import redis
import hashlib
class IncrementalSpider(scrapy.Spider):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.redis = redis.Redis()
def parse(self, response):
for item in response.css('div.item'):
url = item.css('a::attr(href)').get()
# 生成指纹
fingerprint = hashlib.md5(url.encode()).hexdigest()
# 检查是否已爬取
if not self.redis.exists(f'crawled:{fingerprint}'):
yield scrapy.Request(url, callback=self.parse_item)
self.redis.set(f'crawled:{fingerprint}', 1)