上篇文章介紹了將scrapy-redis隊(duì)列,改成內(nèi)存隊(duì)列。這篇主要解決內(nèi)存隊(duì)列當(dāng)程序重啟時(shí)數(shù)據(jù)丟失的問(wèn)題。
解決方案
當(dāng)程序正常停止時(shí),可以將隊(duì)列中的數(shù)據(jù)重新放到redis里,當(dāng)程序啟動(dòng)時(shí)就可以繼續(xù)處理這個(gè)請(qǐng)求。
我們?cè)趯?xiě)中間件以及piplines時(shí),經(jīng)常會(huì)看到spider_closed方法。此方法只有在程序正常關(guān)閉時(shí),才會(huì)執(zhí)行。所以,我們只需在再程序關(guān)閉時(shí),重寫(xiě)這個(gè)方法,將數(shù)據(jù)保存到redis里即可。
代碼
from scrapy import signals
from scrapy.exceptions import NotConfigured
class StopExtension(object):
@classmethod
def from_crawler(cls, crawler):
return cls(crawler)
def __init__(self, crawler):
self.crawler = crawler
self.interval = crawler.settings.getfloat('CORE_METRICS_INTERVAL', 5)
if not self.interval:
raise NotConfigured
cs = crawler.signals
cs.connect(self.spider_closed, signal=signals.spider_closed)
def set_redis_pipeline_right(self, server, key_name, data_list):
# 將數(shù)據(jù)保存redis
begain_time = time.time()
for _ in range(5):
try:
with server.pipeline(transaction=False) as pipe:
for data in data_list:
pipe.rpush(key_name, data)
pipe.execute()
logging.info(f'ocrStopExtension insert redis time:{time.time() - begain_time}')
return
except:
pass
def handle_data(self, request):
# 在這里可以對(duì)request進(jìn)行處理,
return ""
def spider_closed(self, spider):
engine = self.crawler.engine
server = spider.server
data_list = []
while engine.slot.scheduler.queue:
request = engine.slot.scheduler.queue.pop()
data_list.append(self.handle_data(request))
logging.info(f"ocrStopExtension 調(diào)度器--> {request}")
for request in self.crawler.engine.downloader.active:
data_list.append(self.handle_data(request))
logging.info(f"ocrStopExtension 下載器--> {request}")
self.set_redis_pipeline_right(server, "%s:start_urls" % spider.name, data_list)