I have 20 spiders in one project, each spider has different task and URL to crawl ( but data are similar and I'm using shared items.py
and pipelines.py
for all of them), by the way in my pipelines class I want if some conditions satisfied that specified spider stop crawl.
I've testing
raise DropItem("terminated by me")
and
raise CloseSpider('terminate by me')
but both of them just stop the current running of spider and next_page url still crawling !!!
part of my pipelines.py
class MongoDBPipeline(object):
def __init__(self):
connection = pymongo.MongoClient(
settings['MONGODB_SERVER'],
settings['MONGODB_PORT']
)
db = connection[settings['MONGODB_DB']]
self.collection = db[settings['MONGODB_COLLECTION']]
def process_item(self, item, spider):
raise CloseSpider('terminateby')
raise DropItem("terminateby")
valid = True
for data in item:
if not data:
valid = False
raise DropItem("Missing {0}!".format(data))
if valid:
self.collection.insert(dict(item))
log.msg("Items added to MongoDB database!",
level=log.DEBUG, spider=spider)
return item
and my spider
import scrapy
import json
from Maio.items import MaioItem
class ZhilevanSpider(scrapy.Spider):
name = 'tehran'
allowed_domains = []
start_urls = ['https://search.Maio.io/json/']
place_code = str(1);
def start_requests(self):
request_body = {
"id": 2,
"jsonrpc": "2.0",
"method": "getlist",
"params": [[["myitem", 0, [self.place_code]]], next_pdate]
}
# for body in request_body:
# request_body = body
request_body = json.dumps(request_body)
print(request_body)
yield scrapy.Request(
url='https://search.Maio.io/json/',
method="POST",
body=request_body,
callback = self.parse,
headers={'Content-type' : 'application/json;charset=UTF-8'}
)
def parse(self, response):
print(response)
# print(response.body.decode('utf-8'))
input = (response.body.decode('utf-8'))
result = json.loads(input)
# for key,item in result["result"]:
# print(key)
next_pdate = result["result"]["last_post_date"];
print(result["result"]["last_post_date"])
for item in result["result"]["post_list"]:
print("title : {0}".format(item["title"]))
ads = MaioItem()
ads['title'] = item["title"]
ads['desc'] = item["desc"]
yield ads
if(next_pdate):
request_body = {
"id": 2,
"jsonrpc": "2.0",
"method": "getlist",
"params": [[["myitem", 0, [self.place_code]]], next_pdate]
}
request_body = json.dumps(request_body)
yield scrapy.Request(
url='https://search.Maio.io/json/',
method="POST",
body=request_body,
callback=self.parse,
headers={'Content-type': 'application/json; charset=UTF-8'}
)
**update **
even I put sys.exit("SHUT DOWN EVERYTHING!")
in the pipeline but next page still run .
I see the following log in every page running
sys.exit("SHUT DOWN EVERYTHING!")
SystemExit: SHUT DOWN EVERYTHING!
See Question&Answers more detail:
os