yield statement keeps going until the next domain is up.
cannot be done, things are done in parallel and there is no way to make domain crawling serially.
what you can do is to write a pipeline that will accumulate them and yield the entire structure on spider_close
, something like:
# this assume your item looks like the following
class MyItem():
domain = Field()
hs = Field()
import collections
class DomainPipeline(object):
accumulator = collections.defaultdict(set)
def process_item(self, item, spider):
accumulator[item['domain']].update(item['hs'])
def close_spider(spider):
for domain,hs in accumulator.items():
yield MyItem(domain=domain, hs=hs)
usage:
>>> from scrapy.item import Item, Field
>>> class MyItem(Item):
... domain = Field()
... hs = Field()
...
>>> from collections import defaultdict
>>> accumulator = defaultdict(set)
>>> items = []
>>> for i in range(10):
... items.append(MyItem(domain='google.com', hs=[str(i)]))
...
>>> items
[{'domain': 'google.com', 'hs': ['0']}, {'domain': 'google.com', 'hs': ['1']}, {'domain': 'google.com', 'hs': ['2']}, {'domain': 'google.com', 'hs': ['3']}, {'domain': 'google.com', 'hs': ['4']}, {'domain': 'google.com', 'hs': ['5']}, {'domain': 'google.com', 'hs': ['6']}, {'domain': 'google.com', 'hs': ['7']}, {'domain': 'google.com', 'hs': ['8']}, {'domain': 'google.com', 'hs': ['9']}]
>>> for item in items:
... accumulator[item['domain']].update(item['hs'])
...
>>> accumulator
defaultdict(<type 'set'>, {'google.com': set(['1', '0', '3', '2', '5', '4', '7', '6', '9', '8'])})
>>> for domain, hs in accumulator.items():
... print MyItem(domain=domain, hs=hs)
...
{'domain': 'google.com',
'hs': set(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'])}
>>>