本文整理汇总了Python中seesaw.config.realize函数的典型用法代码示例。如果您正苦于以下问题:Python realize函数的具体用法?Python realize怎么用?Python realize使用的例子?那么恭喜您, 这里精选的函数代码示例或许可以为您提供帮助。
在下文中一共展示了realize函数的20个代码示例,这些例子默认根据受欢迎程度排序。您可以为喜欢或者感觉有用的代码点赞,您的评价将有助于我们的系统推荐出更棒的Python代码示例。
示例1: data
def data(self, item):
data = {
"downloader": realize(self.downloader, item),
"api_version": "2"
}
if self.version:
data["version"] = realize(self.version, item)
return data
开发者ID:ArchiveTeam,项目名称:seesaw-kit,代码行数:8,代码来源:tracker.py
示例2: stdin_data
def stdin_data(self, item):
return "".join(
[
"%s\n" % os.path.relpath(
realize(f, item),
realize(self.target_source_path, item)
)
for f in realize(self.files, item)
]).encode('utf-8')
开发者ID:VADemon,项目名称:seesaw-kit,代码行数:9,代码来源:externalprocess.py
示例3: process
def process(self, item):
total_bytes = {}
for (group, files) in self.file_groups.iteritems():
total_bytes[group] = sum([ os.path.getsize(f) for f in realize(files, item)])
stats = {}
stats.update(self.defaults)
stats["item"] = item["item_name"]
stats["bytes"] = total_bytes
if self.id_function:
stats["id"] = self.id_function(item)
item["stats"] = realize(stats, item)
开发者ID:chfoo,项目名称:isohunt-grab,代码行数:14,代码来源:pipeline.py
示例4: process
def process(self, item):
with self.task_cwd():
p = AsyncPopen(
args=realize(self.args, item),
env=realize(self.env, item),
stdin=subprocess.PIPE,
close_fds=True
)
p.on_output += functools.partial(self.on_subprocess_stdout, p, item)
p.on_end += functools.partial(self.on_subprocess_end, item)
p.run()
p.stdin.write(self.stdin_data(item))
p.stdin.close()
开发者ID:daxelrod,项目名称:seesaw-kit,代码行数:16,代码来源:externalprocess.py
示例5: realize
def realize(self, item):
wget_args = [
WGET_LUA,
"-U", USER_AGENT,
"-nv",
"--lua-script", "furaffinity.lua",
"-o", ItemInterpolation("%(item_dir)s/wget.log"),
"--no-check-certificate",
"--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
"--truncate-output",
"-e", "robots=off",
"--rotate-dns",
"--recursive", "--level=inf",
"--no-parent",
"--page-requisites",
"--timeout", "30",
"--tries", "inf",
"--domains", "furaffinity.net",
"--span-hosts",
"--waitretry", "30",
"--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
"--warc-header", "operator: Archive Team",
"--warc-header", "furaffinity-dld-script-version: " + VERSION,
"--warc-header", ItemInterpolation("furaffinity-user: %(item_name)s"),
]
item_name = item['item_name']
assert ':' in item_name
item_type, item_value = item_name.split(':', 1)
item['item_type'] = item_type
item['item_value'] = item_value
assert item_type in ('image', 'imagelogin')
if item_type == 'image':
suffixesa = string.digits + string.lowercase
suffixesb = string.digits + string.lowercase
for url in ['http://www.furaffinity.net/view/{0}{1}{2}/'.format(item_value, a, b) for a in suffixesa for b in suffixesb]:
wget_args.append(url)
wget_args.extend(["--no-cookies"])
elif item_type == 'imagelogin'
suffixesa = string.digits + string.lowercase
suffixesb = string.digits + string.lowercase
for url in ['http://www.furaffinity.net/view/{0}{1}{2}/'.format(item_value, a, b) for a in suffixesa for b in suffixesb]:
wget_args.append(url)
wget_args.extend(["--load-cookies", "cookies.txt"])
else:
raise Exception('Unknown item')
if 'bind_address' in globals():
wget_args.extend(['--bind-address', globals()['bind_address']])
print('')
print('*** Wget will bind address at {0} ***'.format(
globals()['bind_address']))
print('')
return realize(wget_args, item)
开发者ID:ArchiveTeam,项目名称:furaffinity-grab,代码行数:60,代码来源:pipeline.py
示例6: realize
def realize(self, item):
wget_args = [
WGET_LUA,
"-U", USER_AGENT,
"-nv",
"--lua-script", "rutracker.lua",
"-o", ItemInterpolation("%(item_dir)s/wget.log"),
"--no-check-certificate",
"--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
"--truncate-output",
"-e", "robots=off",
"--rotate-dns",
"--recursive", "--level=inf",
"--no-parent",
"--page-requisites",
"--timeout", "30",
"--tries", "inf",
"--domains", "rutracker.org",
"--span-hosts",
"--waitretry", "30",
"--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
"--warc-header", "operator: Archive Team",
"--warc-header", "rutracker-dld-script-version: " + VERSION,
"--warc-header", ItemInterpolation("rutracker-user: %(item_name)s"),
]
item_name = item['item_name']
assert ':' in item_name
item_type, item_value = item_name.split(':', 1)
item['item_type'] = item_type
item['item_value'] = item_value
assert item_type in ('thread', 'forum')
if item_type == 'thread':
suffixes = string.digits
for suffix in suffixes:
wget_args.append('http://rutracker.org/forum/viewtopic.php?t={0}{1}'.format(item_value, suffix))
wget_args.append('http://api.rutracker.org/v1/get_peer_stats?by=topic_id&val={0}{1}'.format(item_value, suffix))
wget_args.append('http://api.rutracker.org/v1/get_tor_hash?by=topic_id&val={0}{1}'.format(item_value, suffix))
wget_args.append('http://api.rutracker.org/v1/get_tor_topic_data?by=topic_id&val={0}{1}'.format(item_value, suffix))
elif item_type == 'forum':
suffixes = string.digits
for suffix in suffixes:
wget_args.append('http://rutracker.org/forum/viewforum.php?f={0}{1}'.format(item_value, suffix))
wget_args.append('http://api.rutracker.org/v1/get_forum_name?by=forum_id&val={0}{1}'.format(item_value, suffix))
wget_args.append('http://api.rutracker.org/v1/get_forum_data?by=forum_id&val={0}{1}'.format(item_value, suffix))
wget_args.append('http://api.rutracker.org/v1/static/pvc/f/{0}{1}'.format(item_value, suffix))
else:
raise Exception('Unknown item')
if 'bind_address' in globals():
wget_args.extend(['--bind-address', globals()['bind_address']])
print('')
print('*** Wget will bind address at {0} ***'.format(
globals()['bind_address']))
print('')
return realize(wget_args, item)
开发者ID:ArchiveTeam,项目名称:rutracker-grab,代码行数:60,代码来源:pipeline.py
示例7: process_body
def process_body(self, body, item):
data = json.loads(body)
if "upload_target" in data:
files = realize(self.files, item)
inner_task = None
if re.match(r"^rsync://", data["upload_target"]):
item.log_output("Uploading with Rsync to %s" % data["upload_target"])
inner_task = RsyncUpload(data["upload_target"], files, target_source_path=self.rsync_target_source_path, bwlimit=self.rsync_bwlimit, extra_args=self.rsync_extra_args, max_tries=1)
elif re.match(r"^https?://", data["upload_target"]):
item.log_output("Uploading with Curl to %s" % data["upload_target"])
if len(files) != 1:
item.log_output("Curl expects to upload a single file.")
self.fail_item(item)
return
inner_task = CurlUpload(data["upload_target"], files[0], self.curl_connect_timeout, self.curl_speed_limit, self.curl_speed_time, max_tries=1)
else:
item.log_output("Received invalid upload type.")
self.fail_item(item)
return
inner_task.on_complete_item += self._inner_task_complete_item
inner_task.on_fail_item += self._inner_task_fail_item
inner_task.enqueue(item)
else:
item.log_output("Tracker did not provide an upload target.")
self.schedule_retry(item)
开发者ID:chfoo,项目名称:isohunt-grab,代码行数:32,代码来源:pipeline.py
示例8: realize
def realize(self, item):
wget_args = [
WGET_LUA,
"-U", USER_AGENT,
"-nv",
"--lua-script", "gamefront.lua",
"-o", ItemInterpolation("%(item_dir)s/wget.log"),
"--no-check-certificate",
"--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
"--truncate-output",
"-e", "robots=off",
"--rotate-dns",
"--recursive", "--level=inf",
"--no-parent",
"--page-requisites",
"--timeout", "30",
"--tries", "inf",
"--domains", "gamefront.com",
"--span-hosts",
"--waitretry", "30",
"--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
"--warc-header", "operator: Archive Team",
"--warc-header", "gamefront-dld-script-version: " + VERSION,
"--warc-header", ItemInterpolation("gamefront-user: %(item_name)s"),
]
item_name = item['item_name']
assert ':' in item_name
item_type, item_value = item_name.split(':', 1)
item['item_type'] = item_type
item['item_value'] = item_value
assert item_type in ('file', 'singlefile')
if item_type == 'file':
suffixes = string.digits
for suffix in suffixes:
wget_args.append('http://www.gamefront.com/files/{0}{1}'.format(item_value, suffix))
elif item_type == 'singlefile':
wget_args.append('http://www.gamefront.com/files/{0}'.format(item_value))
session1 = requests.Session()
mainpage = session1.get('http://www.gamefront.com/files/' + item_value).text
if re.search(r"plopMe\('[0-9]+',\s+'[^']+'\)", mainpage):
plopme = re.search(r"plopMe\('[0-9]+',\s+'([^']+)'\)", mainpage).group(1)
print('Received token ' + plopme + '.')
print('Received ' + session1.post('http://www.gamefront.com/files/service/request', data = {'token':plopme}, headers={'referer': 'http://www.gamefront.com/files/' + item_value}).text + '.')
session1.get('http://www.gamefront.com/files/service/thankyou?id=' + item_value, headers={'referer': 'http://www.gamefront.com/files/' + item_value})
else:
raise Exception('Unknown item')
if 'bind_address' in globals():
wget_args.extend(['--bind-address', globals()['bind_address']])
print('')
print('*** Wget will bind address at {0} ***'.format(
globals()['bind_address']))
print('')
return realize(wget_args, item)
开发者ID:ArchiveTeam,项目名称:gamefront-grab,代码行数:59,代码来源:pipeline.py
示例9: enqueue
def enqueue(self, item):
self.start_item(item)
item.log_output("Starting %s for %s\n" % (self, item.description()))
item["tries"] = 1
item['WgetDownloadMany.urls'] = realize(self.unrealized_urls, item)
item['WgetDownloadMany.urls_index'] = 0
item['WgetDownloadMany.current_url'] = None
self.process(item)
开发者ID:ArchiveTeam,项目名称:puush-grab,代码行数:8,代码来源:pipeline.py
示例10: realize
def realize(self, item):
wget_args = [
WGET_LUA,
"-U",
USER_AGENT,
"-nv",
"--lua-script",
"musicbrainz.lua",
"-o",
ItemInterpolation("%(item_dir)s/wget.log"),
"--no-check-certificate",
"--output-document",
ItemInterpolation("%(item_dir)s/wget.tmp"),
"--truncate-output",
"-e",
"robots=off",
"--rotate-dns",
"--no-parent",
"--page-requisites",
"--timeout",
"30",
"--tries",
"inf",
"--span-hosts",
"--waitretry",
"30",
"--warc-file",
ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
"--warc-header",
"operator: Archive Team",
"--warc-header",
"musicbrainz-dld-script-version: " + VERSION,
"--warc-header",
ItemInterpolation("musicbrainz-user: %(item_name)s"),
]
item_name = item["item_name"]
assert ":" in item_name
item_sort, item_item, item_file = item_name.split(":", 2)
item["item_item"] = item_item
item_list = requests.get("http://archive.org/download/{0}/{1}".format(item_item, item_file))
if item_list.status_code != 200:
raise Exception(
"You received status code %d with URL %s"
% (item_list.status_code, "https://archive.org/download/{0}/{1}".format(item_item, item_file))
)
for url in item_list.text.splitlines():
wget_args.append("{0}".format(url))
if "bind_address" in globals():
wget_args.extend(["--bind-address", globals()["bind_address"]])
print("")
print("*** Wget will bind address at {0} ***".format(globals()["bind_address"]))
print("")
return realize(wget_args, item)
开发者ID:ArchiveTeam,项目名称:musicbrainz-grab,代码行数:58,代码来源:pipeline.py
示例11: realize
def realize(self, item):
wget_args = [
WGET_LUA,
"-U", random.choice(USER_AGENTS),
"-nv",
"--lua-script", "twitpic-api.lua",
"-o", ItemInterpolation("%(item_dir)s/wget.log"),
"--no-check-certificate",
"--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
"--truncate-output",
"-e", "robots=off",
"--no-cookies",
"--rotate-dns",
"--recursive", "--level=inf",
"--no-parent",
"--page-requisites",
"--timeout", "30",
"--tries", "inf",
"--span-hosts",
"--waitretry", "30",
"--domains", "twitpic.com,cloudfront.net,twimg.com,amazonaws.com",
"--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
"--warc-header", "operator: Archive Team",
"--warc-header", "twitpic-api-dld-script-version: " + VERSION,
"--warc-header", ItemInterpolation("twitpic-api-user: %(item_name)s"),
"--header", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"--header", "DNT: 1",
"--header", random.choice(ACCEPT_LANGUAGE_HEADERS),
]
item_name = item['item_name']
assert ':' in item_name
item_type, item_value = item_name.split(':', 1)
item['item_type'] = item_type
item['item_value'] = item_value
assert item_type in ('image', 'user', 'tag', 'event')
if item_type == 'imageapi':
suffixes = string.digits + string.lowercase
for args in [(
'http://api.twitpic.com/2/media/show.json?id={0}{1}'.format(item_value, s), \
'http://api.twitpic.com/2/comments/show.json?media_id={0}{1}&page=1'.format(item_value, s)) for s in suffixes]:
wget_args.append(args[0])
wget_args.append(args[1])
else:
raise Exception('Unknown item')
if 'bind_address' in globals():
wget_args.extend(['--bind-address', globals()['bind_address']])
print('')
print('*** Wget will bind address at {0} ***'.format(
globals()['bind_address']))
print('')
return realize(wget_args, item)
开发者ID:ArchiveTeam,项目名称:twitpic-api-grab,代码行数:58,代码来源:pipeline.py
示例12: realize
def realize(self, item):
wget_args = [
WGET_LUA,
"-U", USER_AGENT,
"-nv",
"--no-cookies",
"--lua-script", "portalgraphics.lua",
"-o", ItemInterpolation("%(item_dir)s/wget.log"),
"--no-check-certificate",
"--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
"--truncate-output",
"-e", "robots=off",
"--rotate-dns",
"--recursive", "--level=inf",
"--no-parent",
"--page-requisites",
"--timeout", "30",
"--tries", "inf",
"--domains", "portalgraphics.net",
"--span-hosts",
"--waitretry", "30",
"--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
"--warc-header", "operator: Archive Team",
"--warc-header", "portalgraphics-dld-script-version: " + VERSION,
"--warc-header", ItemInterpolation("portalgraphics-user: %(item_name)s"),
]
item_name = item['item_name']
assert ':' in item_name
item_type, item_value = item_name.split(':', 2)
item['item_type'] = item_type
item['item_value'] = item_value
assert item_type in ('image_id', 'user_id')
if item_type == 'image_id':
wget_args.append('http://www.portalgraphics.net/pg/illust/?image_id={0}'.format(item_value))
wget_args.append('http://www.portalgraphics.net/pg/illust/?image_id={0}&lang=ja'.format(item_value))
wget_args.append('http://www.portalgraphics.net/pg/illust/?image_id={0}&lang=en'.format(item_value))
wget_args.append('http://www.portalgraphics.net/pg/movie/pg_player/res_movie_data.php?mid={0}'.format(item_value))
wget_args.append('http://www.portalgraphics.net/pg/movie/pg_player/res_movie_data.php?mid={0}&lang=ja'.format(item_value))
wget_args.append('http://www.portalgraphics.net/pg/movie/pg_player/res_movie_data.php?mid={0}&lang=en'.format(item_value))
wget_args.append('http://www.portalgraphics.net/pg/movie/address.php?image%5Fid={0}'.format(item_value))
wget_args.append('http://www.portalgraphics.net/pg/movie/address.php?image_id={0}'.format(item_value))
elif item_type == 'user_id':
wget_args.append('http://portalgraphics.net/pg/profile/?user_id={0}'.format(item_value))
else:
raise Exception('Unknown item')
if 'bind_address' in globals():
wget_args.extend(['--bind-address', globals()['bind_address']])
print('')
print('*** Wget will bind address at {0} ***'.format(
globals()['bind_address']))
print('')
return realize(wget_args, item)
开发者ID:ArchiveTeam,项目名称:portalgraphics-grab,代码行数:58,代码来源:pipeline.py
示例13: realize
def realize(self, item):
wget_args = [
WGET_LUA,
"-U", USER_AGENT,
"-nv",
"--lua-script", "sourceforge.lua",
"-o", ItemInterpolation("%(item_dir)s/wget.log"),
"--no-check-certificate",
"--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
"--truncate-output",
"-e", "robots=off",
"--rotate-dns",
"--recursive", "--level=inf",
"--no-parent",
"--page-requisites",
"--timeout", "30",
"--tries", "inf",
"--domains", "sourceforge.net",
"--span-hosts",
"--waitretry", "30",
"--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
"--warc-header", "operator: Archive Team",
"--warc-header", "sourceforge-dld-script-version: " + VERSION,
"--warc-header", ItemInterpolation("sourceforge-user: %(item_name)s"),
]
item_name = item['item_name']
assert ':' in item_name
item_type, item_value = item_name.split(':', 1)
item['item_type'] = item_type
item['item_value'] = item_value
assert item_type in ('project')
if item_type == 'project':
wget_args.append('http://sourceforge.net/projects/{0}/'.format(item_value))
wget_args.append('http://sourceforge.net/projects/{0}/?source=directory'.format(item_value))
wget_args.append('http://sourceforge.net/projects/{0}/?source=directory-featured'.format(item_value))
wget_args.append('http://sourceforge.net/projects/{0}/?source=frontpage&position=1'.format(item_value))
wget_args.append('http://sourceforge.net/projects/{0}/?source=frontpage'.format(item_value))
wget_args.append('http://sourceforge.net/projects/{0}/'.format(item_value))
wget_args.append('http://sourceforge.net/p/{0}/'.format(item_value))
wget_args.append('http://sourceforge.net/rest/p/{0}/'.format(item_value))
wget_args.append('http://sourceforge.net/rest/p/{0}?doap'.format(item_value))
wget_args.append('http://{0}.sourceforge.net/'.format(item_value))
else:
raise Exception('Unknown item')
if 'bind_address' in globals():
wget_args.extend(['--bind-address', globals()['bind_address']])
print('')
print('*** Wget will bind address at {0} ***'.format(
globals()['bind_address']))
print('')
return realize(wget_args, item)
开发者ID:ArchiveTeam,项目名称:sourceforge-grab,代码行数:57,代码来源:pipeline.py
示例14: realize
def realize(self, item):
wget_args = [
WGET_LUA,
"-U", USER_AGENT,
"-nv",
"--no-cookies",
"--lua-script", "panoramio.lua",
"-o", ItemInterpolation("%(item_dir)s/wget.log"),
"--no-check-certificate",
"--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
"--truncate-output",
"-e", "robots=off",
"--rotate-dns",
"--recursive", "--level=inf",
"--no-parent",
"--page-requisites",
"--timeout", "30",
"--tries", "inf",
"--domains", "panoramio.com",
"--span-hosts",
"--waitretry", "30",
"--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
"--warc-header", "operator: Archive Team",
"--warc-header", "panoramio-dld-script-version: " + VERSION,
"--warc-header", ItemInterpolation("panoramio-item: %(item_name)s"),
]
item_name = item['item_name']
assert ':' in item_name
item_type, item_value = item_name.split(':', 1)
item['item_type'] = item_type
item['item_value'] = item_value
assert item_type in ('photos', 'users')
if item_type == 'photos':
start, stop = item_value.split('-')
for i in range(int(start), int(stop)+1):
wget_args.extend(['--warc-header', 'panoramio-photo: {i}'.format(**locals())])
wget_args.append('http://www.panoramio.com/photo/{i}'.format(**locals()))
elif item_type == 'users':
start, stop = item_value.split('-')
for i in range(int(start), int(stop)+1):
wget_args.extend(['--warc-header', 'panoramio-user: {i}'.format(**locals())])
wget_args.append('http://www.panoramio.com/user/{i}'.format(**locals()))
else:
raise Exception('Unknown item')
if 'bind_address' in globals():
wget_args.extend(['--bind-address', globals()['bind_address']])
print('')
print('*** Wget will bind address at {0} ***'.format(
globals()['bind_address']))
print('')
return realize(wget_args, item)
开发者ID:ArchiveTeam,项目名称:panoramio-grab,代码行数:57,代码来源:pipeline.py
示例15: realize
def realize(self, item):
wget_args = [
WGET_LUA,
'-U', USER_AGENT,
'-nv',
'--no-cookies',
'--lua-script', '500px.lua',
'-o', ItemInterpolation('%(item_dir)s/wget.log'),
'--no-check-certificate',
'--output-document', ItemInterpolation('%(item_dir)s/wget.tmp'),
'--truncate-output',
'-e', 'robots=off',
'--rotate-dns',
'--recursive', '--level=inf',
'--no-parent',
'--page-requisites',
'--timeout', '30',
'--tries', 'inf',
'--domains', '500px.com',
'--span-hosts',
'--waitretry', '30',
'--warc-file', ItemInterpolation('%(item_dir)s/%(warc_file_base)s'),
'--warc-header', 'operator: Archive Team',
'--warc-header', '500px-dld-script-version: ' + VERSION,
'--warc-header', ItemInterpolation('500px-item: %(item_name)s'),
]
item_name = item['item_name']
assert ':' in item_name
item_type, item_value = item_name.split(':', 1)
item['item_type'] = item_type
item['item_value'] = item_value
if item_type == 'photos':
for id_ in item_value.split(';'):
wget_args.extend(['--warc-header', '500px-photo: {}'.format(id_)])
wget_args.append('https://500px.com/photo/{}'.format(id_))
wget_args.append('https://api.500px.com/v1/photos/{}/comments?sort=created_at&include_subscription=1&include_flagged=1&nested=1&page=1&rpp=30'.format(id_))
wget_args.append('https://api.500px.com/v1/photos?image_size%5B%5D=1&image_size%5B%5D=2&image_size%5B%5D=32&image_size%5B%5D=31&image_size%5B%5D=33&image_size%5B%5D=34&image_size%5B%5D=35&image_size%5B%5D=36&image_size%5B%5D=2048&image_size%5B%5D=4&image_size%5B%5D=14&expanded_user_info=true&include_tags=true&include_geo=true&include_equipment_info=true&include_licensing=true&include_releases=true&liked_by=1&following_sample=100&ids={}'.format(id_))
#wget_args.append('https://api.500px.com/v1/photos/{}/navigation?from=user&formats=jpeg%2Clytro&image_size%5B%5D=1&image_size%5B%5D=2&image_size%5B%5D=32&image_size%5B%5D=31&image_size%5B%5D=33&image_size%5B%5D=34&image_size%5B%5D=35&image_size%5B%5D=36&image_size%5B%5D=2048&image_size%5B%5D=4&image_size%5B%5D=14'.format(id_))
elif item_type == 'all':
start, end = item_value.split('-')
for id_ in range(int(start), int(end)+1):
wget_args.extend(['--warc-header', '500px-photo: {}'.format(id_)])
wget_args.append('https://500px.com/photo/{}'.format(id_))
else:
raise Exception('Unknown item')
if 'bind_address' in globals():
wget_args.extend(['--bind-address', globals()['bind_address']])
print('')
print('*** Wget will bind address at {0} ***'.format(
globals()['bind_address']))
print('')
return realize(wget_args, item)
开发者ID:ArchiveTeam,项目名称:500px-grab,代码行数:57,代码来源:pipeline.py
示例16: realize
def realize(self, item):
wget_args = [
WGET_LUA,
"-U", USER_AGENT,
"-nv",
"--lua-script", "yuku.lua",
"-o", ItemInterpolation("%(item_dir)s/wget.log"),
"--no-check-certificate",
"--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
"--truncate-output",
"-e", "robots=off",
"--rotate-dns",
"--recursive", "--level=inf",
"--no-parent",
"--no-cookies",
"--page-requisites",
"--timeout", "30",
"--tries", "inf",
"--domains", "yuku.com",
"--span-hosts",
"--waitretry", "30",
"--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
"--warc-header", "operator: Archive Team",
"--warc-header", "yuku-dld-script-version: " + VERSION,
"--warc-header", ItemInterpolation("yuku-user: %(item_name)s"),
]
item_name = item['item_name']
assert ':' in item_name
item_name, item_type, item_value, item_thread = item_name.split(':', 3)
item['item_type'] = item_type
item['item_value'] = item_value
item['item_thread'] = item_thread
# Example item: yuku:10threads:deltasforest29697:17
assert item_type in ('thread', '10threads')
if item_type == 'thread':
wget_args.append('http://%s.yuku.com/topic/%s/'%(item_value, item_thread))
elif item_type == '10threads':
suffixes = string.digits
for suffix in suffixes:
wget_args.append('http://%s.yuku.com/topic/%s%s/'%(item_value, item_thread, suffix))
else:
raise Exception('Unknown item')
if 'bind_address' in globals():
wget_args.extend(['--bind-address', globals()['bind_address']])
print('')
print('*** Wget will bind address at {0} ***'.format(
globals()['bind_address']))
print('')
return realize(wget_args, item)
开发者ID:ArchiveTeam,项目名称:yuku-grab,代码行数:56,代码来源:pipeline.py
示例17: realize
def realize(self, item):
wget_args = [
WPULL_EXE,
"-nv",
"--python-script", "ftp.py",
"-o", ItemInterpolation("%(item_dir)s/wpull.log"),
"--no-check-certificate",
"--database", ItemInterpolation("%(item_dir)s/wpull.db"),
"--delete-after",
"--no-robots",
"--no-cookies",
"--rotate-dns",
"--timeout", "60",
"--tries", "inf",
"--wait", "0.5",
"--random-wait",
"--waitretry", "5",
"--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
"--warc-header", "operator: Archive Team",
"--warc-header", "ftp-dld-script-version: " + VERSION,
"--warc-header", ItemInterpolation("ftp-user: %(item_name)s"),
]
item_name = item['item_name']
assert ':' in item_name
item_sort, item_item, item_file = item_name.split(':', 2)
item['item_item'] = item_item
MAX_SIZE = 10737418240
item_list = requests.get('http://archive.org/download/{0}/{1}'.format(item_item, item_file))
if item_list.status_code != 200:
raise Exception('You received status code %d with URL %s'%(item_list.status_code, 'https://archive.org/download/{0}/{1}'.format(item_item, item_file)))
itemsize = int(re.search(r'ITEM_TOTAL_SIZE: ([0-9]+)', item_list.text).group(1))
if itemsize > MAX_SIZE:
raise Exception('Item is %d bytes. This is larger then %d bytes.'%(itemsize, MAX_SIZE))
for url in item_list.text.splitlines():
if url.startswith('ftp://'):
url = url.replace(' ', '%20').replace('&', '&')
url = urllib.unquote(url)
if item_item == 'archiveteam_ftp_items_2015120102':
url = url.replace('ftp://ftp.research.microsoft.com/downloads/downloads/', 'ftp://ftp.research.microsoft.com/downloads/')
if '#' in url:
raise Exception('%s containes a bad character.'%(url))
else:
wget_args.append("{0}".format(url))
if 'bind_address' in globals():
wget_args.extend(['--bind-address', globals()['bind_address']])
print('')
print('*** Wget will bind address at {0} ***'.format(
globals()['bind_address']))
print('')
return realize(wget_args, item)
开发者ID:tobbez,项目名称:ftp-grab,代码行数:56,代码来源:pipeline.py
示例18: realize
def realize(self, item):
wget_args = [
WGET_LUA,
"-U", USER_AGENT,
"-nv",
"--lua-script", "canvas.lua",
"-o", ItemInterpolation("%(item_dir)s/wget.log"),
"--no-check-certificate",
"--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
"--truncate-output",
"-e", "robots=off",
"--no-cookies",
"--rotate-dns",
# "--recursive", "--level=inf",
"--no-parent",
"--page-requisites",
"--timeout", "60",
"--tries", "inf",
"--span-hosts",
"--waitretry", "3600",
"--domains", "canv.as,drawquest-export.s3-website-us-east-1.amazonaws.com",
"--warc-file",
ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
"--warc-header", "operator: Archive Team",
"--warc-header", "canvas-archive-dld-script-version: " + VERSION,
"--warc-header", ItemInterpolation("canvas-user: %(item_name)s"),
"--header", "Host: drawquest-export.s3-website-us-east-1.amazonaws.com",
]
item_name = item['item_name']
item_type, item_value = item_name.split(':', 1)
item['item_type'] = item_type
item['item_value'] = item_value
assert item_type in ('user', 'homepage')
if item_type == 'user':
wget_args.append('http://canv.as/{0}/'.format(item_value))
wget_args.extend(["--recursive", "--level=inf"])
elif item_type == 'homepage':
wget_args.append('http://canv.as/')
else:
raise Exception('Unknown item')
if 'bind_address' in globals():
wget_args.extend(['--bind-address', globals()['bind_address']])
print('')
print('*** Wget will bind address at {0} ***'.format(
globals()['bind_address']))
print('')
return realize(wget_args, item)
开发者ID:ArchiveTeam,项目名称:canvas-archive-grab,代码行数:55,代码来源:pipeline.py
示例19: realize
def realize(self, item):
wget_args = [
WGET_LUA,
"-U", USER_AGENT,
"-nv",
"--lua-script", "yahoomaps.lua",
"-o", ItemInterpolation("%(item_dir)s/wget.log"),
"--no-check-certificate",
"--output-document", ItemInterpolation("%(item_dir)s/wget.tmp"),
"--truncate-output",
"-e", "robots=off",
"--rotate-dns",
# "--recursive", "--level=inf",
"--no-parent",
# "--page-requisites",
"--timeout", "30",
"--tries", "inf",
"--domains", "yahoo.com,here.com",
"--span-hosts",
"--waitretry", "30",
"--warc-file", ItemInterpolation("%(item_dir)s/%(warc_file_base)s"),
"--warc-header", "operator: Archive Team",
"--warc-header", "sour
|
请发表评论