from bs4 import BeautifulSoup
import requests
import os
from multiprocessing import Pool
from urllib.request import urlretrieve
total_num = 3
def url_open(url):
web_data = requests.get(url)
html = BeautifulSoup(web_data.text, 'lxml')
return html
def get_all_cover_url(url):#获得所有封面url
for num in range(2,total_num+1):#每个页面的url
total_url = url + 'page/{}'.format(num)
html = url_open(total_url)
url_list = [link.attrs['href'] for link in html.select('#pins > li > a')]
#print(url_list)
return url_list
#get_all_cover_url(start_url)
def from_cover_get_url_list(url):
html = url_open(url)
img_num = int([i.get_text() for i in html.select('div.pagenavi > a > span')][-2])#['?上一组', '2', '3', '4', '31', '下一页?']
response = html.select('body > div.main > div.content > div.main-image > p > a > img')
s_ = response[0].get('src')#第一个图片地址
img_url_list = [s_[:-6] + '0' + str(num) + s_[-4:]
if num<10
else s_[:-6] + str(num) + s_[-4:]
for num in range(1,img_num+1)]
return img_url_list
#print(img_url_list)
#get_img_url_list(url='http://www.mzitu.com/84934')
def download(start_url='http://www.mzitu.com/',folder='妹子图'):
try:
os.mkdir(folder)
os.chdir(folder)
except:
os.chdir(folder)
all_cover_url = get_all_cover_url(start_url)#获得所有系列的链接
for cover_url in all_cover_url:
img_url_list = from_cover_get_url_list(cover_url)
for img_url in img_url_list:
#'http://i.meizitu.net/2017/02/18a01.jpg'
filename = img_url[21:]
urlretrieve(img_url,filename)
print(img_url)
#FileNotFoundError: [Errno 2] No such file or directory: '2017/02/18a01.jpg'报错
if __name__ == '__main__':
#pool = Pool()
download()
与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…