Welcome to OGeek Q&A Community for programmer and developer-Open, Learning and Share
Welcome To Ask or Share your Answers For Others

Categories

0 votes
645 views
in Technique[技术] by (71.8m points)

这个爬虫不停报错,我一共写了两个爬虫,大家帮忙看一下

from bs4 import BeautifulSoup
import requests
import os
from multiprocessing import Pool
from urllib.request import urlretrieve

total_num = 3
def url_open(url):
    web_data = requests.get(url)
    html = BeautifulSoup(web_data.text, 'lxml')
    return html

def get_all_cover_url(url):#获得所有封面url
    for num in range(2,total_num+1):#每个页面的url
        total_url = url + 'page/{}'.format(num)
        html = url_open(total_url)
        url_list = [link.attrs['href'] for link in html.select('#pins > li > a')]
        #print(url_list)
        return url_list

#get_all_cover_url(start_url)

def from_cover_get_url_list(url):
    html = url_open(url)
    img_num = int([i.get_text() for i in html.select('div.pagenavi > a > span')][-2])#['?上一组', '2', '3', '4', '31', '下一页?']
    response = html.select('body > div.main > div.content > div.main-image > p > a > img')
    s_ = response[0].get('src')#第一个图片地址
    img_url_list = [s_[:-6] + '0' + str(num) + s_[-4:]
                    if num<10
                    else s_[:-6] + str(num) + s_[-4:]
                    for num in range(1,img_num+1)]
    return img_url_list
    #print(img_url_list)
#get_img_url_list(url='http://www.mzitu.com/84934')

def download(start_url='http://www.mzitu.com/',folder='妹子图'):
    try:
        os.mkdir(folder)
        os.chdir(folder)
    except:
        os.chdir(folder)
        all_cover_url = get_all_cover_url(start_url)#获得所有系列的链接
        for cover_url in all_cover_url:
            img_url_list = from_cover_get_url_list(cover_url)
            for img_url in img_url_list:
                #'http://i.meizitu.net/2017/02/18a01.jpg'
                filename = img_url[21:]
                urlretrieve(img_url,filename)
                print(img_url)

#FileNotFoundError: [Errno 2] No such file or directory: '2017/02/18a01.jpg'报错

if __name__ == '__main__':
    #pool = Pool()
    download()

与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
Welcome To Ask or Share your Answers For Others

1 Reply

0 votes
by (71.8m points)

好吧,很抱歉,我给你的评论是有问题的。我尝试的跑了你的代码。并且修改了一下当作测试的。如下代码是可以通过的。

filename = 'image_test.jpg' #img_url[21:]

图片描述


与恶龙缠斗过久,自身亦成为恶龙;凝视深渊过久,深渊将回以凝视…
OGeek|极客中国-欢迎来到极客的世界,一个免费开放的程序员编程交流平台!开放,进步,分享!让技术改变生活,让极客改变未来! Welcome to OGeek Q&A Community for programmer and developer-Open, Learning and Share
Click Here to Ask a Question

...