Picture_Gatherer
March 11, 2018
基础操作,初学使用,未解决被网站ban的问题。
import requests from bs4 import BeautifulSoup import os for i in range(2, 4): list_url = 'http://www.mzitu.com/page/' + str(i) list_source = requests.get(list_url).text #print (list_source) list_soup = BeautifulSoup(list_source, 'lxml') #print (list_soup) page_urls = list_soup.find('div',{'class': 'postlist'}).find_all('li') for each22 in page_urls: initial_url_copy = each22.find('a')['href'] initial_source = requests.get(initial_url_copy).text initial_soup = BeautifulSoup(initial_source, 'lxml') title = initial_soup.find('title').text.split(':')[0] print (title) os.mkdir('test22\\' + title) for num in range(1,11): initial_url = initial_url_copy + '/' + str(num) #print (initial_url) initial_request = requests.get(initial_url) initial_pagesource = initial_request.text initial_request.close() page_soup = BeautifulSoup(initial_pagesource, 'lxml') imgs = page_soup.find('div',{'class':'main-image'}).find_all('img') for each in imgs: url = (each['src']) headers = {'Referer': 'http://www.mzitu.com/'} img_source = requests.get(url, headers = headers).content f = open('test22\\' + title + '\\' + str(num) + '.jpg', 'wb') f.write(img_source) f.close()