Picture_Gatherer

基础操作,初学使用,未解决被网站ban的问题。

import requests
from bs4 import BeautifulSoup
import os

for i in range(2, 4):
    list_url = 'http://www.mzitu.com/page/' + str(i)
    list_source = requests.get(list_url).text
    
    #print (list_source)
    
    list_soup = BeautifulSoup(list_source, 'lxml')
    
    #print (list_soup)
    
    page_urls = list_soup.find('div',{'class': 'postlist'}).find_all('li')
    
    for each22 in page_urls:
        initial_url_copy = each22.find('a')['href']
        initial_source = requests.get(initial_url_copy).text
        initial_soup = BeautifulSoup(initial_source, 'lxml')
        
        title = initial_soup.find('title').text.split(':')[0]
        print (title)
        os.mkdir('test22\\' + title)
        
        for num in range(1,11):
            initial_url = initial_url_copy + '/' + str(num)
            
            #print (initial_url)
            
            initial_request = requests.get(initial_url)
            initial_pagesource = initial_request.text
            initial_request.close()
            
            page_soup = BeautifulSoup(initial_pagesource, 'lxml')
            imgs = page_soup.find('div',{'class':'main-image'}).find_all('img')

            for each in imgs:
                url = (each['src'])
                headers = {'Referer': 'http://www.mzitu.com/'}
                img_source = requests.get(url, headers = headers).content
                f = open('test22\\' + title + '\\' + str(num) + '.jpg', 'wb')
                f.write(img_source)
                f.close()

致谢:不二小段
https://www.bilibili.com/video/av18248096

Tags:

Add a Comment

Your email address will not be published. Required fields are marked *