python scrapy img

by · 2019年09月16日 · 336 Words · ~1min reading time | Improve on
#coding=utf-8
import requests
from bs4 import BeautifulSoup
import os
import time


url="https://www.tooopen.com/search/logo.aspx?cate=0&type=0&sort=1&level=0&color=0&copyright=0&page=1"
ref="https://www.tooopen.com/search/logo.aspx"
#http请求头
referer = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
    'Referer': ref
               }

#此请求头破解盗链

#保存地址
path = './data/'

#记录文件
data = './dl/.data'

#读取保存记录
def get_log(file):
    page = 1
    line = 0
    try:
        with open(file, 'r') as f:
            l = f.readline()
            page, line = [int(i) for i in l.split('|')]
    except Exception as e:
        print(e)
        print('读取记录失败，从初始开始')
    return page, line


#保存记录
def put_log(file, page, line):
    mkdir(file)
    try:
        with open(file, "w") as f:
            f.write('{}|{}'.format(page, line))
    except Exception as e:
        print('保存记录失败：[{}]'.format(e))


#找寻最大页数
def find_max_page(url):
    start_html = requests.get(url, headers=referer)
    soup = BeautifulSoup(start_html.text, "html.parser")
    page = soup.find('div', class_='page-nav').find_all('a')
    max_page = page[-2].text
    max_page = int(max_page)
    return max_page
def mkdir(path):
    dirname,filename=os.path.split(path)
    if not os.path.exists(dirname):
	    os.mkdir(dirname)

def write(path,content):
    dirname,filename=os.path.split(path)
    if not os.path.exists(dirname):
	    os.mkdir(dirname)
    with open(path, 'wb') as f:
        f.write(content)

if __name__ == "__main__":
    same_url = url
    max_page = find_max_page(url)
    page, line = get_log(data)
    print('从{}页，{}行开始缓存'.format(page, line))
    for n in range(page, int(max_page)+1):
        ul = same_url+str(n)
        start_html = requests.get(ul, headers=referer)
        soup = BeautifulSoup(start_html.text, "html.parser")
        
        all=soup.find_all('a', class_='pic',target='_blank')
        # for img in all:
        for lines in range(line, len(all)):
            img = all[lines]
            pic_url=img.find('img').get("src")
            print(pic_url)
            html = requests.get(pic_url, headers=referer)
            file_name = pic_url.split(r'/')[-1]
            write(os.path.join('./data',file_name),html.content)
            put_log(data, n, lines)
   
        print('第',n,'页完成')
        line = 0
        time.sleep(1)
python scrapy img

分类

标签云