python scrapy img
原文链接: python scrapy img
#coding=utf-8
import requests
from bs4 import BeautifulSoup
import os
import time
url="https://www.tooopen.com/search/logo.aspx?cate=0&type=0&sort=1&level=0&color=0©right=0&page=1"
ref="https://www.tooopen.com/search/logo.aspx"
#http请求头
referer = {
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)',
'Referer': ref
}
#此请求头破解盗链
#保存地址
path = './data/'
#记录文件
data = './dl/.data'
#读取保存记录
def get_log(file):
page = 1
line = 0
try:
with open(file, 'r') as f:
l = f.readline()
page, line = [int(i) for i in l.split('|')]
except Exception as e:
print(e)
print('读取记录失败,从初始开始')
return page, line
#保存记录
def put_log(file, page, line):
mkdir(file)
try:
with open(file, "w") as f:
f.write('{}|{}'.format(page, line))
except Exception as e:
print('保存记录失败:[{}]'.format(e))
#找寻最大页数
def find_max_page(url):
start_html = requests.get(url, headers=referer)
soup = BeautifulSoup(start_html.text, "html.parser")
page = soup.find('div', class_='page-nav').find_all('a')
max_page = page[-2].text
max_page = int(max_page)
return max_page
def mkdir(path):
dirname,filename=os.path.split(path)
if not os.path.exists(dirname):
os.mkdir(dirname)
def write(path,content):
dirname,filename=os.path.split(path)
if not os.path.exists(dirname):
os.mkdir(dirname)
with open(path, 'wb') as f:
f.write(content)
if __name__ == "__main__":
same_url = url
max_page = find_max_page(url)
page, line = get_log(data)
print('从{}页,{}行开始缓存'.format(page, line))
for n in range(page, int(max_page)+1):
ul = same_url+str(n)
start_html = requests.get(ul, headers=referer)
soup = BeautifulSoup(start_html.text, "html.parser")
all=soup.find_all('a', class_='pic',target='_blank')
# for img in all:
for lines in range(line, len(all)):
img = all[lines]
pic_url=img.find('img').get("src")
print(pic_url)
html = requests.get(pic_url, headers=referer)
file_name = pic_url.split(r'/')[-1]
write(os.path.join('./data',file_name),html.content)
put_log(data, n, lines)
print('第',n,'页完成')
line = 0
time.sleep(1)