uinotes爬虫
uinotes爬虫
爬这个东西,scrapy扒库+url指定拿取图片App库 - UI Notes app->缩略图->单个大图
现在使用者想一次性获得此APP的大图不用一个一个点
其实用不到框架,但是自己练手把扒站也写了吧
普通爬虫
需求输入url 下载此app对应的所有大图
分析请求
首页会重定向,获取是从阿里上海CDN格式暂时没看到在那继续往下看
往下翻页明显是翻页式的,而且是简单的翻页,没带scroll cursor,翻了几十页,没什么问题
看请求头,有自定义的header,但是我并不想弄明白它的header是在干什么,cookie也是直接复制的
注意详情页,获取原图,首页,header并不一样,需要定义三个,我没试出来哪个是必须的。
他的原图是存在阿里云的CDN的,在json里和URL里直接拼接就行 点击查看原图的时候发现会请求alicdn,通过在不同页面保存图片观察变化,分辨率是通过后面的参数确定的
反爬机制
爬取的过程中发现时不时出现requests.exceptions.ProxyError
,时不时会Connect Reset by Peer,就是不定时断线。
while True给他重试一下就行了,10s一次
调试过程中几个小时内请求约3000图片ip就被封禁了 ,在一个多小时候解开了,封禁的时候网页是不能访问的。
代码实现
使用pickle进行本地的持久化,可以断点续爬
不多说
Click to see more
# -*-coding:utf-8-*-
# SettingCode here
__author__ = "a_little_rubbish"
__date__ = "2023/2/27 09:13"
# import your model here
import os.path
import random
import time
from typing import List
import requests
import pickle
from fastapi import FastAPI
# your class&function here
urls = set()
HEADER = {
'Host': 'uinotes.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/110.0',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Cookie': 'Hm_lvt_24d5bc972e9b9e9b2ba031648f98eceb=1677205860; Hm_lpvt_24d5bc972e9b9e9b2ba031648f98eceb=1677460645; __vtins__JrOcbAvQF047xVR4=%7B%22sid%22%3A%20%22416be1c5-321f-5983-88d5-84f67c028362%22%2C%20%22vd%22%3A%205%2C%20%22stt%22%3A%20891664%2C%20%22dr%22%3A%2020210%2C%20%22expires%22%3A%201677462444542%2C%20%22ct%22%3A%201677460644542%7D; __51uvsct__JrOcbAvQF047xVR4=3; __51vcke__JrOcbAvQF047xVR4=6d6af076-7332-536e-b956-d0e63e9a69cf; __51vuft__JrOcbAvQF047xVR4=1677205861743',
'Upgrade-Insecure-Requests': '1',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Referer': 'https://uinotes.com/app/146907778227438046',
'TE': 'trailers',
'If-None-Match': 'W/"1125-cxIIC4CA1SUrDsKKenx3oU7m8k4"'
}
detail_header = {'Accept': 'image/avif,image/webp,*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Host': 'uiwiki4.oss-cn-shanghai.aliyuncs.com',
'Referer': 'https://uinotes.com/',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/110.0'
}
HEADER_TITLE = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection': 'keep-alive',
'Cookie': 'Hm_lvt_24d5bc972e9b9e9b2ba031648f98eceb=1677205860; Hm_lpvt_24d5bc972e9b9e9b2ba031648f98eceb=1677467926; __vtins__JrOcbAvQF047xVR4=%7B%22sid%22%3A%20%22826916f9-3ae7-510d-8060-bf72a4852a62%22%2C%20%22vd%22%3A%2013%2C%20%22stt%22%3A%203593758%2C%20%22dr%22%3A%20791782%2C%20%22expires%22%3A%201677469725867%2C%20%22ct%22%3A%201677467925867%7D; __51uvsct__JrOcbAvQF047xVR4=4; __51vcke__JrOcbAvQF047xVR4=6d6af076-7332-536e-b956-d0e63e9a69cf; __51vuft__JrOcbAvQF047xVR4=1677205861743',
'Host': 'uinotes.com',
'If-None-Match': 'W/"105e-x8xiTTLRwcejqeYiMrKrWBDTS0I"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'cross-site',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/110.0'
}
def write_dumplicate():
with open('url.pkl', 'wb') as f:
pickle.dump(urls, f) # 进行序列化
def save(res_list, appname):
for source in res_list:
# time.sleep((random.randint(0, 1000)) / 1000)
url = "https://uiwiki4.oss-cn-shanghai.aliyuncs.com{image_uid}?x-oss-process=style/title".format(
image_uid=source['url'])
if url in urls:
print("此url已存在,跳过")
continue
try:
res = requests.get(url, headers=detail_header)
urls.add(url)
except requests.exceptions.ProxyError:
print("连接错误,停止10s")
return "break"
imgsuffix = source['url'].split("/")<br/>[-1]
print("获取图片:", source['imageName'] + '-' + imgsuffix)
with open('{appname}/{img_name}'.format(appname=appname, img_name=source['imageName']) + '-' + imgsuffix,
mode='wb') as f:
f.write(res.content)
def spider(appurl):
appid = appurl.split('/')<br/>[-1]
page = 0
print("APP{appid}开始".format(appid=appid))
while True:
page += 1
HEADER['Referer'] = appurl
try:
res = requests.get(
"https://uinotes.com/uinotes-api/app-detail?uuid={appid}&page={page}".format(appid=appid, page=page),
headers=HEADER)
except requests.exceptions.ProxyError:
print("获取页面错误,暂停10s")
write_dumplicate()
time.sleep(10)
page -= 1
continue
if res.status_code == 204:
print("此APP{appid}结束".format(appid=appid))
break
res = res.json()
time.sleep((random.randint(0, 1500)) / 1000)
print(f"获取page {page} : ".format(page=page), res)
appname = res[0]["appName"]
if not os.path.exists(appname):
os.makedirs(appname)
print("创建文件夹", appname)
single = save(res, appname)
if single == "break":
write_dumplicate()
time.sleep(10)
page -= 1
continue
write_dumplicate()
def main(SOURCELIST):
if not os.path.exists('url.pkl'):
write_dumplicate()
with open('url.pkl', 'rb') as f:
urls = pickle.load(f)
print("上次爬过:", urls)
for i in SOURCELIST:
spider(i)
app = FastAPI()
@app.post("/")
def root(app_url_list: List[str]):
main(app_url_list)
return {"message": "OK"}
# nohup python -m uvicorn main:app --reload > runtime.log 2>&1 &
运行结果
/opt/anaconda3/envs/spiders/bin/python -m uvicorn main:app --reload
INFO: Will watch for changes in these directories: ['/Users/jack/code/py/uinotespider']
INFO: Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)
INFO: Started reloader process [70977] using StatReload
INFO: Started server process [70979]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: 127.0.0.1:50483 - "GET /docs HTTP/1.1" 200 OK
INFO: 127.0.0.1:50483 - "GET /openapi.json HTTP/1.1" 200 OK
上次爬过: set()
APPstring开始
此APPstring结束
INFO: 127.0.0.1:50488 - "POST / HTTP/1.1" 200 OK
上次爬过: set()
APP012779878697506993开始
此APP012779878697506993结束
APP146907778227438046开始
获取page 1 : [{'uuid': '287632322506322094', 'height': 520, 'width': 1170, 'url': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYMTT7DJ.png', 'logoUrl': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYNI2SX0.png', 'app': '146907778227438046', 'appName': '京东金融', 'imageName': '京东金融-001', 'order': '001'}, {'uuid': '587239999955940464', 'height': 520, 'width': 1170, 'url': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYN3UWOG.png', 'logoUrl': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYNI2SX0.png', 'app': '146907778227438046', 'appName': '京东金融', 'imageName': '京东金融-002', 'order': '002'}, {'uuid': '322259292774089886', 'height': 520, 'width': 1170, 'url': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYN2O6BF.png', 'logoUrl': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYNI2SX0.png', 'app': '146907778227438046', 'appName': '京东金融', 'imageName': '京东金融-003', 'order': '003'}, {'uuid': '556218580832726485', 'height': 520, 'width': 1170, 'url': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYN0OEUU.png', 'logoUrl': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYNI2SX0.png', 'app': '146907778227438046', 'appName': '京东金融', 'imageName': '京东金融-004', 'order': '004'}, {'uuid': '395426589330760169', 'height': 520, 'width': 1170, 'url': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYNCXUVF.png', 'logoUrl': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYNI2SX0.png', 'app': '146907778227438046', 'appName': '京东金融', 'imageName': '京东金融-005', 'order': '005'}, {'uuid': '802571492955637764', 'height': 520, 'width': 1170, 'url': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYNJ43GD.png', 'logoUrl': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYNI2SX0.png', 'app': '146907778227438046', 'appName': '京东金融', 'imageName': '京东金融-006', 'order': '006'}, {'uuid': '995953259139436499', 'height': 520, 'width': 1170, 'url': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEICQ52U204AH.png', 'logoUrl': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYNI2SX0.png', 'app': '146907778227438046', 'appName': '京东金融', 'imageName': '京东金融-007', 'order': '007'}, {'uuid': '100254973566281366', 'height': 520, 'width': 1170, 'url': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEICQ52U2PD5Q.png', 'logoUrl': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYNI2SX0.png', 'app': '146907778227438046', 'appName': '京东金融', 'imageName': '京东金融-008', 'order': '008'}, {'uuid': '416666334451018289', 'height': 520, 'width': 1170, 'url': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEICQ52U7Y5AA.png', 'logoUrl': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYNI2SX0.png', 'app': '146907778227438046', 'appName': '京东金融', 'imageName': '京东金融-009', 'order': '009'}, {'uuid': '760622668671455588', 'height': 520, 'width': 1170, 'url': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEICQ52U2FOII.png', 'logoUrl': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYNI2SX0.png', 'app': '146907778227438046', 'appName': '京东金融', 'imageName': '京东金融-010', 'order': '010'}, {'uuid': '218217284774724377', 'height': 520, 'width': 1170, 'url': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEICQ52UGTPG9.png', 'logoUrl': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYNI2SX0.png', 'app': '146907778227438046', 'appName': '京东金融', 'imageName': '京东金融-011', 'order': '011'}, {'uuid': '426021415318981234', 'height': 520, 'width': 1170, 'url': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEICQ52UB1A94.png', 'logoUrl': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYNI2SX0.png', 'app': '146907778227438046', 'appName': '京东金融', 'imageName': '京东金融-012', 'order': '012'}, {'uuid': '217555576664546594', 'height': 520, 'width': 1170, 'url': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEICQ52U05OCC.png', 'logoUrl': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYNI2SX0.png', 'app': '146907778227438046', 'appName': '京东金融', 'imageName': '京东金融-013', 'order': '013'}, {'uuid': '068139017852427542', 'height': 520, 'width': 1170, 'url': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEICQ52UVA59Z.png', 'logoUrl': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYNI2SX0.png', 'app': '146907778227438046', 'appName': '京东金融', 'imageName': '京东金融-014', 'order': '014'}, {'uuid': '074867591771388938', 'height': 520, 'width': 1170, 'url': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEICQ52U0OAYL.png', 'logoUrl': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYNI2SX0.png', 'app': '146907778227438046', 'appName': '京东金融', 'imageName': '京东金融-015', 'order': '015'}, {'uuid': '238891650394776034', 'height': 520, 'width': 1170, 'url': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEICQ52UFORZM.png', 'logoUrl': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYNI2SX0.png', 'app': '146907778227438046', 'appName': '京东金融', 'imageName': '京东金融-016', 'order': '016'}, {'uuid': '936839413021684790', 'height': 520, 'width': 1170, 'url': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEICQ52UC1EL4.png', 'logoUrl': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYNI2SX0.png', 'app': '146907778227438046', 'appName': '京东金融', 'imageName': '京东金融-017', 'order': '017'}, {'uuid': '631014334916994354', 'height': 520, 'width': 1170, 'url': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEICQ52UUKL9Q.png', 'logoUrl': '/KWBJUPFRH7692/LEE2SY63EOTLS/LEE3VRYNI2SX0.png', 'app': '146907778227438046', 'appName': '京东金融', 'imageName': '京东金融-018', 'order': '018'}]
创建文件夹 京东金融
获取图片: 京东金融-001-LEE3VRYMTT7DJ.png
获取图片: 京东金融-002-LEE3VRYN3UWOG.png
获取图片: 京东金融-003-LEE3VRYN2O6BF.png
获取图片: 京东金融-004-LEE3VRYN0OEUU.png
获取图片: 京东金融-005-LEE3VRYNCXUVF.png
INFO: Shutting down
INFO: Waiting for connections to close. (CTRL+C to force quit)
Process finished with exit code 137 (interrupted by signal 9: SIGKILL)
Scrapy扒站
按照app分下载所有的大图,这个等以后整理爬虫大块的时候一块整理
Loading...