p站爬虫脚本

相信p站对于许多老司机来说都不陌生,我们在浏览p站时遇到喜欢的图片一张张的保存 往往是很费劲的。那么如何解决这个问题? 邦邦卡邦~~ 爱丽丝 爬虫闪亮登场 下面将介绍如何用爬虫爬取p站图片

我们输入搜索关键词搜索后抓到以下接口

然后……

import re
import requests
import json
import os
import requests
from concurrent.futures import ThreadPoolExecutor
from lxml import etree
from multiprocessing import Process, Queue
from urllib import parse
import re
from progress.bar import Bar
from tqdm import tqdm
from requests.exceptions import RequestException
from concurrent.futures import ThreadPoolExecutor
from threading import Thread
import queue
import sys
headers = {...
}
s=requests.session()

cookies = {
...
}

def create_directory(path):
os.makedirs(path, exist_ok=True)
def geturl(q):
url = f"https://www.pixiv.net/ajax/search/artworks/千恋万花"
r18url=f'https://www.pixiv.net/tags/碧蓝航线/artworks?mode=r18&s_mode=s_tag'
params = {
"word": '千恋万花', #关键词
"order": "date_d",
"mode": "safe", #safe/r18
"p": "1", #页码
"csw": "0",
"s_mode": "s_tag",
"type": "all",
"lang": "zh",
"version": "6c33903e9ee09f649515b0326775bf9913d930a1"
}
response = requests.get(url, headers=headers,params=params).json()
picId=response['body']['illustManga']['data']
all_ids=[]
for item in picId:
illust_id = item['id']
all_ids.append(illust_id)
for id in all_ids:
url = f'https://www.pixiv.net/ajax/illust/{id}/pages?lang=zh&version=6c33903e9ee09f649515b0326775bf9913d930a1'
response = requests.get(url, headers=headers).json()
original_values = [item['urls']['original'] for item in response['body']]
for oriurl in original_values:
print(oriurl)
title=oriurl.split('/')[-1]
q.put(oriurl)
q.put('ok')


def downurl(url):
response = requests.get(url, stream=True,headers=headers)
total_size = int(response.headers.get('content-length', 0))
filename = url.split('/')[-1]
save_path = './千恋万花'
save_file = os.path.join(save_path, filename)
with open(save_file, 'wb') as file:
with tqdm(total=total_size, unit='B', unit_scale=True) as progress_bar:
for chunk in response.iter_content(chunk_size=1024):
if chunk:
file.write(chunk)
progress_bar.update(len(chunk))
print(f'{filename}下载完成')



def load_url(q):
with ThreadPoolExecutor(max_workers=30) as executor:
while True:
url = q.get()
if url == 'ok':
break
executor.submit(downurl, url,)

if __name__ == '__main__':
create_directory('./千恋万花')
q = Queue()
q1 = Process(target=geturl, args=(q,))
q2 = Process(target=load_url, args=(q,))
q1.start()
q2.start()

#4.24 尝试取消cookie获取数据,依然可以爬取,header内似乎也包含验证验证,同时利用session获取获取cookie值,但未使用
#5.1 驳回上面的话,爬取依旧需要携带cookie,只携带header会不允许访问页面 {‘error’: True, ‘message’: ‘尚无此页’, ‘body’: []}

这样子就完成啦

注意此教程实现的只是基本功能,还有需要功能需要进一步的完善...