p站爬虫脚本 Created 2024-04-29 Updated 2024-08-08
南极洲
脚本 脚本 p站爬虫脚本 妃爱 2024-04-29 2024-08-08 相信p站对于许多老司机来说都不陌生,我们在浏览p站时遇到喜欢的图片一张张的保存 往往是很费劲的。那么如何解决这个问题?
邦邦卡邦~~ 爱丽丝 爬虫闪亮登场 下面将介绍如何用爬虫爬取p站图片
我们输入搜索关键词搜索后抓到以下接口
然后……
import reimport requestsimport jsonimport osimport requestsfrom concurrent.futures import ThreadPoolExecutorfrom lxml import etreefrom multiprocessing import Process, Queuefrom urllib import parseimport refrom progress.bar import Barfrom tqdm import tqdmfrom requests.exceptions import RequestExceptionfrom concurrent.futures import ThreadPoolExecutorfrom threading import Threadimport queueimport sysheaders = {... } s=requests.session() cookies = { ... } def create_directory (path ): os.makedirs(path, exist_ok=True ) def geturl (q ): url = f"https://www.pixiv.net/ajax/search/artworks/千恋万花" r18url=f'https://www.pixiv.net/tags/碧蓝航线/artworks?mode=r18&s_mode=s_tag' params = { "word" : '千恋万花' , "order" : "date_d" , "mode" : "safe" , "p" : "1" , "csw" : "0" , "s_mode" : "s_tag" , "type" : "all" , "lang" : "zh" , "version" : "6c33903e9ee09f649515b0326775bf9913d930a1" } response = requests.get(url, headers=headers,params=params).json() picId=response['body' ]['illustManga' ]['data' ] all_ids=[] for item in picId: illust_id = item['id' ] all_ids.append(illust_id) for id in all_ids: url = f'https://www.pixiv.net/ajax/illust/{id } /pages?lang=zh&version=6c33903e9ee09f649515b0326775bf9913d930a1' response = requests.get(url, headers=headers).json() original_values = [item['urls' ]['original' ] for item in response['body' ]] for oriurl in original_values: print (oriurl) title=oriurl.split('/' )[-1 ] q.put(oriurl) q.put('ok' ) def downurl (url ): response = requests.get(url, stream=True ,headers=headers) total_size = int (response.headers.get('content-length' , 0 )) filename = url.split('/' )[-1 ] save_path = './千恋万花' save_file = os.path.join(save_path, filename) with open (save_file, 'wb' ) as file: with tqdm(total=total_size, unit='B' , unit_scale=True ) as progress_bar: for chunk in response.iter_content(chunk_size=1024 ): if chunk: file.write(chunk) progress_bar.update(len (chunk)) print (f'{filename} 下载完成' ) def load_url (q ): with ThreadPoolExecutor(max_workers=30 ) as executor: while True : url = q.get() if url == 'ok' : break executor.submit(downurl, url,) if __name__ == '__main__' : create_directory('./千恋万花' ) q = Queue() q1 = Process(target=geturl, args=(q,)) q2 = Process(target=load_url, args=(q,)) q1.start() q2.start()
#4.24 尝试取消cookie获取数据,依然可以爬取,header内似乎也包含验证验证,同时利用session获取获取cookie值,但未使用 #5.1 驳回上面的话,爬取依旧需要携带cookie,只携带header会不允许访问页面 {‘error’: True, ‘message’: ‘尚无此页’, ‘body’: []}
这样子就完成啦
注意此教程实现的只是基本功能,还有需要功能需要进一步的完善...