import requests from lxml import etree from concurrent.futures import ThreadPoolExecutor
# 使用 session 来复用连接 session = requests.Session()
cookies = { }
headers = { } params = { }
# 爬取页面并获取链接 defget_page_urls(page): xhurl = f'https://xhcya.com/category/acg/page/{page}'#分区链接 try: resp = session.get(url=xhurl, headers=headers).text tree = etree.HTML(resp) urls = tree.xpath('//div[@class="inn-archive__item__container inn-card_post-thumbnail__item__container"]/a/@href') print(urls) return [url.split('/')[-1] for url in urls] except Exception as e: print(f"Error fetching page {page}: {e}") return []
# 发送 POST 请求 defpost_data(post_id): try: data = f'------WebKitFormBoundaryZpZ0W9jNRUM2lSFt\r\nContent-Disposition: form-data; name="postId"\r\n\r\n{post_id}\r\n------WebKitFormBoundaryZpZ0W9jNRUM2lSFt--\r\n' response = session.post(url='https://xhcya.com/wp-admin/admin-ajax.php', headers=headers, data=data, params=params,cookies=cookies) print(f"Response for {post_id}: {response.text}") except Exception as e: print(f"Error posting data for {post_id}: {e}")
# 主函数,使用多线程并发 defmain(): with ThreadPoolExecutor(max_workers=20) as executor: # 获取所有页面的链接 futures = [executor.submit(get_page_urls, page) for page inrange(1, 5)] #爬取页码数 post_ids = [] for future in futures: post_ids.extend(future.result())
# 对每个链接的ID进行并发POST请求 post_futures = [executor.submit(post_data, post_id) for post_id in post_ids] for future in post_futures: future.result() # 等待所有请求完成