from lxml import html
import requests
import threading
import time
import queue
def craw(nums: str):
url = 'https://www.cnblogs.com/AggSite/AggSitePostList'
headers = {
'content-type': 'application/json; charset=UTF-8',
}
data1 = '{"CategoryType": "SiteHome","ParentCategoryId":0,"CategoryId":808,"PageIndex":'
data2 = ',"TotalPostCount": 4000,"ItemListActionName": "AggSitePostList"}'
data = data1 + nums + data2
return requests.post(url=url, headers=headers, data=data).content.decode("utf-8")
def parse(res):
x = html.fromstring(res)
r = x.xpath('//a[@class="post-item-title"]')
ls = []
for s in r:
bt = s.xpath("text()")[0]
link = s.xpath("@href")[0]
ls.append(bt)
ls.append(link)
return ls
def do_craw(nums_queue: queue.Queue, html_queue: queue.Queue):
while True:
num = nums_queue.get()
res = craw(num)
html_queue.put(res)
print(threading.current_thread().name, f"craw{num}")
def do_parse(html_queue: queue.Queue, fout):
while True:
res = html_queue.get()
content = parse(res)
for con in content:
fout.write(str(con) + "\n")
print(time.time())
print(threading.current_thread().name, f"content_size:{len(content)}")
if __name__ == '__main__':
# 普通
for a in range(1, 201):
craw(str(a))
# 多线程
threads = []
for th in range(1, 201):
threads.append(
threading.Thread(target=craw, args=(str(th),))
)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
# 分离式
nums_queue = queue.Queue()
html_queue = queue.Queue()
for num in range(1, 201):
nums_queue.put(str(num))
for idx in range(200):
t = threading.Thread(target=do_craw, args=(nums_queue, html_queue), name=f"craw{str(idx)}")
t.start()
fout = open("data.txt", "w", encoding='utf-8')
for idx in range(200):
t = threading.Thread(target=do_parse, args=(html_queue, fout), name=f"parse{str(idx)}")
t.start()
文档最后编辑于4年前
评论已关闭