多线程之博客园博客爬取

from lxml import html
import requests
import threading
import time
import queue


def craw(nums: str):
    url = 'https://www.cnblogs.com/AggSite/AggSitePostList'
    headers = {
        'content-type': 'application/json; charset=UTF-8',
    }
    data1 = '{"CategoryType": "SiteHome","ParentCategoryId":0,"CategoryId":808,"PageIndex":'
    data2 = ',"TotalPostCount": 4000,"ItemListActionName": "AggSitePostList"}'
    data = data1 + nums + data2
    return requests.post(url=url, headers=headers, data=data).content.decode("utf-8")


def parse(res):
    x = html.fromstring(res)
    r = x.xpath('//a[@class="post-item-title"]')
    ls = []
    for s in r:
        bt = s.xpath("text()")[0]
        link = s.xpath("@href")[0]
        ls.append(bt)
        ls.append(link)
    return ls


def do_craw(nums_queue: queue.Queue, html_queue: queue.Queue):
    while True:
        num = nums_queue.get()
        res = craw(num)
        html_queue.put(res)
        print(threading.current_thread().name, f"craw{num}")


def do_parse(html_queue: queue.Queue, fout):
    while True:
        res = html_queue.get()
        content = parse(res)
        for con in content:
            fout.write(str(con) + "\n")
        print(time.time())
        print(threading.current_thread().name, f"content_size:{len(content)}")


if __name__ == '__main__':

    # 普通
    for a in range(1, 201):
        craw(str(a))

    # 多线程
    threads = []
    for th in range(1, 201):
        threads.append(
            threading.Thread(target=craw, args=(str(th),))
        )

    for thread in threads:
        thread.start()

    for thread in threads:
        thread.join()
    
    # 分离式
    nums_queue = queue.Queue()
    html_queue = queue.Queue()
    for num in range(1, 201):
        nums_queue.put(str(num))

    for idx in range(200):
        t = threading.Thread(target=do_craw, args=(nums_queue, html_queue), name=f"craw{str(idx)}")
        t.start()

    fout = open("data.txt", "w", encoding='utf-8')
    for idx in range(200):
        t = threading.Thread(target=do_parse, args=(html_queue, fout), name=f"parse{str(idx)}")
        t.start()
标签: Python学习
文档最后编辑于4年前
多线程之博客园博客爬取

多线程之博客园博客爬取

评论

评论已关闭

小小英雄杜恩