循环切片+多线程+消息队列queus

# coding:utf-8
from bs4 import BeautifulSoup
import requests
from requests.packages import urllib3
from fake_useragent import UserAgent
import threading
from queue import Queue




urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)




class BatchGetTitle:

    def __init__(self, domains) -> None:
        self.domains = domains

        # 设置http请求头伪装成浏览器
        self.headers = {
            "User-Agent": UserAgent().random,
        }

        self.jobs = Queue()

        self.result = []

        
        # requests获取博客页面html文本



    def getTitle(self,q):

        while not q.empty():
            url  = q.get()
            try:
                r = requests.get("http://"+url, headers=self.headers,timeout=3, verify=False)
                status = r.status_code
                if status >= 400:
                    self.result.append({'url': url,'tag':'访问错误'})
                else:
                    # r.encoding = encoding
                    r.encoding =  r.apparent_encoding # 解决转换乱码
                    html = r.text
                    soup = BeautifulSoup(html, "html.parser")

                    pagetitle = soup.find("title")
                    if pagetitle is None:
                        self.result.append({'url': url,'tag':'无标题'})
                    else:
                        title = pagetitle.get_text().replace(' ', '')
                        if len(title) == 0:
                            self.result.append({'url': url,'tag':'无标题'})
                        else:
                            self.result.append({'url': url,'title':title})
            except:
                
                self.result.append({'url': url,'tag':'无法访问'})
            
            finally:
                q.task_done()

                # 插入到任务情况数据库,再清空 self.result
                # print(len(self.result))
                # print(len(self.result))

                           
                

    def main(self):
        # 添加任务到队列中
        for domain in self.domains:
            self.jobs.put(domain)


        # 随机任务线程数量
        for i in range(10):
            worker = threading.Thread(target=self.getTitle, args=(self.jobs,))
            worker.start()

        # print("waiting for queue to complete", jobs.qsize(), "tasks")
        self.jobs.join()
        # print("all done")
        return self.result


def get_result(val):
    n = 0
    while n >-1:
        d = val[n:n+2]
        if not d:
            break
        ob = BatchGetTitle(d)
        r = ob.main()
        print(r)
        n = n+2



obj = [
        "701.com",
        "9966.com",
        "66612a.com",
        "98.net",
        "amjs.com",
        "qq.com",
    ]

print(get_result(obj))

Last updated