程序使用说明:

安装源码所需python库

使用方法:

python3 baidu.py 教育平台 300

教育平台 关键字

300 搜索页数(最好不要超过300,超过300后会导致页面重复会死循环,这是百度的问题,你们也可以自己解决一下)

源码中 remove = ['http://baike.baidu.com','http://zhidao.baidu.com','http://baijiahao.baidu.com','http://wk.baidu.com']

remove 列表存放的是去除网址,没有对域名进行匹配,只是单纯的字符串匹配。


完整代码:

import re
import requests
import time
import sys
import random
from urllib.parse import quote, unquote
import threading
from bs4 import BeautifulSoup
result = []
threads = []
def collect(word, pn):
    page = pn // 10
    word = quote(word, "utf-8")
    url = "http://m.baidu.com/ssid=0/from=0/bd_page_type=1/uid=0/baiduid=F0A715FCC08EDFEF3EF12FEDDC2EC810/pu=sz%40224_220%2Cta%40middle____/pu=sz%40224_220%2Cta%40middle___24_74.0/baiduid=31235B9FF0F7A756A7940620CAF109E1/s?ref=www_colorful&lid=12985577237012163036&word=" + word + "&pn=" + str(
        pn) + "&rn=10&tn=middle&prest=111081&st=111091&usm=0&sa=pp"
    headers = {
        "Content-Type": "application/x-www-form-urlencoded",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
    }
    request = requests.get(url, headers=headers)
    soup = BeautifulSoup(request.text, 'html.parser')
    res = soup.find_all('div', class_="resitem")
    for i in res:
        try:
            absClass = i.find('div', class_="abs")
            siteurl = absClass.find('span', class_='site').get_text()
            siteurl = "http://" + siteurl
            title = i.find('a').get_text()
            result.append(siteurl)
            # print(title + " - " + siteurl)
        except:
            pass
def save(result):
    filename = time.strftime("%Y-%m-%d-%M", time.localtime()) + "-" + str(random.randint(1, 100)) + "-" + str(
        random.randint(0, 9)) + ".txt"
    for i in result:
        with open(filename, "a", encoding="utf-8") as fw:
            url = i.strip()
            fw.write(url + "\n")
            fw.close()
def main(word, pn):
    for i in range(1, pn):
        pn = i * 10
        t = threading.Thread(target=collect, args=(word, pn))
        threads.append(t)
        t.setDaemon(True)
        t.daemon = True
        t.start()
    for t in threads:
        t.join()
    urlList = list(set(result))
    remove = ['http://baike.baidu.com', 'http://http://zhidao.baidu.com', 'http://baijiahao.baidu.com',
              'http://wk.baidu.com']
    try:
        for i in remove:
            if i in urlList:
                urlList.remove(i)
    except:
        pass
    total = len(urlList)
    save(urlList)
    return total
if __name__ == "__main__":
    word = sys.argv[1]
    pn = int(sys.argv[2])
    print("正在采集...")
    total = main(word, pn)
    print("本次共采集 " + str(total) + " 个域名")
    print("采集结束")