标签 百度url采集 下的文章

Baidu采集URL脚本


采集百度URL脚本-Python3

#!/usr/bin/env python3
# coding: utf-8
# @Author:YH's
# @Date: 2017年4月13日 14:43:21
# @Last Modified by :   YH's
# @Last Modified time:  2017年4月13日 14:44:03

import requests
from bs4 import BeautifulSoup
from urllib import parse

def writeFile(data):
    f = open("url.txt","a")
    for line in data:
        f.write(line+"\n")
    f.close

def BaiduSpider(keyword,page=1):
    paths = []
    peVal = []
    gpage = 0
    flag = 1
    headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0"}
    page*=10
    while gpage<page:
        url = "https://m.baidu.com/s?"
        url += "wd=%s&" % parse.quote(keyword)
        url += "pn=%d" % gpage
        
        try:
            print("请求页面:%d"%flag)
            r = requests.get(url,headers=headers)
            if r.text.find("class=\"result c-result c-clk-recommend\"") == -1 and r.text.find("class=\"result c-result\"") == -1:
                print("警告:没有匹配合适的URL地址")
                break
            soup = BeautifulSoup(r.text,"html.parser")
            div = soup.find_all(tpl="www_normal")
            for i in div:
                url = i.get('data-log')
                a = eval(url)
                print('---------')
                print(a)
                paths.append(a['mu'])
        except KeyboardInterrupt as e:
            peVal = set(paths)
            writeFile(peVal)
            exit(0)
        flag +=1
        gpage+=10
    peVal = set(paths)
    writeFile(peVal)

if __name__ == "__main__":
    banner = '''
       _                             
 _   _| |__  ___ ___  ___ __ _ _ __  
| | | | '_ \/ __/ __|/ __/ _` | '_ \ 
| |_| | | | \__ \__ \ (_| (_| | | | |
 \__, |_| |_|___/___/\___\__,_|_| |_|
 |___/  

blog:https://huimaozi.win
'''
    print(banner)
    try:
        keyword = input("输入搜索内容:")
        page = int(input("输入页面数:"))
    except (ValueError,TypeError) as e:
        print("警告:输入页面类型错误!")
        exit(0)
    except KeyboardInterrupt as e:
        print("警告:没有输入数值")
        exit(0)
    BaiduSpider(keyword,page)