采集百度URL脚本-Python3
#!/usr/bin/env python3
# coding: utf-8
# @Author:YH's
# @Date: 2017年4月13日 14:43:21
# @Last Modified by : YH's
# @Last Modified time: 2017年4月13日 14:44:03
import requests
from bs4 import BeautifulSoup
from urllib import parse
def writeFile(data):
f = open("url.txt","a")
for line in data:
f.write(line+"\n")
f.close
def BaiduSpider(keyword,page=1):
paths = []
peVal = []
gpage = 0
flag = 1
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:52.0) Gecko/20100101 Firefox/52.0"}
page*=10
while gpage<page:
url = "https://m.baidu.com/s?"
url += "wd=%s&" % parse.quote(keyword)
url += "pn=%d" % gpage
try:
print("请求页面:%d"%flag)
r = requests.get(url,headers=headers)
if r.text.find("class=\"result c-result c-clk-recommend\"") == -1 and r.text.find("class=\"result c-result\"") == -1:
print("警告:没有匹配合适的URL地址")
break
soup = BeautifulSoup(r.text,"html.parser")
div = soup.find_all(tpl="www_normal")
for i in div:
url = i.get('data-log')
a = eval(url)
print('---------')
print(a)
paths.append(a['mu'])
except KeyboardInterrupt as e:
peVal = set(paths)
writeFile(peVal)
exit(0)
flag +=1
gpage+=10
peVal = set(paths)
writeFile(peVal)
if __name__ == "__main__":
banner = '''
_
_ _| |__ ___ ___ ___ __ _ _ __
| | | | '_ \/ __/ __|/ __/ _` | '_ \
| |_| | | | \__ \__ \ (_| (_| | | | |
\__, |_| |_|___/___/\___\__,_|_| |_|
|___/
blog:https://huimaozi.win
'''
print(banner)
try:
keyword = input("输入搜索内容:")
page = int(input("输入页面数:"))
except (ValueError,TypeError) as e:
print("警告:输入页面类型错误!")
exit(0)
except KeyboardInterrupt as e:
print("警告:没有输入数值")
exit(0)
BaiduSpider(keyword,page)