#!/usr/bin/python
# -*- coding: utf-8 -*-
#coding=utf-8
from bs4 import BeautifulSoup
import requests
import sys
import time
import urllib.request
from urllib import parse
import re
import copy
import datetime
import MySQLdb
import socket
#创建表
use='' #数据库用户名
password='' #数据库密码
host='' #数据库ip
database='' #数据库名
#功能:去除不可见字符
#参数:
# str (in) 要解析的字符串
#返回值:返回处理过后的字符串
def confir(str):
for i in range(0,32):
str = str.replace(chr(i),'')
return str
#功能:打开数据连接
#参数:use 数据库用户名
# password 数据库密码
# host 数据库IP地址
# database 数据库名
#返回值:成功返回数据库句柄
def openMysql(use, password, host, database):
conn = MySQLdb.connect(host, use, password, database, charset='utf8')
return conn
#功能:关闭数据库连接
#参数:数据库句柄
#返回值:无返回值
def closeMysql(conn):
conn.close()
#功能:获取网页数据内容
#参数:
# url (in) 网页url
#返回值:
def getHtml(url):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8'
}
req = urllib.request.Request(url, None, headers, None, False)
response = urllib.request.urlopen(req)
html = response.read().decode('utf-8','ignore')
html = BeautifulSoup(html,'html.parser')
return html
except AttributeError as e:
return None
#功能:解析网页,获取下一页
#参数:
# url 要解析的网页
#返回值:成功返回下一页url
def tableutl(url):
#从网络获取网页内容
html = getHtml(url)
endpage = " "
#解析页,获取最后一条
pages = html.find_all('p', id="page")
for pg in pages:
for pgurl in pg.find_all('a', class_="n"):
if pgurl.getText() == "下一页>":
endpage = pgurl.get('href')
break
return endpage
#功能:解析百度新闻搜索列表页
#参数:
# url (in) 百度搜索列表页
#返回值:
def parsetable(url):
html = getHtml(url)
table = html.find_all('div', id="container", class_ ="container_s")
dict = {
'ListId':'',
'Taskid':'',
'当前页码':'',
'标准编号':'',
'标准名称':'',
'发布部门':'',
'实施日期':'',
'状态':'',
'详情链接':'',
'采集状态':'',
'采集机IP':'',
'采集时间':'',
'当前页面访问次数':''
}
conn = openMysql(use, password, host, database)
cursor = conn.cursor()
#创建表
createtabsql = """CREATE TABLE IF NOT EXISTS baidulistpage(
listid VARCHAR(20),
taskid VARCHAR(20),
current_page_num VARCHAR(100),
standard_encd VARCHAR(100),
standard_name VARCHAR(100),
release_department VARCHAR(100),
Implementation_date VARCHAR(100),
start VARCHAR(20),
detail_link VARCHAR(56),
acqui_status VARCHAR(28),
collector_ip VARCHAR(56),
acqui_time VARCHAR(56),
current_page_visits VARCHAR(56));"""
cursor.execute(createtabsql)
conn.commit()
#创建socket实例
skt = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
#连接8.8.8.8,8.8.8.8是google提供的公共dns服务器
skt.connect(('8.8.8.8',80))
#获得连接信息,是一个tuple (ip,port)
socketIpPort = skt.getsockname()
ip = socketIpPort[0]
skt.close()
for divtag in html.find_all('div', id="wrapper_wrapper"):
dict = {
'ListId':'',
'Taskid':'',
'当前页码':'',
'标准编号':'',
'标准名称':'',
'发布部门':'',
'实施日期':'',
'状态':'',
'详情链接':'',
'采集状态':'',
'采集机IP':'',
'采集时间':'',
'当前页面访问次数':''
}
dt = datetime.datetime.now()
dict['详情链接'] = url
dict['采集机IP'] = ip
dict['采集时间'] = dt.strftime("%Y-%m-%d %H:%M:%S")
for div in divtag.find_all('div', class_="result"):
for h3 in div.find_all('h3'):
for a in h3.find_all('a'):
dict['详情链接'] = a.get('href')
dict['标准名称'] = confir(a.getText().replace(" ", ""))
for p in div.find_all('p'):
tr = confir(p.getText().replace(" ", "")).replace(u'\xa0\xa0', u':')
str1 = re.split(r":", tr)
key = copy.copy(str1)
if len(key) < 2:
dict['实施日期'] = key[0]
else:
dict['发布部门'] = key[0]
dict['实施日期'] = key[1]
#print(confir(p.getText().replace(" ", "")).replace(u'\xa0', u':')) #'\xa0' = &nbps 空格
instelldatasql = "INSERT INTO baidulistpage(listid,taskid,\
current_page_num,standard_encd,standard_name, \
release_department,Implementation_date,start, \
detail_link,acqui_status,collector_ip,acqui_time,\
current_page_visits) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);"
cursor.execute(instelldatasql, [dict['ListId'],dict['Taskid'],
dict['当前页码'],dict['标准编号'],dict['标准名称'],
dict['发布部门'],dict['实施日期'],dict['状态'],
dict['详情链接'],dict['采集状态'],dict['采集机IP'],
dict['采集时间'],'1'])
conn.commit()
cursor.close()
closeMysql(conn)
#功能:将搜索关键词插入数据库
#参数:
# keyword (in) 关键词
#返回值:无返回值
def mainfunction(keyword):
#插入入口数据库
initial = 'http://news.baidu.com'
url = initial + "/ns?word=" + parse.quote(keyword) + "&tn=news&from=news&cl=2&rn=20&ct=1"
dict = {
'TaskId':'',
'网站名称':'',
'类别名称':'',
'检索词':'',
'附加参数':'',
'入口链接':'',
'采集状态':'',
'采集时间':'',
'采集机IP':'',
'采集机名称':'',
'采集起始时间':'',
'采集结束时间':'',
'采集间隔':''
}
#创建socket实例
skt = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
#连接8.8.8.8,8.8.8.8是google提供的公共dns服务器
skt.connect(('8.8.8.8',80))
#获得连接信息,是一个tuple (ip,port)
socketIpPort = skt.getsockname()
ip = socketIpPort[0]
skt.close()
dt = datetime.datetime.now()
html = getHtml(initial)
title = html.find_all('title')
#print(title[0].getText())
dict['网站名称'] = title[0].getText()
dict['类别名称'] = '新闻'
dict['检索词'] = keyword
dict['入口链接'] = url
dict['采集机IP'] = ip
dict['采集时间'] = dt.strftime("%Y-%m-%d %H:%M:%S")
dict['采集机名称'] = socket.getfqdn(socket.gethostname())
conn = openMysql(use, password, host, database)
cursor = conn.cursor()
createtabsql = """CREATE TABLE IF NOT EXISTS baiduentrance(
task_id VARCHAR(50),
website_name VARCHAR(100),
class_name VARCHAR(100),
search_term VARCHAR(100),
additi_para VARCHAR(100),
entrance_link VARCHAR(100),
acqui_status VARCHAR(100),
acqui_time VARCHAR(56),
collector_ip VARCHAR(28),
collector_name VARCHAR(56),
acqui_start_time VARCHAR(56),
acqui_end_time VARCHAR(56),
collecor_interval VARCHAR(56)
);"""
cursor.execute(createtabsql)
conn.commit()
dict['采集状态'] = '已采集'
instelldatasql = "INSERT INTO baiduentrance(task_id,website_name,\
class_name,search_term,additi_para, \
entrance_link,acqui_status,acqui_time, \
collector_ip,collector_name,acqui_start_time,\
acqui_end_time,collecor_interval) VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);"
cursor.execute(instelldatasql, [dict['TaskId'],dict['网站名称'],
dict['类别名称'],dict['检索词'],dict['附加参数'],
dict['入口链接'],dict['采集状态'],dict['采集时间'],
dict['采集机IP'],dict['采集机名称'],dict['采集起始时间'],
dict['采集结束时间'],dict['采集间隔']])
conn.commit()
cursor.close()
closeMysql(conn)
if __name__ == "__main__":
keyword = sys.argv[1]
url = "http://news.baidu.com/ns?word=" + parse.quote(keyword) + "&tn=news&from=news&cl=2&rn=20&ct=1"
newurl = ''
parsetable(url)
url_1 = "http://news.baidu.com" + tableutl(url)
mainfunction(keyword)
for i in range(1,101):
url_2 = url_1
parsetable(url_2)
url_1 = "http://news.baidu.com" + tableutl(url_2)
运行命令:
python3.4 baidunews.py 吴恩达