本文章仅用作学习python使用
与其他文章的爬虫区别之处在于不使用ocr识别
环境:
python 依赖项:
1.selenium
2.pandas
3.numpy
5.pymysql
软件依赖项:
chrome浏览器
chromedriver
mysql
如何检查chrome的版本
下载对应版本的chromedriver
python第三方库下载
功能模块一:登录
加载库:
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
import time
import requests
from lxml.html import etree
import pandas as pd
import numpy as np
登录部分函数:
原文
此文已过时,查找元素部分已修正
请先写账号密码到account.txt,账号换行+密码
# 打开浏览器
def openbrowser():
global browser
# https://passport.baidu.com/v2/?login
url = "https://passport.baidu.com/v2/?login&tpl=mn&u=http%3A%2F%2Fwww.baidu.com%2F"
# 打开谷歌浏览器
# Firefox()
# Chrome()
browser = webdriver.Chrome()
# 输入网址
browser.get(url)
# 打开浏览器时间
# print("等待10秒打开浏览器...")
# time.sleep(10)
# 找到id="TANGRAM__PSP_3__userName"的对话框
# 清空输入框
browser.find_element_by_xpath('//*[@id="TANGRAM__PSP_3__footerULoginBtn"]').click()
browser.find_element_by_id("TANGRAM__PSP_3__userName").clear()
browser.find_element_by_id("TANGRAM__PSP_3__password").clear()
# 输入账号密码
# 输入账号密码
account = []
try:
fileaccount = open("account.txt")
accounts = fileaccount.readlines()
for acc in accounts:
account.append(acc.strip())
fileaccount.close()
except Exception as err:
print(err)
input("请正确在account.txt里面写入账号密码")
exit()
browser.find_element_by_id("TANGRAM__PSP_3__userName").send_keys(account[0])
browser.find_element_by_id("TANGRAM__PSP_3__password").send_keys(account[1])
# 点击登陆登陆
# id="TANGRAM__PSP_3__submit"
browser.find_element_by_id("TANGRAM__PSP_3__submit").click()
# 等待登陆10秒
# print('等待登陆10秒...')
# time.sleep(10)
print("等待网址加载完毕...")
select = input("请观察浏览器网站是否已经登陆(y/n):")
while 1:
if select == "y" or select == "Y":
print("登陆成功!")
print("准备打开新的窗口...")
# time.sleep(1)
# browser.quit()
break
elif select == "n" or select == "N":
selectno = input("账号密码错误请按0,验证码出现请按1...")
# 账号密码错误则重新输入
if selectno == "0":
# 找到id="TANGRAM__PSP_3__userName"的对话框
# 清空输入框
browser.find_element_by_id("TANGRAM__PSP_3__userName").clear()
browser.find_element_by_id("TANGRAM__PSP_3__password").clear()
# 输入账号密码
account = []
try:
fileaccount = open("../baidu/account.txt")
accounts = fileaccount.readlines()
for acc in accounts:
account.append(acc.strip())
fileaccount.close()
except Exception as err:
print(err)
input("请正确在account.txt里面写入账号密码")
exit()
browser.find_element_by_id("TANGRAM__PSP_3__userName").send_keys(account[0])
browser.find_element_by_id("TANGRAM__PSP_3__password").send_keys(account[1])
# 点击登陆sign in
# id="TANGRAM__PSP_3__submit"
browser.find_element_by_id("TANGRAM__PSP_3__submit").click()
elif selectno == "1":
# 验证码的id为id="ap_captcha_guess"的对话框
input("请在浏览器中输入验证码并登陆...")
select = input("请观察浏览器网站是否已经登陆(y/n):")
else:
print("请输入“y”或者“n”!")
select = input("请观察浏览器网站是否已经登陆(y/n):")
openbrowser()
打开百度指数
# 新开一个窗口,通过执行js来新开一个窗口
js = 'window.open("http://index.baidu.com");'
browser.execute_script(js)
# 新窗口句柄切换,进入百度指数
# 获得当前打开所有窗口的句柄handles
# handles为一个数组
handles = browser.window_handles
# print(handles)
# 切换到当前最新打开的窗口
browser.switch_to_window(handles[-1])
按输入的关键词点击搜索
def openone(keyword):
print(keyword)
# 清空输入框
browser.find_element_by_xpath('//*[@id="search-input-form"]/input[3]').clear()
# 写入需要搜索的百度指数
browser.find_element_by_xpath('//*[@id="search-input-form"]/input[3]').send_keys(keyword)
# 点击搜索
# <input type="submit" value="" id="searchWords" onclick="searchDemoWords()">
browser.find_element_by_xpath('//*[@id="home"]/div[2]/div[2]/div/div[1]/div/div[2]/div/span/span').click()
# 最大化窗口
browser.maximize_window()
time.sleep(3)
建立数据库连接:
数据库建立的脚本我会在后面贴出
import pymysql
conn =pymysql.connect(host='',user='',password='',db='baidu',charset="utf8")
cur=conn.cursor()
#写一个返回表格 bjii 现有记录数量的函数方便以后添加ID
def nown():
c="select count(*) from `bjii`"
sqlc =c
cur.execute(sqlc)
asa=cur.fetchall()
conn.commit()
allcont=int(str(asa[0])[1:-2])
return allcont
nown()
去重函数
def unique_list(listt):
newl=[]
for i in listt:
if i not in newl:
newl.append(i)
return newl
爬取数据存入数据库:
def uploadone(keyword):
# 构造天数——选择最近几天内数据
#sel = int(input("查询7天请按0,30天请按1,90天请按2,半年请按3:"))
#此处默认为3,其余给出方便修改
sel=3
day = 0
if sel == 0:
ind = 2
elif sel == 1:
ind = 3
elif sel == 2:
ind = 4
elif sel == 3:
ind = 5
browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[1]/div[1]/div[1]/div/div[2]/button').click()
time.sleep(0.3)
#把选择好的天数放进来
sel = '/html/body/div[3]/div/div/div['+str(ind)+']'
browser.find_element_by_xpath(sel).click()
# 给点时间缓冲
time.sleep(0.5)
#这一部分是选择日期 :点击时间段 browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[1]/div[1]/div[1]/div/div[1]/button').click()
time.sleep(0.3)
#对应记号是ly:left year ry: right year
# rm : right month
ly=('/html/body/div[11]/div/div[2]/div/div/div[1]/span[1]/button[1]')
ry=('/html/body/div[11]/div/div[2]/div/div/div[1]/span[1]/button[3]')
lm=('/html/body/div[11]/div/div[2]/div/div/div[1]/span[2]/button[1]')
rm=('/html/body/div[11]/div/div[2]/div/div/div[1]/span[2]/button[3]')
#目标日期为 2017 09 01——2018-03-31 此处查找对应的是day
b170901=('/html/body/div[11]/div/div[2]/div/div/div[2]/table/tbody/tr[1]/td[5]/button')
b180330=('/html/body/div[11]/div/div[3]/div/div/div[2]/table/tbody/tr[5]/td[6]/button')
#在此处查找需要结束的日期, 即endday
endday=('/html/body/div[11]/div/div[1]/div[3]/div')
#cof为确认
cof=('/html/body/div[11]/div/div[1]/div[4]/div/span[1]')
# 前1年 前2月 点1号,换到endday
browser.find_element_by_xpath(ly).click()
for i in range(2):
browser.find_element_by_xpath(lm).click()
browser.find_element_by_xpath(b170901).click()
time.sleep(0.05)
browser.find_element_by_xpath(endday).click()
time.sleep(0.05)
# 发现元素已经改变了,重新查找
lm1=('/html/body/div[11]/div/div[3]/div/div/div[1]/span[2]/button[1]')
ly1=('/html/body/div[11]/div/div[3]/div/div/div[1]/span[1]/button[1]')
# 前1年 前3月 点31号,确认
browser.find_element_by_xpath(ly1).click()
for i in range(3):
browser.find_element_by_xpath(lm1).click()
browser.find_element_by_xpath(b180330).click()
time.sleep(0.05)
browser.find_element_by_xpath(cof).click()
time.sleep(0.3)
########################################################### browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[1]/div[1]/div[1]/div/div[4]/button').click()
#确保元素在浏览器显示界面中,随意找一个目标区域下的文字点击。 browser.find_element_by_xpath('/html/body/div[5]/div/div/div[2]/div[2]/span[3]').click()
#找到画布元素
xoyelement = browser.find_element_by_xpath("/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/div/div[1]")
#开始使用 模拟鼠标移动
from selenium.webdriver.common.action_chains import ActionChains
#把上面选择按钮对应的位置映射给天数
liss=[]
x_0=0
if ind==0:
day=7
elif ind==1:
day=30
elif ind==2:
day=90
elif ind==3:
day=180
day=180
#已经算好了画布上需要鼠标移动的像素点的宽度
#200是为了有余量
for i in range(200):
# 构造规则
if day == 7:
x_0 = x_0 + 202.33
elif day == 30:
x_0 = x_0 + 41.68
elif day == 90:
x_0 = x_0 + 13.64
elif day >= 180:
x_0 = x_0 + 6.78
#画布换一个名字
b_label_location = xoyelement
mouse_action = ActionChains(browser)
#开始迭代移动鼠标。一定要try ,容易被悬浮物遮拦报错
mouse_action.move_to_element_with_offset(b_label_location,x_0, 100).perform()
#mouse_action.click().perform()
#对 动态出现的数据框进行解析 dat1=browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/div/div[2]/div[2]/div[2]')
dat0=browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/div/div[2]/div[1]')
dictthis=[]
try:
key=str(dat0.text).split(' ')[0]
data=str(dat1.text)
dictthis.append(key)
dictthis.append(data)
#判断框有没有数据
if key!='':
#把所有数据放入数组
liss.append(dictthis)
else:
aaaa=0
except:
#凑格式
aaaaaa=9
#print(dictthis)
#这里的很多数据,不能用list(set()),写了个unique_list方法。去重
liss=unique_list(liss)
# nown 函数确定表中现有数据长度
#开始迭代存入数据库
id=nown()+1
for i in liss:
data=str(i[0])
value=str(i[1]).split(' ')[-1]
typeing='搜索指数'
#a="INSERT INTO `bji` VALUES ("+str(id) +", '"+'北京'+"', '"+keyword+"','"+data+"','"+value+"');"
a="INSERT INTO `bjii` VALUES ("+str(id) +", '"+typeing+"', '"+keyword+"','"+data+"','"+value+"','"+'北京'+"');"
sqlc =a
#执行sql
A = cur.execute(sqlc)
conn.commit()
id+=1
#browser.get("http://index.baidu.com")
time.sleep(0.8)
第二部分,媒体指数数据数据(上面为搜索指数数据)
换个日期
#换日期函数
def change_data2():
browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[3]/div[2]/div').click()
browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[3]/div[1]/div[1]/div[1]/div/div[2]/button').click()
browser.find_element_by_xpath('/html/body/div[8]/div/div/div[4]').click()
browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[3]/div[1]/div[1]/div[1]/div/div[1]/button').click()
#年
time.sleep(0.2)
for i in range(1):
browser.find_element_by_xpath('/html/body/div[12]/div/div[2]/div/div/div[1]/span[1]/button[1]').click()
#月
for i in range(7):
browser.find_element_by_xpath('/html/body/div[12]/div/div[2]/div/div/div[1]/span[2]/button[1]').click()
#日
browser.find_element_by_xpath('/html/body/div[12]/div/div[2]/div/div/div[2]/table/tbody/tr[1]/td[7]/button').click()
#切换
browser.find_element_by_xpath('/html/body/div[12]/div/div[2]/div/div/div[2]/table/tbody/tr[1]/td[7]/button').click()
browser.find_element_by_xpath('/html/body/div[12]/div/div[1]/div[3]/div').click()
#年
browser.find_element_by_xpath('/html/body/div[12]/div/div[3]/div/div/div[1]/span[1]/button[1]').click()
#月
for i in range(3):
browser.find_element_by_xpath('/html/body/div[12]/div/div[3]/div/div/div[1]/span[2]/button[1]').click()
#日
browser.find_element_by_xpath('/html/body/div[12]/div/div[3]/div/div/div[2]/table/tbody/tr[5]/td[6]/button').click()
#确认
browser.find_element_by_xpath('/html/body/div[12]/div/div[1]/div[4]/div/span[1]').click()
鼠标从左到右来一遍存数据库
def uploadone2(keyword):
change_data2()
day=180
liss1=[]
x_0=1
for i in range(200):
# 构造规则
if day == 7:
x_0 = x_0 + 202.33
elif day == 30:
x_0 = x_0 + 41.68
elif day == 90:
x_0 = x_0 + 13.64
elif day >= 180:
x_0 = x_0 + 6.3
else:
asddddd=0
from selenium.webdriver.common.action_chains import ActionChains
xoyelement = browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[3]/div[1]/div[3]/div[1]/div[1]/div/div[1]/canvas')
mouse_action = ActionChains(browser)
mouse_action.move_to_element_with_offset(xoyelement,x_0, 100).perform()
data1=browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[3]/div[1]/div[3]/div[1]/div[1]/div/div[2]/div[2]/div[2]')
data0=browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[3]/div[1]/div[3]/div[1]/div[1]/div/div[2]/div[1]')
dictthis=[]
try:
key=str(data0.text).split(' ')[0]
data=str(data1.text)
except:
print('wrong')
aaasdasd=0
dictthis.append(key)
dictthis.append(data)
#print(dictthis)
if key!='':
liss1.append(dictthis)
else:
aaaa=0
liss1=unique_list(liss1)
id=nown()+1
for i in liss1:
data=str(i[0])
value=str(i[1]).split(' ')[-1]
typeing='媒体指数'
#a="INSERT INTO `bji` VALUES ("+str(id) +", '"+'北京'+"', '"+keyword+"','"+data+"','"+value+"');"
a="INSERT INTO `bjii` VALUES ("+str(id) +", '"+typeing+"', '"+keyword+"','"+data+"','"+value+"','"+'北京'+"');"
sqlc =a
A = cur.execute(sqlc)
conn.commit()
id+=1
#browser.get("http://index.baidu.com")
time.sleep(0.8)
性别 年龄分布
def atribution_data():
browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[1]/div/div[3]/span/span').click()
time.sleep(0.5)
if browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[1]/div[2]/div[2]/div[2]/ul/li[3]/div/div[3]/div/div'):
print('ok')
else:
print('未载入完成')
#亲自测试的像素点击位置
list_pos1=[[65,'19-'],[185,'20-29'],[300,'29-39'],[415,'39-49'],[530,'50+']]
age_dstri=[]
for i in list_pos1:
browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[3]/div[1]/div[1]/span').click()
from selenium.webdriver.common.action_chains import ActionChains
xoyelement=browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div[1]/div[2]/div[1]/canvas')
mouse_action = ActionChains(browser)
mouse_action.move_to_element_with_offset(xoyelement,i[0], 222).perform()
data_dist1=browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div[1]/div[2]/div[2]').text
age_dstri.append([i[1],data_dist1])
#性别分布
from selenium.webdriver.common.action_chains import ActionChains
xoyelement=browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div[3]/div[2]/div[1]/div[1]/canvas')
mouse_action = ActionChains(browser)
mouse_action.move_to_element_with_offset(xoyelement,220, 85).perform()
man=browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div[3]/div[2]/div[1]/div[2]').text
xoyelement=browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div[3]/div[3]/div[1]/div[1]/canvas')
mouse_action = ActionChains(browser)
mouse_action.move_to_element_with_offset(xoyelement,220, 10).perform()
woman=browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div[3]/div[3]/div[1]/div[2]').text
sex_atri=[]
sex_atri.append(man)
sex_atri.append(woman)
finallist=[]
for i in age_dstri :
finallist.append(i[1])
for i in sex_atri:
finallist.append(i)
return finallist
def update_atribution_data(keyword):
add=atribution_data()
a="INSERT INTO `distribution` VALUES (" +" '"+keyword+"', '"+add[0]+"','"+add[1]+"','"+add[2]+"', '"+add[3]+"', '"+add[4]+"', '"+add[5]+"', '"+add[6]+"');"
sqlc =a
A = cur.execute(sqlc)
conn.commit()
画像数据
def child_page_inde(listindex,listcount):
xxx=80
for i in range(40):
from selenium.webdriver.common.action_chains import ActionChains
xoyelement=browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[3]/div[2]/div/div[3]/div[1]/canvas')
#/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[3]/div[2]/div/div[3]/div[1]/canvas
#/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[3]/div[2]/div/div[3]/div[1]/canvas
mouse_action = ActionChains(browser)
mouse_action.move_to_element_with_offset(xoyelement,xxx, 220).perform()
mes=browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[3]/div[2]/div/div[3]/div[2]/div').text.split('\n')
#mouse_action.move_to_element_with_offset(xoyelement,xxx, 220).click().perform()
#8,10,10,10,10,8,5,10,9
#print((mes))
if mes:
if mes not in listcount:
listcount.append(mes)
else:
print('-------------------------------')
print(listindex)
print(xxx)
print('-------------------------------')
xxx+=30
def intresting_distri():
browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[3]').click()
time.sleep(0.5)
xxx=100
contentlist=[]
for i in range(10):
from selenium.webdriver.common.action_chains import ActionChains
xoyelement=browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[3]/div[2]/div/div[2]/div[1]/canvas')
mouse_action = ActionChains(browser)
mouse_action.move_to_element_with_offset(xoyelement,xxx, 220).perform()
mes=browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[3]/div[2]/div/div[2]/div[2]/div').text.split('\n')
#mouse_action.move_to_element_with_offset(xoyelement,xxx, 220).click().perform()
contentlist.append(mes[:-1])
#break
xxx+=123
xxx=100
list_index_ca=[[8,100],[10,100],[10,100],[10,100],[8,100],[8,100],[5,130],[10,100],[9,80],[10,100]]
for i in range(10):
from selenium.webdriver.common.action_chains import ActionChains
xoyelement=browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[3]/div[2]/div/div[2]/div[1]/canvas')
mouse_action = ActionChains(browser)
mouse_action.move_to_element_with_offset(xoyelement,xxx, 220).perform()
#mes=browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[3]/div[2]/div/div[2]/div[2]/div').text.split('\n')
mouse_action.move_to_element_with_offset(xoyelement,xxx, 220).click().perform()
time.sleep(1)
child_page_inde(list_index_ca[i],contentlist)
browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[3]/div[2]/div/div[2]/a').click()
#8,10,10,10,10,8,5,10,9
time.sleep(0.5)
xxx+=123
return contentlist
def upload_facedate(keyword):
intresting_=unique_list(intresting_distri())
for add in intresting_:
a="INSERT INTO `facedata` VALUES (" +" '"+add[0]+"', '"+keyword+"','"+add[2]+"','"+add[4]+"', '"+add[6]+"');"
sqlc =a
A = cur.execute(sqlc)
conn.commit()
# 这里是搜索前相关关键词
def before_indexing():
namell=browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]').text.split('\n')
valll=[]
for i in range(1,11):
c=browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/ul/li['+str(i)+']/div/div[3]/div/div').get_attribute("style")
valll.append(c.split(':')[1].split(';')[0])
before_index=[]
for i,j in zip(namell,valll):
before_index.append([i,j])
return before_index
# 这里是搜索后相关关键词
def after_indexing():
namell=browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div[3]/div[1]/div[2]').text.split('\n')
valll=[]
for i in range(1,11):
c=browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div[3]/div[1]/div[2]/ul/li['+str(i)+']/div/div[3]/div/div').get_attribute("style")
valll.append(c.split(':')[1].split(';')[0])
before_index=[]
for i,j in zip(namell,valll):
before_index.append([i,j])
return before_index
#整合上面前后相关关键词上传
def rel_addone_data(date,keyword):
stringbefor=""
for i in before_indexing():
for j in i:
stringbefor+=j+"_"
stringbefor+=","
stringbefor
stringaftr=""
for i in after_indexing():
for j in i:
stringaftr+=j+"_"
stringaftr+=","
#####################################################
conn =pymysql.connect(host='',user='',password='',db='baidu',charset="utf8")
cur=conn.cursor()
#########################################################
#a="INSERT INTO `bji` VALUES ("+str(id) +", '"+'北京'+"', '"+keyword+"','"+data+"','"+value+"');"
a="INSERT INTO `relationt` VALUES (" +" '"+keyword+"', '"+date+"','"+stringbefor[:-1]+"','"+stringaftr[:-1]+"');"
print(a)
sqlc =a
A = cur.execute(sqlc)
conn.commit()
#循环所有页面的关键词,上传数据
def upload_relation_data(keyword):
browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[1]/div/div[2]/span/span').click()
time.sleep(0.8)
browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[2]/div[2]/div[1]/div[1]/div[2]/ul/li[10]/div/div[2]').click()
browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[2]/div[1]/div[2]/span').text
xx1=151
for i in range(52):
from selenium.webdriver.common.action_chains import ActionChains
xoyelement=browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[1]/div[2]/div[3]/div[1]/canvas[2]')
mouse_action = ActionChains(browser)
mouse_action.move_to_element_with_offset(xoyelement,xx1, 420).perform()
mes=browser.find_element_by_xpath('/html/body/div[1]/div[2]/div[2]/div[2]/div[2]/div[1]/div[2]/div[3]/div[2]').text
mouse_action.move_to_element_with_offset(xoyelement,xx1, 420).click().perform()
time.sleep(0.3)
timestr=mes
rel_addone_data(mes,keyword)
xx1+=18.5
#mouse_action.move_to_element_with_offset(xoyelement,xxx, 220).click().perform()
#150,1092
整合调用
def index_page_one(keyword):
#########################别忘了############################
conn =pymysql.connect(host='',user='',password='',db='baidu',charset="utf8")
cur=conn.cursor()
#######################################################
time.sleep(0.3)
browser.get("http://index.baidu.com")
time.sleep(0.3)
openone(keyword)
time.sleep(0.5)
uploadone(keyword)
uploadone2(keyword)
time.sleep(0.5)
update_atribution_data(keyword)
time.sleep(0.5)
upload_facedate(keyword)
time.sleep(0.5)
upload_relation_data(keyword)
from selenium.webdriver.common.action_chains import ActionChains
errorlist=[]
for iii in range(len(shoplist)):
print(iii)
try:
index_page_one(shoplist[iii])
except:
browser.get("http://index.baidu.com")
print('none')
errorlist.append(shoplist[iii])
数据库建立语句:
先建立一个数据库xxx
use xxx;
/*
Navicat Premium Data Transfer
Source Server : web
Source Server Type : MySQL
Source Server Version : 80016
Source Host : localhost:3306
Source Schema : baidu
Target Server Type : MySQL
Target Server Version : 80016
File Encoding : 65001
Date: 22/06/2019 23:34:48
*/
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for bjii
-- ----------------------------
DROP TABLE IF EXISTS `bjii`;
CREATE TABLE `bjii` (
`id` int(11) NOT NULL AUTO_INCREMENT,
`type` varchar(40) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
`keyword` varchar(40) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
`data` date NULL DEFAULT NULL,
`value` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
`place` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 3482 CHARACTER SET = utf8mb4 COLLATE = utf8mb4_0900_ai_ci ROW_FORMAT = Dynamic;
-- ----------------------------
-- Table structure for distribution
-- ----------------------------
DROP TABLE IF EXISTS `distribution`;
CREATE TABLE `distribution` (
`keyword` varchar(40) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
`19-` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
`20-29` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
`29-39` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
`39-49` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
`50+` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
`man` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
`woman` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL
) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_0900_ai_ci ROW_FORMAT = Dynamic;
-- ----------------------------
-- Table structure for facedata
-- ----------------------------
DROP TABLE IF EXISTS `facedata`;
CREATE TABLE `facedata` (
`classname` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
`keyword` varchar(40) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
`ivalue` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
`全网分布` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
`TGI` varchar(20) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL
) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_0900_ai_ci ROW_FORMAT = Dynamic;
-- ----------------------------
-- Table structure for relationt
-- ----------------------------
DROP TABLE IF EXISTS `relationt`;
CREATE TABLE `relationt` (
`keyword` varchar(40) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
`startdate` date NULL DEFAULT NULL,
`beforelist` varchar(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL,
`afterlist` varchar(1000) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NULL DEFAULT NULL
) ENGINE = InnoDB CHARACTER SET = utf8mb4 COLLATE = utf8mb4_0900_ai_ci ROW_FORMAT = Dynamic;
SET FOREIGN_KEY_CHECKS = 1;
至此完成功能
遗憾:在爬取前后相关词语时有一些bug
tips :不同的兴趣柱状图与TGI类别是会改变的。