想批量下载万方数据库的文献,看了一下其html源码不好玩啊.
其一篇文献的下载的链接.
<onclick="upload('4','hwjs201802015','chi','WF','Cs、O激活对GaAsP光阴极光谱响应特性的影响','0','perio')" id="ddownb" class="read"><i class="icon icon_down" title="下载"></i>下 载</a>
onclick 事件
onclick 事件会在对象被点击时发生。
请注意, onclick 与 onmousedown 不同。单击事件是在同一元素上发生了鼠标按下事件之后又发生了鼠标放开事件时才发生的。
语法:onclick="SomeJavaScriptCode"
找到upload函数。
/*下载*/
function upload(page_cnt,id,language,source_db,title,isoa,type,resourceType){
/* var user = $("#user").val();
if(user == "{}"){
getloginurl("ALL");
}else{*/
title=window.encodeURI(window.encodeURI(title));
var type = $("#document_type").val();
if(type == "standards"){
type="standard";
}
window.open("/search/downLoad.do?page_cnt="+page_cnt+"&language="+language+"&resourceType="+type+"&source="+source_db+"&resourceId="+id+"&resourceTitle="+title+"&isoa="+isoa+"&type="+type);
/*}*/
}
/*在线阅读*/
function onlineReading(page_cnt,id,language,source_db,title,isoa,type,resourceType){
/* var user = $("#user").val();
if(user == "{}"){
getloginurl("ALL");
}else{*/
title=window.encodeURI(window.encodeURI(title));
var type = $("#document_type").val();
if(type == "standards"){
type="standard";
}
window.open("/search/onlineread.do?page_cnt="+page_cnt+"&language="+language+"&resourceType="+type+"&source="+source_db+"&resourceId="+id+"&resourceTitle="+title+"&isoa="+isoa+"&type="+type);
/*}*/
}
以Spectral Efficiency and Power Allocation for Mixed-ADC Massive MIMO System这篇文献为列子,起下载事件为。
<a onclick="upload('16','zgtx201803009','eng','WF','Spectral Efficiency and Power Allocation for Mixed-ADC Massive MIMO System','0','perio')" id="ddownb" class="read"><i class="icon icon_down" title="下载"></i>下 载</a>
点击下载,获取了一个url, 好像随机生成hash值
先用浏览器访问下url,如果可以得到数据,就可以使用requests的get方法,如果不能就使用post方法
想采用requests库得到html文本.
def get_html(url):
try:
header ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331', }
r=requests.get(url,headers = header,verify=False)
r.raise_for_status
r.encoding=r.apparent_encoding
#print(r.text)
return r
except Exception as e:
print("has error:"+str(e))
万方数据库一页显示20也结果
<input type="hidden" id="pSize" value="20">
<!-- 分页参数 -->
搜索页的url
http://www.wanfangdata.com.cn/search/searchList.do?searchType=all&showType=&searchWord=cd&isTriggerTag=
http://www.wanfangdata.com.cn/search/searchList.do?searchType=all&pageSize=20&page=2&searchWord=cd&order=correlation&showType=detail&isCheck=check&isHit=&isHitUnit=&firstAuthor=false&rangeParame=all
http://www.wanfangdata.com.cn/search/searchList.do?searchType=all&pageSize=20&page=3&searchWord=cd&order=correlation&showType=detail&isCheck=check&isHit=&isHitUnit=&firstAuthor=false&rangeParame=all
搜索的结果在strong标签中.采用正则表达式,可以得到搜索结果.
找到<strong>168533</strong>条结果。<br> </div>
def getNum(key):
head="http://www.wanfangdata.com.cn/search/searchList.do?searchType=all&showType=&searchWord="
end="&isTriggerTag="
url=head+key+end
re1=r'\s*找到<strong>(.*?)</strong>条结果'
html=get_html(url).text
if html==None:
print("没有文献")
return ;
strnum=re.findall(re1,html)
num=int(strnum[0])
#print("找到了:",num)
return num;
根据关键字key和总结果可以构建出搜索的页面.
def search_key(key):
allurl=[]
page=0
head="http://www.wanfangdata.com.cn/search/searchList.do?searchType=all&showType=&searchWord="
end="&isTriggerTag="
url=head+key+end
#print(url)
allurl.append(url)
html=get_html(url).text
if html==None:
print("text empty")
return ;
num=getNum(key)
print("找到了:",num)
if num>20:
if(num%20!=0):
page=num//20+1
else:
page=num//20
# page>1 url
head='http://www.wanfangdata.com.cn/search/searchList.do?searchType=all&pageSize=20&page='
end='&searchWord='+key+'&order=correlation&showType=detail&isCheck=check&isHit=&isHitUnit=&firstAuthor=false&rangeParame=all'
for i in range(2,page+1):
url=head+str(i)+end
allurl.append(url)
l=len(allurl)
print('第',l,"页")
print(allurl[0])
print(allurl[l-1])
return allurl
这是每一页的具体url
def get_url(urls):
base='http://www.wanfangdata.com.cn//link.do'
html=get_html(urls).text
#re0=r'<a href="(.*?)">'
re0=r'<a\b[^>]*\bhref="/link.do?([^"]+)'
allUrl=re.findall(re0,html)
length=len(allUrl)
print("length=",length)
for i in range(length):
allUrl[i]=base+allUrl[i]
#print(allUrl)
return allUrl
总共的页数已经得到了,但是js卡注了,不知道怎么生成相关的下载hrfe。已经根据这个文件,下载下来相关的pdf文档了。
def get_pdf(url):
text=get_html(url)
path="/home/dflx/文档/python/6.pdf"
with open(path,'wb') as f:
f.write(text.content)
print("successf")
所以现在做不下去了,于是决定先把每一篇文章的题目,页数,等重要信息爬虫下来,写入excel文件,看一看.
"""
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun May 20 10:05:29 2018
@author: dflx
"""
import requests
import time
import re
import os
from bs4 import BeautifulSoup
import bs4
from urllib import parse
from multiprocessing import Pool
import xlwt
def get_html(url):
try:
header ={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.90 Safari/537.36 2345Explorer/9.3.2.17331', }
r=requests.get(url,headers = header,verify=False)
r.raise_for_status
r.encoding=r.apparent_encoding
#print(r.text)
return r
except Exception as e:
print("has error:"+str(e))
def getNum(key):
head="http://www.wanfangdata.com.cn/search/searchList.do?searchType=all&showType=&searchWord="
end="&isTriggerTag="
url=head+key+end
re1=r'\s*找到<strong>(.*?)</strong>条结果'
html=get_html(url).text
if html==None:
print("没有文献")
return ;
strnum=re.findall(re1,html)
num=int(strnum[0])
#print("找到了:",num)
return num;
def search_key(key):
allurl=[]
page=0
head="http://www.wanfangdata.com.cn/search/searchList.do?searchType=all&showType=&searchWord="
end="&isTriggerTag="
url=head+key+end
#print(url)
allurl.append(url)
html=get_html(url).text
if html==None:
print("text empty")
return ;
num=getNum(key)
print("找到了:",num)
if num>20:
if(num%20!=0):
page=num//20+1
else:
page=num//20
# page>1 url
head='http://www.wanfangdata.com.cn/search/searchList.do?searchType=all&pageSize=20&page='
end='&searchWord='+key+'&order=correlation&showType=detail&isCheck=check&isHit=&isHitUnit=&firstAuthor=false&rangeParame=all'
for i in range(2,page+1):
url=head+str(i)+end
allurl.append(url)
l=len(allurl)
print('第',l,"页")
print(allurl[0])
print(allurl[l-1])
return allurl
def get_pdf(url):
text=get_html(url)
path="/home/dflx/文档/python/6.pdf"
with open(path,'wb') as f:
f.write(text.content)
print("successf")
def get_information(url):
wenben=get_html(url).text
soup=BeautifulSoup(wenben,'html.parser')
title=[]
information=[]
title=soup.find_all('title')
information.append('文章的题目')
information.append(soup.title.string)
print("文章的题目:",soup.title.string)
print("--------中文摘要------------")
abstract=[]
abstract=soup.find_all('textarea')
soup_abstr=BeautifulSoup(str(abstract),'html.parser')
if len(abstract)!=0:
print(len(abstract),abstract[1].string)
if len(abstract)!=0 and abstract[1].string!=None:
information.append("中文摘要")
information.append(abstract[1].string)
print("--------中文摘要------------")
english_abstract=[]
re0=r'<div id="abstract_content" style="line-height:24px; padding:5px 0 10px;display: none">\s*(.*?)\s*</div'
engabs=re.findall(re0,wenben)
print(engabs)
if len(engabs)!=0:
information.append("英文摘要:")
information.append(engabs[0])
tag_all=soup.find_all('ul')
soup_txt=BeautifulSoup(str(tag_all[1]),'html.parser')
print(type(soup_txt))
#print(soup_txt.prettify())
a_tag=soup_txt.find_all('li')
print(type(a_tag))
print(type(a_tag[0]))
print(len(a_tag))
#收集文献的信息
for smtag in a_tag:
for stag in smtag.children:
str_infor=""
if isinstance(stag,bs4.element.Tag):
for xx in stag.children:
stant=( type(xx) !=bs4.element.Comment)
if stant:
#print("---",type(xx),xx.string)
if xx.string!='\n' and xx.string!=None:
str_infor+=str(xx.string)+' '
information.append(str_infor)
#去除list里面的空字符
for i in range(information.count('')):
information.remove('')
#去除list里面字符串的开头和结尾的空格
for i in range(len(information)):
if information[i].count(' '):
information[i]=information[i].strip()
#print(soup_txt.li.contents)
return information
def get_url(urls):
base='http://www.wanfangdata.com.cn//link.do'
html=get_html(urls).text
#re0=r'<a href="(.*?)">'
re0=r'<a\b[^>]*\bhref="/link.do?([^"]+)'
allUrl=re.findall(re0,html)
length=len(allUrl)
print("length=",length)
for i in range(length):
allUrl[i]=base+allUrl[i]
#print(allUrl)
return allUrl
def writeAll(key):
path="/home/dflx/文档/informa.xls"
file=xlwt.Workbook()
sheet1=file.add_sheet('1')
row=getNum(key)
pages=search_key(key)
allurl=[]
row=0
for page in pages:
allurl=get_url(page)
for url in allurl:
#得到每一篇文献的信息,写入文件
print("写入第",row,'篇文章')
information=get_information(url)
t=len(information)
for j in range(t):
sheet1.write(row,j,information[j])
row+=1
print('一篇写入成功')
file.save(path)
print("一共",len(allurl))
#file.save(path)
print('successful')
def writrWord(key):
path="/home/dflx/文档/information.docx"
file=docx.Document()
row=getNum(key)
pages=search_key(key)
allurl=[]
row=0
for page in pages:
allurl=get_url(page)
for url in allurl:
#得到每一篇文献的信息,写入文件
print("写入第",row,'篇文章')
information=get_information(url)
t=len(information)
file.add_heading(information[0]+information[1])
file.add_paragraph(url)
p=file.add_paragraph(information[2])
run=p.add_run(information[3]).bold=True
if t%2==0:
for i in range(2,t//2):
file.add_paragraph(information[i*2]+information[i*2+1])
else:
for i in range(2,t//2-1):
file.add_paragraph(information[i*2]+information[i*2+1])
file.add_paragraph(information[i*2+2])
file.save(path)
print('一篇写入成功')
#单独写入一篇文章
def write(information,row):
path="/home/dflx/文档/informa.xls"
file=xlwt.Workbook()
sheet1=file.add_sheet('1')
t=len(information)
print(" 一共有",t,'列 ',row,'行')
for j in range(t):
sheet1.write(row,j,information[j])
file.save(path)
print('successful')
def main():
#url="http://www.wanfangdata.com.cn/details/detail.do?_type=perio&id=hwjs201802015"
url='http://www.wanfangdata.com.cn/details/detail.do?_type=perio&id=jsjyszgc201801022'
#infors=get_information(url)
#print(infors)
#write(infors,0)
url1='http://www.wanfangdata.com.cn/search/searchList.do?searchType=all&showType=&pageSize=&searchWord=%EF%BD%83%EF%BD%93&isTriggerTag='
#get_url(url1)
#自己输入关键字,检索
key='网络爬虫的价值'
#search_key(key)
#getAllUrl(key)
#getNum(key)
writeAll(key)
#下载文献的,还没有完成
ul0='http://f.wanfangdata.com.cn/www/%E4%BB%A5%E8%AE%A1%E7%AE%97%E6%9C%BA%E7%AD%89%E7%BA%A7%E8%80%83%E8%AF%95%E4%B8%BA%E5%AF%BC%E5%90%91%E7%9A%84%E9%AB%98%E8%81%8C%E9%99%A2%E6%A0%A1%E8%AE%A1%E7%AE%97%E6%9C%BA%E5%9F%BA%E7%A1%80%E8%AF%BE%E7%A8%8B%E6%95%99%E5%AD%A6%E7%AD%96%E7%95%A5%E7%A0%94%E7%A9%B6.ashx?type=degree&resourceId=Y3208264&resourceTitle=%25E4%25BB%25A5%25E8%25AE%25A1%25E7%25AE%2597%25E6%259C%25BA%25E7%25AD%2589%25E7%25BA%25A7%25E8%2580%2583%25E8%25AF%2595%25E4%25B8%25BA%25E5%25AF%25BC%25E5%2590%2591%25E7%259A%2584%25E9%25AB%2598%25E8%2581%258C%25E9%2599%25A2%25E6%25A0%25A1%25E8%25AE%25A1%25E7%25AE%2597%25E6%259C%25BA%25E5%259F%25BA%25E7%25A1%2580%25E8%25AF%25BE%25E7%25A8%258B%25E6%2595%2599%25E5%25AD%25A6%25E7%25AD%2596%25E7%2595%25A5%25E7%25A0%2594%25E7%25A9%25B6&transaction=%7B%22id%22%3Anull%2C%22transferOutAccountsStatus%22%3Anull%2C%22transaction%22%3A%7B%22id%22%3A%22998142638869729280%22%2C%22status%22%3A1%2C%22createDateTime%22%3Anull%2C%22payDateTime%22%3A1526810724357%2C%22authToken%22%3A%22TGT-10950440-vQmYpbn7uZ4A9qjhbck0dktj3fqAqupVGvshhbN6HPj0nAuIdK-my.wanfangdata.com.cn%22%2C%22user%22%3A%7B%22accountType%22%3A%22Group%22%2C%22key%22%3A%22hzkjdx%22%7D%2C%22transferIn%22%3A%7B%22accountType%22%3A%22Income%22%2C%22key%22%3A%22ThesisFulltext%22%7D%2C%22transferOut%22%3A%7B%22GTimeLimit.hzkjdx%22%3A30.0%7D%2C%22turnover%22%3A30.0%2C%22productDetail%22%3A%22degree_Y3208264%22%2C%22productTitle%22%3Anull%2C%22userIP%22%3A%22218.199.110.68%22%2C%22organName%22%3Anull%2C%22memo%22%3Anull%2C%22webTransactionRequest%22%3Anull%2C%22signature%22%3A%22GCqmQM0YOUI2CKzr554pVH%2FRC3soMlPbyvtC4tDHMX2dFNcJToPYtfnKoiJ%2BcO5hFzCfo84x16Es%5Cnl5g7%2BwbZUItqh741LwlHDFzBKYaBAm4WjXU3%2BD%2FiMfl%2FzEMMZAqHTF2S%2FsONyYcUol4hry%2FLCXOD%5CnJTy%2FMI7LU2Km%2FySe0eU%3D%22%2C%22delete%22%3Afalse%7D%2C%22isCache%22%3Afalse%7D'
ul='http://f.wanfangdata.com.cn/www/Spectral+Efficiency+and+Power+Allocation+for+Mixed-ADC+Massive+MIMO+System.ashx?type=perio&resourceId=zgtx201803009&resourceTitle=Spectral%2BEfficiency%2Band%2BPower%2BAllocation%2Bfor%2BMixed-ADC%2BMassive%2BMIMO%2BSystem&transaction=%7B%22id%22%3Anull%2C%22transferOutAccountsStatus%22%3Anull%2C%22transaction%22%3A%7B%22id%22%3A%22998101496136486912%22%2C%22status%22%3A1%2C%22createDateTime%22%3Anull%2C%22payDateTime%22%3A1526800915165%2C%22authToken%22%3A%22TGT-10848458-zHl3CXey47UjQav6HqMOisw3CZqNxO6NBjA4fvtzkCQ1tXPRcu-my.wanfangdata.com.cn%22%2C%22user%22%3A%7B%22accountType%22%3A%22Group%22%2C%22key%22%3A%22hbdesf%22%7D%2C%22transferIn%22%3A%7B%22accountType%22%3A%22Income%22%2C%22key%22%3A%22PeriodicalFulltext%22%7D%2C%22transferOut%22%3A%7B%22GTimeLimit.hbdesf%22%3A3.0%7D%2C%22turnover%22%3A3.0%2C%22productDetail%22%3A%22perio_zgtx201803009%22%2C%22productTitle%22%3Anull%2C%22userIP%22%3A%22202.110.130.244%22%2C%22organName%22%3Anull%2C%22memo%22%3Anull%2C%22webTransactionRequest%22%3Anull%2C%22signature%22%3A%22I6p3Hq9DM8nnf3U1DVVw4lZcQAF1mxcJWmNcnUpeTMY5I6jkhJtlDHrujdJa6SsKqZ26E52RnHDO%5CntPqYeEFZ6laDAwSRs0U3xwr%2FU3CS7w8zuvg8XyHEym9ufvCyJElsxwP0fSq5GMI0EaNwv45SoqQ7%5CnVI1Bhel0QUD1KVa0TFQ%3D%22%2C%22delete%22%3Afalse%7D%2C%22isCache%22%3Afalse%7D'
#get_pdf(ul0)
参考文献
requests浏览器登录后手动设置cookies
Python将浏览器cookies共享给requests库
Python爬虫利器一之Requests库的用法
python的requests在网络请求中添加cookies参数
Chrome浏览器如何查看 & 编辑Cookie?
Python学习日记12|用python3多进程批量下载pdf文件
用Python和selenium下载pdf文件
浏览器下载文件时资源链接的获取方法
用python爬虫批量下载pdf
使用python爬虫抓取学术论文
实现a标签中的各种点击(onclick)事件的方法
URLConnection抓取万方数据上的文献数据
学习笔记之万方数据爬取
Python 爬虫如何获取 JS 生成的 URL 和网页内容?