这里以IEEE里面的PIMRC17‘,WCNC17’,GLOBECOM17‘为例,需要用到的库:
bs4里面的BeautifulSoup,requests。
step1
downtitle.py
这个文件里,我们要写一下如何获取题目。
def get_title(conference, url, total_page):
from bs4 import BeautifulSoup
import requests
import time
if conference == 'wcnc':
f2 = open("wcnc2017.txt", "a")
elif conference == 'pimrc':
f2 = open("pimrc2017.txt", "a")
elif conference == 'globecom':
f2 = open("globecom2017.txt", "a")
for i in range(1, total_page+1): # 假设最多有total_page页,你也可以自动化
#time.sleep(5)
url_this = url + str(i)
wb_data = requests.get(url_this)
soup = BeautifulSoup(wb_data.text, 'lxml')
titles = soup.select('#results-blk > div > ul > li > div.txt > h3 > a > span') # 解析题目列表
print(titles)
for title in titles:
temp = str(title.get_text())
print(temp)
if len(temp)>20:
f2.write(temp+'\n')
print(conference+' page: '+str(i))
return f2
STEP2
主文件,用来把会议的题目写进txt里面,这里面的三个网址我都是在对应会议的页面加上钩子条件得到的,页码去掉了。每页显示100条。
import downtitle
PIMRC17 = 'https://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=8288420&punumber%3D8288420%26filter%3DAND%28p_IS_Number%3A8292162%29%26rowsPerPage%3D100&rowsPerPage=100&pageNumber='
GLOBE17 = 'https://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=8253768&punumber%3D8253768%26filter%3DAND%28p_IS_Number%3A8253909%29%26rowsPerPage%3D100&rowsPerPage=100&pageNumber='
WCNC17 = 'https://ieeexplore.ieee.org/xpl/mostRecentIssue.jsp?punumber=7924448&punumber%3D7924448%26filter%3DAND%28p_IS_Number%3A7925429%29%26pageNumber%3D4%26rowsPerPage%3D100&pageNumber='
WCNC17txt = downtitle.get_title('wcnc', WCNC17,5)
PIMRC17txt = downtitle.get_title('pimrc',PIMRC17,7)
GLOBE17txt = downtitle.get_title('globecom',GLOBE17, 11)
STEP3
统计不同话题的题目,注意在匹配的时候有的时候大小写是不好搞定的,因此匹配字符串的时候,就应该是取单词主题部分,有时候可以去掉首字母,再匹配:
topics = (1,11)#10 topics
filewcnc = open("wcnc2017.txt", "r")
filepimrc = open("pimrc2017.txt", "r")
fileglobecom = open("globecom2017.txt", "r")
count_D2D = 0
count_self_organization = 0
count_vehicle = 0
count_cooperation = 0
count_incentive = 0
count_interference = 0
count_bigdata = 0
count_crowdsourcing = 0
count_privacy = 0
count_IoT = 0
for i in range(1,4):
if i == 1:
filetoscrapy = filewcnc
conference_name = 'WCNC_2017'
elif i == 2:
filetoscrapy = filepimrc
conference_name = 'PIMRC_2017'
elif i ==3:
filetoscrapy = fileglobecom
conference_name = 'GLOBECOM_2017'
for temp in filetoscrapy:
index_of_D2D = open("D2d中继.txt", "a") #D2D
justify_D2D = (('D2D' in temp) or ('elay' in temp) or (temp.count('evice')>=2))
if justify_D2D:
index_of_D2D.write(conference_name+', '+ temp)
count_D2D += 1
index_of_self_organization = open("自组织和传感器.txt", "a") #自组织和传感器
justify_self_organization = (('d-hoc' in temp) or ('ensor' in temp) or(('elf' in temp) and ('rganiz' in temp)))
if justify_self_organization:
index_of_self_organization.write(conference_name+', '+ temp)
count_self_organization += 1
index_of_vehicle = open("车联网.txt", "a") #车联网
justify_vehicle = (('ehicular' in temp) or ('ehicle' in temp))
if justify_vehicle:
index_of_vehicle.write(conference_name+', '+ temp)
count_vehicle += 1
index_of_cooperation = open("协作通信.txt", "a") #协作通信
justify_cooperation = (('ollab' in temp) or ('ooperat' in temp) or ('oordinat' in temp))
if justify_cooperation:
index_of_cooperation.write(conference_name+', '+ temp)
count_cooperation += 1
index_of_incentive = open("合作激励.txt", "a")
justify_incentive =('ncentive' in temp)
if justify_incentive:
index_of_incentive.write(conference_name+', '+ temp)
count_incentive += 1
index_of_interference= open("干扰协调管理缓解.txt", "a")
justify_interference = ('nterference' in temp) and (('itigat' in temp) or ('anage' in temp) or ('ancell'in temp))
if justify_interference:
index_of_interference.write(conference_name+', '+ temp)
count_interference += 1
index_of_bigdata = open("大数据.txt", "a")
justify_bigdata = ((('Big' in temp) and ('Data' in temp)) or ('ig Data' in temp) or ('ig data' in temp))
if justify_bigdata:
index_of_bigdata.write(conference_name+', '+ temp)
count_bigdata += 1
index_of_crowdsourcing = open("众包.txt", "a")
justify_crowdsourcing = ('rowdsourcing' in temp)
if justify_crowdsourcing:
index_of_crowdsourcing.write(conference_name+', '+ temp)
count_crowdsourcing += 1
index_of_privacy = open("安全隐私.txt", "a")
justify_privacy = (('rivacy' in temp) or ('ecurity' in temp) or ('afty' in temp) or ('safe' in temp) or ('Safe' in temp))
if justify_privacy:
index_of_privacy.write(conference_name+', '+ temp)
count_privacy += 1
index_of_IoT = open("物联网.txt", "a")
justify_IoT = (('IoT' in temp) or (('hings' in temp) and ('nternet' in temp)))
if justify_IoT:
index_of_IoT.write(conference_name+', '+ temp)
count_IoT += 1
IEEE对于用户下载文件的权限要求很诡异,我用电脑下载时候,下下来的pdf都是损坏的,就算是用了带cookie的headers也是损坏,等我以后再研究吧。