想要统计一下中国药品临床试验机构的信息,尝试从国家官网上爬一下机构信息。主要是get http请求得到html,再解析html。
一个坑
请求需要cookie,cookie还会过期。
得到cookie的方法有几种:
1.先无cookie请求一次拿到cookie
2.用selenium库的webdriver浏览器获取cookie
3.从本地Chrome浏览器保存cookie的地方读取cookie
最终使用方法3成功爬到了数据。
import subprocess
import sqlite3
import win32crypt
import requests
SOUR_COOKIE_FILENAME = r'C:\Users\Administrator\AppData\Local\Google\Chrome\User Data\Default\Cookies'
DIST_COOKIE_FILENAME = '.\python-chrome-cookies'
def get_chrome_cookies(url):
subprocess.call(['copy', SOUR_COOKIE_FILENAME, DIST_COOKIE_FILENAME], shell=True)
conn = sqlite3.connect(".\python-chrome-cookies")
ret_dict = {}
for row in conn.execute("SELECT host_key, name, path, value, encrypted_value FROM cookies"):
# if row[0] not in url:
if row[0] != url:
continue
ret = win32crypt.CryptUnprotectData(row[4], None, None, None, 0)
ret_dict[row[1]] = ret[1].decode()
conn.close()
subprocess.call(['del', '.\python-chrome-cookies'], shell=True)
return ret_dict
DOMAIN_NAME = 'app1.sfda.gov.cn'
get_url = 'http://app1.sfda.gov.cn/datasearchcnda/face3/content.jsp?tableId=19&tableName=TABLE19&Id=12'
coo = get_chrome_cookies(DOMAIN_NAME)
del coo['JSESSIONID']
print(coo)
response = requests.get(get_url, cookies=coo)
html=response.content
import requests
from bs4 import BeautifulSoup
from requests.cookies import RequestsCookieJar
import os
from http import cookiejar
from requests import get,post,Session
import getcookiefromchrom
#c = s.get(url, headers=hd).cookies.get_dict()
##cookie_jar = RequestsCookieJar()
##cookie_jar.set(c[key],c[value],domain='app1.sfda.gov.cn')
#
#coo1 = []
#for key,value in c.items():
# coo1.append( key + '=' + value )
#coo = coo1[0] + '; ' + coo1[1]
#print(coo)
#coo = '''FSSBBIl1UgzbN7N82S=BPJFXa0U1qQCeacuNUgV6YKkAYGSwD4y.4Dy3mtfxONtpTa4RNcUn65uSw7c627S; FSSBBIl1UgzbN7N82T=2FS0dUr3d9UbyRRs03JNNjqS.0QUWsid3t2iX3GLlFIO0yQyYmRYe0UrK4G1ifv7Yzylbf_slsKAiOs05bskKsYhOax5XbeN7020u2yy1gURLlihUXobjh7ob_m81KIYupHLBge4JaH0RMH726dMN9YBBhADQvt1owS6eF4CtDl50M0Klw9Nw.eWivti6lwplcKM6fZE4C8ZyNq2PDvTW67um_NQbztEpcpeBRf2VEmpQVI4At7qui9TWvNDb5QzF.6oJAQ2HzshkstyOGu8bIlbL'''
DOMAIN_NAME = 'app1.sfda.gov.cn'
co = getcookiefromchrom.get_chrome_cookies(DOMAIN_NAME)
coo ='''FSSBBIl1UgzbN7N82S=BPJFXa0U1qQCeacuNUgV6YKkAYGSwD4y.4Dy3mtfxONtpTa4RNcUn65uSw7c627S; FSSBBIl1UgzbN7N82T=''' + co['FSSBBIl1UgzbN7N82T']
hd = {'Connection':'keep-alive','Accept-Language':'zh-CN,zh;q=0.9','Cookie':coo}
#print(coo)
result = []
for i in range(5000):
#i=2
url = 'http://app1.sfda.gov.cn/datasearchcnda/face3/content.jsp?tableId=19&tableName=TABLE19&Id='+str(i)
# 向请求头中添加cookie
r1 = requests.get(url, headers=hd)
#cookie = 'JSESSIONID=3EE20713D3766A19CE4FD83056154355.7; FSSBBIl1UgzbN7N82S=BPJFXa0U1qQCeacuNUgV6YKkAYGSwD4y.4Dy3mtfxONtpTa4RNcUn65uSw7c627S; FSSBBIl1UgzbN7N82T=25E_7ugK7qXLjeGwvZ0v_iydgA2RI66ShXQqeZRbU5bzvw26DfGMXAXHmIRQwm8PDxqtTmkwU6B8woj_zcjJm67ma5vMoPBUF_JL6RqySJShk_ef59xNwNFotv9Zq80RvD8_uNaf5qaeoKNr_t0DU.lgtO350DSJFGlS8p3_Gw25VdQ0oQjskM.VzbDktB_BvZygo7RUykitHRZ7vRdU4FcVJwZiTutIfWuxFfBIB8q_VpWFIBZtehB7v83ekrK0y9i8TfYUp9iKBIMs9icS3Ywn1eTayJCkpOt0GtU0ERq0kZcEaQhIhxNOTyAXqU2QCDjE'
#hd={ 'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36",'Accept-Language':'zh-CN,zh;q=0.9','Cookie':cookie}
#r1 = requests.get(url,headers=hd)
bs=BeautifulSoup(r1.content,'html.parser')
tr=bs.find_all('tr') #爬取所有的tr标签
if len(tr)==0 or len(tr) == 5:
print('stop'+str(i))
continue
else:
print(i)
k = []
for i in range(7):
k.append(tr[i+1].find('td').text)
v = []
for i in range(7):
v.append(tr[i+1].find_all('td')[1].text)
d=dict(zip(k,v))
result.append(d)