爬虫学习Task4

1.切换到账号密码表单登录

js1 = 'document.querySelector("#j_loginTab1").style.display="none";'


browser.execute_script(js1)


time.sleep(1)


js2 = 'document.querySelector("#j_loginTab2").style.display="block";'


browser.execute_script(js2)


2.定位到账号,点击登录

input_name = browser.find_element_by_name('username')


input_name.clear()


input_name.send_keys(********')


input_pass = browser.find_element_by_name('password')


input_pass.clear()


input_pass.send_keys('********')


browser.find_element_by_xpath('//*[@class="form__button"]/button').click()


3.登录成功后,抓取页面,利用xpath定位到抓取内容的位置

user = tree.xpath('//div[@id="postcontainer"]//div[@class="auth"]/a/text()')

content = tree.xpath('//td[@class="postbody"]')


# -*- coding:utf-8 -*-

import requests, json, re, random,time


from bs4 import BeautifulSoup


from selenium import webdriver


from lxml import etree



class getUrl(object):


"""docstring for getUrl"""


def __init__(self):


self.headers={


            "Connection": "keep-alive",


            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 "


                          "(KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36",


            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",


            "Accept-Encoding": "gzip, deflate, sdch",


            "Accept-Language": "zh-CN,zh;q=0.8"


        };



def run(self):


browser = webdriver.Chrome()


browser.get('https://auth.dxy.cn/accounts/login?service=http://www.dxy.cn/bbs/index.html')


time.sleep(1)


js1 = 'document.querySelector("#j_loginTab1").style.display="none";'


browser.execute_script(js1)


time.sleep(1)


js2 = 'document.querySelector("#j_loginTab2").style.display="block";'


browser.execute_script(js2)


input_name = browser.find_element_by_name('username')


input_name.clear()


input_name.send_keys('*********')


input_pass = browser.find_element_by_name('password')


input_pass.clear()


input_pass.send_keys('*******')


browser.find_element_by_xpath('//*[@class="form__button"]/button').click()


time.sleep(10)


cookie = browser.get_cookies()


cookie_dict = {i['name']:i['value'] for i in cookie}


browser.get("http://www.dxy.cn/bbs/thread/626626#626626");


html = browser.page_source


tree = etree.HTML(html)


user = tree.xpath('//div[@id="postcontainer"]//div[@class="auth"]/a/text()')


content = tree.xpath('//td[@class="postbody"]')


for i in range(0,len(user)):


result = user[i].strip()+":"+content[i].xpath('string(.)').strip()


dir_file = open("DXY_records.txt",'a', encoding="utf-8")


dir_file.write(result+"\n")


dir_file.write('*' * 80+"\n")


dir_file.close()


print('*' * 5 +"结束"+'*' * 5)





if __name__ == '__main__':


geturl = getUrl()


geturl.run()

©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容