原本需求是,需要把所有的视频、赞、评论、观看量、链接、博主,爬下来,然后按观看量进行一个排序,但是因为时间原因,只完成了部分代码,基本逻辑已经清楚,待完善中。。。
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from lxml import etree
class FacebookSpider(object):
def __init__(self):
print("==============================")
print(" Geting Cookies! ")
print("==============================")
# 选择浏览器
self.browser = webdriver.Chrome(r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
# browser.implicitly_wait(10)
# browser.set_window_size(0,0)
# 访问facebook网页
self.browser.get('https://www.facebook.com/login.php?login_attempt=1&lwv=110/')
# 输入账户密码
self.browser.find_element_by_id('email').clear()
self.browser.find_element_by_id('email').send_keys('你的账号')
self.browser.find_element_by_id('pass').clear()
self.browser.find_element_by_id('pass').send_keys('你的密码')
# 模拟点击登录按钮,两种不同的点击方法。。。
try:
self.browser.find_element_by_xpath('//button[@id="loginbutton"]').send_keys(Keys.ENTER)
except:
self.browser.find_element_by_xpath('//input[@tabindex="4"]').send_keys(Keys.ENTER)
# time.sleep(10)
self.browser.find_element_by_xpath('//a[@href="https://www.facebook.com/?ref=logo"]').send_keys(Keys.ENTER)
def send_request(seif):
seif.browser.get("https://www.facebook.com/pg/mommonshoes/videos/?ref=page_internal/")
soup = etree.HTML(seif.browser.page_source)
link_list = soup.xpath("//div[@class='_3v4h _48gm _50f3 _50f7']/a/@href")
item_list = []
for link in link_list:
print(link)
item_list.append(link)
return item_list
def send_requestss(self, full_url):
self.browser.get(full_url)
time.sleep(10)
soup = etree.HTML(self.browser.page_source)
zan = soup.xpath("//span[@class='_1g5v']/span/text()")
voide = soup.xpath("//span[@class='_44bh']/text()")
print(zan)
print(voide)
self.browser.close()
def main(self):
item_list = self.send_request()
for item in range(len(item_list)):
full_url = "https://www.facebook.com" + item_list[item]
print(full_url)
self.send_requestss(full_url)
if __name__ == '__main__':
spider = FacebookSpider()
spider.main()