任务
问题拆解
# Windows版的代码请在这下载:https://video.mugglecode.com/net5.py
# 以下为Mac/Linux/可在麻瓜编程在线运行的代码:
from selenium import webdriver
import time
# 运行前先下载 chrome driver,下载地址是:https://sites.google.com/a/chromium.org/chromedriver/downloads,点击【Latest Release: ChromeDriver x.xx】进入下载
def start_chrome():
driver = webdriver.Chrome(executable_path='./chromedriver') # Windows 需写成'./chromedriver.exe'
driver.start_client()
return driver
def find_strangers():
# btn
btn_sel = 'div.ContentItem-extra > button.Button--blue'
elems = driver.find_elements_by_css_selector(btn_sel)
return elems
def add_fren():
pass
while True:
url = 'https://www.zhihu.com/'
follower_url = 'https://www.zhihu.com/people/xxx/followers' #需替换成你的知乎url,点击【我的主页】→【关注者】可进入该页面
driver = start_chrome()
driver.get(url)
if not driver.get_cookies():
push()
time.sleep(20)
# wait login
driver.get(follower_url)
time.sleep(6) # wait for loading page & users
strangers = find_strangers()
for s in strangers:
s.click()
time.sleep(3)
print('Done!')
time.sleep(3000)
# js_execute('xxx.click()')
商品上架提醒这一部分需要考虑的是样品的标签名称。另外pyautogui库里面的alert方法可以提醒。
from selenium.webdriver import Chrome
import pyautogui
import webbrowser
import time
class PageObserver:
def __init__(self, url, target_sel):
self.driver = Chrome(executable_path='./chromedriver')
self.url = url
self.target_sel = target_sel
self.request_time = 20
#self.driver.find_elements_by_css_selector(self.target_sel)
def is_changed(self):
self.driver.get(self.url)
time.sleep(self.request_time)
oos_el = self.driver.find_elements_by_css_selector(self.target_sel)
buy_button = self.driver.find_elements_by_id('update-cart')
# -> []
print(oos_el)
print(buy_button)
if not oos_el and buy_button:
return True
def alert():
pyautogui.alert('The bag is available')
# find changes
url = 'https://www.strathberry.com/products/east-west-mini-tri-colour-navy-ruby-vanilla'
target_sel = 'div.oos.swatch-container.swatch-3-colours.active-colour'
fake_url = 'https://www.strathberry.com/products/east-west-mini-black-with-eyelets'
fake_target_sel = 'oss.swatch-container.swatch-1-colours.active-colour'
target = PageObserver(url=fake_url, target_sel=fake_target_sel)
while True:
if target.is_changed():
alert()
webbrowser.open(fake_url)
target.driver.close()
else:
print('Nope!')
拉勾网职位提醒
其实爬取的时候面向对象还是面向过程, 这个代码可以来对比一下的。
面向过程的时候呢:
面向对象可以看做是面向过程的一种扩展,也是一种比较友好的形式。
# get_page() -> parse_page() -> filter_job() -> send()
'''
raw_html = []
for i in range(1, 30):
page = get_page()
raw_html.append(page)
all_jobs = []
for html in raw_html:
jobs = parse(html)
all_jobs.append(jobs)
for job in all_jobs:
result = filter_job(job)
if result:
send(job)
'''
# Spider -> Parser -> Job
'''
s = Spider()
raw_pages = s.crawl(url)
p = Parser(raw_pages)
jobs = p.get_jobs()
for j in jobs:
if j.is_today():
j.send_to_me()
'''
from selenium.webdriver import Chrome
from bs4 import BeautifulSoup
import time
# https://www.lagou.com/zhaopin/qukuailian/12/
class Spider:
def __init__(self, index_url, page_range):
self.page_range = page_range + 1
self.index_url = index_url
self.raw_pages = []
self.boot()
def boot(self):
self.chrome = Chrome(executable_path='./chromedriver')
self.chrome.start_client()
def crawl(self):
for num in range(1, self.page_range):
full_url = f'{self.index_url}{num}/'
self.chrome.get(full_url)
print('Wait for loading page')
time.sleep(3)
single_html = self.chrome.page_source
#本来这个浏览器应该打开就处理,但是也可以一次性网页都加载完再挨个处理,这个可以存内容的。,
self.raw_pages.append(single_html)
print('Done')
class Parser:
def __init__(self, raw_pages):
self.raw_pages = raw_pages
self.jobs = []
self.parse()
def parse(self):
for html in self.raw_pages:
soup = BeautifulSoup(html, 'html.parser') #这里注意是html.parser
time_sel = 'ul span.format-time'
comp_sel = 'ul .company_name > a'
link_sel = 'ul a.position_link'
time_els = soup.select(time_sel) # list
comp_els= soup.select(comp_sel) # list
link_els= soup.select(link_sel) # list
for t,c,l in zip(time_els, comp_els, link_els):
cell = { #字典方便查询
'time':t.text,
'comp':c.text,
'link':l.get('href')
}
self.jobs.append(cell)
# [{},{}]
def get_jobs(self):
return [Job(j) for j in self.jobs]
class Job:
def __init__(self,data):
self.time = data.get('time')
self.comp = data.get('comp')
self.link = data.get('link')
def is_today(self):
return ':' in self.time # -> T or F
def send(self):
pass
def save_into_csv(self):
pass
s = Spider(
index_url='https://www.lagou.com/zhaopin/qukuailian/',
page_range=2
)
s.crawl()
p = Parser(s.raw_pages)
jobs = p.get_jobs()
for j in jobs:
if j.is_today():
print(j.comp,j.link)