"""
really used in fetching url from https://artsandculture.google.com/entity/m0bwbv?categoryid=art-movement
"""
from selenium import webdriver
import time
import os
from bs4 import BeautifulSoup
# os.environ["PATH"] += os.pathsep + 'D:\google-art-downloader-master'
# browser = webdriver.Chrome()
browser.get('http://huaban.com/search/?q=%E6%B0%B4%E5%A2%A8%E7%94%BB&type=pins')
asserts_all=set()
images_all = set()
pin_number = 0
img_number = 0
is_running = True
count = 0
pre_img_num = 0
while is_running:
pageSource = browser.page_source
soup = BeautifulSoup(pageSource,'lxml')
asserts = soup.find_all('a')
for assert_value in asserts:
if assert_value.get("href") != None and assert_value.get("href").startswith('/pin') and assert_value.img != None and assert_value.img.get("src") != None:
asserts_all.add(assert_value.get("href"))
images_all.add(assert_value.img.get("src"))
browser.execute_script("window.scrollBy(0,1000)")
pin_number = len(asserts_all)
print("pin numbers:", pin_number)
img_number = len(images_all)
print("img number", img_number)
time.sleep(1)
if pre_img_num == img_number:
count += 1
else:
count = 0
pre_img_num = img_number
if count == 100:
is_running = False
with open("huaban_pin_asserts_all.txt",'w',encoding="utf8") as write_file:
for line in asserts_all:
write_file.write(str(line)+"\n")
with open("huaban_img_asserts_all.txt",'w',encoding="utf8") as write_file:
for line in images_all:
write_file.write(str(line)+"\n")
# browser.close()
url:http://huaban.com/search/?q=%E6%B0%B4%E5%A2%A8%E7%94%BB&type=pins