多线程爬取

import time

import requests
from lxml import etree
from selenium import webdriver
from kaisha import str2url
from threading import Thread

browser = webdriver.Chrome(executable_path="/Users/apple/Desktop/tool/chromedriver")

def get_page():
url = 'https://www.xiami.com/chart'
browser.get(url)
time.sleep(1)
return browser.page_source

def get_mp3(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.content
return None

def process_mp3(mp3_url, mp3_title):
mp3_url = str2url(mp3_url)
print(mp3_url, mp3_title)
mp3_content = get_mp3(mp3_url)
save_mp3(mp3_content, mp3_title)

def save_mp3(mp3_content, mp3_title):
with open('./mp3/%s.mp3' % mp3_title, 'wb') as f:
f.write(mp3_content)

def parse_page(html):
etree_html = etree.HTML(html)
items = etree_html.xpath('//tr[@class="songwrapper"]')
threads = []
for item in items:
mp3_url = item.xpath('./@data-mp3')[0]
mp3_title = item.xpath('./@data-title')[0]

    thread = Thread(target=process_mp3, args=(mp3_url, mp3_title))
    threads.append(thread)
for thread in threads:
    thread.start()

def main():
html = get_page()
parse_page(html)

if name == 'main':
main()

©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容