import re
import pymongo
import json
from urllib.parse import urlencode
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
import bs4
import requests
def get_page_index():
data={
'offset': 0,
'format': 'json',
'keyword': 'xxx',
'autoload': 'true',
'count': '20',
'cur_tab':3
}
url = '' + urlencode(data)
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
print('请求索引页出错')
return None
def parse_page_index(html):
data = json.loads(html)
if data and 'data' in data.keys():
for item in data.get('data'):
yield item.get('article_url')
def get_page_detail(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
print('请求详细页出错')
return None
def parse_page_detail(html):
soup = BeautifulSoup(html,'lxml')
title = soup.select('title')[0].get_text()
print(title)
images_pattern = re.compile('var gallery = (.*?);',re.s)
result = re.search(images_pattern,html)
if result:
print(result.group(1))
def save_to_mongo(result):
if db[MONGO_TABLE].insert(result):
print('存储到MongoDB成功',result)
return True
return False
def download_image(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.add_text
return None
except RequestException:
print('请求图片错误', url)
return None
def save_image(content):
file_path = '{0}/{1}.{2}'.format(os.getcwd(),md5(content).hexdigest(),'jpg')
if not os.path.exists(file_path):
with open(file_path,'wb')as f:
f.write(content)
f.close()
def main():
html = get_page_index(0,'xxx')
print(html)
if __name__ == '__main__':
main()