#!/usr/bin/env python
# encoding=utf8
import sys
import bs4
from bs4 import BeautifulSoup
import requests
import json
import re
gIgnoreGenreList = ['报刊杂志', '贴纸', '商品指南']
# https://www.apple.com/cn/itunes/charts/paid-apps/
# https://www.apple.com/cn/itunes/charts/free-apps/
def parse_appstore_page(cate_url, out_file):
# section apps grid
html = requests.get(cate_url).content
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
result = soup.find("div", id="main")
result = result.find("section", class_="section apps grid")
result = result.find("ul")
app_list = result.children
app_result = []
for child in app_list:
if isinstance(child, bs4.element.Tag):
app_info = child.find("h3").find("a")
# print child
app_name = app_info.string
app_itunes_url = app_info.get("href")
# print(repr(app_name).decode('unicode-escape'))
# print(repr(app_detail_url).decode('unicode-escape'))
one_app = {}
one_app["app_name"] = app_name
one_app["app_detail_url"] = app_itunes_url
app_result.append(one_app)
if len(out_file) > 0:
print "save result to file :%s" % out_file
result_string = json.dumps(app_result, ensure_ascii=False)
f=open(out_file,"w")
f.write(result_string)
f.write('\n')
f.close()
# https://itunes.apple.com/cn/genre/ios/id36?mt=8
def parse_genre_page(genre_url, limit = 10, out_file = "genre_result.txt"):
html = requests.get(genre_url).content
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
result = soup.find("div", id="main")
result = result.find("div", id="content")
result = result.find("div", id="genre-nav")
result = result.find("div", class_="grid3-column")
# ul list
ul_list = result.find_all("ul", recursive=False)
cate_result = []
for one_ul in ul_list:
if isinstance(one_ul, bs4.element.Tag):
cate_list = one_ul.children
for cate in cate_list:
cate_info = cate.find("a")
cate_name = cate_info.string
if cate_name in gIgnoreGenreList:
print "ingore cate :" + cate_name
continue
cate_url = cate_info.get("href")
one_cate = {}
one_cate["name"] = cate_name
one_cate["url"] = cate_url
print "processing genre %s." % cate_name
cate_app = parse_genre_content(cate_url, limit)
one_cate["app_list"] = cate_app
cate_result.append(one_cate)
break
if len(out_file) > 0:
print "save result to file :%s" % out_file
result_string = json.dumps(cate_result, ensure_ascii=False)
f=open(out_file,"w")
f.write(result_string)
f.write('\n')
f.close()
# https://itunes.apple.com/cn/genre/ios-导航/id6010?mt=8
# 返回一个数组 [{app_name:xxxx, app_detail_url:xxxxx}]
# limit 为app返回数量,-1 为不限制
def parse_genre_content(content_url, limit = -1):
html = requests.get(content_url).content
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
result = soup.find("div", id="main")
result = result.find("div", id="content")
result = result.find("div", id="selectedgenre")
result = result.find("div", class_="grid3-column")
# ul list
ul_list = result.find_all("div")
app_result = []
count = 0
for one_ul in ul_list:
if isinstance(one_ul, bs4.element.Tag):
app_list = one_ul.find("ul").children
for app in app_list:
if isinstance(app, bs4.element.Tag):
if limit > -1 and count >= limit:
break
# print count
app_info = app.find("a")
app_name = app_info.string
app_url = app_info.get("href")
one_app = {}
one_app["app_name"] = app_name
one_app["app_detail_url"] = app_url
one_app["app_id"] = parse_appid(app_url)
app_detail = parse_detail_page(app_url)
if app_detail != None:
print app_detail
one_app.update(app_detail)
app_result.append(one_app)
count += 1
return app_result
# https://itunes.apple.com/cn/app/高德地图-精准导航-出行必备/id461703208?mt=8
# {latest_version:x.x.x, update_date:xxxxxxx, system_version:iOS 7.0}
def parse_detail_page(detail_url):
app_detail = {}
html = requests.get(detail_url).content
soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')
result = soup.find("main", class_="is-app-theme")
# print result
# result_string = soup
# f=open("out_file.txt","w")
# print >> f, result
# f.write(result_string)
# f.close()
# return
whats_new = result.find("section", class_="l-content-width section section--bordered whats-new")
# 获取版本号和最后更新时间
update_date = whats_new.find("time").string
latest_version = parse_appver(whats_new.find("p").string)
# print update_date
# print latest_version
# APP 信息
information = result.find("dl", class_=["information-list", "information-list--app", "medium-columns"])
# 获取app支持的最低iOS系统版本号
version_label = information.select("div")[3].select("div > span")[0].string
system_version = parse_systver(version_label)
# print system_version
# return
app_detail["update_date"] = update_date
app_detail["latest_version"] = latest_version
app_detail["system_version"] = system_version
return app_detail
def parse_appid(detail_url):
# [^/]+(?!.*/)(?=[\?]+)
return re.search("[^/]+(?!.*/)(?=[\?]+)", detail_url).group(0)
def parse_systver(version_label):
# return version_label
return re.search("iOS.(\d+\.\d+)", version_label).group(1)
def parse_appver(version_label):
# "版本 10.5.0"
# return version_label
return re.search(u"版本\s(.+)", version_label).group(1)
if __name__ == '__main__':
reload(sys)
sys.setdefaultencoding('utf8')
# parse_appstore_page('https://www.apple.com/cn/itunes/charts/free-apps/', 'free-apps.txt')
# parse_appstore_page('https://www.apple.com/cn/itunes/charts/paid-apps/', 'paid-apps.txt')
parse_genre_page('https://itunes.apple.com/cn/genre/ios/id36?mt=8', 1, 'appstore-genre.txt')
# version = re.search("iOS.(\d+\.\d+)", "xxx iOS 8.0 asdasdadasd").group(1)
# print version
爬取appstore应用信息
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...