数据获取
爬虫新浪新闻获得新闻标题和内容
import requests
import json
from bs4 import BeautifulSoup
import pandas as pd
res = requests.get('http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gjxw&level==1||=2&show_ext=1&show_all=1&show_num=500&tag=1&format=json') #通过新浪新闻API获取国际新闻信息,show_num可以设置一次获得多少新闻
data = json.loads(res.text) #将json格式数据转换成字典
def get_article(url): #定义一个通过网址获取页面内新闻内容的函数
res = requests.get(url)
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'lxml')
article = soup.select('div#artibody')[0].text.strip()
return article
news = []
for i in data['result']['data']:
article = get_article(i['url'])
news.append({'title':i['title'],'article':article}) # 将新闻标题,新闻内容保存到news列表
df = pd.DataFrame(news) # 将news列表转化成dataframe格式
使用结巴分词
import jieba
titles = []
articles = []
for rec in df.iterrows():
articles.append(' '.join(jieba.cut(rec[1].article))) #articles中每一项为新闻内容分词后的结果
titles.append(rec[1].title) # titles为新闻标题列表
建立词频矩阵
from sklearn.feature_extraction.text import CountVectorizer
vertorizer = CountVectorizer()
X = vertorizer.fit_transform(articles)
计算余弦距离
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity = cosine_similarity(X,X)
使用kmeans分群
from sklearn.cluster import KMeans
c = KMeans(n_clusters = 10,init='k-means++',random_state=123)
k_data = c.fit_predict(cosine_similarity)
分群结果
import numpy
titles_ary = numpy.array(titles)
print(titles_ary[k_data ==0])