字典特征抽取
#文本特征抽取
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
dict_ = [
{'name':'name1','hobby':'h1','age':18},
{'name':'name2','hobby':'h2','age':30},
{'name':'name1','hobby':'h3','age':28},
]
dictv = DictVectorizer()
#稀松矩阵 one-hot 独热码
data = dictv.fit_transform(dict_).toarray()
col = dictv.get_feature_names()
df = pd.DataFrame(data=data,columns=col)
image.png
文本内容抽取
pip install jieba
from sklearn.feature_extraction.text import CountVectorizer
import jieba
li = ['饿了吗,我下面给你吃,美国',
'容我插一下嘴,美国',
'做我的女朋友一定要喜欢吃海鲜,美国',
'每天只想与你做四件事,一日三餐,美国',
'我有一个大的荷尔蒙想要放,美国']
jieba_data = []
for i in li:
jieba_data.append(' '.join(jieba.lcut(i)))
cv = CountVectorizer()
data = cv.fit_transform(jieba_data).toarray()
col = cv.get_feature_names()
df = pd.DataFrame(data=data,columns=col)
image.png
重要程度分析 Tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tf_data = tfidf.fit_transform(jieba_data).toarray()
col = tfidf.get_feature_names()
df2 = pd.DataFrame(data=tf_data,columns=col)
image.png