我们在之前的教程里讲到了如何在openrefine中整理美国大选总统辩论文本,经过整理后的文本就可以在R中使用,用来进行文本分析,这个教程跟大家一起来实际操作,如何利用R进行文本分析,并将分析结果以图的方式呈现,今天我们先进行的是词频分析、加入情感字典以后的正负面词云和辩论人的发言时间统计。
一、词频统计
导入R中所需要的包
library(tidytext)
library(tidyverse)
library(wordcloud2)
library(stm)
library(RColorBrewer)
library(wordcloud)
library(ggplot2)
library(geometry)
library(Rtsne)
library(rsvd)
library(syuzhet)
library(reshape2)
将我们整理好的辩论文本导入到R中,对文本进行分词,并去掉英文常用的停止词,统计词频
data(stop_words)
text <- read_csv("2020byden and trump.csv")
tidy_content <- text %>% select(1,2) %>%
unnest_tokens(word,发言内容) %>%
anti_join(stop_words)
tidy_content <- tidy_content %>%
count(发言者,word,sort=T)
通过上一步骤的词频统计结果,我们发现还有好多的例如it’s,don’t等没有意义的词没有去掉,接下来利用filter去掉这些词,另外在分词里还有些数字,通过正则表达式替换将其替换成“+++”的模式,然后利用filter筛选掉
#排除词,也可以用正则来实现
tidy_content <- tidy_content %>%
filter(! word %in% c('it’s',"don’t","he’s","i’m","that’s","didn’t","00",'they’re','you’re','they’ve','aren’t',
'we’re','we’ve','i’ll','here’s', 'you’ve','what’s','01','i’ve','can’t','doesn’t','there’s'))
#将数字用正则替换为“+++”,然后不选“+++”的列
tidy_content <- tidy_content %>%
mutate(word=str_replace_all(word,'\\d+',"+++")) %>%
filter(!word %in% c("+++","+++,+++","+++.+++","+++th"))
接下来进行画图,首先筛选出biden和trump在辩论中提及的10次以上的词,然后画图
#画图一 biden和trump在辩论中说的10次以上的词频统计
biden <- tidy_content %>% filter(发言者=='Vice President Joe Biden'&n>9) %>% mutate(word=reorder(word,n))
ggplot(biden,aes(x=word,y=n,))+
geom_col(fill="blue")+
labs(x="单词",y="数量")+
theme_bw()+
scale_y_continuous(limits = c(0,80))+
coord_flip()
trump <- tidy_content %>% filter(发言者=='President Donald J. Trump'&n>9) %>% mutate(word=reorder(word,n))
ggplot(trump,aes(x=word,y=n,))+
geom_col(fill="red")+
labs(x="单词",y="数量")+
theme_bw()+
scale_y_continuous(limits = c(0,80))+
coord_flip()
ggplot(tidy_content %>% filter(发言者!=('Chris Wallace')&n>9),aes(word,n,fill=发言者))+
labs(x="单词",y="数量")+
theme_bw()+
#scale_y_continuous(limits = c(0,80))+
geom_col()+
coord_flip()
biden tf.png
biden的词频统计
trump tf.png
trump的词频统计
biden&turmp.png
biden和trump的词频比较
二、正负面情感词云
将biden和trump的发言加入情感词库"bing",得出发言人话语中的正面、负面词性,并通过词云表示
# 画图二 正负面词情感词云
biden1 <- text %>% select(1,2) %>%
unnest_tokens(word,发言内容) %>%
anti_join(stop_words)%>%
filter(! word %in% c('it’s',"don’t","he’s","i’m","that’s","didn’t","00",'they’re','you’re','they’ve','aren’t',
'we’re','we’ve','i’ll','here’s', 'you’ve','what’s','01','i’ve','can’t','doesn’t','there’s')) %>%
mutate(word=str_replace_all(word,'\\d+',"+++")) %>%
filter(!word %in% c("+++","+++,+++","+++.+++","+++th")) %>%
filter(发言者=='Vice President Joe Biden')
biden_setiments <- biden1 %>% inner_join(get_sentiments("bing")) %>%
count(word,sentiment,sort = T) %>%
acast(word~sentiment,value.var = "n",fill=0)
comparison.cloud(biden_setiments,colors = c("gray20","gray80"),max.words = 500)
trump1 <- text %>% select(1,2) %>%
unnest_tokens(word,发言内容) %>%
anti_join(stop_words)%>%
filter(! word %in% c('it’s',"don’t","he’s","i’m","that’s","didn’t","00",'they’re','you’re','they’ve','aren’t',
'we’re','we’ve','i’ll','here’s', 'you’ve','what’s','01','i’ve','can’t','doesn’t','there’s')) %>%
mutate(word=str_replace_all(word,'\\d+',"+++")) %>%
filter(!word %in% c("+++","+++,+++","+++.+++","+++th")) %>%
filter(发言者=='President Donald J. Trump')
trump_setiments <- biden1 %>% inner_join(get_sentiments("bing")) %>%
count(word,sentiment,sort = T) %>%
acast(word~sentiment,value.var = "n",fill=0)
comparison.cloud(trump_setiments,colors = c("gray20","gray80"),max.words = 500)
biden.png
** biden词云**
trump.png
trump词云
三、发言时间图
#时间画图
time <- text %>% group_by(发言者) %>% summarise(发言时长=sum(发言时长,na.rm=T)/60)
ggplot(time,aes(发言者,发言时长,fill=发言者))+
geom_col()+
labs(y='发言时长(分)')
发言时间.png
本次教程我们简单分析了词频、正负情感词云及发言时间,下次我们将继续分析tf-idf值的情况,挖掘发言人的主要观点和进一步的情感分析变化情况。