参考：《文本数据挖掘——基于R语言》

library(pacman)
p_load(dplyr, stringr, purrr)

1、读取数据

随便文本代替即可，包括两列，一列为文档名或编号，一列为文本内容。

storagebottles <- read.csv("dataset/ali/storagebottles0905.csv", 
                           header = F) %>% 
  set_names(c("sku_name", "sku_price", "sku_sale_volume", "sku_score",
              "sku_ship", "sku_isNewin", "sku_isPromotion", 
              "sku_isTopselling", "shop_name", "sku_link", "category4")) %>%
  distinct(.keep_all = T)

storagebottles <- storagebottles %>% 
  filter(!is.na(sku_name)) %>%
  filter(str_detect(sku_price, "^US")) %>% 
  filter(str_detect(sku_link, "aliexpress")) %>% 
  filter(str_detect(sku_sale_volume, "sold")) %>% 
  mutate(category = "home",
         category2 = "Home Storage",
         category3 = "Storage Bottles & Jars")  %>% 
  mutate(sku_id = str_extract(sku_link, "\\d{16}"),
         sku_link = paste0("http:", sku_link)) %>% 
  mutate(sku_id = as.character(sku_id)) %>% 
  arrange(sku_sale_volume) %>% 
  group_by(sku_id, .drop = T) %>% 
  slice_tail(n=1) %>% 
  ungroup()

df <- select(storagebottles, sku_id, sku_name)

2、文本纠错

p_load(hunspell)

# 检查是否有错
hunspell_check(df$sku_name[1])

## [1] FALSE

# 识别错误单词
bad <- hunspell(df$sku_name[1])
print(bad[[1]])

## [1] "pcs"

# 修正建议
hunspell_suggest(bad[[1]])

## [[1]]
##  [1] "cps"  "cs"   "pecs" "pics" "pis"  "pas"  "pct"  "pus"  "p cs" "PCs"

3、切分

p_load(tokenizers, tidytext)

txt <- paste0(df$sku_name[1:2], collapse = "。")
txt

## [1] "1000pcs 8*32mm 0.5ml Plastic Centrifuge Tube Test Tubing Vial Clear Plastic Container Home Garden Storage Bottles。1000pcs 6*22mm 0.2ml Plastic Bottles Gardening Storage Container Transparent Plastic Vials PCR Centrifuge Tube"

# 段落切分
# 设置切分标记
tokenize_paragraphs(txt, paragraph_break = "。")

## [[1]]
## [1] "1000pcs 8*32mm 0.5ml Plastic Centrifuge Tube Test Tubing Vial Clear Plastic Container Home Garden Storage Bottles"
## [2] "1000pcs 6*22mm 0.2ml Plastic Bottles Gardening Storage Container Transparent Plastic Vials PCR Centrifuge Tube"

# 句子切分
tokenize_sentences(txt)

## [[1]]
## [1] "1000pcs 8*32mm 0.5ml Plastic Centrifuge Tube Test Tubing Vial Clear Plastic Container Home Garden Storage Bottles。"
## [2] "1000pcs 6*22mm 0.2ml Plastic Bottles Gardening Storage Container Transparent Plastic Vials PCR Centrifuge Tube"

# 词语切分，会同时去除标点，转小写
tokenize_words(txt)

## [[1]]
##  [1] "1000pcs"     "8"           "32mm"        "0.5ml"       "plastic"    
##  [6] "centrifuge"  "tube"        "test"        "tubing"      "vial"       
## [11] "clear"       "plastic"     "container"   "home"        "garden"     
## [16] "storage"     "bottles"     "1000pcs"     "6"           "22mm"       
## [21] "0.2ml"       "plastic"     "bottles"     "gardening"   "storage"    
## [26] "container"   "transparent" "plastic"     "vials"       "pcr"        
## [31] "centrifuge"  "tube"

# 也可以保留标点，去掉数字
tokenize_words(txt, strip_punct = F, strip_numeric = T, simplify = T)

##  [1] "1000pcs"     "*"           "32mm"        "0.5ml"       "plastic"    
##  [6] "centrifuge"  "tube"        "test"        "tubing"      "vial"       
## [11] "clear"       "plastic"     "container"   "home"        "garden"     
## [16] "storage"     "bottles"     "。"          "1000pcs"     "*"          
## [21] "22mm"        "0.2ml"       "plastic"     "bottles"     "gardening"  
## [26] "storage"     "container"   "transparent" "plastic"     "vials"      
## [31] "pcr"         "centrifuge"  "tube"

# n元切分，simplify = T输出向量而非列表
tokenize_ngrams(txt, n = 2, simplify = T)

##  [1] "1000pcs 8"             "8 32mm"                "32mm 0.5ml"           
##  [4] "0.5ml plastic"         "plastic centrifuge"    "centrifuge tube"      
##  [7] "tube test"             "test tubing"           "tubing vial"          
## [10] "vial clear"            "clear plastic"         "plastic container"    
## [13] "container home"        "home garden"           "garden storage"       
## [16] "storage bottles"       "bottles 1000pcs"       "1000pcs 6"            
## [19] "6 22mm"                "22mm 0.2ml"            "0.2ml plastic"        
## [22] "plastic bottles"       "bottles gardening"     "gardening storage"    
## [25] "storage container"     "container transparent" "transparent plastic"  
## [28] "plastic vials"         "vials pcr"             "pcr centrifuge"       
## [31] "centrifuge tube"

# 4、字符切分
tokenize_characters(txt, simplify = T)

##   [1] "1" "0" "0" "0" "p" "c" "s" "8" "3" "2" "m" "m" "0" "5" "m" "l" "p" "l"
##  [19] "a" "s" "t" "i" "c" "c" "e" "n" "t" "r" "i" "f" "u" "g" "e" "t" "u" "b"
##  [37] "e" "t" "e" "s" "t" "t" "u" "b" "i" "n" "g" "v" "i" "a" "l" "c" "l" "e"
##  [55] "a" "r" "p" "l" "a" "s" "t" "i" "c" "c" "o" "n" "t" "a" "i" "n" "e" "r"
##  [73] "h" "o" "m" "e" "g" "a" "r" "d" "e" "n" "s" "t" "o" "r" "a" "g" "e" "b"
##  [91] "o" "t" "t" "l" "e" "s" "1" "0" "0" "0" "p" "c" "s" "6" "2" "2" "m" "m"
## [109] "0" "2" "m" "l" "p" "l" "a" "s" "t" "i" "c" "b" "o" "t" "t" "l" "e" "s"
## [127] "g" "a" "r" "d" "e" "n" "i" "n" "g" "s" "t" "o" "r" "a" "g" "e" "c" "o"
## [145] "n" "t" "a" "i" "n" "e" "r" "t" "r" "a" "n" "s" "p" "a" "r" "e" "n" "t"
## [163] "p" "l" "a" "s" "t" "i" "c" "v" "i" "a" "l" "s" "p" "c" "r" "c" "e" "n"
## [181] "t" "r" "i" "f" "u" "g" "e" "t" "u" "b" "e"

5、扩展缩写

p_load(qdap)

# 连词缩写替换
replace_contraction(c("MR. Jones isn't going."))

## [1] "MR. Jones is not going."

# 缩写替换
replace_abbreviation(c("MR. Jones isn't going."))

## [1] "Mister Jones isn't going."

# 数字替换
replace_number(c(1))

## [1] "one"

# 序词替换
replace_ordinal(c("3rd"))

## [1] "third"

# 符号替换
replace_symbol(c("&"))

## [1] "and"

6、词干提取

stem <- tokenize_word_stems(df$sku_name[1], simplify = T)
stem

##  [1] "1000pcs"   "8"         "32mm"      "0.5ml"     "plastic"   "centrifug"
##  [7] "tube"      "test"      "tube"      "vial"      "clear"     "plastic"  
## [13] "contain"   "home"      "garden"    "storag"    "bottl"

7、词形还原

模型下载地址：
英文：https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/english-ewt-ud-2.5-191206.udpipe
中文：https://raw.githubusercontent.com/jwijffels/udpipe.models.ud.2.5/master/inst/udpipe-ud-2.5-191206/chinese-gsd-ud-2.5-191206.udpipe

p_load(udpipe)

# 从github上下载英文词形还原的模型，也可以下载chinese
# 网络可能导致下载失败
udmodel <- udpipe_download_model(language = "english", 
                                 model_dir = "model/")

# 加载模型
en_model <- udpipe_load_model(udmodel$file_model)

# 词形还原
udpipe_annotate(en_model, stem) %>% 
  as_tibble() %>% 
  # 分词结果，词形还原结果
  select(token, lemma)

## token  lemma
## <chr>  <chr>
## 1000pcs  1000pcs         
## 8        8           
## 32mm 32mm            
## 0.5ml    0.5ml           
## plastic  plastic         
## centrifug    centrifug           
## tube tube            
## test test            
## tube tube            
## vial vial
## clear    clear           
## plastic  plastic         
## contain  contain         
## home home            
## garden   garden          
## storag   storag          
## bottl    bottl

udmodel <- udpipe_download_model(language = "chinese", 
                                 model_dir = "model/")

# 加载模型
cn_model <- udpipe_load_model(udmodel$file_model)

# 中文词形还原，# 转换为UTF-8编码
udpipe_annotate(cn_model, iconv(c("别人笑我忒疯癫"), to = "UTF-8")) %>% 
  as_tibble() %>% 
  # 分词结果，词形还原结果
  select(token, lemma)

## token  lemma
## <chr>  <chr>
## 别    别           
## 人笑   人笑          
## 我忒   我忒          
## 疯    疯           
## 癫    癫

8、词性标注

udpipe_annotate(en_model, df$sku_name[1]) %>% 
  as_tibble() %>% 
  select(token, upos)

## token  upos
## <chr>  <chr>
## 1000 NUM         
## pcs  NOUN            
## 8    NUM         
## *    PUNCT           
## 32   NUM         
## mm   NOUN            
## 0.5  NUM         
## ml   NOUN            
## Plastic  PROPN           
## Centrifuge   PROPN   
……（太多就不一一列出来了）

PROPN表示专有名词，AUX表示助动词，ADJ表示形容词，DET表示限定词，NOUN表示名词，PUNCT表示标点符号

ADJ: adjective
ADP: adposition
ADV: adverb
AUX: auxiliary
CCONJ: coordinating conjunction
DET: determiner
INTJ: interjection
NOUN: noun
NUM: numeral
PART: particle
PRON: pronoun
PROPN: proper noun
PUNCT: punctuation
SCONJ: subordinating conjunction
SYM: symbol
VERB: verb
X: other

9、批量文本预处理

p_load(tidytext)

df <- unnest_tokens(tbl = df,
                    # 输出列名称
                    output = stem, 
                    input = sku_name,
                    # 词干提取
                    token = tokenize_word_stems)
df

## # A tibble: 21,765 × 2
##    sku_id           stem     
##    <chr>            <chr>    
##  1 2251801564728378 1000pcs  
##  2 2251801564728378 8        
##  3 2251801564728378 32mm     
##  4 2251801564728378 0.5ml    
##  5 2251801564728378 plastic  
##  6 2251801564728378 centrifug
##  7 2251801564728378 tube     
##  8 2251801564728378 test     
##  9 2251801564728378 tube     
## 10 2251801564728378 vial     
## # … with 21,755 more rows

109-文本分析之文本预处理