前言
LDA是文本挖掘中最常用的主题模型之一,其可以理解为一篇文章有N个主题,每个主题有N个高频词汇,也可以理解为主题是一个bucket,里面装入一堆频率较高的词。
本文根据搜狗130万数据做spark LDA主题模型训练与分析。
准备数据
Sogou数据下载地址:https://github.com/qianzhengyang/AllDataPackages/blob/master/%E6%96%87%E6%9C%AC%E8%AF%AD%E6%96%99.md
数据格式如下:
{"id":1358656,"title":"组图:女足世界杯美国对阵挪威 球迷到场助兴","content":"(图5)腾讯体育讯 北京时间9月30日,本届女足世界杯将迎来了第三四名的决战,由美国队对阵挪威队。比赛还未开始,已经聚集了众多的球迷。"}
{"id":1358657,"title":"图文:季前赛雄鹿备战掘金 阿联跃跃欲试","content":"来源:搜狐体育 作者:张仪搜狐体育讯 美国密尔沃基当地时间10月22日(北京时间23日),雄鹿在圣弗朗西斯训练中心进行了训练,备战23日晚客场对掘金的季前赛。图为训练中的精彩瞬间。(摄/张仪 发自密尔沃基)(责任编辑:严国平)"}
{"id":1358658,"title":"德国队主场净吞三蛋","content":"网易体育10月18日消息 北京时间今天凌晨,2008年欧洲杯预选赛D组再战三场,结果在其中一场重头戏中,已经拿到明年欧洲杯决赛圈参赛资格的德国队在安联球场0-3惨败于捷克。于是,捷克队也步德国后尘,成功晋级明年欧洲杯决赛圈比赛。说明:点击了此复制图片按钮以后,只需要放一论坛就可以了"}
{"id":1358659,"title":"组图:男篮vs悉尼国王 队员场上激烈拼抢","content":"您所在的位置: 腾讯首页 > 体育频道 > 图片滚动图集 > 正文(图4)腾讯体育讯 北京时间10月2日,中国男篮vs悉尼国王队,图为双方队员场上激烈拼抢。分类信息赛场瞬间企业服务招商信息热点信息热门推荐体育"}
数据路径
private[this] val stopword = "/data/stop-words-pos.txt"
private[this] val sogoupath = "/data/Sogoutext.data"
分词与停用词过滤
对文章进行分词以及用stopRemover去除停用词。
def createDataFrame(spark: SparkSession): DataFrame = {
val ogrinData = spark.sparkContext
.textFile(sogoupath)
.map(_.split("\\s+", 2))
.filter(_.length > 1)
.map(arr => Doc(arr(0), arr(1).trim))
val originTokenRow = ogrinData.mapPartitions(iter => {
val segment = new Segment
iter.map(doc => (doc.id, segment.tokenLine(doc.content).get)).filter(_._2.nonEmpty)
}).map(t => Row(t._1, t._2.toArray))
val schema = StructType(Array(
StructField("id", StringType),
StructField("rawwords", ArrayType(StringType))
))
//创建dataframe
val originDf = spark.createDataFrame(originTokenRow, schema)
//停用词过滤
val stopRemover = getStopRemove(spark, "rawwords", "words")
val dataFrame = stopRemover.transform(originDf).drop("rawwords")
dataFrame
}
def getStopRemove(spark: SparkSession, inputcol: String, outputcol: String): StopWordsRemover = {
val stopwords = spark.sparkContext.textFile(stopword).map(_.trim).collect()
val remover = new StopWordsRemover()
.setInputCol(inputcol)
.setOutputCol(outputcol)
.setStopWords(stopwords)
remover
}
CountVectorizerModel特征表示
val dataFrame = createDataFrame(spark)
//CountVectorizerModel 生成文档稀疏表示
val cvModel: CountVectorizerModel = new CountVectorizer()
.setInputCol("words")
.setOutputCol("features")
.setMinDF(3)
.fit(dataFrame)
val cvDf = cvModel.transform(dataFrame).drop("words")
LDA模型训练
def train(df: DataFrame, k: Int, iter: Int): LDAModel = {
val lda = new LDA()
.setK(k)
.setTopicConcentration(3)
.setDocConcentration(3)
.setFeaturesCol("features")
.setOptimizer("online")
.setSeed(1313133L)
.setMaxIter(iter)
val ldaModel = lda.fit(df)
ldaModel
}
LDA主题特征相似度
用训练完的lda模型对文档进行转换之后,取出topicDistribution列做相似度计算,当然向量很大,可以
通过广播或分块计算。
def similarCalc(spark: SparkSession, df: DataFrame, model: LDAModel): Unit = {
val transformed = model.transform(df)
val topicVectors = transformed.rdd.map(r => {
val realId = r.getAs("id").asInstanceOf[String]
val vec = r.getAs("topicDistribution").asInstanceOf[DenseVector]
(realId, vec)
})
val vectorsValues = spark.sparkContext.broadcast(topicVectors.collect())
val similarity = topicVectors.mapPartitions(iter => {
val vectors = vectorsValues.value
iter.map(e => {
val idscores = for (i <- vectors if !i._1.equals(e._1))
yield (i._1, cosSimilarity(i._2.toDense, e._2.toDense))
val sorted = idscores.toList.filter(_._2 > minScore).sortWith((a, b) => a._2 > b._2)
val res = if (sorted.size > 3) sorted.slice(0, 100) else sorted
(e._1, res.mkString("\t"))
}).filter(_._2.length > 0)
})
similarity.take(10).foreach(println)
}
结果:
(10881,(10971,0.9999515808325444) (10877,0.9998280626566719) (10980,0.9995914557793235) (10966,0.9991007196188407) (10873,0.9987575059499324) (10875,0.9981125831642206) (10981,0.9972441479916212) (10985,0.9969863934224734) (10967,0.9964373392422373) (10963,0.9955499846964887) (10972,0.9951080932747084) (10964,0.9949256988391596) (10968,0.9948354276982817) (10970,0.9947821586233754) (10979,0.9941977034513715) (10975,0.9939260467610985) (10986,0.9938606629891327) (10977,0.9938557900935678) (10983,0.993786203004832) (10969,0.9937041840584083) (10982,0.9935910079282171) (10965,0.9933793777836343) (10976,0.9741354788233058) (10878,0.9654222524920384) (10984,0.9301626909256506) (10978,0.800250563530617) (10887,0.7187256781870788) (10973,0.6406047256395061) (10890,0.6318892634657478))
(10882,(10883,0.6081910200908042) (12852,0.6077947899678519) (12862,0.6039295088761559) (10872,0.6037768935995764) (12854,0.603750877835594) (10879,0.6037352832221344) (12864,0.6037104590359069) (12352,0.6036872108282482) (12853,0.6036817395891366) (12848,0.6036282469847594) (12858,0.6035508326372964) (12850,0.6035074312165228) (10884,0.6034977757744621) (12354,0.6034884996602544) (10885,0.603479205418295) (12860,0.6034733253158702) (10870,0.6034694003981216) (12339,0.6034115738513248) (10974,0.6034104714142132) (12351,0.6033943000908086) (12865,0.6033828258967537) (12851,0.6033776674635098) (12355,0.6033532576517068) (12350,0.6033511665515813) (12346,0.6033215932869845) (12342,0.6033186524952903) (12861,0.603313546402084) (12856,0.6033046406389434) (12348,0.6032985527867994) (12849,0.6032900888159755) (12859,0.6032710970543572) (12358,0.6032700847740925) (10889,0.6032673400542778) (12855,0.6032514669000055) (12341,0.6032477086400966) (12857,0.6032364580507691) (12359,0.6032271639871424) (12345,0.603223988720094) (12360,0.6032163551947679) (12357,0.6032156057336366) (10871,0.6032125473327076) (1,0.6031998557427022) (12353,0.603193427106624) (10874,0.6031859289729451) (12349,0.6031590878725842) (12347,0.6031574882630748) (10876,0.6031251389014955) (12344,0.6031087393356553) (12340,0.6031037125144678) (10880,0.6030807289196441) (10888,0.6030248207807695) (12343,0.6025163277996423) (12356,0.5883342844675824) (10890,0.5184170910080879) (10973,0.5152434623993652) (10887,0.47814692224991767) (12863,0.44887103061525074) (10978,0.42884466449650926) (10886,0.39387338046764797) (10984,0.3003794089089516))
(10883,(12852,0.9999990338192377) (12862,0.9999070352479472) (10872,0.9999004569114167) (12854,0.9998992466445294) (10879,0.9998988185680989) (12864,0.9998977252915594) (12352,0.999896632022754) (12853,0.9998959248356877) (12848,0.9998934707007257) (12858,0.9998899532220626) (12850,0.9998882356307417) (10884,0.9998881458916872) (12354,0.9998877099075276) (10885,0.9998872391417131) (10870,0.9998866867755247) (12860,0.999886607621506) (10974,0.9998844302402172) (12339,0.9998838605384653) (12351,0.9998831045615648) (12865,0.9998824333754488) (12851,0.9998821053566576) (12355,0.9998810479101061) (12350,0.9998810366390131) (12342,0.9998795332445202) (12346,0.999879491599391) (12861,0.9998791631533249) (12856,0.9998785871686101) (12348,0.9998785386397566) (12849,0.9998778717230853) (12859,0.9998772474800274) (12358,0.99987706849682) (12855,0.999875988881857) (12341,0.9998759590480496) (12359,0.9998752803266486) (12857,0.9998752648040228) (12345,0.9998748202141837) (12357,0.999874700100957) (12360,0.999874449107237) (10871,0.9998742015657709) (1,0.9998737506491303) (12353,0.9998733647230904) (10874,0.9998728675247125) (12349,0.9998715951415093) (12347,0.9998715386830703) (10876,0.9998700545638085) (12344,0.9998691207908174) (12340,0.9998688729721046) (10880,0.9998676633938013) (10888,0.9998648322799082) (10889,0.9990855465693352) (12343,0.9967447336626574) (12356,0.9726285382124998) (10890,0.8495728871018591) (10973,0.8435680516737657) (10887,0.7815866660055631) (12863,0.7270731966522094) (10978,0.6972068278405774) (10886,0.6381510532611705) (10882,0.6080948800704287) (10984,0.4832362410283633) (10878,0.38218092911598234) (10976,0.34884624301069744))
(10884,(12850,0.9999999997088058) (12354,0.9999999995892376) (10885,0.9999999981968531) (10870,0.9999999952908332) (12860,0.9999999945012847) (12858,0.9999999916413216) (10974,0.999999969383069) (12339,0.9999999597782901) (12351,0.9999999445642892) (12848,0.9999999337403473) (12865,0.9999999289318888) (12851,0.9999999205655592) (12355,0.9999998909696552) (12350,0.9999998906766265) (12853,0.9999998584481459) (12342,0.9999998405524404) (12346,0.9999998389430155) (12352,0.9999998322236485) (12861,0.9999998267758284) (12856,0.9999998041946623) (12348,0.9999998023948466) (12864,0.9999997852177291) (12849,0.9999997744639244) (12859,0.9999997470427826) (12358,0.9999997388294248) (10879,0.999999732021543) (12854,0.9999997090339147) (12855,0.9999996867031629) (12341,0.9999996853147701) (12359,0.9999996501915464) (12857,0.9999996493353429) (10872,0.999999640356477) (12345,0.9999996254963571) (12357,0.9999996188338959) (12360,0.9999996049416748) (10871,0.9999995909180615) (1,0.9999995648562423) (12353,0.9999995419212083) (10874,0.9999995115245175) (12349,0.9999994297099485) (12347,0.9999994259469382) (10876,0.9999993227429314) (12344,0.9999992537963381) (12340,0.9999992349751484) (10880,0.9999991400151099) (12862,0.9999991257735172) (10888,0.9999988980099624) (12852,0.9999078975600806) (10883,0.9998879772680086) (10889,0.9986262789434801) (12343,0.9958540431180917) (12356,0.971244235629662) (10890,0.8437490820486102) (10973,0.837630881739796) (10887,0.7747066904153979) (12863,0.7229748909645948) (10978,0.6892861316681685) (10886,0.6331908532307197) (10882,0.603401418205036) (10984,0.47359033999192685) (10878,0.3719979229480692) (10976,0.3384584573847307))
(10885,(12354,0.9999999995694454) (10870,0.9999999992176765) (12860,0.9999999987232177) (10884,0.9999999982908657) (12850,0.9999999976385263) (12858,0.9999999828691492) (10974,0.9999999819776897) (12339,0.9999999745241481) (12351,0.9999999621849927) (12865,0.999999949090943) (12851,0.9999999419615594) (12355,0.999999916355272) (12350,0.9999999161074041) (12848,0.9999999111016402) (12342,0.9999998716212422) (12346,0.9999998701610178) (12861,0.9999998592227418) (12856,0.9999998387829284) (12348,0.9999998371721253) (12853,0.9999998260015481) (12849,0.9999998117111579) (12352,0.999999796949108) (12859,0.9999997866142839) (12358,0.9999997790586824) (12864,0.9999997455232281) (12855,0.9999997309117378) (12341,0.9999997296385773) (12359,0.9999996970219883) (12857,0.9999996962074635) (10879,0.9999996878802172) (12345,0.9999996740053331) (12357,0.9999996677924657) (12854,0.9999996631336585) (12360,0.9999996548103757) (10871,0.9999996416904342) (1,0.9999996172815426) (12353,0.9999995957519154) (10872,0.9999995894993084) (10874,0.9999995671619137) (12349,0.9999994899661991) (12347,0.9999994864083053) (10876,0.9999993885639694) (12344,0.9999993229676492) (12340,0.999999305034582) (10880,0.9999992143956363) (12862,0.9999990474092798) (10888,0.9999989824315164) (12852,0.9999071086957289) (10883,0.9998871074841843) (10889,0.9986240817703873) (12343,0.9958501874925795) (12356,0.9712383807081328) (10890,0.8437263193727491) (10973,0.8376076851543639) (10887,0.7746798959236126) (12863,0.7229584866302433) (10978,0.6892553704390012) (10886,0.6331711365885218) (10882,0.603382879932175) (10984,0.4735530383279062) (10878,0.37195859931637293) (10976,0.33841835644805957))
(10886,(12356,0.7992414948778147) (5,0.781044697244012) (2,0.7807864557922221) (4,0.7786308753570542) (3,0.778630733294385) (6,0.7786299199613398) (8,0.7762272442916341) (10,0.7760134771151203) (7,0.775423712180227) (9,0.7754236710895442) (10883,0.6381614526143734) (12852,0.6377496242193051) (12862,0.6336570971698485) (10872,0.6334958653194538) (12854,0.6334691024803991) (10879,0.6334523709005836) (12864,0.6334254494112045) (12352,0.6334010134031557) (12853,0.6333950894006496) (12848,0.6333385782091747) (12858,0.6332566144392426) (12850,0.6332112890819779) (10884,0.6332011749220386) (12354,0.6331912055869935) (10885,0.6331814184807963) (12860,0.6331749648176341) (10870,0.6331710060085364) (12339,0.6331100039344104) (10974,0.6331089858280525) (12351,0.633091825571039) (12865,0.6330793167635556) (12851,0.6330737564608168) (12355,0.6330481472725538) (12350,0.6330465309275164) (12346,0.6330150200552578) (12342,0.6330118976789237) (12861,0.6330058976437005) (12856,0.6329965695184069) (12348,0.6329905369455953) (12849,0.6329810830077939) (12859,0.6329610213640231) (12358,0.6329603797560607) (12855,0.6329403285126153) (12341,0.632936534010707) (12857,0.6329243825403899) (12359,0.6329146984413102) (12345,0.632911649369519) (12360,0.6329033856603088) (12357,0.6329025131374101) (10871,0.63289932761057) (1,0.6328864651325474) (12353,0.6328798209806055) (10874,0.6328711478041692) (12349,0.6328430712946982) (12347,0.632841349753367) (10876,0.6328068045565227) (12344,0.6327895332601488) (12340,0.6327843900222467) (10880,0.6327598691806352) (10888,0.6327006845915486) (10889,0.6326499250987697) (12343,0.6316463557789259) (10890,0.5408352282207873) (10973,0.5374589929732085) (10887,0.49802534977163904) (12863,0.4692313553184523) (10978,0.44578092421467613) (10882,0.3938188729752155) (10984,0.3100674974764472))
(10887,(10973,0.994314771500258) (10890,0.9930491119242065) (10978,0.9921023407995474) (10984,0.9238047144225626) (10878,0.8751276895780896) (10976,0.857193707119378) (12343,0.8290175525660679) (10889,0.8067760736343141) (10883,0.7815994971615375) (12852,0.7809128559572485) (12862,0.7753105432108696) (10872,0.7750925801781304) (12854,0.7750508429950066) (10879,0.7750449274396629) (12864,0.7750102001229872) (12352,0.7749733529970323) (12853,0.7749372978759953) (12848,0.77485820942797) (10967,0.7747961293112682) (12858,0.7747487927161804) (10884,0.7747109133098183) (12850,0.7747035936500525) (12354,0.7746974363970609) (10885,0.7746819588147463) (10870,0.7746623112943075) (12860,0.7746526004000972) (10974,0.7746077564077867) (12339,0.7745732242704519) (12351,0.77455210742735) (12865,0.7745286035492642) (12851,0.774516511196227) (12350,0.7744895320251082) (12355,0.7744877098519344) (12342,0.7744464134462536) (12346,0.7744401668824283) (12861,0.7744323251548347) (12348,0.7744162943542923) (12856,0.7744112726260332) (12849,0.7743898553045404) (12859,0.7743794821261328) (12358,0.7743705010315216) (12341,0.774337504481435) (12855,0.7743343131131284) (12359,0.7743264707622729) (12857,0.774313445733646) (12357,0.7743091594428778) (12345,0.7743047200981402) (12360,0.7742940917242199) (10871,0.774285280484513) (1,0.7742765580731964) (12353,0.7742634427569837) (10874,0.7742460066576058) (12349,0.7742106487039982) (12347,0.7742096460454493) (10876,0.7741705503064884) (12344,0.7741408393881863) (12340,0.7741338057492329) (10880,0.7740980920509803) (10888,0.7740181952970691) (10985,0.7704928375428792) (10981,0.7683246235115162) (12356,0.7526834479215058) (10873,0.7524643757287608) (10971,0.7255230308450507) (10881,0.7187182580326889) (10877,0.7057060611547824) (10980,0.6987511569235237) (10966,0.6895198955712055) (10875,0.6746712891133851) (10963,0.6500827780256353) (10972,0.646533877507589) (10964,0.6451280367295233) (10968,0.6444563267135811) (10970,0.6440632989361309) (10979,0.6397596912052681) (10975,0.6378384496191243) (10986,0.6373824569730782) (10977,0.6373484669497657) (10983,0.636864770766553) (10969,0.6362983397077224) (10982,0.6355228694530675) (10965,0.6340906224512567) (12863,0.5705586261702986) (10886,0.49801972692138996) (10882,0.47807442394787253))
(10888,(10880,0.9999999850367409) (12340,0.9999999693716692) (12344,0.9999999654650613) (10876,0.9999999485791712) (12347,0.9999999147265096) (12349,0.9999999132592778) (10874,0.9999998769103083) (12353,0.9999998609778157) (1,0.999999847887038) (10871,0.9999998317810851) (12360,0.9999998226287751) (12357,0.9999998129917418) (12345,0.9999998083911885) (12857,0.999999790469711) (12359,0.9999997898748034) (12341,0.9999997611180722) (12855,0.9999997596819247) (12358,0.9999997098321389) (12859,0.9999997010983954) (12849,0.9999996692825246) (12348,0.9999996337968116) (12856,0.9999996309566975) (12861,0.9999995986146245) (12346,0.9999995793903051) (12342,0.9999995770280722) (12350,0.9999994829158332) (12355,0.9999994821189238) (12851,0.9999994097008333) (12865,0.9999993863427444) (12351,0.9999993369131364) (12339,0.9999992786067364) (10974,0.9999992326287956) (12860,0.9999990441300951) (10870,0.999999037144616) (10885,0.9999989855183657) (12354,0.999998940326547) (10884,0.9999988975069509) (12850,0.9999988882769539) (12858,0.9999987089378525) (12848,0.9999982961859165) (12853,0.9999979698530737) (12352,0.999997870843702) (12864,0.9999977106684467) (10879,0.9999975436864519) (12854,0.999997475928617) (10872,0.9999972803199306) (12862,0.9999960617810486) (12852,0.9998866496905116) (10883,0.9998646567251652) (10889,0.9985685161168957) (12343,0.9957535002439031) (12356,0.9710948481743853) (10890,0.8431608405363578) (10973,0.8370314645551611) (10887,0.7740144787851332) (12863,0.7225594574113094) (10978,0.6884916825607849) (10886,0.6326903672123694) (10882,0.6029284364968666) (10984,0.4726273167563025) (10878,0.37098283788121755) (10976,0.3374234582557949))
(10889,(12343,0.9992521095401322) (10883,0.9990861419632784) (12852,0.9990483914547083) (12862,0.9986746045645686) (10872,0.9986572266391658) (12854,0.9986538662413362) (10879,0.9986534189012262) (12864,0.9986506242622887) (12352,0.9986476455451662) (12853,0.998644688916849) (12848,0.9986382765476228) (12858,0.9986293666839882) (10884,0.9986263067192936) (12850,0.9986256865719693) (12354,0.998625201950222) (10885,0.9986239304214257) (10870,0.9986223124097807) (12860,0.9986215014384304) (10974,0.9986178463420589) (12339,0.9986149734468968) (12351,0.9986132323695992) (12865,0.9986112853230364) (12851,0.9986102822159116) (12350,0.99860805370892) (12355,0.9986079003716035) (12342,0.9986044771611686) (12346,0.9986039538358534) (12861,0.9986033037331433) (12348,0.9986019723392072) (12856,0.9986015496709174) (12849,0.9985997664411902) (12859,0.9985989068565455) (12358,0.9985981558811723) (12341,0.9985954013226882) (12855,0.9985951330171163) (12359,0.998594481979854) (12857,0.9985933888757706) (12357,0.998593033449406) (12345,0.9985926601035754) (12360,0.998591770388323) (10871,0.9985910320979362) (1,0.9985903017730724) (12353,0.9985892018672912) (10874,0.9985877387221037) (12349,0.9985847678147205) (12347,0.9985846833831312) (10876,0.9985813904787498) (12344,0.9985788855446294) (12340,0.9985782916936056) (10880,0.9985752739920648) (10888,0.9985685012177888) (12356,0.9698392587571757) (10890,0.8707117974013717) (10973,0.8650991305112242) (10887,0.8067725638815297) (10978,0.7262974147807277) (12863,0.7226033436348559) (10886,0.6326396197602078) (10882,0.6031710917090656) (10984,0.5190862207185997) (10878,0.4201213384277405) (10976,0.38729119744433826))
(10890,(10973,0.9999358409551456) (10887,0.9930481435732126) (10978,0.9704444549870703) (12343,0.88907654144425) (10984,0.8723191715861481) (10889,0.870712819690559) (10883,0.8495816998665513) (12852,0.8490021659338793) (12862,0.8442598441870163) (10872,0.8440747747269942) (12854,0.8440393342726553) (10879,0.8440343013994782) (12864,0.8440048087838492) (12352,0.8439735175901738) (12853,0.8439429126485675) (12848,0.8438757409176043) (12858,0.843782798568394) (10884,0.8437505940644784) (12850,0.8437443906618924) (12354,0.8437391451170455) (10885,0.8437259977842962) (10870,0.8437093102171035) (12860,0.8437010711894019) (10974,0.8436629388351456) (12339,0.8436336266374165) (12351,0.8436156820267312) (12865,0.843595716882016) (12851,0.8435854458878199) (12350,0.8435625134568857) (12355,0.8435609687161052) (12342,0.8435258714066243) (12346,0.8435205718909713) (12861,0.8435139050818841) (12348,0.8435002783621696) (12856,0.8434960221441591) (12849,0.8434778225009935) (12859,0.8434689933088669) (12358,0.8434613676057615) (12341,0.8434333269480858) (12855,0.8434306220595659) (12359,0.8434239340220752) (12857,0.8434128873231468) (12357,0.8434092223830298) (12345,0.8434054639933277) (12360,0.8433964308469525) (10871,0.8433889452547438) (1,0.8433815240577839) (12353,0.8433703808513451) (10874,0.8433555672979065) (12349,0.8433255126542399) (12347,0.8433246592543874) (10876,0.8432914221446742) (12344,0.8432661736617711) (12340,0.8432601948729104) (10880,0.8432298395332307) (10888,0.8431619203495035) (12356,0.8197677607817924) (10878,0.812089390763502) (10976,0.7906258730194181) (10967,0.6949975434269322) (10985,0.6901083631440336) (10981,0.6876474428705006) (10873,0.6697110075317415) (10971,0.6394773841537952) (10881,0.6318839848884114) (12863,0.6190724508454095) (10877,0.6174080738106444) (10980,0.6096973502948118) (10966,0.599493210714436) (10875,0.5831031926652726) (10963,0.5561303940001459) (10972,0.5522480698935739) (10964,0.5507115321138731) (10968,0.5499784186204018) (10970,0.5495495131132638) (10979,0.5448496632804671) (10975,0.5427541275837334) (10986,0.5422571373331398) (10977,0.5422200621281829) (10983,0.5416924825631915) (10969,0.541074955981184) (10886,0.5408272503978503) (10982,0.5402294660364149) (10965,0.5386684733969618) (10882,0.5183364203418986))
(10963,(10972,0.9999861094641637) (10964,0.9999739958468103) (10968,0.9999700083751769) (10970,0.9999669220880782) (10979,0.9998955734347648) (10975,0.9998598868179772) (10986,0.9998526132095077) (10977,0.999851777814232) (10983,0.9998394136235381) (10969,0.9998264264097145) (10982,0.9998054681201918) (10965,0.9997652976273314) (10875,0.99944644002508) (10966,0.9985705957655786) (10980,0.9978269809522996) (10877,0.9971244094119827) (10881,0.9955524690927513) (10971,0.9945767567492477) (10873,0.9896211223163791) (10981,0.9858267322293528) (10985,0.9852551057557405) (10967,0.9840654869832612) (10976,0.9485739741907806) (10878,0.9365967956047563) (10984,0.8914757180032622) (10978,0.7402719154195435) (10887,0.6501039177916546) (10973,0.5655071637757126) (10890,0.5561503066889326))
LDA聚类可视化
设定一个最小阈值比如0.1,过滤每个文档的主题,进行聚类以及可视化。
def cluster(spark: SparkSession, df: DataFrame, model: LDAModel, k: Int): Unit = {
val transformed = model.transform(df).drop("features")
transformed.show(false)
val topics = transformed.rdd.map(r => {
val realId = r.getAs("id").asInstanceOf[String]
val vec = r.getAs("topicDistribution").asInstanceOf[DenseVector]
(realId, vec.toArray.zipWithIndex.map(_.swap))
})
topics.persist(StorageLevel.MEMORY_AND_DISK)
val topicNodes = spark.sparkContext.makeRDD((0 until k).map(f => (f.toLong, s"topic-$f")))
val docNodes = topics.map(f => f._1.toLong + k).map(f => (f, s"doc-${f - k}"))
val nodes = topicNodes ++ docNodes
val edges = topics.mapPartitions(iter => {
iter.flatMap(f => {
f._2.filter(_._2 > 0.1).map(t => Edge(f._1.toLong + k, t._1.toLong, t._2))
})
})
val graph = Graph(nodes, edges)
toVisualFile(graph, visualizepath)
}