func wordBoundary(_ text: String) -> [String] {
let tokenize = CFStringTokenizerCreate(kCFAllocatorDefault, text as CFString?, CFRangeMake(0, text.count), kCFStringTokenizerUnitWordBoundary, CFLocaleCopyCurrent())
CFStringTokenizerAdvanceToNextToken(tokenize)
var range = CFStringTokenizerGetCurrentTokenRange(tokenize)
var boundaries : [String] = []
while range.length > 0 {
let wRange = text.index(text.startIndex, offsetBy: range.location)..<text.index(text.startIndex, offsetBy: range.location + range.length)
let keyWord = String(text[wRange])
boundaries.append(keyWord)
CFStringTokenizerAdvanceToNextToken(tokenize)
range = CFStringTokenizerGetCurrentTokenRange(tokenize)
}
return boundaries
}
func wordBoundariesWithSentences(_ text: String) -> [Dictionary<String, Array<String>>] {
let tokenize = CFStringTokenizerCreate(kCFAllocatorDefault, text as CFString?, CFRangeMake(0, text.count), kCFStringTokenizerUnitSentence, CFLocaleCopyCurrent())
CFStringTokenizerAdvanceToNextToken(tokenize)
var range = CFStringTokenizerGetCurrentTokenRange(tokenize)
var result : [Dictionary<String, Array<String>>] = []
while range.length > 0 {
let wRange = text.index(text.startIndex, offsetBy: range.location)..<text.index(text.startIndex, offsetBy: range.location + range.length)
let sentence = String(text[wRange])
let words = wordBoundary(sentence)
result.append([sentence: words])
CFStringTokenizerAdvanceToNextToken(tokenize)
range = CFStringTokenizerGetCurrentTokenRange(tokenize)
}
return result
}
iOS 分词
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
推荐阅读更多精彩内容
- 引言 技术无关, 可跳过. 最近在写一个独立项目,基于斗鱼直播平台的开放接口, 对斗鱼的弹幕进行实时的分析,最近抽...
- 背景 在垃圾短信过滤应用 SMSFilters 中,需要使用 Jieba 分词库来対短信进行分词,然后使用 TF-...