utf-8
vector<string> Similarity::s2v(string t_str)
{
boost::regex re("\d+");
//setup converter
vector<string> wanted;
for(int i=0; i<t_str.length(); i++){
char c = t_str[i];
unsigned short b = 0x80;
int head = 0;
while((c & (b>>head)) != 0){
head += 1;
}
if(head == 0)head = 1;
string candiate = t_str.substr(i, head);
//is number
if(!wanted.empty()){
if(boost::regex_match(candiate, re) && boost::regex_match(*(wanted.end()-1), re)) {
*(wanted.end() - 1) = *(wanted.end() - 1) + candiate;
continue;
}
bool repeat = 0;
for(auto item : wanted){
if(item == candiate){
repeat = 1;
break;
}
}
if(repeat)
continue;
}
wanted.push_back(candiate);
i+=head-1;
}
return wanted;
}