正则表达式编程

编程中经常会出现对字符串的处理，自己编写处理程序通常是低效的，也是一种重复造轮子的行为，如果掌握使用正则表达式处理字符串，可以极大提高编程效率。

正则表达式的语法就不作说明，这里主要类比在c++和python中如何使用正则表达式进行编程。

match 匹配

match 函数通常需要两个参数，pattern 和 s，其中 pattern 为正则表达式，s 表示待匹配字符串，返回值在c++和python中并不相同。下面是两个功能类似的例子：

c++ 版

#include <iostream>
#include <regex>
#include <string>

using namespace std;

int main(int argc, char* argv[]){
    string fnames[] = {"foo.txt", "bar.txt", "test.txtx", "0a.txt", "AAA.txt"};
    regex txt_regex("[a-z]+\\.txt");
    for(string& fname: fnames){
        cout << fname << ": " << regex_match(fname, txt_regex) << endl;
    }
    return 0;
}

输出为

foo.txt: 1
bar.txt: 1
test.txtx: 0
0a.txt: 0
AAA.txt: 0

python 版

import re

fnames = ["foo.txt", "bar.txt", "test.txtx", "0a.txt", "AAA.txt"]

txt_regex = re.compile(r"[a-z]\.txt")
txt_regex = re.compile("[a-z]+\\.txt")
for fname in fnames:
    print(f"{fname}: {txt_regex.match(fname)}")

输出为：

foo.txt: <_sre.SRE_Match object; span=(0, 7), match='foo.txt'>
bar.txt: <_sre.SRE_Match object; span=(0, 7), match='bar.txt'>
test.txtx: <_sre.SRE_Match object; span=(0, 8), match='test.txt'>
0a.txt: None
AAA.txt: None

对比 c++ 和 python 中 match 的用法，可以看出c++返回的是一个 bool类型的，而python如果能够匹配，就返回匹配实例，否则返回None。而且c++默认是对这个字符串匹配，而python只要前一部分能够匹配就认为匹配成功。

c++ 匹配到的结果可以通过传入类型为 smatch 的参数获取，例如传入的参数名为 sm，则 sm.length() 返回匹配的字符串长度，sm.size() 为捕获的变量数，其中第一个为匹配到的整个字符串，之后为正则表达式中用()捕捉到的内容，捕捉到的内容可以用类似 sm[1] 获取，也可以用 sm.format("... $1 ...") 直接用于格式化。

c++ 的 regex_match 的其他用法（参考： <https://blog.csdn.net/mycwq/article/details/18838151）

#include <iostream>
#include <regex>
#include <string>

using namespace std;

int main(){
    string s = "xsubject";
    regex e("(sub)(.*)");
    if(regex_match(s, e)){
        cout << "string object matched" << endl;
    }
    // 部分匹配
    if(regex_match(s.begin()+1, s.end(), e)){
        cout << "range matched" << endl;
    }
    // 获取匹配结果
    smatch sm;
    if(regex_match(s.cbegin()+1, s.cend(), sm, e)){
        cout << sm.str() << endl;
        for(int i=0; i<sm.size(); ++i){
            cout << "\t" << i << ": " << sm[i] << endl;
        }
    }
    return 0;
}

输出结果为：

range matched
subject
        0: subject
        1: sub
        2: ject

search 搜索

match 和 search 的主要区别：match 是全词匹配，而 search 是搜索其中匹配的字符串。

#include <iostream>
#include <regex>
#include <string>
using namespace std;

int main(){
    string s("this subject has a submarine as subsequence");
    smatch m;
    regex e("\\b(sub)([^ ]*)");
    while(regex_search(s, m, e)){
        for(auto x=m.begin(); x!=m.end(); x++){
            cout << x->str() << ", ";
        }
        cout << m.format(" --> ([^ ]*) match $2") << endl;
        // 等价于
        // cout << " --> ([^ ]*) match " << m[2] << endl;        
        s = m.suffix().str();
    }
    return 0;
}

输出为：

subject, sub, ject, --> ([^ ]*) match ject
submarine, sub, marine, --> ([^ ]*) match marine
subsequence, sub, sequence, --> ([^ ]*) match sequence

等价的 python 代码为：

import re

s = "this subject has a submarine as subsequence"
e = re.compile("\\b(sub)([^ ]*)")

bg_pos = 0
m = e.search(s, bg_pos)
while m:
    print(m.group(), end=", ")
    for ts in m.groups():
        print(ts, end=", ")
    print("--> ([^ ]*) match", m[2])
    bg_pos = m.end()
    m = e.search(s, bg_pos)

replace 替换

#include <iostream>
#include <regex>
#include <string>
using namespace std;

int main(){
    string s = "This is string, that is right;";
    regex e("th([^\\s]+)", regex::icase);
    string news = regex_replace(s, e, "[$1]");
    cout << s << endl;
    cout << news << endl;
}

输出为：

This is string, that is right;
[is] is string, [at] is right;

等价的 python 代码为：

import re

s = "This is string, that is right;"
e = re.compile("th([^\\s]+)", re.RegexFlag.IGNORECASE)

print(s)
print(e.sub(lambda mt: "[%s]" %(mt[1]), s))

总结

下面用两段代码对正则表达式在c++和python中进行总结：

#include <iostream>
#include <regex>
#include <string>
using namespace std;

int main(){
    string s("this is me, that\tis you!");
    regex ex("\\bth([^\\s]+)");
    smatch mt;
    // 匹配
    if(regex_match(s.cbegin(), s.cbegin()+3, mt, ex)){
        cout << mt.str() << endl;
    }
    string s2 = s;
    // 搜索
    while(regex_search(s2, mt, ex)){
        cout << mt[1] << endl;
        s2 = mt.suffix();
    }
    // 替换
    s2 = regex_replace(s, ex, "$1");
    cout << s2 << endl;
}

python

import re

s = "this is me, that\tis you!"
ex = re.compile("\\bth([^\\s]+)")

mt = ex.match(s, 0, 3)
print(mt.group())

bg_pos = 0
mt = ex.search(s, bg_pos)
while mt:
    print(mt[1])
    bg_pos = mt.end()
    mt = ex.search(s, bg_pos)

print(ex.sub(lambda m:m[1], s))

上述两段代码的输出都为：

thi
is
at
is is me, at    is you!

练习

下面是网易的一道笔试题，用c++完成，题目要求为解析简易版的markdown文档，文档的要求为：

以#、##、###、... 开头的行，去掉最前面的#；
以 +\t 开头的行，去掉最前面的 +t；
将超链接替换为其内容，例如将 abc[xxx](url.com) 替换为 abcxxx；

c++实现代码为：

#include <iostream>
#include <sstream>
#include <regex>
#include <string>
using namespace std;


string decode_markdown(string s){
    // #、##、### 表示标题
    regex e_title("^(#+)");
    // +\t 表示list
    regex e_list("^\\+\\t");
    // []()表示超链接
    regex e_href("\\[([^\\]]*)\\]\\([^\\)]*\\)");
    string ret, line;
    istringstream sf(s);
    while(getline(sf, line)){
        string ns = regex_replace(line, e_title, "");
        ns = regex_replace(ns, e_list, "");
        ns = regex_replace(ns, e_href, "$1");
        ret += ns + "\n";
    }
    
    return ret;
}

int main(){
    string s("#hello\n##world\n+\tlist1\n+\tlist " \
        "[this is href](abc.com) suffix\nsome thing "\
        "[another ref](aaa.com) else\n");
    cout << decode_markdown(s) << endl;
}

输出为：

hello
world
list1
list this is href suffix
some thing another ref else

对应的python代码为：

import re

s = ("#hello\n##world\n+\tlist1\n+\tlist " +
        "[this is href](abc.com) suffix\nsome thing " +
        "[another ref](aaa.com) else\n")
result = ""
e_title = re.compile("^(#+)")
e_list = re.compile("\\+\\t")
e_href = re.compile("\\[([^\\]]*)\\]\\([^\\)]*\\)")
for line in s.split('\n'):
    ns = e_title.sub("", line)
    ns = e_list.sub("", ns)
    ns = e_href.sub(lambda mt:mt[1], ns)
    result += ns + '\n'
print(result)