1. 文本中url地址抽取
regex = "(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*,]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)|([a-zA-Z]+.\w+\.+[a-zA-Z0-9\/_]+)"
text = '百度的网站地址是:http://baidu.com'
url = re.findall(regex, text)
print(url)
"""
[('http://baidu.com', '')]
"""
2. Email 地址抽取
regex = "([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"
text = "我的邮箱是test@163.com"
email = re.findall(regex, text)
print(email)
“”“
['test@163.com']
”“”
3. 身份证号码抽取
regex = "\d{17}[\d|x|X]"
text = "我的身份证号码是352221199604050372"
idnumber = re.findall(regex, text)
print(idnumber)
“”“
['352221199604050372']
”“”
4. ip地址抽取
regex = "(?:(?:[0,1]?\d?\d|2[0-4]\d|25[0-5])\.){3}(?:[0,1]?\d?\d|2[0-4]\d|25[0-5])"
text = '我的ip是127.0.0.1'
ip = re.findall(regex, text)
print(ip)
"""
['127.0.0.1']
"""
5. 电话号码抽取
regex = "1[34578]\d{9}"
text = "我的手机号是18896827613"
phoneNumber = re.findall(regex, text)
print(phoneNumber)
"""
['18896827613']
"""
6. 中文信息抽取
import re
regex = '[\u4e00-\u9fa5]'
text = '我是wo shi 中国人 zhong guo ren'
ch_character = re.findall(regex, text)
print(ch_character)
"""
['我', '是', '中', '国', '人']
7. 英文信息抽取
text = '我是wo shi 中国人 zhong guo ren'
regex = '[a-zA-Z]'
en_character = re.findall(regex, text)
print(en_character)
['w', 'o', 's', 'h', 'i', 'z', 'h', 'o', 'n', 'g', 'g', 'u', 'o', 'r', 'e', 'n']
8. 数字抽取
text = '北京2019口号富强'
regex = '[0-9]'
number = re.findall(regex, text)
print(number)
['2', '0', '1', '9']