re模块使用案例


1. 提取

import re

ret = re.findall(正则表达式,被提取的字符串)
# 返回类型是列表
# 没有匹配返回空列表

2. 匹配

import re

ret = re.match(正则表达式,被匹配的字符串)
# 匹配成功返回 【class 're.Match'】对象
# 匹配不成功,返回None
# 使用if判断
if ret: # if ret == True:
    xxxxxxx

3. 替换

import re


ret = re.sub(正则表达式,替换成的字符串,被匹配的字符串)

练习

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Title</title>
</head>
<body>
    <footer>
        <div>
            <div class="email">
                Email:kefu@CSDN.net
            </div>
            <div class="tel">
                手机号:400-660-0108
            </div>
        </div>
    </footer>
</body>
</html>
import re


with open('index.html','r',encoding='utf-8') as f:
    html = f.read()
    print(html)

    pattern_1 = '<div class="email">(.*?)</div>'

    ret_1 = re.findall(pattern_1,html)
    print(ret_1)
# 输出是空列表!
# 原因是.匹配非\n \r的字符串,但是这里有\n,所以一般都在正则匹配之前把换行符替换掉
# 改成这个
import re


with open('index.html','r',encoding='utf-8') as f:
    html = f.read()
    html = re.sub('\n','',html)

    print(html)
    pattern_1 = '<div class="email">(.*?)</div>'

    ret_1 = re.findall(pattern_1,html)
    print(ret_1)

优化一下输出

import re


with open('index.html','r',encoding='utf-8') as f:
    html = f.read()
    html = re.sub('\n','',html)

    print(html)
    pattern_1 = '<div class="email">(.*?)</div>'

    ret_1 = re.findall(pattern_1,html)
    print(ret_1[0].strip())

# r是正则常用的,字符串防转义
# 首字符大小写字母,后面可以大小写字符数字下划线,总共6-16个字符
password_pattern = r'^[a-zA-Z][a-zA-Z0-9_]{5,15}$'

pass1 = '1234567'
pass2 = 'k123456'
pass3 = 'k123'

print(re.match(password_pattern,pass1))
print(re.match(password_pattern,pass2))
print(re.match(password_pattern,pass3))
# re.match()的返回值可以用if取True False

  • re.findall
  • re.match
  • re.sub

是三种最常用的正则表达式方法,在数据清洗和爬虫中非常常用