跳到主要内容

Python 正则表达式实践指南

基本匹配模式

常见字符串匹配与提取。

import re

def validate_email(email):
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
return bool(re.match(pattern, email))

def find_phone_numbers(text):
pattern = r'\d{3}[-.]?\d{3}[-.]?\d{4}'
return re.findall(pattern, text)

def extract_urls(text):
pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
return re.findall(pattern, text)

字符串替换

文本替换与格式化。

def mask_sensitive_info(text):
patterns = {
"phone": r'\d{3}[-.]?\d{3}[-.]?\d{4}',
"email": r'[\w\.-]+@[\w\.-]+\.\w+',
"card": r'\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}'
}

masked_text = text
for key, pattern in patterns.items():
if key == "card":
masked_text = re.sub(pattern, "****-****-****-****", masked_text)
else:
masked_text = re.sub(pattern, "*****", masked_text)

return masked_text

模式分组

使用分组提取特定信息。

def parse_datetime(text):
pattern = r'(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})'
match = re.search(pattern, text)

if match:
year, month, day, hour, minute, second = match.groups()
return {
"year": int(year),
"month": int(month),
"day": int(day),
"hour": int(hour),
"minute": int(minute),
"second": int(second)
}
return None

文本分割

基于正则表达式分割文本。

def split_sentences(text):
pattern = r'[.!?]+\s+'
return re.split(pattern, text)

def tokenize_text(text):
pattern = r'\W+'
return [word.lower() for word in re.split(pattern, text) if word]

模式编译

编译常用正则表达式提高性能。

class TextValidator:
def __init__(self):
self.email_pattern = re.compile(r'^[\w\.-]+@[\w\.-]+\.\w+$')
self.phone_pattern = re.compile(r'\d{3}[-.]?\d{3}[-.]?\d{4}')
self.url_pattern = re.compile(
r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+')

def validate_email(self, email):
return bool(self.email_pattern.match(email))

def validate_phone(self, phone):
return bool(self.phone_pattern.match(phone))

def validate_url(self, url):
return bool(self.url_pattern.match(url))

高级匹配

复杂模式匹配与提取。

def extract_html_tags(html):
pattern = r'<(\w+)[^>]*>(.*?)</\1>'
return re.findall(pattern, html, re.DOTALL)

def find_words_context(text, word, context_words=5):
pattern = fr'\b\w+\b(?:[^\w\n]+\w+\b){0,{context_words}}'
pattern += fr'{word}\b(?:[^\w\n]+\w+\b){0,{context_words}}'
return re.findall(pattern, text, re.IGNORECASE)