Python 正则表达式实践指南
基本匹配模式
常见字符串匹配与提取。
import re
def validate_email(email):
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
return bool(re.match(pattern, email))
def find_phone_numbers(text):
pattern = r'\d{3}[-.]?\d{3}[-.]?\d{4}'
return re.findall(pattern, text)
def extract_urls(text):
pattern = r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+'
return re.findall(pattern, text)
字符串替换
文本替换与格式化。
def mask_sensitive_info(text):
patterns = {
"phone": r'\d{3}[-.]?\d{3}[-.]?\d{4}',
"email": r'[\w\.-]+@[\w\.-]+\.\w+',
"card": r'\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}'
}
masked_text = text
for key, pattern in patterns.items():
if key == "card":
masked_text = re.sub(pattern, "****-****-****-****", masked_text)
else:
masked_text = re.sub(pattern, "*****", masked_text)
return masked_text
模式分组
使用分组提取特定信息。
def parse_datetime(text):
pattern = r'(\d{4})-(\d{2})-(\d{2}) (\d{2}):(\d{2}):(\d{2})'
match = re.search(pattern, text)
if match:
year, month, day, hour, minute, second = match.groups()
return {
"year": int(year),
"month": int(month),
"day": int(day),
"hour": int(hour),
"minute": int(minute),
"second": int(second)
}
return None
文本分割
基于正则表达式分割文本。
def split_sentences(text):
pattern = r'[.!?]+\s+'
return re.split(pattern, text)
def tokenize_text(text):
pattern = r'\W+'
return [word.lower() for word in re.split(pattern, text) if word]
模式编译
编译常用正则表达式提高性能。
class TextValidator:
def __init__(self):
self.email_pattern = re.compile(r'^[\w\.-]+@[\w\.-]+\.\w+$')
self.phone_pattern = re.compile(r'\d{3}[-.]?\d{3}[-.]?\d{4}')
self.url_pattern = re.compile(
r'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+')
def validate_email(self, email):
return bool(self.email_pattern.match(email))
def validate_phone(self, phone):
return bool(self.phone_pattern.match(phone))
def validate_url(self, url):
return bool(self.url_pattern.match(url))
高级匹配
复杂模式匹配与提取。
def extract_html_tags(html):
pattern = r'<(\w+)[^>]*>(.*?)</\1>'
return re.findall(pattern, html, re.DOTALL)
def find_words_context(text, word, context_words=5):
pattern = fr'\b\w+\b(?:[^\w\n]+\w+\b){0,{context_words}}'
pattern += fr'{word}\b(?:[^\w\n]+\w+\b){0,{context_words}}'
return re.findall(pattern, text, re.IGNORECASE)