#文本清洗与规范化:停用词过滤、正则表达式与词干提取
#1. 为什么需要文本清洗?
#1.1 脏数据的问题
原始网页文本:
"<p>【推荐】🔥 限时优惠!仅剩 3 天!!点击👉 https://example.com 🎁</p>"
问题:
├── HTML 标签:<p>、<a>
├── 特殊字符:🔥、👉、🎁
├── 乱码符号:【】、!!
├── URL 链接:https://...
├── 表情符号:emoji
└── 重复标点:!!
清洗后:["推荐", "限时", "优惠", "仅剩", "3", "天", "点击"]#2. 基础清洗操作
#2.1 去除 HTML 标签
import re
from html import unescape
def remove_html(text):
"""去除 HTML 标签"""
# 方法一:正则
text = re.sub(r'<[^>]+>', '', text)
# 方法二:lxml(更健壮)
from lxml import html
text = html.fromstring(text).text_content()
return text
text = "<p>这是一段<strong>HTML</strong>文本</p>"
print(remove_html(text))
# 这是一段HTML文本#2.2 去除 URL 和邮箱
def remove_urls_and_emails(text):
"""去除 URL 和邮箱"""
# 去除 URL
text = re.sub(r'https?://\S+', '', text)
text = re.sub(r'www\.\S+', '', text)
# 去除邮箱
text = re.sub(r'\S+@\S+\.\S+', '', text)
return text
text = "联系邮箱:contact@example.com,官网:https://www.example.com"
print(remove_urls_and_emails(text))
# 联系邮箱:,官网:#2.3 去除特殊字符与标点
def remove_special_chars(text, keep_spaces=True):
"""只保留中英文、数字和空格"""
if keep_spaces:
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\s]', '', text)
else:
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', '', text)
return text
text = "限时优惠🔥!仅剩3天!!👉 https://t.cn/xxx 🎁"
print(remove_special_chars(text))
# 限时优惠仅剩3天#2.4 全角转半角
def fullwidth_to_halfwidth(text):
"""全角转半角"""
rstring = ""
for char in text:
inside_code = ord(char)
if inside_code == 12288: # 全角空格
inside_code = 32
elif 65281 <= inside_code <= 65374: # 全角字符
inside_code -= 65248
rstring += chr(inside_code)
return rstring
print(fullwidth_to_halfwidth("HELLO 123"))
# HELLO 123#2.5 Unicode 规范化
import unicodedata
def normalize_unicode(text):
"""Unicode NFC 规范化"""
return unicodedata.normalize("NFC", text)
# 消除重复字符
def remove_duplicate_chars(text, max_repeat=2):
"""将连续重复超过 max_repeat 次的字符缩减"""
return re.sub(r'(.)\1{%d,}' % max_repeat, r'\1' * max_repeat, text)
print(remove_duplicate_chars("我我我我太开心了"))
# 我我太开心了#3. 停用词
#3.1 中文停用词表
# 中文常见停用词
STOPWORDS_ZH = {
# 标点符号
'的', '了', '和', '是', '就', '都', '而', '及', '与', '着',
'或', '一个', '没有', '我们', '你们', '他们', '这个', '那个',
'啊', '呀', '呢', '吧', '吗', '哦', '哈', '嗯', '哎',
'了', '着', '过', '的', '地', '得',
}
def remove_stopwords(tokens, stopwords):
"""去除停用词"""
return [t for t in tokens if t not in stopwords]
# 使用 Jieba 分词后去除停用词
import jieba
text = "我今天在图书馆学习自然语言处理技术"
tokens = jieba.lcut(text)
filtered = remove_stopwords(tokens, STOPWORDS_ZH)
print(filtered)
# ['今天', '图书馆', '学习', '自然语言处理', '技术']#3.2 自定义停用词
# 从文件加载停用词
def load_stopwords(filepath):
with open(filepath, encoding="utf-8") as f:
return set(line.strip() for line in f if line.strip())
stopwords = load_stopwords("stopwords_zh.txt")#4. 词干提取与词形还原
#4.1 词干提取(Stemming)
# NLTK 词干提取器
import nltk
from nltk.stem import PorterStemmer, LancasterStemmer
nltk.download("punkt")
stemmer = PorterStemmer()
words = ["running", "runs", "ran", "runner", "easily", "fairly"]
stems = [stemmer.stem(w) for w in words]
print(stems)
# ['run', 'run', 'ran', 'runner', 'easili', 'fairli']
# Lancaster 词干提取器(更激进)
lancaster = LancasterStemmer()
print([lancaster.stem(w) for w in words])
# ['run', 'run', 'run', 'run', 'eas', 'fair']#4.2 词形还原(Lemmatization)
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")
lemmatizer = WordNetLemmatizer()
# 动词还原
print(lemmatizer.lemmatize("running", "v")) # run
print(lemmatizer.lemmatize("better", "a")) # good
print(lemmatizer.lemmatize("studies", "n")) # study#5. 完整文本预处理管道
import jieba
import re
import unicodedata
from functools import reduce
class TextPreprocessor:
def __init__(self, stopwords=None):
self.stopwords = stopwords or set()
def clean_html(self, text):
text = re.sub(r'<[^>]+>', '', text)
return unescape(text)
def clean_urls(self, text):
text = re.sub(r'https?://\S+', '', text)
text = re.sub(r'www\.\S+', '', text)
return text
def clean_special_chars(self, text, keep_spaces=True):
pattern = r'[^\u4e00-\u9fa5a-zA-Z0-9\s]' if keep_spaces else r'[^\u4e00-\u9fa5a-zA-Z0-9]'
return re.sub(pattern, '', text)
def normalize_unicode(self, text):
return unicodedata.normalize("NFC", text)
def remove_extra_spaces(self, text):
return re.sub(r'\s+', ' ', text).strip()
def tokenize_zh(self, text):
return jieba.lcut(text)
def remove_stopwords(self, tokens):
return [t for t in tokens if t not in self.stopwords and len(t) > 1]
def full_pipeline(self, text):
"""完整预处理管道"""
steps = [
self.clean_html,
self.clean_urls,
self.normalize_unicode,
self.clean_special_chars,
self.remove_extra_spaces,
]
text = reduce(lambda t, fn: fn(t), steps, text)
tokens = self.tokenize_zh(text)
tokens = self.remove_stopwords(tokens)
return tokens
# 使用
processor = TextPreprocessor(stopwords=STOPWORDS_ZH)
result = processor.full_pipeline("""
<p>🔥 限时优惠!仅剩3天!!
来我们的官网 https://example.com 了解更多!
<p>
""")
print(result)
# ['限时', '优惠', '仅剩', '天', '官网', '了解']#6. 小结
# 文本清洗速查
# 正则
re.sub(r'<[^>]+>', '', text) # 去除 HTML
re.sub(r'https?://\S+', '', text) # 去除 URL
re.sub(r'[^\w\s]', '', text) # 去除标点
# Unicode
unicodedata.normalize("NFC", text) # 规范化
unicodedata.category(char) # 获取字符类别
# 分词
jieba.lcut(text) # 中文分词
nltk.word_tokenize(text) # 英文分词
# 词干提取
PorterStemmer().stem(word) # 英文词干💡 最佳实践:文本清洗不是越干净越好,要根据任务决定。比如情感分析中,标点 "!!!" 和 "😠" 可能包含情感信息,适度保留。
🔗 扩展阅读

