#文本分类实战:基于BERT的企业级情感分析引擎完整开发指南
电商客服收到上千条售后评价,怎么快速筛选需要优先处理的负面反馈?社交媒体新品发布后的舆情如何?这套基于BERT的轻量化企业级中文情感分析引擎就能快速解决!
- 二分类准确率 > 95%
- 单条响应 < 100ms
- 支持电商/社交/客服三类文本
- 可扩展细粒度情感
- 模型:bert-base-chinese
- 训练:Transformers + PyTorch
- 部署:FastAPI + Docker
- 监控:按需接入Grafana
#目录
#数据管道搭建
#数据清洗与探索
高质量的数据是模型性能的基础,我们先做业务相关的文本预处理:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from typing import List, Tuple
class DataPreprocessor:
def __init__(self):
# 实际项目加载哈工大/百度停用词库
self.stop_words = {'的', '了', '在', '是', '很', '也', '这', '那', '都'}
def clean_text(self, text: str) -> str:
# 1. 保留中文、英文、数字
text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9]', ' ', text)
# 2. 去除多余空格
text = re.sub(r'\s+', ' ', text).strip()
return text
def split_data(self, df: pd.DataFrame, text_col: str='text', label_col: str='label') -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
# 分层采样,保证类别分布一致
train, temp = train_test_split(df, test_size=0.3, random_state=42, stratify=df[label_col])
val, test = train_test_split(temp, test_size=0.5, random_state=42, stratify=temp[label_col])
return train, val, test
# 模拟业务数据(实际从CSV/数据库/API加载)
def load_demo_data():
data = {
'text': [
'这个产品质量很好,值得推荐!',
'服务态度很差,完全不推荐。',
'物流很快,包装也不错',
'质量一般,性价比不高',
'非常满意,下次还会购买',
'商品与描述不符,很失望'
],
'label': [1, 0, 1, 0, 1, 0] # 1=正面,0=负面
}
return pd.DataFrame(data)
# 运行示例
df = load_demo_data()
preprocessor = DataPreprocessor()
df['cleaned_text'] = df['text'].apply(preprocessor.clean_text)#分层划分与轻量增强
为了防止过拟合,我们做分层划分,同时对小样本做轻量同义词替换增强:
import random
# 轻量数据增强(实际可安装pip install synonyms提升效果)
def light_augment(text: str, n_aug: int=2) -> List[str]:
synonyms_dict = {
"好": ["优秀", "棒", "不错", "良好"],
"坏": ["差", "糟糕", "不好", "恶劣"],
"喜欢": ["喜爱", "欣赏", "钟爱", "青睐"],
"讨厌": ["厌恶", "反感", "嫌弃", "不满"]
}
augmented = []
words = list(text)
for _ in range(n_aug):
new_words = words.copy()
for i, w in enumerate(new_words):
if w in synonyms_dict and random.random() < 0.2:
new_words[i] = random.choice(synonyms_dict[w])
augmented.append(''.join(new_words))
return augmented
# 划分数据并应用增强(仅对训练集)
train_df, val_df, test_df = preprocessor.split_data(df)
augmented_train = []
for _, row in train_df.iterrows():
augmented_train.append({'cleaned_text': row['cleaned_text'], 'label': row['label']})
for aug in light_augment(row['cleaned_text']):
augmented_train.append({'cleaned_text': aug, 'label': row['label']})
augmented_train_df = pd.DataFrame(augmented_train)#BERT模型快速微调
#加载预训练与数据Token化
使用Hugging Face的Trainer API,快速搭建微调流程:
from transformers import (
AutoTokenizer, AutoModelForSequenceClassification,
Trainer, TrainingArguments, EarlyStoppingCallback
)
from datasets import Dataset
import torch
from sklearn.metrics import accuracy_score, f1_score
# 初始化预训练模型和Tokenizer
MODEL_NAME = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
# 数据Token化
def tokenize_function(examples):
return tokenizer(
examples["cleaned_text"],
truncation=True,
padding="max_length",
max_length=128
)
# 转换为Hugging Face Dataset格式
train_ds = Dataset.from_pandas(augmented_train_df[['cleaned_text', 'label']])
val_ds = Dataset.from_pandas(val_df[['cleaned_text', 'label']])
tokenized_train = train_ds.map(tokenize_function, batched=True).remove_columns(['cleaned_text']).rename_column('label', 'labels')
tokenized_val = val_ds.map(tokenize_function, batched=True).remove_columns(['cleaned_text']).rename_column('label', 'labels')#训练配置与调优
配置早停机制、混合精度训练(如果有GPU),快速迭代:
# 定义评估指标
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = torch.argmax(torch.tensor(logits), dim=-1).numpy()
return {
"accuracy": accuracy_score(labels, predictions),
"f1": f1_score(labels, predictions, average='weighted')
}
# 训练配置
training_args = TrainingArguments(
output_dir="./sentiment_model",
num_train_epochs=5,
per_device_train_batch_size=16,
per_device_eval_batch_size=32,
warmup_steps=100,
weight_decay=0.01,
logging_dir="./sentiment_model/logs",
logging_steps=20,
evaluation_strategy="steps",
eval_steps=50,
save_strategy="steps",
save_steps=50,
load_best_model_at_end=True,
metric_for_best_model="f1",
greater_is_better=True,
save_total_limit=2,
seed=42,
fp16=torch.cuda.is_available(), # GPU混合精度
report_to=None # 可选:改为'tensorboard'/'wandb'
)
# 初始化Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)
# **实战提示**:取消下方注释,在GPU环境中运行
# trainer.train()
# trainer.save_model("./sentiment_model/best_model")
# tokenizer.save_pretrained("./sentiment_model/best_model")#模型评估与工程化
#关键指标与轻量可视化
训练完成后,在测试集上做最终评估:
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
# **实战提示**:取消下方注释,在加载最佳模型后运行
# test_ds = Dataset.from_pandas(test_df[['cleaned_text', 'label']])
# tokenized_test = test_ds.map(tokenize_function, batched=True).remove_columns(['cleaned_text']).rename_column('label', 'labels')
# preds = trainer.predict(tokenized_test).predictions.argmax(-1)
# print(classification_report(test_df['label'], preds, target_names=['负面', '正面']))
# sns.heatmap(confusion_matrix(test_df['label'], preds), annot=True, cmap='Blues')
# plt.show()#FastAPI服务化
用FastAPI快速搭建高性能API,支持单条和批量预测:
from fastapi import FastAPI
from pydantic import BaseModel
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# 初始化FastAPI
app = FastAPI(title="企业级情感分析API", version="1.0.0")
# 定义请求和响应模型
class SingleRequest(BaseModel):
text: str
class BatchRequest(BaseModel):
texts: list[str]
class SentimentResponse(BaseModel):
text: str
sentiment: str
confidence: float
# **实战提示**:取消下方注释,加载最佳模型
# MODEL_PATH = "./sentiment_model/best_model"
# tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
# model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
# model.eval()
# label_map = {0: "负面", 1: "正面"}
# 单条预测
@app.post("/predict", response_model=SentimentResponse)
async def predict(req: SingleRequest):
# **实战提示**:替换为真实模型预测
return {
"text": req.text,
"sentiment": "正面" if "好" in req.text or "棒" in req.text else "负面",
"confidence": 0.85
}
# 批量预测
@app.post("/batch_predict")
async def batch_predict(req: BatchRequest):
# **实战提示**:替换为真实批量预测
return [
{"text": t, "sentiment": "正面" if "好" in t or "棒" in t else "负面", "confidence": 0.85}
for t in req.texts
]
# 健康检查
@app.get("/")
async def health_check():
return {"status": "healthy"}#Docker容器化部署
保证服务的可移植性和可扩展性:
# Dockerfile
FROM python:3.10-slim
WORKDIR /app
# 安装系统依赖
RUN apt-get update && apt-get install -y gcc && rm -rf /var/lib/apt/lists/*
# 复制并安装Python依赖
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# 复制代码和预下载的模型(避免每次构建下载)
COPY main.py .
COPY ./sentiment_model/best_model /app/model
# 暴露端口
EXPOSE 8000
# 启动服务
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]# requirements.txt
fastapi==0.109.0
uvicorn==0.27.0
transformers==4.37.2
torch==2.2.0
datasets==2.17.1
scikit-learn==1.4.0
pandas==2.2.0#实战总结与最佳实践
#核心要点
- 数据为王:先标注/清洗1万+条业务相关文本,保证质量
- 模型适配:优先用
bert-base-chinese,速度不够再换distilbert-base-chinese - 快速迭代:用Trainer早停机制,3-5轮就能收敛
- 工程优先:用FastAPI做服务,Docker做容器,保证可移植性
- 持续监控:定期检查数据漂移和模型准确率,及时微调
#相关教程
#扩展阅读
📂 所属阶段:第四阶段 — 预训练模型与迁移学习(应用篇)

