#文本分类实战:基于 BERT 实现企业级情感分析引擎
📂 所属阶段:第四阶段 — 预训练模型与迁移学习(应用篇)
🔗 相关章节:Hugging Face 实战 · 命名实体识别 (NER)
#1. 项目概述
目标:构建一个中文情感分析引擎
准确率目标:> 95%
数据:酒店评论(好评/差评)
模型:BERT-base-chinese#2. 数据准备
# data_loader.py
import pandas as pd
from sklearn.model_selection import train_test_split
def load_data():
# 示例:加载 CSV 数据
df = pd.read_csv("hotel_reviews.csv")
print(f"总样本数: {len(df)}")
print(df["label"].value_counts())
# 划分训练/测试集
train_df, test_df = train_test_split(
df, test_size=0.2, random_state=42, stratify=df["label"]
)
train_df.to_json("train.json", orient="records", force_ascii=False)
test_df.to_json("test.json", orient="records", force_ascii=False)
return train_df, test_df#3. 数据增强
# data_augmentation.py
from transformers import pipeline
def augment_data(texts, labels, n_augments=2):
"""使用 T5 生成增强数据"""
augmentor = pipeline("text2text-generation", model="uer/t5-base-chinese-cluecorpuss底")
augmented = []
for text, label in zip(texts, labels):
for _ in range(n_augments):
# 同义词替换(简化版)
aug_text = synonym_replacement(text)
augmented.append({"text": aug_text, "label": label})
return augmented
def synonym_replacement(text, n=3):
"""简化版同义词替换"""
synonyms = {"好": ["棒", "不错", "优秀"], "差": ["糟糕", "失望", "不行"]}
for word, syns in synonyms.items():
if word in text and n > 0:
import random
text = text.replace(word, random.choice(syns), 1)
n -= 1
return text#4. 模型微调
# train.py
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
def train():
# 数据
dataset = load_dataset("json", data_files={
"train": "train.json", "test": "test.json"
})
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
model = AutoModelForSequenceClassification.from_pretrained(
"bert-base-chinese", num_labels=2
)
def tokenize(batch):
return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)
dataset = dataset.map(tokenize, batched=True)
dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
# 评估指标
def compute_metrics(eval_pred):
logits, labels = eval_pred
preds = torch.argmax(torch.tensor(logits), dim=-1)
acc = accuracy_score(labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(
labels, preds, average="binary"
)
return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}
# 训练
args = TrainingArguments(
output_dir="./sentiment_model",
num_train_epochs=3,
per_device_train_batch_size=32,
per_device_eval_batch_size=64,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="f1",
fp16=True,
logging_steps=50,
)
trainer = Trainer(
model=model,
args=args,
train_dataset=dataset["train"],
eval_dataset=dataset["test"],
compute_metrics=compute_metrics,
)
trainer.train()
trainer.save_model("./best_model")
# 输出示例
"""
Epoch 1: F1 = 0.93
Epoch 2: F1 = 0.95
Epoch 3: F1 = 0.96
→ 最终 F1 = 0.96,超过目标!
"""#5. 评估与混淆矩阵
# evaluate.py
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
def evaluate(model_path):
# 加载模型
model = AutoModelForSequenceClassification.from_pretrained(model_path)
# 在测试集上预测
# ...(省略预测代码)
# 分类报告
print(classification_report(y_true, y_pred, target_names=["负面", "正面"]))
# 混淆矩阵
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt="d", xticklabels=["负面", "正面"], yticklabels=["负面", "正面"])
plt.xlabel("预测")
plt.ylabel("真实")
plt.savefig("confusion_matrix.png")#6. 模型部署
# app.py
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
model_name = "./best_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()
LABELS = ["负面", "正面"]
def predict(text):
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
with torch.no_grad():
logits = model(**inputs).logits
probs = torch.softmax(logits, dim=-1).numpy()[0]
pred = torch.argmax(logits, dim=-1).item()
return {
"text": text,
"sentiment": LABELS[pred],
"confidence": round(float(probs[pred]), 4),
"probs": {LABELS[i]: round(float(p), 4) for i, p in enumerate(probs)},
}
# FastAPI 服务
from fastapi import FastAPI
app = FastAPI()
@app.post("/predict")
def api_predict(text: str):
return predict(text)#7. 小结
情感分析引擎流程:
1. 数据准备 → CSV → 划分 train/test
2. 数据增强 → 同义词替换 / Back Translation
3. 模型选择 → BERT-base-chinese
4. 微调训练 → 3 epochs, F1 > 95%
5. 评估分析 → 混淆矩阵 / 分类报告
6. 部署上线 → FastAPI API
2026 年进阶:用微调+RLHF 做更细粒度的多情感分类(喜、怒、哀、乐等)
