文本分类实战:基于 BERT 实现企业级情感分析引擎

📂 所属阶段:第四阶段 — 预训练模型与迁移学习(应用篇)
🔗 相关章节:Hugging Face 实战 · 命名实体识别 (NER)


1. 项目概述

目标:构建一个中文情感分析引擎
准确率目标:> 95%
数据:酒店评论(好评/差评)
模型:BERT-base-chinese

2. 数据准备

# data_loader.py
import pandas as pd
from sklearn.model_selection import train_test_split

def load_data():
    # 示例:加载 CSV 数据
    df = pd.read_csv("hotel_reviews.csv")
    print(f"总样本数: {len(df)}")
    print(df["label"].value_counts())

    # 划分训练/测试集
    train_df, test_df = train_test_split(
        df, test_size=0.2, random_state=42, stratify=df["label"]
    )
    train_df.to_json("train.json", orient="records", force_ascii=False)
    test_df.to_json("test.json", orient="records", force_ascii=False)

    return train_df, test_df

3. 数据增强

# data_augmentation.py
from transformers import pipeline

def augment_data(texts, labels, n_augments=2):
    """使用 T5 生成增强数据"""
    augmentor = pipeline("text2text-generation", model="uer/t5-base-chinese-cluecorpuss底")

    augmented = []
    for text, label in zip(texts, labels):
        for _ in range(n_augments):
            # 同义词替换(简化版)
            aug_text = synonym_replacement(text)
            augmented.append({"text": aug_text, "label": label})

    return augmented

def synonym_replacement(text, n=3):
    """简化版同义词替换"""
    synonyms = {"好": ["棒", "不错", "优秀"], "差": ["糟糕", "失望", "不行"]}
    for word, syns in synonyms.items():
        if word in text and n > 0:
            import random
            text = text.replace(word, random.choice(syns), 1)
            n -= 1
    return text

4. 模型微调

# train.py
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch

def train():
    # 数据
    dataset = load_dataset("json", data_files={
        "train": "train.json", "test": "test.json"
    })

    tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-chinese", num_labels=2
    )

    def tokenize(batch):
        return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

    dataset = dataset.map(tokenize, batched=True)
    dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

    # 评估指标
    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = torch.argmax(torch.tensor(logits), dim=-1)
        acc = accuracy_score(labels, preds)
        precision, recall, f1, _ = precision_recall_fscore_support(
            labels, preds, average="binary"
        )
        return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

    # 训练
    args = TrainingArguments(
        output_dir="./sentiment_model",
        num_train_epochs=3,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        fp16=True,
        logging_steps=50,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["test"],
        compute_metrics=compute_metrics,
    )

    trainer.train()
    trainer.save_model("./best_model")

# 输出示例
"""
Epoch 1: F1 = 0.93
Epoch 2: F1 = 0.95
Epoch 3: F1 = 0.96
→ 最终 F1 = 0.96,超过目标!
"""

5. 评估与混淆矩阵

# evaluate.py
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def evaluate(model_path):
    # 加载模型
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    # 在测试集上预测
    # ...(省略预测代码)

    # 分类报告
    print(classification_report(y_true, y_pred, target_names=["负面", "正面"]))

    # 混淆矩阵
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt="d", xticklabels=["负面", "正面"], yticklabels=["负面", "正面"])
    plt.xlabel("预测")
    plt.ylabel("真实")
    plt.savefig("confusion_matrix.png")

6. 模型部署

# app.py
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model_name = "./best_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()

LABELS = ["负面", "正面"]

def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    with torch.no_grad():
        logits = model(**inputs).logits
        probs = torch.softmax(logits, dim=-1).numpy()[0]
        pred = torch.argmax(logits, dim=-1).item()
    return {
        "text": text,
        "sentiment": LABELS[pred],
        "confidence": round(float(probs[pred]), 4),
        "probs": {LABELS[i]: round(float(p), 4) for i, p in enumerate(probs)},
    }

# FastAPI 服务
from fastapi import FastAPI
app = FastAPI()

@app.post("/predict")
def api_predict(text: str):
    return predict(text)

7. 小结

情感分析引擎流程:

1. 数据准备 → CSV → 划分 train/test
2. 数据增强 → 同义词替换 / Back Translation
3. 模型选择 → BERT-base-chinese
4. 微调训练 → 3 epochs, F1 > 95%
5. 评估分析 → 混淆矩阵 / 分类报告
6. 部署上线 → FastAPI API

2026 年进阶:用微调+RLHF 做更细粒度的多情感分类(喜、怒、哀、乐等)