#实战项目一:智能客服工单分类系统
📂 所属阶段:第六阶段 — 工业级 NLP 项目实战
🔗 相关章节:BERT 家族详解 · Hugging Face 实战
#1. 项目需求
目标:对客服工单进行自动分类
分类体系:
├── 咨询类(产品咨询、价格咨询)
├── 投诉类(服务投诉、质量投诉)
├── 售后类(退换货、维修申请)
├── 技术类(Bug 反馈、功能建议)
└── 其他类
目标:准确率 > 90%,F1 > 85%#2. 数据处理
# data_loader.py
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
def load_and_preprocess(csv_path):
df = pd.read_csv(csv_path)
print(f"原始数据分布:\n{df['category'].value_counts()}")
# 文本清洗
df["text"] = df["title"] + " " + df["content"]
df["text"] = df["text"].apply(clean_text)
# 标签编码
label2id = {cat: i for i, cat in enumerate(df["category"].unique())}
id2label = {v: k for k, v in label2id.items()}
df["label"] = df["category"].map(label2id)
# 划分数据集
train_df, test_df = train_test_split(
df, test_size=0.2, stratify=df["label"], random_state=42
)
# 不平衡处理
X_train_emb = extract_embeddings(train_df["text"].tolist()) # 用 BERT 提取特征
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train_emb, train_df["label"])
return train_df, test_df, label2id, id2label
def clean_text(text):
import re
text = re.sub(r"http\\S+", "", text) # 去 URL
text = re.sub(r"\\d+", " NUM ", text) # 数字归一
return text.strip()#3. 模型选型
# model_selector.py
"""
模型对比实验
目标:选择最适合工单分类的模型
"""
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from transformers import AutoModelForSequenceClassification
models = {
"TF-IDF + LR": lambda: LogisticRegression(max_iter=1000),
"BERT-base-chinese": lambda: AutoModelForSequenceClassification.from_pretrained(
"bert-base-chinese", num_labels=5
),
"RoBERTa-wwm": lambda: AutoModelForSequenceClassification.from_pretrained(
"hfl/chinese-roberta-wwm-ext", num_labels=5
),
}
# 实验结果(示例)
results = {
"TF-IDF + LR": {"accuracy": 0.82, "f1": 0.79},
"BERT-base-chinese": {"accuracy": 0.93, "f1": 0.91},
"RoBERTa-wwm": {"accuracy": 0.95, "f1": 0.94},
}
# 选 RoBERTa-wwm(最高 F1)#4. 完整训练流程
# train.py
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
model_name = "hfl/chinese-roberta-wwm-ext"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
# 训练配置
args = TrainingArguments(
output_dir="./ticket_classifier",
num_train_epochs=5,
per_device_train_batch_size=32,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="f1",
fp16=True,
warmup_ratio=0.1,
weight_decay=0.01,
)
# 训练
trainer = Trainer(
model=model,
args=args,
train_dataset=train_dataset,
eval_dataset=test_dataset,
compute_metrics=compute_metrics,
)
trainer.train()
trainer.save_model("./best_ticket_classifier")#5. API 部署
# api.py
from fastapi import FastAPI
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
app = FastAPI()
# 加载模型
model_name = "./best_ticket_classifier"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()
CATEGORIES = ["咨询类", "投诉类", "售后类", "技术类", "其他类"]
@app.post("/classify")
def classify_ticket(ticket: dict):
text = ticket["title"] + " " + ticket["content"]
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
with torch.no_grad():
logits = model(**inputs).logits
probs = torch.softmax(logits, dim=-1).numpy()[0]
pred_id = probs.argmax()
return {
"category": CATEGORIES[pred_id],
"confidence": round(float(probs[pred_id]), 4),
"all_scores": {cat: round(float(p), 4) for cat, p in zip(CATEGORIES, probs)}
}#6. 小结
工单分类系统流程:
1. 需求分析 → 5 类分类
2. 数据清洗 → 文本预处理、标签编码
3. 不平衡处理 → SMOTE
4. 模型选型 → RoBERTa-wwm
5. 训练微调 → F1 = 0.94
6. API 部署 → FastAPI💡 最佳实践:企业级 NLP 系统不仅仅是模型,还需要数据管道、监控系统、AB 测试。微调只是第一步,上线后的持续优化更重要。
🔗 扩展阅读

