Hugging Face 实战:Transformers 库、Pipeline 与预训练模型

📂 所属阶段:第四阶段 — 预训练模型与迁移学习(应用篇)
🔗 相关章节:BERT 家族详解 · 文本分类实战:情感分析引擎


1. Hugging Face 生态系统

Hugging Face = NLP 领域的 GitHub

核心产品:
├── Transformers:预训练模型库
├── Datasets:数据集库
├── Tokenizers:高性能分词器
├── Accelerate:分布式训练
├── Hub:模型共享平台(10万+模型)
└── Spaces:Demo 托管平台

2. Pipeline(最简单的方式)

pip install transformers torch

from transformers import pipeline

# 情感分析(1行代码!)
sentiment = pipeline("sentiment-analysis")
result = sentiment("I love natural language processing!")
print(result)
# [{'label': 'POSITIVE', 'score': 0.9998}]

# 中文情感分析
sentiment_zh = pipeline("sentiment-analysis",
                        model="uer/roberta-base-finetuned-chinanews-chinese")
print(sentiment_zh("这个产品太棒了,必须推荐!"))
# [{'label': 'positive', 'score': 0.98}]

# 问答系统
qa = pipeline("question-answering")
result = qa(
    question="什么是自然语言处理?",
    context="自然语言处理是人工智能的重要分支,研究计算机与人类语言的交互。"
)
print(result)
# {'score': 0.95, 'start': 0, 'end': 30, 'answer': '自然语言处理'}

# 文本生成
generator = pipeline("text-generation", model="gpt2")
print(generator("Once upon a time", max_length=50)[0]["generated_text"])

# 翻译
translator = pipeline("translation_en_to_zh", model="Helsinki-NLP/opus-mt-en-zh")
print(translator("Hello, world!")[0]["translation_text"])

3. 使用中文预训练模型

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# 中文 BERT
model_name = "bert-base-chinese"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2
)

# 中文文本分类
text = "这部电影太精彩了,必须推荐!"
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=-1)
    pred = torch.argmax(probs, dim=-1)

print(f"预测: {'正面' if pred.item() == 1 else '负面'}, 置信度: {probs.max().item():.4f}")

4. 完整微调流程

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch

# 1. 加载数据
dataset = load_dataset("tyqiangz_sentiment-chinese", "sentiment datasets", split="train")
dataset = dataset.train_test_split(test_size=0.1)

# 2. 加载模型
model_name = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# 3. 分词
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

dataset = dataset.map(tokenize, batched=True)
dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# 4. 训练参数
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir="./logs",
    fp16=True,  # 加速(需要 GPU)
)

# 5. 评估指标
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average="weighted"),
    }

# 6. 训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.evaluate()

5. Datasets 数据集操作

from datasets import load_dataset, concatenate_datasets

# 加载数据集
dataset = load_dataset("mteb/amazon_reviews_multi", "zh", split="train[:1000]")

# 查看数据
print(dataset[0])
print(dataset.column_names)

# 预处理
def preprocess(example):
    return {"text": example["text"][:200], "label": example["label"]}

dataset = dataset.map(preprocess, remove_columns=dataset.column_names)

# 过滤
dataset = dataset.filter(lambda x: len(x["text"]) > 10)

# 打乱
dataset = dataset.shuffle(seed=42)

# 分割
train, test = dataset.train_test_split(test_size=0.2).values()

# 保存
dataset.save_to_disk("./my_dataset")

6. 小结

# Hugging Face 速查

# Pipeline(最简单)
pipeline("sentiment-analysis")

# 加载模型
AutoTokenizer.from_pretrained("bert-base-chinese")
AutoModel.from_pretrained("bert-base-chinese")

# 训练
Trainer(
    model=model,
    args=TrainingArguments(...),
    train_dataset=...,
    eval_dataset=...,
)

💡 最佳实践:能用 Pipeline 就用 Pipeline,能用微调就微调,不要轻易从零训练模型。


🔗 扩展阅读