#Hugging Face 实战:Transformers 库、Pipeline 与预训练模型
📂 所属阶段:第四阶段 — 预训练模型与迁移学习(应用篇)
🔗 相关章节:BERT 家族详解 · 文本分类实战:情感分析引擎
#1. Hugging Face 生态系统
Hugging Face = NLP 领域的 GitHub
核心产品:
├── Transformers:预训练模型库
├── Datasets:数据集库
├── Tokenizers:高性能分词器
├── Accelerate:分布式训练
├── Hub:模型共享平台(10万+模型)
└── Spaces:Demo 托管平台#2. Pipeline(最简单的方式)
pip install transformers torch
from transformers import pipeline
# 情感分析(1行代码!)
sentiment = pipeline("sentiment-analysis")
result = sentiment("I love natural language processing!")
print(result)
# [{'label': 'POSITIVE', 'score': 0.9998}]
# 中文情感分析
sentiment_zh = pipeline("sentiment-analysis",
model="uer/roberta-base-finetuned-chinanews-chinese")
print(sentiment_zh("这个产品太棒了,必须推荐!"))
# [{'label': 'positive', 'score': 0.98}]
# 问答系统
qa = pipeline("question-answering")
result = qa(
question="什么是自然语言处理?",
context="自然语言处理是人工智能的重要分支,研究计算机与人类语言的交互。"
)
print(result)
# {'score': 0.95, 'start': 0, 'end': 30, 'answer': '自然语言处理'}
# 文本生成
generator = pipeline("text-generation", model="gpt2")
print(generator("Once upon a time", max_length=50)[0]["generated_text"])
# 翻译
translator = pipeline("translation_en_to_zh", model="Helsinki-NLP/opus-mt-en-zh")
print(translator("Hello, world!")[0]["translation_text"])#3. 使用中文预训练模型
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
# 中文 BERT
model_name = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
model_name, num_labels=2
)
# 中文文本分类
text = "这部电影太精彩了,必须推荐!"
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
with torch.no_grad():
outputs = model(**inputs)
logits = outputs.logits
probs = torch.softmax(logits, dim=-1)
pred = torch.argmax(probs, dim=-1)
print(f"预测: {'正面' if pred.item() == 1 else '负面'}, 置信度: {probs.max().item():.4f}")#4. 完整微调流程
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch
# 1. 加载数据
dataset = load_dataset("tyqiangz_sentiment-chinese", "sentiment datasets", split="train")
dataset = dataset.train_test_split(test_size=0.1)
# 2. 加载模型
model_name = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
# 3. 分词
def tokenize(batch):
return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)
dataset = dataset.map(tokenize, batched=True)
dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
# 4. 训练参数
training_args = TrainingArguments(
output_dir="./results",
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=32,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="accuracy",
logging_dir="./logs",
fp16=True, # 加速(需要 GPU)
)
# 5. 评估指标
from sklearn.metrics import accuracy_score, f1_score
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = torch.argmax(torch.tensor(logits), dim=-1)
return {
"accuracy": accuracy_score(labels, predictions),
"f1": f1_score(labels, predictions, average="weighted"),
}
# 6. 训练
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset["train"],
eval_dataset=dataset["test"],
compute_metrics=compute_metrics,
)
trainer.train()
trainer.evaluate()#5. Datasets 数据集操作
from datasets import load_dataset, concatenate_datasets
# 加载数据集
dataset = load_dataset("mteb/amazon_reviews_multi", "zh", split="train[:1000]")
# 查看数据
print(dataset[0])
print(dataset.column_names)
# 预处理
def preprocess(example):
return {"text": example["text"][:200], "label": example["label"]}
dataset = dataset.map(preprocess, remove_columns=dataset.column_names)
# 过滤
dataset = dataset.filter(lambda x: len(x["text"]) > 10)
# 打乱
dataset = dataset.shuffle(seed=42)
# 分割
train, test = dataset.train_test_split(test_size=0.2).values()
# 保存
dataset.save_to_disk("./my_dataset")#6. 小结
# Hugging Face 速查
# Pipeline(最简单)
pipeline("sentiment-analysis")
# 加载模型
AutoTokenizer.from_pretrained("bert-base-chinese")
AutoModel.from_pretrained("bert-base-chinese")
# 训练
Trainer(
model=model,
args=TrainingArguments(...),
train_dataset=...,
eval_dataset=...,
)💡 最佳实践:能用 Pipeline 就用 Pipeline,能用微调就微调,不要轻易从零训练模型。
🔗 扩展阅读

