#PyTorch 基础与 NLP 适配:构建第一个文本分类器
📂 所属阶段:第二阶段 — 深度学习与序列模型(进阶篇)
🔗 相关章节:词向量空间 · 循环神经网络 (RNN)
#1. PyTorch 基础
#1.1 Tensor 是什么?
import torch
import numpy as np
# 从 Python 列表创建
x = torch.tensor([1.0, 2.0, 3.0])
print(x.shape, x.dtype)
# torch.Size([3]) torch.float32
# 从 NumPy 创建
np_array = np.array([[1, 2], [3, 4]])
x = torch.from_numpy(np_array).float()
print(x)
# tensor([[1., 2.],
# [3., 4.]])
# 常用创建方法
torch.zeros(3, 5) # 零矩阵
torch.randn(3, 5) # 标准正态分布
torch.arange(0, 10, 2) # [0, 2, 4, 6, 8]
torch.linspace(0, 1, 10) # 等差数列#1.2 Tensor 操作
# 基本运算
a = torch.tensor([[1, 2], [3, 4]])
b = torch.tensor([[5, 6], [7, 8]])
print(a + b) # 逐元素加法
print(a @ b) # 矩阵乘法
print(a * b) # 逐元素乘法
print(a.sum()) # 求和
# 形状变换
x = torch.randn(32, 10, 100) # (batch, seq_len, features)
print(x.view(32, -1).shape) # (32, 1000) 展平
print(x.transpose(0, 1).shape) # (10, 32, 100) 转置#2. 自动求导
#2.1 requires_grad
# 创建需要梯度的张量
x = torch.tensor([2.0, 3.0], requires_grad=True)
print(x.grad) # None
# 计算 y = x^2 + 2x + 1
y = x**2 + 2*x + 1
# 反向传播
y.sum().backward()
print(x.grad) # dy/dx = 2x + 2 → [6., 8.]#2.2 简单 MLP 实现
import torch.nn as nn
class SimpleClassifier(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super().__init__()
self.fc1 = nn.Linear(input_size, hidden_size)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(hidden_size, num_classes)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
return x
# 使用
model = SimpleClassifier(768, 128, 2)
print(model)#3. 文本分类实战
#3.1 数据准备
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
class TextDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_len=128):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = self.texts[idx]
label = self.labels[idx]
# 分词并转为 ID
tokens = self.tokenizer.tokenize(text)[:self.max_len]
token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
# Padding
if len(token_ids) < self.max_len:
token_ids += [0] * (self.max_len - len(token_ids))
return {
"input_ids": torch.tensor(token_ids),
"label": torch.tensor(label, dtype=torch.long),
}
# 使用
dataset = TextDataset(X_train, y_train, tokenizer)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)#3.2 简单文本分类模型
class TextClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim=128, hidden_dim=64, num_classes=2):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.fc = nn.Sequential(
nn.Linear(embed_dim, hidden_dim),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(hidden_dim, num_classes),
)
def forward(self, input_ids):
# input_ids: (batch, seq_len)
embedded = self.embedding(input_ids) # (batch, seq_len, embed_dim)
# 平均池化
pooled = embedded.mean(dim=1) # (batch, embed_dim)
logits = self.fc(pooled) # (batch, num_classes)
return logits
# 训练
model = TextClassifier(vocab_size=10000)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
for epoch in range(10):
for batch in dataloader:
input_ids = batch["input_ids"]
labels = batch["label"]
logits = model(input_ids)
loss = criterion(logits, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"Epoch {epoch}, Loss: {loss.item():.4f}")#4. 小结
# PyTorch NLP 速查
# Tensor 创建
torch.tensor([1.0, 2.0])
torch.zeros(3, 5)
torch.randn(3, 5)
# 模型定义
class Net(nn.Module):
def __init__(self):
super().__init__()
self.layer = nn.Linear(in, out)
def forward(self, x):
return self.layer(x)
# 训练循环
optimizer.zero_grad()
output = model(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()💡 2026 年建议:除非学习目的,否则推荐直接用 Hugging Face 的预训练模型 + 简单微调,效果远比从头训练好得多。
🔗 扩展阅读

