PyTorch 基础与 NLP 适配:构建第一个文本分类器

📂 所属阶段:第二阶段 — 深度学习与序列模型(进阶篇)
🔗 相关章节:词向量空间 · 循环神经网络 (RNN)


1. PyTorch 基础

1.1 Tensor 是什么?

import torch
import numpy as np

# 从 Python 列表创建
x = torch.tensor([1.0, 2.0, 3.0])
print(x.shape, x.dtype)
# torch.Size([3]) torch.float32

# 从 NumPy 创建
np_array = np.array([[1, 2], [3, 4]])
x = torch.from_numpy(np_array).float()
print(x)
# tensor([[1., 2.],
#         [3., 4.]])

# 常用创建方法
torch.zeros(3, 5)           # 零矩阵
torch.randn(3, 5)           # 标准正态分布
torch.arange(0, 10, 2)      # [0, 2, 4, 6, 8]
torch.linspace(0, 1, 10)    # 等差数列

1.2 Tensor 操作

# 基本运算
a = torch.tensor([[1, 2], [3, 4]])
b = torch.tensor([[5, 6], [7, 8]])

print(a + b)    # 逐元素加法
print(a @ b)    # 矩阵乘法
print(a * b)    # 逐元素乘法
print(a.sum())  # 求和

# 形状变换
x = torch.randn(32, 10, 100)  # (batch, seq_len, features)
print(x.view(32, -1).shape)    # (32, 1000) 展平
print(x.transpose(0, 1).shape) # (10, 32, 100) 转置

2. 自动求导

2.1 requires_grad

# 创建需要梯度的张量
x = torch.tensor([2.0, 3.0], requires_grad=True)
print(x.grad)  # None

# 计算 y = x^2 + 2x + 1
y = x**2 + 2*x + 1

# 反向传播
y.sum().backward()

print(x.grad)  # dy/dx = 2x + 2 → [6., 8.]

2.2 简单 MLP 实现

import torch.nn as nn

class SimpleClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# 使用
model = SimpleClassifier(768, 128, 2)
print(model)

3. 文本分类实战

3.1 数据准备

import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # 分词并转为 ID
        tokens = self.tokenizer.tokenize(text)[:self.max_len]
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)

        # Padding
        if len(token_ids) < self.max_len:
            token_ids += [0] * (self.max_len - len(token_ids))

        return {
            "input_ids": torch.tensor(token_ids),
            "label": torch.tensor(label, dtype=torch.long),
        }

# 使用
dataset = TextDataset(X_train, y_train, tokenizer)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

3.2 简单文本分类模型

class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=64, num_classes=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(hidden_dim, num_classes),
        )

    def forward(self, input_ids):
        # input_ids: (batch, seq_len)
        embedded = self.embedding(input_ids)  # (batch, seq_len, embed_dim)
        # 平均池化
        pooled = embedded.mean(dim=1)          # (batch, embed_dim)
        logits = self.fc(pooled)                # (batch, num_classes)
        return logits

# 训练
model = TextClassifier(vocab_size=10000)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(10):
    for batch in dataloader:
        input_ids = batch["input_ids"]
        labels = batch["label"]

        logits = model(input_ids)
        loss = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

4. 小结

# PyTorch NLP 速查

# Tensor 创建
torch.tensor([1.0, 2.0])
torch.zeros(3, 5)
torch.randn(3, 5)

# 模型定义
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer = nn.Linear(in, out)
    def forward(self, x):
        return self.layer(x)

# 训练循环
optimizer.zero_grad()
output = model(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()

💡 2026 年建议:除非学习目的,否则推荐直接用 Hugging Face 的预训练模型 + 简单微调,效果远比从头训练好得多。


🔗 扩展阅读