#长短时记忆网络 (LSTM/GRU):解决梯度消失,捕捉长距离依赖
📂 所属阶段:第二阶段 — 深度学习与序列模型(进阶篇)
🔗 相关章节:循环神经网络 (RNN) · 序列到序列模型 (Seq2Seq)
#1. LSTM 核心思想
#1.1 门控机制
LSTM = Long Short-Term Memory
核心:引入"门"(Gate)来控制信息的流动
三种门:
遗忘门(Forget Gate):决定丢弃什么信息
输入门(Input Gate):决定保存什么新信息
输出门(Output Gate):决定输出什么信息
关键结构:细胞状态(Cell State)→ 信息的高速公路
细胞状态像传送带,信息在上面流动,基本不受损失!#1.2 LSTM 数学公式
"""
LSTM 计算流程:
遗忘门(决定丢弃什么):
f_t = σ(W_f · [h_{t-1}, x_t] + b_f)
输入门(决定写入什么):
i_t = σ(W_i · [h_{t-1}, x_t] + b_i)
C̃_t = tanh(W_C · [h_{t-1}, x_t] + b_C)
细胞状态更新(信息流动):
C_t = f_t * C_{t-1} + i_t * C̃_t
输出门(决定输出什么):
o_t = σ(W_o · [h_{t-1}, x_t] + b_o)
h_t = o_t * tanh(C_t)
"""#1.3 PyTorch LSTM 实现
import torch
import torch.nn as nn
class LSTMClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim=256, hidden_dim=256, num_layers=2,
dropout=0.3, num_classes=2):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.lstm = nn.LSTM(
input_size=embed_dim,
hidden_size=hidden_dim,
num_layers=num_layers,
batch_first=True,
bidirectional=True,
dropout=dropout if num_layers > 1 else 0,
)
# 双向 × hidden_dim
self.fc = nn.Sequential(
nn.Dropout(dropout),
nn.Linear(hidden_dim * 2, hidden_dim),
nn.ReLU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim, num_classes),
)
def forward(self, input_ids):
embedded = self.embedding(input_ids) # (B, L, E)
output, (h_n) = self.lstm(embedded)
# h_n: (num_layers * dirs, B, H)
# 取最后一层双向拼接
forward_h = h_n[-2] # 前向最后一层
backward_h = h_n[-1] # 后向最后一层
hidden = torch.cat([forward_h, backward_h], dim=-1)
logits = self.fc(hidden)
return logits#2. GRU:LSTM 的简化版
#2.1 GRU 的优势
GRU = Gated Recurrent Unit
比 LSTM 更简单,只有两个门:
重置门(Reset Gate):控制如何结合新输入和旧记忆
更新门(Update Gate):控制保留多少旧记忆
参数量更少,训练更快,效果往往和 LSTM 相当#2.2 PyTorch GRU 实现
class GRUClassifier(nn.Module):
def __init__(self, vocab_size, embed_dim=256, hidden_dim=256,
num_layers=2, dropout=0.3, num_classes=2):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.gru = nn.GRU(
input_size=embed_dim,
hidden_size=hidden_dim,
num_layers=num_layers,
batch_first=True,
bidirectional=True,
dropout=dropout if num_layers > 1 else 0,
)
self.fc = nn.Linear(hidden_dim * 2, num_classes)
def forward(self, input_ids):
embedded = self.embedding(input_ids)
_, hidden = self.gru(embedded)
# 取最后一层双向拼接
hidden = torch.cat([hidden[-2], hidden[-1]], dim=-1)
return self.fc(hidden)#3. 实战:情感分类
import torch
from torch.utils.data import DataLoader
# 训练函数
def train_epoch(model, dataloader, optimizer, criterion, device):
model.train()
total_loss, correct, total = 0, 0, 0
for batch in dataloader:
input_ids = batch["input_ids"].to(device)
labels = batch["label"].to(device)
optimizer.zero_grad()
logits = model(input_ids)
loss = criterion(logits, labels)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # 梯度裁剪
optimizer.step()
total_loss += loss.item()
correct += (logits.argmax(1) == labels).sum().item()
total += labels.size(0)
return total_loss / len(dataloader), correct / total
# 推理
def predict(model, text, tokenizer, device):
model.eval()
with torch.no_grad():
tokens = tokenizer.tokenize(text)
ids = tokenizer.convert_tokens_to_ids(tokens)
ids = torch.tensor([ids]).to(device)
logits = model(ids)
prob = torch.softmax(logits, dim=-1)
return {"positive": prob[0, 1].item(), "negative": prob[0, 0].item()}
# 使用
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(vocab_size=tokenizer.vocab_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
for epoch in range(10):
loss, acc = train_epoch(model, train_loader, optimizer, criterion, device)
print(f"Epoch {epoch}: Loss={loss:.4f}, Acc={acc:.4f}")
# 测试
print(predict(model, "这部电影太精彩了!", tokenizer, device))
# {'positive': 0.96, 'negative': 0.04}#4. 小结
LSTM vs GRU:
LSTM:三个门(遗忘/输入/输出)+ 细胞状态
→ 表达能力更强,但参数量大
GRU:两个门(重置/更新),无独立细胞状态
→ 更轻量,效果相近
选择建议:
→ 简单任务:GRU(更快)
→ 复杂任务:LSTM(更强)
→ 2026 年:直接用 Transformer 或预训练模型!💡 实践建议:虽然 LSTM/GRU 在学术上很重要,但 2026 年的 NLP 实际应用中,Transformer 架构的 BERT、GPT 几乎完全替代了它们的位置。LSTM 现在主要用于需要严格顺序处理的场景(如语音识别)。
🔗 扩展阅读

