深度学习序列建模:注意力机制理论 1. 技术分析 1.1 注意力机制概述 注意力机制允许模型聚焦于输入的不同部分:
注意力机制类型 自注意力: 序列内部依赖 多头注意力: 多个注意力头 交叉注意力: 不同序列间交互 核心思想: 计算注意力权重 加权求和得到输出1.2 注意力机制对比 类型 复杂度 能力 适用场景 点积注意力 O(n²d) 标准 通用 多头注意力 O(n²d * h) 多视角 Transformer 线性注意力 O(nd) 长序列 长文档 稀疏注意力 O(n log n d) 长序列 高效计算
1.3 Transformer架构 Transformer架构 编码器: 处理输入序列 解码器: 生成输出序列 注意力机制: 核心组件 关键创新: 自注意力 位置编码 残差连接2. 核心功能实现 2.1 自注意力机制 import numpy as np class ScaledDotProductAttention: def __init__(self): pass def forward(self, Q, K, V, mask=None): d_k = Q.shape[-1] scores = np.dot(Q, K.transpose(-2, -1)) / np.sqrt(d_k) if mask is not None: scores = scores.masked_fill(mask == 0, -1e9) attn_weights = self._softmax(scores, axis=-1) output = np.dot(attn_weights, V) return output, attn_weights def _softmax(self, x, axis=-1): exp_x = np.exp(x - np.max(x, axis=axis, keepdims=True)) return exp_x / np.sum(exp_x, axis=axis, keepdims=True) class MultiHeadAttention: def __init__(self, d_model, num_heads): self.d_model = d_model self.num_heads = num_heads self.d_k = d_model // num_heads self.W_q = np.random.randn(d_model, d_model) self.W_k = np.random.randn(d_model, d_model) self.W_v = np.random.randn(d_model, d_model) self.W_o = np.random.randn(d_model, d_model) def split_heads(self, x): batch_size = x.shape[0] return x.reshape(batch_size, -1, self.num_heads, self.d_k).transpose(0, 2, 1, 3) def forward(self, Q, K, V, mask=None): batch_size = Q.shape[0] Q = self.split_heads(Q @ self.W_q) K = self.split_heads(K @ self.W_k) V = self.split_heads(V @ self.W_v) output, attn_weights = ScaledDotProductAttention().forward(Q, K, V, mask) output = output.transpose(0, 2, 1, 3).reshape(batch_size, -1, self.d_model) output = output @ self.W_o return output, attn_weights2.2 Transformer编码器 class TransformerEncoderLayer: def __init__(self, d_model, num_heads, d_ff, dropout=0.1): self.self_attn = MultiHeadAttention(d_model, num_heads) self.feed_forward = PositionWiseFFN(d_model, d_ff) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout = dropout def forward(self, x, mask=None): attn_output, _ = self.self_attn(x, x, x, mask) x = self.norm1(x + self._dropout(attn_output)) ff_output = self.feed_forward(x) x = self.norm2(x + self._dropout(ff_output)) return x def _dropout(self, x): if self.dropout > 0: mask = np.random.rand(*x.shape) > self.dropout return x * mask / (1 - self.dropout) return x class PositionWiseFFN: def __init__(self, d_model, d_ff): self.fc1 = np.random.randn(d_model, d_ff) self.fc2 = np.random.randn(d_ff, d_model) def forward(self, x): return self._gelu(x @ self.fc1) @ self.fc2 def _gelu(self, x): return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x ** 3))) class LayerNorm: def __init__(self, d_model, eps=1e-5): self.gamma = np.ones(d_model) self.beta = np.zeros(d_model) self.eps = eps def forward(self, x): mean = np.mean(x, axis=-1, keepdims=True) var = np.var(x, axis=-1, keepdims=True) x_normalized = (x - mean) / np.sqrt(var + self.eps) return self.gamma * x_normalized + self.beta class PositionalEncoding: def __init__(self, d_model, max_len=5000): self.encoding = self._compute_positional_encoding(d_model, max_len) def _compute_positional_encoding(self, d_model, max_len): position = np.arange(max_len).reshape(-1, 1) div_term = np.exp(np.arange(0, d_model, 2) * (-np.log(10000.0) / d_model)) pe = np.zeros((max_len, 1, d_model)) pe[:, 0, 0::2] = np.sin(position * div_term) pe[:, 0, 1::2] = np.cos(position * div_term) return pe def forward(self, x): return x + self.encoding[:x.shape[0]]2.3 高效注意力机制 class LinearAttention: def __init__(self): pass def forward(self, Q, K, V): Q = self._softmax(Q, axis=-1) K = self._softmax(K, axis=-1) context = np.dot(K.transpose(-2, -1), V) Z = np.sum(K, axis=-2, keepdims=True) output = np.dot(Q, context) / np.dot(Q, Z) return output def _softmax(self, x, axis=-1): exp_x = np.exp(x - np.max(x, axis=axis, keepdims=True)) return exp_x / np.sum(exp_x, axis=axis, keepdims=True) class SparseAttention: def __init__(self, window_size=5): self.window_size = window_size def forward(self, Q, K, V): scores = np.dot(Q, K.transpose(-2, -1)) mask = self._create_sparse_mask(Q.shape[1], self.window_size) scores = scores * mask + (1 - mask) * (-1e9) attn_weights = self._softmax(scores, axis=-1) output = np.dot(attn_weights, V) return output def _create_sparse_mask(self, seq_len, window_size): mask = np.zeros((seq_len, seq_len)) for i in range(seq_len): start = max(0, i - window_size) end = min(seq_len, i + window_size + 1) mask[i, start:end] = 1 return mask3. 性能对比 3.1 注意力机制对比 类型 时间复杂度 空间复杂度 适用序列长度 标准注意力 O(n²d) O(n²) <1000 线性注意力 O(nd) O(nd) >10000 稀疏注意力 O(n log n d) O(n log n) >1000
3.2 Transformer变体对比 模型 序列长度 性能 计算成本 Vanilla Transformer 512 基准 基准 Longformer 4096 高 中 Reformer 16384 中 高 Linformer 100000 中 低
3.3 注意力头数量影响 头数 模型容量 训练速度 效果 4 低 快 中 8 中 中 高 16 高 慢 很高
4. 最佳实践 4.1 注意力机制选择 def choose_attention_mechanism(seq_len, task_type): if seq_len > 10000: return 'linear' elif seq_len > 2000: return 'sparse' else: return 'standard' class AttentionMechanismSelector: @staticmethod def select(config): mechanisms = { 'standard': MultiHeadAttention, 'linear': LinearAttention, 'sparse': SparseAttention } return mechanisms[config['type']](**config.get('params', {}))4.2 Transformer配置 class TransformerConfigGenerator: @staticmethod def from_task(task_type): configs = { 'nlp': {'d_model': 768, 'num_heads': 12, 'd_ff': 3072, 'layers': 12}, 'vision': {'d_model': 512, 'num_heads': 8, 'd_ff': 2048, 'layers': 6}, 'long_seq': {'d_model': 512, 'num_heads': 8, 'd_ff': 2048, 'layers': 12, 'attention': 'sparse'} } return configs.get(task_type, configs['nlp'])5. 总结 注意力机制是Transformer的核心:
自注意力 :捕捉序列内部依赖多头注意力 :多视角特征学习高效注意力 :处理长序列位置编码 :注入顺序信息对比数据如下:
线性注意力适合超长序列(>10000) 稀疏注意力在长序列上平衡效果和效率 12个头是NLP任务的标准配置 推荐根据序列长度选择合适的注意力机制