The Standard Transformer Architecture
The standard transformer for time-series prediction (which we'll call ST) consists of four main components:
- Input projection: Projects raw features (d_features) into model dimension (d_model)
- Positional encoding: Adds temporal order information to each time step
- Transformer encoder layers: N layers of multi-head self-attention + feed-forward
- Output head: Projects the final representation to the prediction targets
import torch
import torch.nn as nn
import math
class TemporalTransformer(nn.Module):
def __init__(
self, d_features: int, # number of input features (e.g., 30) d_model: int = 128, # transformer internal dimension n_heads: int = 4, # attention heads n_layers: int = 3, # transformer layers d_ff: int = 256, # feed-forward hidden size seq_len: int = 60, # input sequence length (days) n_horizons: int = 3, # output targets (ret5, ret10, ret20) dropout: float = 0.1, ):
super().__init__()
self.d_model = d_model
# 1. Input projection
self.input_proj = nn.Linear(d_features, d_model)
# 2. Positional encoding (fixed sinusoidal)
pe = torch.zeros(seq_len, d_model)
position = torch.arange(0, seq_len).unsqueeze(1).float()
div_term = torch.exp(
torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
)
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
self.register_buffer('pe', pe.unsqueeze(0)) # (1, seq_len, d_model)
# 3. Transformer encoder
encoder_layer = nn.TransformerEncoderLayer(
d_model=d_model,
nhead=n_heads,
dim_feedforward=d_ff,
dropout=dropout,
batch_first=True # (batch, seq, features)
) self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
4. Output head — predicts returns at multiple horizons
def forward(self, x: torch.Tensor) -> torch.Tensor:
# x: (batch, seq_len, d_features)
x = self.input_proj(x) # → (batch, seq_len, d_model)
x = x + self.pe # add positional encoding
x = self.encoder(x) # transformer layers
# Use the LAST time step's representation to predict
out = self.output_head(x[:, -1, :]) # → (batch, n_horizons)
return out
Hyperparameters: where to start
| Hyperparameter | Starting value | Rationale |
|---|---|---|
| d_model | 128 | Sufficient capacity without memory issues |
| n_heads | 4 | 4 attention heads: each attends to different temporal patterns |
| n_layers | 3 | Enough depth; financial data rarely benefits from >4 layers |
| seq_len | 60 | ~3 months of daily data — captures medium-term patterns |
| batch_size | 256 | Large enough for gradient stability, not too large for GPU |
| learning_rate | 1e-3 | Adam default, well-tested starting point |
| weight_decay | 1e-4 | Mild L2 regularization |
| epochs | 50 (with early stopping) | Stop when val_loss stops improving |
Training Loop
def train_epoch(model, dataloader, optimizer, criterion):
model.train()
total_loss = 0
for batch_features, batch_targets in dataloader:
optimizer.zero_grad()
predictions = model(batch_features)
loss = criterion(predictions, batch_targets)
loss.backward()
# Gradient clipping — prevents exploding gradients on noisy data
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
total_loss += loss.item()
return total_loss / len(dataloader)
def validate(model, dataloader, criterion):
model.eval()
total_loss = 0
all_preds, all_targets = [], []
with torch.no_grad():
for batch_features, batch_targets in dataloader:
preds = model(batch_features)
total_loss += criterion(preds, batch_targets).item()
all_preds.append(preds.cpu())
all_targets.append(batch_targets.cpu())
preds_flat = torch.cat(all_preds)
targets_flat = torch.cat(all_targets)
return total_loss / len(dataloader), preds_flat, targets_flat
# Training loop with early stopping
best_val_loss = float('inf')
patience = 10
patience_counter = 0
for epoch in range(50):
train_loss = train_epoch(model, train_loader, optimizer, criterion)
val_loss, preds, targets = validate(model, val_loader, criterion)
if val_loss = patience:
print(f"Early stopping at epoch {epoch}")
break
Understanding Validation Loss
In our experiment, the Standard Transformer showed a specific pathology: best val_loss at epoch 1, then degrading performance in subsequent epochs. This means the model was learning to memorize the training set rather than generalizing. But the deeper problem wasn't overfitting — it was collapse.
The Collapse: Constant Mean Prediction
After training, we ran the ST model on the validation set and examined the predictions. The results were alarming:
0.003 Variance Ratio (ST)