The Standard Transformer Architecture

The standard transformer for time-series prediction (which we'll call ST) consists of four main components:

Input projection: Projects raw features (d_features) into model dimension (d_model)
Positional encoding: Adds temporal order information to each time step
Transformer encoder layers: N layers of multi-head self-attention + feed-forward
Output head: Projects the final representation to the prediction targets

import torch
import torch.nn as nn
import math

class TemporalTransformer(nn.Module):
def __init__(

self, d_features: int, # number of input features (e.g., 30) d_model: int = 128, # transformer internal dimension n_heads: int = 4, # attention heads n_layers: int = 3, # transformer layers d_ff: int = 256, # feed-forward hidden size seq_len: int = 60, # input sequence length (days) n_horizons: int = 3, # output targets (ret5, ret10, ret20) dropout: float = 0.1, ):

super().__init__()

self.d_model = d_model

# 1. Input projection
self.input_proj = nn.Linear(d_features, d_model)

# 2. Positional encoding (fixed sinusoidal)
pe = torch.zeros(seq_len, d_model)
position = torch.arange(0, seq_len).unsqueeze(1).float()
div_term = torch.exp(

torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
)
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)

self.register_buffer('pe', pe.unsqueeze(0))  # (1, seq_len, d_model)

# 3. Transformer encoder
encoder_layer = nn.TransformerEncoderLayer(
d_model=d_model,
nhead=n_heads,
dim_feedforward=d_ff,
dropout=dropout,
batch_first=True  # (batch, seq, features)

) self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)

4. Output head — predicts returns at multiple horizons

self.output_head = nn.Sequential(

nn.LayerNorm(d_model), nn.Linear(d_model, n_horizons) )

def forward(self, x: torch.Tensor) -> torch.Tensor:

# x: (batch, seq_len, d_features)
x = self.input_proj(x)           # → (batch, seq_len, d_model)
x = x + self.pe                  # add positional encoding
x = self.encoder(x)              # transformer layers

# Use the LAST time step's representation to predict
out = self.output_head(x[:, -1, :])  # → (batch, n_horizons)
return out

Hyperparameters: where to start

Hyperparameter	Starting value	Rationale
d_model	128	Sufficient capacity without memory issues
n_heads	4	4 attention heads: each attends to different temporal patterns
n_layers	3	Enough depth; financial data rarely benefits from >4 layers
seq_len	60	~3 months of daily data — captures medium-term patterns
batch_size	256	Large enough for gradient stability, not too large for GPU
learning_rate	1e-3	Adam default, well-tested starting point
weight_decay	1e-4	Mild L2 regularization
epochs	50 (with early stopping)	Stop when val_loss stops improving

Training Loop

def train_epoch(model, dataloader, optimizer, criterion):

model.train()
total_loss = 0
for batch_features, batch_targets in dataloader:

optimizer.zero_grad()
predictions = model(batch_features)
loss = criterion(predictions, batch_targets)

loss.backward()
# Gradient clipping — prevents exploding gradients on noisy data
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
total_loss += loss.item()
return total_loss / len(dataloader)

def validate(model, dataloader, criterion):

model.eval()
total_loss = 0

all_preds, all_targets = [], []
with torch.no_grad():
for batch_features, batch_targets in dataloader:
preds = model(batch_features)

total_loss += criterion(preds, batch_targets).item()
all_preds.append(preds.cpu())
all_targets.append(batch_targets.cpu())
preds_flat = torch.cat(all_preds)
targets_flat = torch.cat(all_targets)
return total_loss / len(dataloader), preds_flat, targets_flat

# Training loop with early stopping
best_val_loss = float('inf')
patience = 10
patience_counter = 0

for epoch in range(50):
train_loss = train_epoch(model, train_loader, optimizer, criterion)

val_loss, preds, targets = validate(model, val_loader, criterion)

if val_loss = patience:
print(f"Early stopping at epoch {epoch}")
break

Understanding Validation Loss

Validation Loss (val_loss)

The loss function value computed on the held-out validation set — data the model never saw during training. Lower val_loss means smaller prediction errors on unseen data. If train_loss decreases but val_loss increases, the model is overfitting (memorizing training data, not learning generalizable patterns).

In our experiment, the Standard Transformer showed a specific pathology: best val_loss at epoch 1, then degrading performance in subsequent epochs. This means the model was learning to memorize the training set rather than generalizing. But the deeper problem wasn't overfitting — it was collapse.

The Collapse: Constant Mean Prediction

After training, we ran the ST model on the validation set and examined the predictions. The results were alarming:

0.003 Variance Ratio (ST)