C-Transformer
Cache-Optimized Transformers in C (x86)
Loading...
Searching...
No Matches
Macros | Functions | Variables
GEMM Kernels

Matrix multiplication implementations with different optimization strategies. More...

Macros

#define ATTN_SCORES_ACCESS(scores_ptr, h, i, j, aligned_context_window)    scores_ptr[((h) * (aligned_context_window) * (aligned_context_window)) + ((i) * (aligned_context_window)) + (j)]
 

Functions

void gemm_naive_parallel (const float *A, const float *B, const float *bias, float *C, int M, int N, int K)
 Naive parallel GEMM implementation (reference baseline)
 
void gemm_avx512_parallel (const float *A, const float *B, const float *bias, float *C, int M, int N, int K)
 AVX-512 optimized GEMM with vectorized inner loops.
 
void gemm_fine_grained_parallel (const float *A, const float *B, const float *bias, float *C, int M, int N, int K)
 Cache-blocked GEMM with fine-grained parallelism.
 
void gemm_blocked_serial (const float *A, const float *B, const float *bias, float *C, int M, int N, int K)
 
void layernorm_naive_serial (const float *input, const float *gamma, const float *beta, float *output, float *mean_cache, float *rstd_cache, int tokens, int d_model, int aligned_embed_dim, float eps)
 
void layernorm_forward_rolled_slice (const float *__restrict input_slice_base, const float *__restrict gamma, const float *__restrict beta, float *__restrict output_slice_base, float *__restrict mean_cache_slice, float *__restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, int aligned_embed_dim, float eps)
 
void layernorm_forward_unrolled_slice (const float *__restrict input_slice_base, const float *__restrict gamma, const float *__restrict beta, float *__restrict output_slice_base, float *__restrict mean_cache_slice, float *__restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, float eps)
 
void layernorm_token_parallel (TransformerModel *M, size_t input_offset, size_t weight_offset, size_t bias_offset, size_t mean_cache_offset, size_t rstd_cache_offset, size_t output_offset, float eps)
 Token-parallel Layer Normalization with AVX-512 optimization.
 
void layernorm_naive_serial_matched_precision (const float *input, const float *gamma, const float *beta, float *output, float *mean_cache, float *rstd_cache, int tokens, int d_model, float eps)
 
static void * aligned_alloc_64 (size_t size)
 
void debug_math_comparison (TransformerModel *M)
 
void run_layernorm_benchmark_precision_matched (TransformerModel *M)
 
void run_layernorm_benchmark_performance (TransformerModel *M)
 
static void qkv_micro_kernel_blocked_4x16_polished (const float *__restrict input_token, const float *__restrict Q_weights_block, const float *__restrict K_weights_block, const float *__restrict V_weights_block, const float *__restrict Q_bias_4, const float *__restrict K_bias_4, const float *__restrict V_bias_4, float *__restrict Q_output_4, float *__restrict K_output_4, float *__restrict V_output_4, int embed_dim)
 
static void qkv_token_kernel_4x16_blocked_polished (const float *__restrict input_token, const float *__restrict Q_weights, const float *__restrict K_weights, const float *__restrict V_weights, const float *__restrict Q_bias, const float *__restrict K_bias, const float *__restrict V_bias, float *__restrict Q_output, float *__restrict K_output, float *__restrict V_output, int embed_dim)
 
void qkv_projection (TransformerModel *M, size_t layer_idx)
 
static void qkv_micro_kernel_head_major_4x16 (const float *__restrict input_token, const float *__restrict Q_weights_block, const float *__restrict K_weights_block, const float *__restrict V_weights_block, const float *__restrict Q_bias_4, const float *__restrict K_bias_4, const float *__restrict V_bias_4, TransformerModel *M, float *__restrict q_output_base, float *__restrict k_output_base, float *__restrict v_output_base, int embed_dim, int token_idx, int output_start_dim)
 
static void qkv_token_kernel_head_major_4x16 (const float *__restrict input_token, const float *__restrict Q_weights, const float *__restrict K_weights, const float *__restrict V_weights, const float *__restrict Q_bias, const float *__restrict K_bias, const float *__restrict V_bias, TransformerModel *M, float *__restrict q_output_base, float *__restrict k_output_base, float *__restrict v_output_base, int embed_dim, int token_idx)
 
void qkv_projection_head_major (TransformerModel *M, int layer_idx)
 
double compare_arrays (const float *a, const float *b, size_t size, const char *name)
 
void convert_token_major_to_head_major_layer (const float *token_major_base, float *head_major_base, TransformerModel *M)
 
void benchmark_qkv_dual_comparison (TransformerModel *M)
 
void compute_attention_scores_head_major (TransformerModel *M, int layer_idx)
 
void apply_causal_softmax_head_major (TransformerModel *M, int layer_idx)
 
void compute_attention_output_head_major (TransformerModel *M, int layer_idx)
 
void attention_head_major_complete (TransformerModel *M, int layer_idx)
 Complete multi-head attention with head-major layout (self-attention)
 
void test_attention_head_major_after_qkv (TransformerModel *M)
 
void attention_projection_with_concat (TransformerModel *M, int layer_idx)
 Production attention projection with concat: Head-major → Token-major → GEMM.
 
void benchmark_attention_projection_complete (TransformerModel *M)
 
void add_gpt2_token_and_positional_embeddings (TransformerModel *M, size_t token_ids_offset, size_t output_offset)
 
void residual_add_token_parallel (TransformerModel *M, size_t input_offset, size_t residual_offset, size_t output_offset)
 
void gelu_activation_token_parallel (TransformerModel *M, size_t data_offset)
 
void mlp_token_parallel (TransformerModel *M, size_t input_offset, size_t fc1_weight_offset, size_t fc1_bias_offset, size_t fc1_output_offset, size_t fc2_weight_offset, size_t fc2_bias_offset, size_t output_offset)
 
void embed_tokens (TransformerModel *M, int32_t *token_ids, int num_tokens)
 
void compute_logits_last_token_optimized (TransformerModel *M, int position)
 
void transformer_layer_forward (TransformerModel *M, int layer_idx, size_t layer_input_offset)
 
void run_comprehensive_benchmark (TransformerModel *M)
 
int load_model_weights (TransformerModel *M, const char *weight_file)
 Load weights into already-allocated TransformerModel.
 
int read_model_metadata (TransformerModel *M, const char *weight_file)
 Read model metadata from weight file header.
 
int sample_token (float *logits, int vocab_size, float temperature)
 
void generate (TransformerModel *M, int *prompt, int prompt_len, int max_tokens)
 
void zero_gradients (TransformerModel *M)
 
void cache_forward_activations (TransformerModel *M)
 Copy forward pass activations to gradient storage for backward pass This preserves the forward computations needed for gradient calculation.
 
void backward_residual_connection (TransformerModel *M, size_t d_output_offset, size_t d_input_offset, size_t d_transform_offset)
 
void backward_embedding_layer (TransformerModel *M)
 
void backward_final_layernorm (TransformerModel *M)
 
void backward_fc2 (TransformerModel *M, size_t d_output_offset, size_t fc2_input_copy_offset, size_t fc2_weight_offset, size_t fc2_bias_offset, size_t d_input_offset, size_t d_weight_offset, size_t d_bias_offset)
 
void backward_gelu (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t d_input_offset)
 
void backward_fc1 (TransformerModel *M, size_t d_output_offset, size_t fc1_input_copy_offset, size_t fc1_weight_offset, size_t fc1_bias_offset, size_t d_input_offset, size_t d_weight_offset, size_t d_bias_offset)
 
void backward_gelu_fast (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t d_input_offset)
 
void backward_layernorm (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t gamma_copy_offset, size_t beta_copy_offset, size_t mean_copy_offset, size_t rstd_copy_offset, size_t d_input_offset, size_t d_gamma_offset, size_t d_beta_offset)
 
void add_gradient (TransformerModel *M, size_t source_offset, size_t dest_offset)
 
void backward_attention_projection (TransformerModel *M, size_t d_output_offset, size_t attention_output_copy_offset, size_t proj_weight_offset, size_t proj_bias_offset, size_t d_attention_offset, size_t d_weight_offset, size_t d_bias_offset)
 
void backward_attention_weighted_values (TransformerModel *M, size_t d_output_offset, size_t attention_weights_offset, size_t v_output_offset, size_t d_weights_offset, size_t d_v_offset)
 
void backward_causal_softmax (TransformerModel *M, size_t d_scores_offset, size_t weights_copy_offset, size_t scores_copy_offset)
 
void backward_qk_matmul (TransformerModel *M, size_t d_scores_offset, size_t q_copy_offset, size_t k_copy_offset, size_t d_q_offset, size_t d_k_offset)
 
void backward_linear (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t weight_offset, size_t bias_offset, size_t d_input_offset, size_t d_weight_offset, size_t d_bias_offset)
 
void backward_lm_head (TransformerModel *M)
 
void backward_transformer_layer (TransformerModel *M, int layer_idx)
 
void compute_cross_entropy_loss (TransformerModel *M, int32_t *target_tokens, float *loss_out)
 Compute cross-entropy loss and gradients w.r.t logits.
 
void training_step (TransformerModel *M, int32_t *input_tokens, int32_t *target_tokens, float learning_rate)
 
void update_all_weights_sgd (TransformerModel *M, float learning_rate)
 
int main (int argc, char **argv)
 

Variables

size_t TrulyOptimalLayer::layer_start_canary_offset
 
size_t TrulyOptimalLayer::ln1_weight_offset
 
size_t TrulyOptimalLayer::ln1_bias_offset
 
size_t TrulyOptimalLayer::ln1_mean_offset
 
size_t TrulyOptimalLayer::ln1_rstd_offset
 
size_t TrulyOptimalLayer::layer_input_offset
 
size_t TrulyOptimalLayer::ln1_output_offset
 
size_t TrulyOptimalLayer::q_weight_offset
 
size_t TrulyOptimalLayer::q_bias_offset
 
size_t TrulyOptimalLayer::q_output_offset
 
size_t TrulyOptimalLayer::k_weight_offset
 
size_t TrulyOptimalLayer::k_bias_offset
 
size_t TrulyOptimalLayer::k_output_offset
 
size_t TrulyOptimalLayer::v_weight_offset
 
size_t TrulyOptimalLayer::v_bias_offset
 
size_t TrulyOptimalLayer::v_output_offset
 
size_t TrulyOptimalLayer::attention_scores_offset
 
size_t TrulyOptimalLayer::proj_weight_offset
 
size_t TrulyOptimalLayer::proj_bias_offset
 
size_t TrulyOptimalLayer::attention_output_offset
 
size_t TrulyOptimalLayer::residual1_output_offset
 
size_t TrulyOptimalLayer::ln2_weight_offset
 
size_t TrulyOptimalLayer::ln2_bias_offset
 
size_t TrulyOptimalLayer::ln2_mean_offset
 
size_t TrulyOptimalLayer::ln2_rstd_offset
 
size_t TrulyOptimalLayer::ln2_output_offset
 
size_t TrulyOptimalLayer::fc1_weight_offset
 
size_t TrulyOptimalLayer::fc1_bias_offset
 
size_t TrulyOptimalLayer::fc1_output_offset
 
size_t TrulyOptimalLayer::fc2_weight_offset
 
size_t TrulyOptimalLayer::fc2_bias_offset
 
size_t TrulyOptimalLayer::mlp_output_offset
 
size_t TrulyOptimalLayer::residual2_output_offset
 
size_t TrulyOptimalLayer::layer_end_canary_offset
 
size_t LayerGradients::residual2_copy_offset
 
size_t LayerGradients::d_residual2_offset
 
size_t LayerGradients::mlp_output_copy_offset
 
size_t LayerGradients::d_mlp_output_offset
 
size_t LayerGradients::fc2_input_copy_offset
 
size_t LayerGradients::fc2_weights_copy_offset
 
size_t LayerGradients::fc2_bias_copy_offset
 
size_t LayerGradients::d_fc2_input_offset
 
size_t LayerGradients::d_fc2_weights_offset
 
size_t LayerGradients::d_fc2_bias_offset
 
size_t LayerGradients::fc1_output_copy_offset
 
size_t LayerGradients::d_fc1_output_offset
 
size_t LayerGradients::ln2_output_copy_offset
 
size_t LayerGradients::fc1_weights_copy_offset
 
size_t LayerGradients::fc1_bias_copy_offset
 
size_t LayerGradients::d_ln2_output_offset
 
size_t LayerGradients::d_fc1_weights_offset
 
size_t LayerGradients::d_fc1_bias_offset
 
size_t LayerGradients::ln2_input_copy_offset
 
size_t LayerGradients::ln2_mean_copy_offset
 
size_t LayerGradients::ln2_rstd_copy_offset
 
size_t LayerGradients::ln2_gamma_copy_offset
 
size_t LayerGradients::ln2_beta_copy_offset
 
size_t LayerGradients::d_ln2_input_offset
 
size_t LayerGradients::d_ln2_gamma_offset
 
size_t LayerGradients::d_ln2_beta_offset
 
size_t LayerGradients::residual1_copy_offset
 
size_t LayerGradients::d_residual1_offset
 
size_t LayerGradients::attention_output_copy_offset
 
size_t LayerGradients::proj_weights_copy_offset
 
size_t LayerGradients::proj_bias_copy_offset
 
size_t LayerGradients::d_attention_output_offset
 
size_t LayerGradients::d_proj_weights_offset
 
size_t LayerGradients::d_proj_bias_offset
 
size_t LayerGradients::attention_weights_copy_offset
 
size_t LayerGradients::v_output_copy_offset
 
size_t LayerGradients::d_attention_weights_offset
 
size_t LayerGradients::d_v_output_offset
 
size_t LayerGradients::attention_scores_copy_offset
 
size_t LayerGradients::d_attention_scores_offset
 
size_t LayerGradients::q_output_copy_offset
 
size_t LayerGradients::k_output_copy_offset
 
size_t LayerGradients::d_q_output_offset
 
size_t LayerGradients::d_k_output_offset
 
size_t LayerGradients::ln1_output_copy_offset
 
size_t LayerGradients::q_weights_copy_offset
 
size_t LayerGradients::q_bias_copy_offset
 
size_t LayerGradients::k_weights_copy_offset
 
size_t LayerGradients::k_bias_copy_offset
 
size_t LayerGradients::v_weights_copy_offset
 
size_t LayerGradients::v_bias_copy_offset
 
size_t LayerGradients::d_ln1_output_offset
 
size_t LayerGradients::d_q_weights_offset
 
size_t LayerGradients::d_q_bias_offset
 
size_t LayerGradients::d_k_weights_offset
 
size_t LayerGradients::d_k_bias_offset
 
size_t LayerGradients::d_v_weights_offset
 
size_t LayerGradients::d_v_bias_offset
 
size_t LayerGradients::ln1_input_copy_offset
 
size_t LayerGradients::ln1_mean_copy_offset
 
size_t LayerGradients::ln1_rstd_copy_offset
 
size_t LayerGradients::ln1_gamma_copy_offset
 
size_t LayerGradients::ln1_beta_copy_offset
 
size_t LayerGradients::d_ln1_input_offset
 
size_t LayerGradients::d_ln1_gamma_offset
 
size_t LayerGradients::d_ln1_beta_offset
 
size_t GradientStorage::backprop_base
 
size_t GradientStorage::total_gradient_floats
 
size_t GradientStorage::logits_copy_offset
 
size_t GradientStorage::actual_tokens_offset
 
size_t GradientStorage::d_logits_offset
 
size_t GradientStorage::final_output_copy_offset
 
size_t GradientStorage::d_final_output_offset
 
size_t GradientStorage::d_embed_weights_offset
 
size_t GradientStorage::final_ln_input_copy_offset
 
size_t GradientStorage::final_ln_mean_copy_offset
 
size_t GradientStorage::final_ln_rstd_copy_offset
 
size_t GradientStorage::final_ln_gamma_copy_offset
 
size_t GradientStorage::final_ln_beta_copy_offset
 
size_t GradientStorage::d_final_ln_input_offset
 
size_t GradientStorage::d_final_ln_gamma_offset
 
size_t GradientStorage::d_final_ln_beta_offset
 
LayerGradientsGradientStorage::layers
 
size_t GradientStorage::d_pos_embed_offset
 
size_t GradientStorage::layer_backprop_stride
 
char TransformerModel::magic [8]
 Magic string "BUMPWGT2" for file validation.
 
uint32_t TransformerModel::version
 Weight file format version.
 
uint32_t TransformerModel::model_type
 Model architecture: 0=GPT2, 1=LLAMA, etc.
 
int TransformerModel::num_layers
 Number of transformer layers (e.g., 12 for GPT-2)
 
int TransformerModel::vocab_size
 Vocabulary size (e.g., 50257 for GPT-2)
 
int TransformerModel::embed_dim
 Embedding dimension (e.g., 768 for GPT-2 small)
 
int TransformerModel::context_window
 Maximum sequence length (e.g., 1024)
 
size_t TransformerModel::aligned_embed_dim
 embed_dim rounded up to 64-byte alignment (in floats)
 
size_t TransformerModel::aligned_head_dim
 head_dim rounded up to 64-byte alignment (in floats)
 
size_t TransformerModel::aligned_attn_context_window
 context_window padded to prevent false sharing
 
int TransformerModel::num_cores
 Number of CPU cores to use (OpenMP threads)
 
int TransformerModel::tokens_per_core
 Tokens assigned per core: context_window / num_cores.
 
int TransformerModel::num_attention_heads
 Number of attention heads (e.g., 12 for GPT-2)
 
int TransformerModel::head_dim
 Dimension per head: embed_dim / num_attention_heads.
 
float * TransformerModel::memory_base
 Base pointer to single contiguous memory block.
 
size_t TransformerModel::total_floats
 Total size of memory block in float elements.
 
size_t TransformerModel::layer_stride
 Byte offset between consecutive layer memory blocks.
 
size_t TransformerModel::token_emb_offset
 Token embedding table [vocab_size × aligned_embed_dim].
 
size_t TransformerModel::pos_emb_offset
 Positional embedding table [context_window × aligned_embed_dim].
 
size_t TransformerModel::embedded_input_offset
 Combined token+pos embeddings [context_window × aligned_embed_dim].
 
size_t TransformerModel::layers_start_offset
 Start of first transformer layer memory.
 
TrulyOptimalLayerTransformerModel::layers
 Array of per-layer offset structures.
 
size_t TransformerModel::final_ln_weight_offset
 Final LayerNorm gamma [aligned_embed_dim].
 
size_t TransformerModel::final_ln_bias_offset
 Final LayerNorm beta [aligned_embed_dim].
 
size_t TransformerModel::final_ln_mean_offset
 Final LayerNorm mean [context_window].
 
size_t TransformerModel::final_ln_rstd_offset
 Final LayerNorm rstd [context_window].
 
size_t TransformerModel::final_output_offset
 Final normalized output [context_window × aligned_embed_dim].
 
size_t TransformerModel::lm_head_weight_offset
 Language model head (weight-tied to token_emb_offset)
 
size_t TransformerModel::logits_offset
 Output logits [context_window × vocab_size].
 
GradientStorage TransformerModel::gradients
 Gradient and activation cache memory (training only)
 
bool TransformerModel::training_enabled
 Whether gradient storage is allocated.
 
float TransformerModel::learning_rate
 SGD learning rate for weight updates.
 
uint8_t TransformerModel::checksum [32]
 SHA256 checksum of weight file.
 
uint8_t TransformerModel::reserved [32]
 Reserved for future extensions.
 

Detailed Description

Matrix multiplication implementations with different optimization strategies.

Macro Definition Documentation

◆ ATTN_SCORES_ACCESS

#define ATTN_SCORES_ACCESS (   scores_ptr,
  h,
  i,
  j,
  aligned_context_window 
)     scores_ptr[((h) * (aligned_context_window) * (aligned_context_window)) + ((i) * (aligned_context_window)) + (j)]

Function Documentation

◆ add_gpt2_token_and_positional_embeddings()

void add_gpt2_token_and_positional_embeddings ( TransformerModel M,
size_t  token_ids_offset,
size_t  output_offset 
)

◆ add_gradient()

void add_gradient ( TransformerModel M,
size_t  source_offset,
size_t  dest_offset 
)

ADD GRADIENT (accumulate gradients from residual path)

This is used when gradients from multiple paths need to be summed. For example, at a residual connection, gradients flow through both:

  1. The transformation path (MLP or attention)
  2. The skip connection path

Both gradients need to be added together.

◆ aligned_alloc_64()

static void * aligned_alloc_64 ( size_t  size)
inlinestatic

◆ apply_causal_softmax_head_major()

void apply_causal_softmax_head_major ( TransformerModel M,
int  layer_idx 
)

◆ attention_head_major_complete()

void attention_head_major_complete ( TransformerModel M,
int  layer_idx 
)

Complete multi-head attention with head-major layout (self-attention)

Computes scaled dot-product attention using head-major memory layout for optimal cache locality. Each attention head operates independently, enabling head-level parallelism and cache-efficient processing.

Parameters
MTransformer model with memory layout and configuration
layer_idxLayer index for accessing Q/K/V tensors

Head-Level Parallelism Strategy: Unlike token-parallel operations (LayerNorm, GELU), attention parallelizes across HEADS because each head's computation is independent.

Memory Layout (Head-Major):

Q, K, V Tensors: [num_heads][context_window][head_dim]
Head 0: [Token0: 64f] [Token1: 64f] ... [TokenN: 64f]
Head 1: [Token0: 64f] [Token1: 64f] ... [TokenN: 64f]
...
Head 11: [Token0: 64f] [Token1: 64f] ... [TokenN: 64f]
Each head's data is CONTIGUOUS (no interleaving with other heads)

Why Head-Major Layout?:

  • Perfect Locality for Q·K^T: All data for one head fits in L2 cache
  • Attention Matrix in Cache: For 64-dim head, 1024 tokens:
    • Score matrix: 1024 × 1024 × 4 bytes = 4MB (fits in L3)
    • Per-head Q: 1024 × 64 × 4 bytes = 256KB (fits in L2)
    • Per-head K: 1024 × 64 × 4 bytes = 256KB (fits in L2)
  • No Strided Access: Sequential reads within each head
  • Head Parallelism: 12 independent heads = 12 parallel tasks

Three-Phase Attention Algorithm:

Phase 1: Compute Attention Scores (Q·K^T / √d_k)

For each head h in parallel:
For each query token i:
For each key token j (where j <= i for causal masking):
scores[h][i][j] = (Q[h][i] · K[h][j]) / sqrt(head_dim)
  • FLOPs: num_heads × T × (T+1)/2 × head_dim × 2
  • Memory: Streaming reads of Q and K
  • Cache: Each head's scores fit in L1 (1024×1024 floats = 4KB)

Phase 2: Causal Softmax

For each head h in parallel:
For each query token i:
scores[h][i][0:i+1] = softmax(scores[h][i][0:i+1])
scores[h][i][i+1:T] = 0 (causal mask)
  • Prevents attending to future tokens (autoregressive)
  • Row-wise softmax for numerical stability
  • FLOPs: num_heads × T × (T+1)/2 × 5 (exp, sum, divide, max)

Phase 3: Weighted Sum of Values (Softmax·V)

For each head h in parallel:
For each query token i:
output[h][i] = Σ_{j=0}^{i} scores[h][i][j] * V[h][j]
  • FLOPs: num_heads × T × (T+1)/2 × head_dim × 2
  • Produces per-head attention output in head-major layout

Stride Pattern Access: Access to Q[h][t][d]:

offset = h * (context_window * aligned_head_dim) +
t * aligned_head_dim +
d
  • aligned_head_dim ensures 64-byte alignment (prevents false sharing)
  • Sequential access within a head (hardware prefetcher friendly)
  • Each head occupies separate cache lines

Performance Characteristics:

  • Compute: O(num_heads × T² × head_dim)
  • Memory: O(num_heads × T²) for attention scores
  • Parallelism: Scales with min(num_heads, num_cores)
  • Cache: L2/L3 critical (must fit score matrix + Q/K/V for one head)

Comparison: Token-Parallel vs Head-Parallel:

Operation Parallelism Memory Pattern Cache Footprint
LayerNorm Token Sequential 3KB per token
Attention Head Strided 512KB per head
MLP Token Sequential 3KB per token

Why NOT Token-Parallel for Attention?:

  • Attention requires ALL token pairs (Q[i] · K[j] for all i,j)
  • Token parallelism would require synchronization at score matrix
  • Head-major layout allows independent head computation
Note
This function orchestrates all three attention phases
See also
compute_attention_scores_head_major Phase 1: Q·K^T
apply_causal_softmax_head_major Phase 2: Softmax with causal mask
compute_attention_output_head_major Phase 3: Attention·V
Q_ACCESS Head-major memory access macro

@performance Achieves 100-200 GFLOPS on attention computation (Xeon Gold 6248)

◆ attention_projection_with_concat()

void attention_projection_with_concat ( TransformerModel M,
int  layer_idx 
)

Production attention projection with concat: Head-major → Token-major → GEMM.

This function implements the concat strategy that proved 3.2x faster than direct head-major projection on hyperthreaded systems.

Parameters
MTransformer model
layer_idxLayer index to process

Memory flow:

  1. Input: Head-major attention [head][token][head_dim]
  2. Concat: Convert to token-major contiguous [token][embed_dim]
  3. GEMM: Standard matrix multiplication (proven 100 GFLOPS)
  4. Output: Token-major projection result [token][embed_dim]

◆ backward_attention_projection()

void backward_attention_projection ( TransformerModel M,
size_t  d_output_offset,
size_t  attention_output_copy_offset,
size_t  proj_weight_offset,
size_t  proj_bias_offset,
size_t  d_attention_offset,
size_t  d_weight_offset,
size_t  d_bias_offset 
)

BACKWARD THROUGH ATTENTION OUTPUT PROJECTION

Forward: output[T×D] = attention[T×D] @ W_proj[D×D] + b_proj[D]

This is after concatenating all heads back to [T×D] format

Backward computes:

  1. d_attention[T×D] = d_output[T×D] @ W_proj^T[D×D]
  2. d_W_proj[D×D] = attention^T[D×T] @ d_output[T×D]
  3. d_b_proj[D] = sum(d_output) over T

◆ backward_attention_weighted_values()

void backward_attention_weighted_values ( TransformerModel M,
size_t  d_output_offset,
size_t  attention_weights_offset,
size_t  v_output_offset,
size_t  d_weights_offset,
size_t  d_v_offset 
)

BACKWARD THROUGH ATTENTION WEIGHTED VALUES

Forward: attention_output[h,t,d] = sum_over_s(attention_weights[h,t,s] * V[h,s,d])

This operates in HEAD-MAJOR layout

Backward computes:

  1. d_attention_weights[h,t,s] = sum_over_d(d_output[h,t,d] * V[h,s,d])
  2. d_V[h,s,d] = sum_over_t(attention_weights[h,t,s] * d_output[h,t,d])

◆ backward_causal_softmax()

void backward_causal_softmax ( TransformerModel M,
size_t  d_scores_offset,
size_t  weights_copy_offset,
size_t  scores_copy_offset 
)

BACKWARD THROUGH CAUSAL SOFTMAX

Forward: attention_weights[h,i,j] = softmax(scores[h,i,:]) with causal mask

Softmax Jacobian for row i: ∂softmax[i,j]/∂score[i,k] = softmax[i,j] * (δ[j,k] - softmax[i,k]) where δ[j,k] is Kronecker delta (1 if j==k, 0 otherwise)

For a row, if y = softmax(x), then: dx = y * (dy - dot(y, dy))

Causal mask means we only compute for j <= i

◆ backward_embedding_layer()

void backward_embedding_layer ( TransformerModel M)

◆ backward_fc1()

void backward_fc1 ( TransformerModel M,
size_t  d_output_offset,
size_t  fc1_input_copy_offset,
size_t  fc1_weight_offset,
size_t  fc1_bias_offset,
size_t  d_input_offset,
size_t  d_weight_offset,
size_t  d_bias_offset 
)

BACKWARD THROUGH FC1 (Feed-Forward Layer 1)

FORWARD PASS (for reference): ──────────────────────────────────────────────────────────────── Input: fc1_input [T × D] (output from LayerNorm2) Weight: W_fc1 [D × 4D] (projects from D to 4D) Bias: b_fc1 [4D] Output: fc1_output [T × 4D] = fc1_input @ W_fc1 + b_fc1

BACKWARD PASS: ──────────────────────────────────────────────────────────────── Given: d_output [T × 4D] (gradient from GELU backward)

Need to compute:

  1. d_input [T × D] = d_output @ W_fc1^T
  2. d_W_fc1 [D × 4D] = fc1_input^T @ d_output (accumulated)
  3. d_b_fc1 [4D] = sum over T of d_output (accumulated)

DIMENSION FLOW: d_output[T×4D] ──┬──> @ W_fc1^T[4D×D] ──> d_input[T×D] │ ├──> fc1_input^T[D×T] @ ──> d_W_fc1[D×4D] │ └──> sum_over_T ──> d_b_fc1[4D]

HPC CONSIDERATIONS:

  • FC1 expands dimensions (D -> 4D), so weight matrix is large
  • Memory bandwidth critical for weight gradient accumulation
  • Consider chunking for better cache reuse
  • Token parallelism for d_input computation

◆ backward_fc2()

void backward_fc2 ( TransformerModel M,
size_t  d_output_offset,
size_t  fc2_input_copy_offset,
size_t  fc2_weight_offset,
size_t  fc2_bias_offset,
size_t  d_input_offset,
size_t  d_weight_offset,
size_t  d_bias_offset 
)

BACKWARD THROUGH FC2 (Feed-Forward Layer 2)

FORWARD PASS (for reference): ──────────────────────────────────────────────────────────────── Input: fc2_input [T × 4D] (after GELU activation) Weight: W_fc2 [4D × D] (projects from 4D back to D) Bias: b_fc2 [D] Output: fc2_output [T × D] = fc2_input @ W_fc2 + b_fc2

BACKWARD PASS: ──────────────────────────────────────────────────────────────── Given: d_output [T × D] (gradient from residual connection)

Need to compute:

  1. d_input [T × 4D] = d_output @ W_fc2^T
  2. d_W_fc2 [4D × D] = fc2_input^T @ d_output (accumulated)
  3. d_b_fc2 [D] = sum over T of d_output (accumulated)

DIMENSION FLOW: d_output[T×D] ──┬──> @ W_fc2^T[D×4D] ──> d_input[T×4D] │ ├──> fc2_input^T[4D×T] @ ──> d_W_fc2[4D×D] │ └──> sum_over_T ──> d_b_fc2[D]

HPC CONSIDERATIONS:

  • Token-parallel for d_input computation (each thread handles tokens)
  • Reduction required for weight/bias gradients (atomic ops or local accumulation)
  • Memory bandwidth bound due to large weight matrix (4D×D)
  • Cache blocking beneficial for weight gradient accumulation

◆ backward_final_layernorm()

void backward_final_layernorm ( TransformerModel M)

◆ backward_gelu()

void backward_gelu ( TransformerModel M,
size_t  d_output_offset,
size_t  input_copy_offset,
size_t  d_input_offset 
)

BACKWARD THROUGH GELU ACTIVATION

FORWARD PASS (for reference): ──────────────────────────────────────────────────────────────── GELU(x) = 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x³)))

Approximation used in practice: GELU(x) ≈ 0.5 * x * (1 + tanh(0.7978845608 * (x + 0.044715 * x³)))

BACKWARD PASS (derivative): ──────────────────────────────────────────────────────────────── d/dx[GELU(x)] = 0.5 * tanh(g(x)) + 0.5 * x * sech²(g(x)) * g'(x) + 0.5 where: g(x) = 0.7978845608 * (x + 0.044715 * x³) g'(x) = 0.7978845608 * (1 + 3 * 0.044715 * x²)

Simplified form: GELU'(x) = 0.5 * (1 + tanh(g(x))) + 0.5 * x * sech²(g(x)) * g'(x) = 0.5 * (1 + tanh(g(x))) + 0.5 * x * (1 - tanh²(g(x))) * g'(x)

DIMENSION FLOW: d_output [T × 4D] ──> × GELU'(input) ──> d_input [T × 4D]

HPC CONSIDERATIONS:

  • Element-wise operation (embarrassingly parallel)
  • Compute bound (tanh is expensive)
  • Can fuse with surrounding operations for better cache usage
  • Consider using fast tanh approximations for speed

◆ backward_gelu_fast()

void backward_gelu_fast ( TransformerModel M,
size_t  d_output_offset,
size_t  input_copy_offset,
size_t  d_input_offset 
)

Alternative: Fast approximation using precomputed GELU derivative This version trades accuracy for speed by using a simpler approximation

◆ backward_layernorm()

void backward_layernorm ( TransformerModel M,
size_t  d_output_offset,
size_t  input_copy_offset,
size_t  gamma_copy_offset,
size_t  beta_copy_offset,
size_t  mean_copy_offset,
size_t  rstd_copy_offset,
size_t  d_input_offset,
size_t  d_gamma_offset,
size_t  d_beta_offset 
)

BACKWARD THROUGH LAYERNORM

FORWARD (for reference): x_hat = (x - mean) / rstd y = gamma * x_hat + beta

BACKWARD: Given d_y, compute d_x, d_gamma, d_beta

The math (per token): d_x = (rstd/D) * [D * d_y * gamma - sum(d_y * gamma) - x_hat * sum(d_y * gamma * x_hat)] d_gamma = sum_over_tokens(d_y * x_hat) d_beta = sum_over_tokens(d_y)

◆ backward_linear()

void backward_linear ( TransformerModel M,
size_t  d_output_offset,
size_t  input_copy_offset,
size_t  weight_offset,
size_t  bias_offset,
size_t  d_input_offset,
size_t  d_weight_offset,
size_t  d_bias_offset 
)

BACKWARD THROUGH LINEAR LAYER (GENERIC)

Forward: output = input @ W + bias

This handles Q, K, V projections which are all linear layers Note: For QKV, the output is in head-major format but input is token-major

Backward: d_input += d_output @ W^T (accumulate because QKV all contribute) d_W += input^T @ d_output d_bias += sum(d_output)

◆ backward_lm_head()

void backward_lm_head ( TransformerModel M)

◆ backward_qk_matmul()

void backward_qk_matmul ( TransformerModel M,
size_t  d_scores_offset,
size_t  q_copy_offset,
size_t  k_copy_offset,
size_t  d_q_offset,
size_t  d_k_offset 
)

BACKWARD THROUGH Q @ K^T

Forward: scores[h,i,j] = sum_d(Q[h,i,d] * K[h,j,d]) / sqrt(head_dim)

Backward: d_Q[h,i,d] = sum_j(d_scores[h,i,j] * K[h,j,d]) / sqrt(head_dim) d_K[h,j,d] = sum_i(d_scores[h,i,j] * Q[h,i,d]) / sqrt(head_dim)

Note: Causal mask means d_scores[h,i,j] = 0 for j > i

◆ backward_residual_connection()

void backward_residual_connection ( TransformerModel M,
size_t  d_output_offset,
size_t  d_input_offset,
size_t  d_transform_offset 
)

RESIDUAL CONNECTION - FORWARD & BACKWARD PROPAGATION

CONCEPT: A residual connection (skip connection) allows gradients to flow directly through the network by adding the input to the output of a transformation. This addresses the vanishing gradient problem in deep networks.

FORWARD PASS: ──────────────────────────────────────────────────────────────── input (x) │ ├────────────────┐ (identity path / skip connection) │ │ ▼ │ ┌─────────┐ │ │ F(x) │ │ │ (trans- │ │ │ form) │ │ └─────────┘ │ │ │ ▼ ▼ F(x) + x └────────┬───────┘ ▼ output = F(x) + x

Mathematical form: output = input + transform(input)

In transformers specifically: output = input + MultiHeadAttention(LayerNorm(input)) output = input + FFN(LayerNorm(input))

BACKWARD PASS: ────────────────────────────────────────────────────────────────

Given: d_output = ∂L/∂output (gradient from layer above) Need: d_input = ∂L/∂input and d_transform = ∂L/∂transform

Since output = input + transform, by the chain rule: ∂output/∂input = 1 (derivative of input w.r.t itself) ∂output/∂transform = 1 (derivative of transform w.r.t itself)

Therefore: d_input = d_output × 1 = d_output d_transform = d_output × 1 = d_output

BACKWARD FLOW: d_output │ ┌──────────┴──────────┐ │ │ ▼ ▼ d_transform d_input (gradient flows (gradient flows to transform) directly through)

KEY INSIGHT: The gradient d_output flows EQUALLY through both paths:

  1. Through the transformation (to update its parameters)
  2. Directly to the input (skip connection)

This is why residual connections help with vanishing gradients: even if the transformation has small gradients, the skip path ensures gradients can flow directly to earlier layers.

IMPLEMENTATION NOTES:

  • Both gradients receive the SAME value (d_output)
  • Use += (accumulation) not = (assignment) in case gradients already exist from other paths
  • This simple operation is crucial for training deep networks
Parameters
MTransformer model
d_output_offsetGradient from the layer above
d_input_offsetWhere to accumulate gradient for input path
d_transform_offsetWhere to accumulate gradient for transform path

◆ backward_transformer_layer()

void backward_transformer_layer ( TransformerModel M,
int  layer_idx 
)

◆ benchmark_attention_projection_complete()

void benchmark_attention_projection_complete ( TransformerModel M)

◆ benchmark_qkv_dual_comparison()

void benchmark_qkv_dual_comparison ( TransformerModel M)

◆ cache_forward_activations()

void cache_forward_activations ( TransformerModel M)

Copy forward pass activations to gradient storage for backward pass This preserves the forward computations needed for gradient calculation.

◆ compare_arrays()

double compare_arrays ( const float *  a,
const float *  b,
size_t  size,
const char *  name 
)

◆ compute_attention_output_head_major()

void compute_attention_output_head_major ( TransformerModel M,
int  layer_idx 
)

◆ compute_attention_scores_head_major()

void compute_attention_scores_head_major ( TransformerModel M,
int  layer_idx 
)

◆ compute_cross_entropy_loss()

void compute_cross_entropy_loss ( TransformerModel M,
int32_t *  target_tokens,
float *  loss_out 
)

Compute cross-entropy loss and gradients w.r.t logits.

Loss = -sum(log(p[correct])) / context_length Gradient: dL/dlogit[i] = p[i] - 1 (for correct token) dL/dlogit[i] = p[i] (for other tokens)

◆ compute_logits_last_token_optimized()

void compute_logits_last_token_optimized ( TransformerModel M,
int  position 
)

◆ convert_token_major_to_head_major_layer()

void convert_token_major_to_head_major_layer ( const float *  token_major_base,
float *  head_major_base,
TransformerModel M 
)

◆ debug_math_comparison()

void debug_math_comparison ( TransformerModel M)

◆ embed_tokens()

void embed_tokens ( TransformerModel M,
int32_t *  token_ids,
int  num_tokens 
)

◆ gelu_activation_token_parallel()

void gelu_activation_token_parallel ( TransformerModel M,
size_t  data_offset 
)

◆ gemm_avx512_parallel()

void gemm_avx512_parallel ( const float *  A,
const float *  B,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

AVX-512 optimized GEMM with vectorized inner loops.

Parameters
AInput matrix A [M x K]
BInput matrix B [N x K] (transposed)
biasBias vector [N]
COutput matrix C [M x N]
MNumber of rows in A and C
NNumber of columns in B and C
KInner dimension

@performance

  • Target: 200-400 GFLOPS on modern Xeon
  • 16-wide SIMD operations
  • FMA instruction utilization

@optimization_details

  • Uses _mm512_fmadd_ps for 3 FLOPs per instruction
  • 16-element vectorization of inner loop
  • Handles remainder elements with scalar code
See also
gemm_naive_parallel for reference implementation

◆ gemm_blocked_serial()

void gemm_blocked_serial ( const float *  A,
const float *  B,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

◆ gemm_fine_grained_parallel()

void gemm_fine_grained_parallel ( const float *  A,
const float *  B,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

Cache-blocked GEMM with fine-grained parallelism.

Parameters
AInput matrix A [M x K]
BInput matrix B [N x K] (transposed)
biasBias vector [N]
COutput matrix C [M x N]
MNumber of rows in A and C
NNumber of columns in B and C
KInner dimension

@performance

  • Target: 300-500 GFLOPS
  • Best performance for large matrices
  • Optimal cache utilization

@implementation_notes

  • 64x64 blocking for L1 cache optimization
  • Collapse(3) OpenMP directive for maximum parallelism
  • Atomic updates for thread safety

@benchmark_results Tested on 8192x8192 matrices:

  • Naive: 85 GFLOPS
  • This impl: 474 GFLOPS (5.6x speedup)

◆ gemm_naive_parallel()

void gemm_naive_parallel ( const float *  A,
const float *  B,
const float *  bias,
float *  C,
int  M,
int  N,
int  K 
)

Naive parallel GEMM implementation (reference baseline)

Parameters
AInput matrix A [M x K]
BInput matrix B [N x K] (transposed)
biasBias vector [N]
COutput matrix C [M x N]
MNumber of rows in A and C
NNumber of columns in B and C
KInner dimension (columns of A, rows of B)

@performance

  • Baseline performance: ~50-100 GFLOPS
  • Used as reference for accuracy validation
  • Simple OpenMP parallelization
Note
This is the golden reference - all other implementations are validated against this

◆ generate()

void generate ( TransformerModel M,
int *  prompt,
int  prompt_len,
int  max_tokens 
)

◆ layernorm_forward_rolled_slice()

void layernorm_forward_rolled_slice ( const float *__restrict  input_slice_base,
const float *__restrict  gamma,
const float *__restrict  beta,
float *__restrict  output_slice_base,
float *__restrict  mean_cache_slice,
float *__restrict  rstd_cache_slice,
int  num_tokens_in_slice,
int  d_model,
int  aligned_embed_dim,
float  eps 
)

◆ layernorm_forward_unrolled_slice()

void layernorm_forward_unrolled_slice ( const float *__restrict  input_slice_base,
const float *__restrict  gamma,
const float *__restrict  beta,
float *__restrict  output_slice_base,
float *__restrict  mean_cache_slice,
float *__restrict  rstd_cache_slice,
int  num_tokens_in_slice,
int  d_model,
float  eps 
)

◆ layernorm_naive_serial()

void layernorm_naive_serial ( const float *  input,
const float *  gamma,
const float *  beta,
float *  output,
float *  mean_cache,
float *  rstd_cache,
int  tokens,
int  d_model,
int  aligned_embed_dim,
float  eps 
)

◆ layernorm_naive_serial_matched_precision()

void layernorm_naive_serial_matched_precision ( const float *  input,
const float *  gamma,
const float *  beta,
float *  output,
float *  mean_cache,
float *  rstd_cache,
int  tokens,
int  d_model,
float  eps 
)

◆ layernorm_token_parallel()

void layernorm_token_parallel ( TransformerModel M,
size_t  input_offset,
size_t  weight_offset,
size_t  bias_offset,
size_t  mean_cache_offset,
size_t  rstd_cache_offset,
size_t  output_offset,
float  eps 
)

Token-parallel Layer Normalization with AVX-512 optimization.

Performs Layer Normalization across tokens using token-level parallelism. Each CPU core processes a contiguous slice of tokens independently, achieving perfect cache locality and zero synchronization overhead.

Parameters
MTransformer model containing memory layout and parallelism config
input_offsetOffset to input tensor [context_window × aligned_embed_dim]
weight_offsetOffset to gamma weights [aligned_embed_dim]
bias_offsetOffset to beta biases [aligned_embed_dim]
mean_cache_offsetOffset to mean cache [context_window] (for backward pass)
rstd_cache_offsetOffset to rstd cache [context_window] (for backward pass)
output_offsetOffset to output tensor [context_window × aligned_embed_dim]
epsEpsilon for numerical stability (typically 1e-5)

Token-Level Parallelism Strategy:

Memory Layout (Token-Major):
┌──────────────┬──────────────┬──────────────┬──────────────┐
│ Token 0 │ Token 1 │ Token 2 │ Token 3 │
│ [768 floats] │ [768 floats] │ [768 floats] │ [768 floats] │
└──────────────┴──────────────┴──────────────┴──────────────┘
│<─ Core 0 ──>│<─ Core 1 ──>│<─ Core 2 ──>│<─ Core 3 ──>│

Why Token-Parallel?:

  • Perfect Locality: Each token's data (768 floats) is contiguous in memory
  • Zero Sync: Tokens are independent, no barriers or atomics needed
  • Cache Efficiency: Each core streams through sequential memory
  • Linear Scaling: Speedup = num_cores (measured 7.8x on 8 cores)

Memory Access Pattern (per core):

Core 0 processes tokens [0, tokens_per_core):
- Read: input[0*768], input[1*768], ..., input[N*768] (sequential)
- Write: output[0*768], output[1*768], ..., output[N*768] (sequential)
- Gamma/Beta: Shared read-only (broadcast to all cores)

Algorithm (per token):

  1. Pass 1: Compute mean across embed_dim using AVX-512
  2. Pass 2: Compute variance using FMA (fused multiply-add)
  3. Pass 3: Normalize, scale by gamma, shift by beta

AVX-512 Optimization:

  • Processes 16 floats per instruction (4x16 unrolling)
  • Uses FMA for variance: acc = diff * diff + acc (2 FLOPs per cycle)
  • Aligned loads: _mm512_load_ps (requires 64-byte alignment)
  • Prefetching: Hints to load next cache line while computing current

Performance Characteristics:

  • Compute: 9 * embed_dim FLOPs per token
  • Memory: 3 * embed_dim reads + embed_dim writes per token
  • Bandwidth: Achieves 50-100 GB/s per core (streaming bandwidth)
  • Latency: ~5 μs per token on modern Xeon (768-dim, 3.0 GHz)

Cache Behavior:

  • L1 Data Cache: Holds ~4 tokens (768 floats = 3KB per token, 32KB L1)
  • L2 Cache: Holds ~80 tokens (256KB L2)
  • Prefetcher: Detects sequential pattern, hides DRAM latency

Why Aligned Embed Dim?: Padding to 64-byte boundaries ensures:

  • No false sharing between cores writing adjacent tokens
  • Aligned SIMD loads (faster than unaligned)
  • Clean cache line ownership (no partial cache line reads)
Note
This is a core building block used in every transformer layer
See also
layernorm_forward_rolled_slice Per-core slice processing kernel
TrulyOptimalLayer For offset definitions within a layer

@performance Measured 7.8x speedup on 8-core Xeon vs serial baseline

◆ load_model_weights()

int load_model_weights ( TransformerModel M,
const char *  weight_file 
)

Load weights into already-allocated TransformerModel.

This assumes:

  1. read_model_metadata() has been called
  2. layout_transformer() has been called to allocate memory
  3. You've verified you have enough RAM
Parameters
MPointer to initialized and allocated TransformerModel
weight_filePath to the .weights file
Returns
0 on success, -1 on failure

◆ main()

int main ( int  argc,
char **  argv 
)

◆ mlp_token_parallel()

void mlp_token_parallel ( TransformerModel M,
size_t  input_offset,
size_t  fc1_weight_offset,
size_t  fc1_bias_offset,
size_t  fc1_output_offset,
size_t  fc2_weight_offset,
size_t  fc2_bias_offset,
size_t  output_offset 
)

◆ qkv_micro_kernel_blocked_4x16_polished()

static void qkv_micro_kernel_blocked_4x16_polished ( const float *__restrict  input_token,
const float *__restrict  Q_weights_block,
const float *__restrict  K_weights_block,
const float *__restrict  V_weights_block,
const float *__restrict  Q_bias_4,
const float *__restrict  K_bias_4,
const float *__restrict  V_bias_4,
float *__restrict  Q_output_4,
float *__restrict  K_output_4,
float *__restrict  V_output_4,
int  embed_dim 
)
inlinestatic

◆ qkv_micro_kernel_head_major_4x16()

static void qkv_micro_kernel_head_major_4x16 ( const float *__restrict  input_token,
const float *__restrict  Q_weights_block,
const float *__restrict  K_weights_block,
const float *__restrict  V_weights_block,
const float *__restrict  Q_bias_4,
const float *__restrict  K_bias_4,
const float *__restrict  V_bias_4,
TransformerModel M,
float *__restrict  q_output_base,
float *__restrict  k_output_base,
float *__restrict  v_output_base,
int  embed_dim,
int  token_idx,
int  output_start_dim 
)
inlinestatic

◆ qkv_projection()

void qkv_projection ( TransformerModel M,
size_t  layer_idx 
)

◆ qkv_projection_head_major()

void qkv_projection_head_major ( TransformerModel M,
int  layer_idx 
)

◆ qkv_token_kernel_4x16_blocked_polished()

static void qkv_token_kernel_4x16_blocked_polished ( const float *__restrict  input_token,
const float *__restrict  Q_weights,
const float *__restrict  K_weights,
const float *__restrict  V_weights,
const float *__restrict  Q_bias,
const float *__restrict  K_bias,
const float *__restrict  V_bias,
float *__restrict  Q_output,
float *__restrict  K_output,
float *__restrict  V_output,
int  embed_dim 
)
static

◆ qkv_token_kernel_head_major_4x16()

static void qkv_token_kernel_head_major_4x16 ( const float *__restrict  input_token,
const float *__restrict  Q_weights,
const float *__restrict  K_weights,
const float *__restrict  V_weights,
const float *__restrict  Q_bias,
const float *__restrict  K_bias,
const float *__restrict  V_bias,
TransformerModel M,
float *__restrict  q_output_base,
float *__restrict  k_output_base,
float *__restrict  v_output_base,
int  embed_dim,
int  token_idx 
)
static

◆ read_model_metadata()

int read_model_metadata ( TransformerModel M,
const char *  weight_file 
)

Read model metadata from weight file header.

This function ONLY reads the header and populates model dimensions. It does NOT allocate memory - that's your decision to make separately.

Parameters
MPointer to zero-initialized TransformerModel struct
weight_filePath to the .weights file
Returns
0 on success, -1 on failure

◆ residual_add_token_parallel()

void residual_add_token_parallel ( TransformerModel M,
size_t  input_offset,
size_t  residual_offset,
size_t  output_offset 
)

◆ run_comprehensive_benchmark()

void run_comprehensive_benchmark ( TransformerModel M)

◆ run_layernorm_benchmark_performance()

void run_layernorm_benchmark_performance ( TransformerModel M)

◆ run_layernorm_benchmark_precision_matched()

void run_layernorm_benchmark_precision_matched ( TransformerModel M)

◆ sample_token()

int sample_token ( float *  logits,
int  vocab_size,
float  temperature 
)

◆ test_attention_head_major_after_qkv()

void test_attention_head_major_after_qkv ( TransformerModel M)

◆ training_step()

void training_step ( TransformerModel M,
int32_t *  input_tokens,
int32_t *  target_tokens,
float  learning_rate 
)

◆ transformer_layer_forward()

void transformer_layer_forward ( TransformerModel M,
int  layer_idx,
size_t  layer_input_offset 
)

◆ update_all_weights_sgd()

void update_all_weights_sgd ( TransformerModel M,
float  learning_rate 
)

◆ zero_gradients()

void zero_gradients ( TransformerModel M)

Variable Documentation

◆ actual_tokens_offset

size_t GradientStorage::actual_tokens_offset

◆ aligned_attn_context_window

size_t TransformerModel::aligned_attn_context_window

context_window padded to prevent false sharing

◆ aligned_embed_dim

size_t TransformerModel::aligned_embed_dim

embed_dim rounded up to 64-byte alignment (in floats)

◆ aligned_head_dim

size_t TransformerModel::aligned_head_dim

head_dim rounded up to 64-byte alignment (in floats)

◆ attention_output_copy_offset

size_t LayerGradients::attention_output_copy_offset

◆ attention_output_offset

size_t TrulyOptimalLayer::attention_output_offset

◆ attention_scores_copy_offset

size_t LayerGradients::attention_scores_copy_offset

◆ attention_scores_offset

size_t TrulyOptimalLayer::attention_scores_offset

◆ attention_weights_copy_offset

size_t LayerGradients::attention_weights_copy_offset

◆ backprop_base

size_t GradientStorage::backprop_base

◆ checksum

uint8_t TransformerModel::checksum[32]

SHA256 checksum of weight file.

◆ context_window

int TransformerModel::context_window

Maximum sequence length (e.g., 1024)

◆ d_attention_output_offset

size_t LayerGradients::d_attention_output_offset

◆ d_attention_scores_offset

size_t LayerGradients::d_attention_scores_offset

◆ d_attention_weights_offset

size_t LayerGradients::d_attention_weights_offset

◆ d_embed_weights_offset

size_t GradientStorage::d_embed_weights_offset

◆ d_fc1_bias_offset

size_t LayerGradients::d_fc1_bias_offset

◆ d_fc1_output_offset

size_t LayerGradients::d_fc1_output_offset

◆ d_fc1_weights_offset

size_t LayerGradients::d_fc1_weights_offset

◆ d_fc2_bias_offset

size_t LayerGradients::d_fc2_bias_offset

◆ d_fc2_input_offset

size_t LayerGradients::d_fc2_input_offset

◆ d_fc2_weights_offset

size_t LayerGradients::d_fc2_weights_offset

◆ d_final_ln_beta_offset

size_t GradientStorage::d_final_ln_beta_offset

◆ d_final_ln_gamma_offset

size_t GradientStorage::d_final_ln_gamma_offset

◆ d_final_ln_input_offset

size_t GradientStorage::d_final_ln_input_offset

◆ d_final_output_offset

size_t GradientStorage::d_final_output_offset

◆ d_k_bias_offset

size_t LayerGradients::d_k_bias_offset

◆ d_k_output_offset

size_t LayerGradients::d_k_output_offset

◆ d_k_weights_offset

size_t LayerGradients::d_k_weights_offset

◆ d_ln1_beta_offset

size_t LayerGradients::d_ln1_beta_offset

◆ d_ln1_gamma_offset

size_t LayerGradients::d_ln1_gamma_offset

◆ d_ln1_input_offset

size_t LayerGradients::d_ln1_input_offset

◆ d_ln1_output_offset

size_t LayerGradients::d_ln1_output_offset

◆ d_ln2_beta_offset

size_t LayerGradients::d_ln2_beta_offset

◆ d_ln2_gamma_offset

size_t LayerGradients::d_ln2_gamma_offset

◆ d_ln2_input_offset

size_t LayerGradients::d_ln2_input_offset

◆ d_ln2_output_offset

size_t LayerGradients::d_ln2_output_offset

◆ d_logits_offset

size_t GradientStorage::d_logits_offset

◆ d_mlp_output_offset

size_t LayerGradients::d_mlp_output_offset

◆ d_pos_embed_offset

size_t GradientStorage::d_pos_embed_offset

◆ d_proj_bias_offset

size_t LayerGradients::d_proj_bias_offset

◆ d_proj_weights_offset

size_t LayerGradients::d_proj_weights_offset

◆ d_q_bias_offset

size_t LayerGradients::d_q_bias_offset

◆ d_q_output_offset

size_t LayerGradients::d_q_output_offset

◆ d_q_weights_offset

size_t LayerGradients::d_q_weights_offset

◆ d_residual1_offset

size_t LayerGradients::d_residual1_offset

◆ d_residual2_offset

size_t LayerGradients::d_residual2_offset

◆ d_v_bias_offset

size_t LayerGradients::d_v_bias_offset

◆ d_v_output_offset

size_t LayerGradients::d_v_output_offset

◆ d_v_weights_offset

size_t LayerGradients::d_v_weights_offset

◆ embed_dim

int TransformerModel::embed_dim

Embedding dimension (e.g., 768 for GPT-2 small)

◆ embedded_input_offset

size_t TransformerModel::embedded_input_offset

Combined token+pos embeddings [context_window × aligned_embed_dim].

◆ fc1_bias_copy_offset

size_t LayerGradients::fc1_bias_copy_offset

◆ fc1_bias_offset

size_t TrulyOptimalLayer::fc1_bias_offset

◆ fc1_output_copy_offset

size_t LayerGradients::fc1_output_copy_offset

◆ fc1_output_offset

size_t TrulyOptimalLayer::fc1_output_offset

◆ fc1_weight_offset

size_t TrulyOptimalLayer::fc1_weight_offset

◆ fc1_weights_copy_offset

size_t LayerGradients::fc1_weights_copy_offset

◆ fc2_bias_copy_offset

size_t LayerGradients::fc2_bias_copy_offset

◆ fc2_bias_offset

size_t TrulyOptimalLayer::fc2_bias_offset

◆ fc2_input_copy_offset

size_t LayerGradients::fc2_input_copy_offset

◆ fc2_weight_offset

size_t TrulyOptimalLayer::fc2_weight_offset

◆ fc2_weights_copy_offset

size_t LayerGradients::fc2_weights_copy_offset

◆ final_ln_beta_copy_offset

size_t GradientStorage::final_ln_beta_copy_offset

◆ final_ln_bias_offset

size_t TransformerModel::final_ln_bias_offset

Final LayerNorm beta [aligned_embed_dim].

◆ final_ln_gamma_copy_offset

size_t GradientStorage::final_ln_gamma_copy_offset

◆ final_ln_input_copy_offset

size_t GradientStorage::final_ln_input_copy_offset

◆ final_ln_mean_copy_offset

size_t GradientStorage::final_ln_mean_copy_offset

◆ final_ln_mean_offset

size_t TransformerModel::final_ln_mean_offset

Final LayerNorm mean [context_window].

◆ final_ln_rstd_copy_offset

size_t GradientStorage::final_ln_rstd_copy_offset

◆ final_ln_rstd_offset

size_t TransformerModel::final_ln_rstd_offset

Final LayerNorm rstd [context_window].

◆ final_ln_weight_offset

size_t TransformerModel::final_ln_weight_offset

Final LayerNorm gamma [aligned_embed_dim].

◆ final_output_copy_offset

size_t GradientStorage::final_output_copy_offset

◆ final_output_offset

size_t TransformerModel::final_output_offset

Final normalized output [context_window × aligned_embed_dim].

◆ gradients

GradientStorage TransformerModel::gradients

Gradient and activation cache memory (training only)

◆ head_dim

int TransformerModel::head_dim

Dimension per head: embed_dim / num_attention_heads.

◆ k_bias_copy_offset

size_t LayerGradients::k_bias_copy_offset

◆ k_bias_offset

size_t TrulyOptimalLayer::k_bias_offset

◆ k_output_copy_offset

size_t LayerGradients::k_output_copy_offset

◆ k_output_offset

size_t TrulyOptimalLayer::k_output_offset

◆ k_weight_offset

size_t TrulyOptimalLayer::k_weight_offset

◆ k_weights_copy_offset

size_t LayerGradients::k_weights_copy_offset

◆ layer_backprop_stride

size_t GradientStorage::layer_backprop_stride

◆ layer_end_canary_offset

size_t TrulyOptimalLayer::layer_end_canary_offset

◆ layer_input_offset

size_t TrulyOptimalLayer::layer_input_offset

◆ layer_start_canary_offset

size_t TrulyOptimalLayer::layer_start_canary_offset

◆ layer_stride

size_t TransformerModel::layer_stride

Byte offset between consecutive layer memory blocks.

◆ layers [1/2]

LayerGradients* GradientStorage::layers

◆ layers [2/2]

TrulyOptimalLayer* TransformerModel::layers

Array of per-layer offset structures.

◆ layers_start_offset

size_t TransformerModel::layers_start_offset

Start of first transformer layer memory.

◆ learning_rate

float TransformerModel::learning_rate

SGD learning rate for weight updates.

◆ lm_head_weight_offset

size_t TransformerModel::lm_head_weight_offset

Language model head (weight-tied to token_emb_offset)

◆ ln1_beta_copy_offset

size_t LayerGradients::ln1_beta_copy_offset

◆ ln1_bias_offset

size_t TrulyOptimalLayer::ln1_bias_offset

◆ ln1_gamma_copy_offset

size_t LayerGradients::ln1_gamma_copy_offset

◆ ln1_input_copy_offset

size_t LayerGradients::ln1_input_copy_offset

◆ ln1_mean_copy_offset

size_t LayerGradients::ln1_mean_copy_offset

◆ ln1_mean_offset

size_t TrulyOptimalLayer::ln1_mean_offset

◆ ln1_output_copy_offset

size_t LayerGradients::ln1_output_copy_offset

◆ ln1_output_offset

size_t TrulyOptimalLayer::ln1_output_offset

◆ ln1_rstd_copy_offset

size_t LayerGradients::ln1_rstd_copy_offset

◆ ln1_rstd_offset

size_t TrulyOptimalLayer::ln1_rstd_offset

◆ ln1_weight_offset

size_t TrulyOptimalLayer::ln1_weight_offset

◆ ln2_beta_copy_offset

size_t LayerGradients::ln2_beta_copy_offset

◆ ln2_bias_offset

size_t TrulyOptimalLayer::ln2_bias_offset

◆ ln2_gamma_copy_offset

size_t LayerGradients::ln2_gamma_copy_offset

◆ ln2_input_copy_offset

size_t LayerGradients::ln2_input_copy_offset

◆ ln2_mean_copy_offset

size_t LayerGradients::ln2_mean_copy_offset

◆ ln2_mean_offset

size_t TrulyOptimalLayer::ln2_mean_offset

◆ ln2_output_copy_offset

size_t LayerGradients::ln2_output_copy_offset

◆ ln2_output_offset

size_t TrulyOptimalLayer::ln2_output_offset

◆ ln2_rstd_copy_offset

size_t LayerGradients::ln2_rstd_copy_offset

◆ ln2_rstd_offset

size_t TrulyOptimalLayer::ln2_rstd_offset

◆ ln2_weight_offset

size_t TrulyOptimalLayer::ln2_weight_offset

◆ logits_copy_offset

size_t GradientStorage::logits_copy_offset

◆ logits_offset

size_t TransformerModel::logits_offset

Output logits [context_window × vocab_size].

◆ magic

char TransformerModel::magic[8]

Magic string "BUMPWGT2" for file validation.

◆ memory_base

float* TransformerModel::memory_base

Base pointer to single contiguous memory block.

◆ mlp_output_copy_offset

size_t LayerGradients::mlp_output_copy_offset

◆ mlp_output_offset

size_t TrulyOptimalLayer::mlp_output_offset

◆ model_type

uint32_t TransformerModel::model_type

Model architecture: 0=GPT2, 1=LLAMA, etc.

◆ num_attention_heads

int TransformerModel::num_attention_heads

Number of attention heads (e.g., 12 for GPT-2)

◆ num_cores

int TransformerModel::num_cores

Number of CPU cores to use (OpenMP threads)

◆ num_layers

int TransformerModel::num_layers

Number of transformer layers (e.g., 12 for GPT-2)

◆ pos_emb_offset

size_t TransformerModel::pos_emb_offset

Positional embedding table [context_window × aligned_embed_dim].

◆ proj_bias_copy_offset

size_t LayerGradients::proj_bias_copy_offset

◆ proj_bias_offset

size_t TrulyOptimalLayer::proj_bias_offset

◆ proj_weight_offset

size_t TrulyOptimalLayer::proj_weight_offset

◆ proj_weights_copy_offset

size_t LayerGradients::proj_weights_copy_offset

◆ q_bias_copy_offset

size_t LayerGradients::q_bias_copy_offset

◆ q_bias_offset

size_t TrulyOptimalLayer::q_bias_offset

◆ q_output_copy_offset

size_t LayerGradients::q_output_copy_offset

◆ q_output_offset

size_t TrulyOptimalLayer::q_output_offset

◆ q_weight_offset

size_t TrulyOptimalLayer::q_weight_offset

◆ q_weights_copy_offset

size_t LayerGradients::q_weights_copy_offset

◆ reserved

uint8_t TransformerModel::reserved[32]

Reserved for future extensions.

◆ residual1_copy_offset

size_t LayerGradients::residual1_copy_offset

◆ residual1_output_offset

size_t TrulyOptimalLayer::residual1_output_offset

◆ residual2_copy_offset

size_t LayerGradients::residual2_copy_offset

◆ residual2_output_offset

size_t TrulyOptimalLayer::residual2_output_offset

◆ token_emb_offset

size_t TransformerModel::token_emb_offset

Token embedding table [vocab_size × aligned_embed_dim].

◆ tokens_per_core

int TransformerModel::tokens_per_core

Tokens assigned per core: context_window / num_cores.

◆ total_floats

size_t TransformerModel::total_floats

Total size of memory block in float elements.

◆ total_gradient_floats

size_t GradientStorage::total_gradient_floats

◆ training_enabled

bool TransformerModel::training_enabled

Whether gradient storage is allocated.

◆ v_bias_copy_offset

size_t LayerGradients::v_bias_copy_offset

◆ v_bias_offset

size_t TrulyOptimalLayer::v_bias_offset

◆ v_output_copy_offset

size_t LayerGradients::v_output_copy_offset

◆ v_output_offset

size_t TrulyOptimalLayer::v_output_offset

◆ v_weight_offset

size_t TrulyOptimalLayer::v_weight_offset

◆ v_weights_copy_offset

size_t LayerGradients::v_weights_copy_offset

◆ version

uint32_t TransformerModel::version

Weight file format version.

◆ vocab_size

int TransformerModel::vocab_size

Vocabulary size (e.g., 50257 for GPT-2)