Matrix multiplication implementations with different optimization strategies. More...

Macros
#define	ATTN_SCORES_ACCESS(scores_ptr, h, i, j, aligned_context_window) scores_ptr[((h) * (aligned_context_window) * (aligned_context_window)) + ((i) * (aligned_context_window)) + (j)]

Functions
void	gemm_naive_parallel (const float A, const float B, const float bias, float C, int M, int N, int K)
	Naive parallel GEMM implementation (reference baseline)

void	gemm_avx512_parallel (const float A, const float B, const float bias, float C, int M, int N, int K)
	AVX-512 optimized GEMM with vectorized inner loops.

void	gemm_fine_grained_parallel (const float A, const float B, const float bias, float C, int M, int N, int K)
	Cache-blocked GEMM with fine-grained parallelism.

void	gemm_blocked_serial (const float A, const float B, const float bias, float C, int M, int N, int K)

void	layernorm_naive_serial (const float input, const float gamma, const float beta, float output, float mean_cache, float rstd_cache, int tokens, int d_model, int aligned_embed_dim, float eps)

void	layernorm_forward_rolled_slice (const float __restrict input_slice_base, const float __restrict gamma, const float __restrict beta, float __restrict output_slice_base, float __restrict mean_cache_slice, float __restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, int aligned_embed_dim, float eps)

void	layernorm_forward_unrolled_slice (const float __restrict input_slice_base, const float __restrict gamma, const float __restrict beta, float __restrict output_slice_base, float __restrict mean_cache_slice, float __restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, float eps)

void	layernorm_token_parallel (TransformerModel *M, size_t input_offset, size_t weight_offset, size_t bias_offset, size_t mean_cache_offset, size_t rstd_cache_offset, size_t output_offset, float eps)
	Token-parallel Layer Normalization with AVX-512 optimization.

void	layernorm_naive_serial_matched_precision (const float input, const float gamma, const float beta, float output, float mean_cache, float rstd_cache, int tokens, int d_model, float eps)

static void *	aligned_alloc_64 (size_t size)

void	debug_math_comparison (TransformerModel *M)

void	run_layernorm_benchmark_precision_matched (TransformerModel *M)

void	run_layernorm_benchmark_performance (TransformerModel *M)

static void	qkv_micro_kernel_blocked_4x16_polished (const float __restrict input_token, const float __restrict Q_weights_block, const float __restrict K_weights_block, const float __restrict V_weights_block, const float __restrict Q_bias_4, const float __restrict K_bias_4, const float __restrict V_bias_4, float __restrict Q_output_4, float __restrict K_output_4, float __restrict V_output_4, int embed_dim)

static void	qkv_token_kernel_4x16_blocked_polished (const float __restrict input_token, const float __restrict Q_weights, const float __restrict K_weights, const float __restrict V_weights, const float __restrict Q_bias, const float __restrict K_bias, const float __restrict V_bias, float __restrict Q_output, float __restrict K_output, float __restrict V_output, int embed_dim)

void	qkv_projection (TransformerModel *M, size_t layer_idx)

static void	qkv_micro_kernel_head_major_4x16 (const float __restrict input_token, const float __restrict Q_weights_block, const float __restrict K_weights_block, const float __restrict V_weights_block, const float __restrict Q_bias_4, const float __restrict K_bias_4, const float __restrict V_bias_4, TransformerModel M, float __restrict q_output_base, float __restrict k_output_base, float *__restrict v_output_base, int embed_dim, int token_idx, int output_start_dim)

static void	qkv_token_kernel_head_major_4x16 (const float __restrict input_token, const float __restrict Q_weights, const float __restrict K_weights, const float __restrict V_weights, const float __restrict Q_bias, const float __restrict K_bias, const float __restrict V_bias, TransformerModel M, float __restrict q_output_base, float __restrict k_output_base, float *__restrict v_output_base, int embed_dim, int token_idx)

void	qkv_projection_head_major (TransformerModel *M, int layer_idx)

double	compare_arrays (const float a, const float b, size_t size, const char *name)

void	convert_token_major_to_head_major_layer (const float token_major_base, float head_major_base, TransformerModel *M)

void	benchmark_qkv_dual_comparison (TransformerModel *M)

void	compute_attention_scores_head_major (TransformerModel *M, int layer_idx)

void	apply_causal_softmax_head_major (TransformerModel *M, int layer_idx)

void	compute_attention_output_head_major (TransformerModel *M, int layer_idx)

void	attention_head_major_complete (TransformerModel *M, int layer_idx)
	Complete multi-head attention with head-major layout (self-attention)

void	test_attention_head_major_after_qkv (TransformerModel *M)

void	attention_projection_with_concat (TransformerModel *M, int layer_idx)
	Production attention projection with concat: Head-major → Token-major → GEMM.

void	benchmark_attention_projection_complete (TransformerModel *M)

void	add_gpt2_token_and_positional_embeddings (TransformerModel *M, size_t token_ids_offset, size_t output_offset)

void	residual_add_token_parallel (TransformerModel *M, size_t input_offset, size_t residual_offset, size_t output_offset)

void	gelu_activation_token_parallel (TransformerModel *M, size_t data_offset)

void	mlp_token_parallel (TransformerModel *M, size_t input_offset, size_t fc1_weight_offset, size_t fc1_bias_offset, size_t fc1_output_offset, size_t fc2_weight_offset, size_t fc2_bias_offset, size_t output_offset)

void	embed_tokens (TransformerModel M, int32_t token_ids, int num_tokens)

void	compute_logits_last_token_optimized (TransformerModel *M, int position)

void	transformer_layer_forward (TransformerModel *M, int layer_idx, size_t layer_input_offset)

void	run_comprehensive_benchmark (TransformerModel *M)

int	load_model_weights (TransformerModel M, const char weight_file)
	Load weights into already-allocated TransformerModel.

int	read_model_metadata (TransformerModel M, const char weight_file)
	Read model metadata from weight file header.

int	sample_token (float *logits, int vocab_size, float temperature)

void	generate (TransformerModel M, int prompt, int prompt_len, int max_tokens)

void	zero_gradients (TransformerModel *M)

void	cache_forward_activations (TransformerModel *M)
	Copy forward pass activations to gradient storage for backward pass This preserves the forward computations needed for gradient calculation.

void	backward_residual_connection (TransformerModel *M, size_t d_output_offset, size_t d_input_offset, size_t d_transform_offset)

void	backward_embedding_layer (TransformerModel *M)

void	backward_final_layernorm (TransformerModel *M)

void	backward_fc2 (TransformerModel *M, size_t d_output_offset, size_t fc2_input_copy_offset, size_t fc2_weight_offset, size_t fc2_bias_offset, size_t d_input_offset, size_t d_weight_offset, size_t d_bias_offset)

void	backward_gelu (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t d_input_offset)

void	backward_fc1 (TransformerModel *M, size_t d_output_offset, size_t fc1_input_copy_offset, size_t fc1_weight_offset, size_t fc1_bias_offset, size_t d_input_offset, size_t d_weight_offset, size_t d_bias_offset)

void	backward_gelu_fast (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t d_input_offset)

void	backward_layernorm (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t gamma_copy_offset, size_t beta_copy_offset, size_t mean_copy_offset, size_t rstd_copy_offset, size_t d_input_offset, size_t d_gamma_offset, size_t d_beta_offset)

void	add_gradient (TransformerModel *M, size_t source_offset, size_t dest_offset)

void	backward_attention_projection (TransformerModel *M, size_t d_output_offset, size_t attention_output_copy_offset, size_t proj_weight_offset, size_t proj_bias_offset, size_t d_attention_offset, size_t d_weight_offset, size_t d_bias_offset)

void	backward_attention_weighted_values (TransformerModel *M, size_t d_output_offset, size_t attention_weights_offset, size_t v_output_offset, size_t d_weights_offset, size_t d_v_offset)

void	backward_causal_softmax (TransformerModel *M, size_t d_scores_offset, size_t weights_copy_offset, size_t scores_copy_offset)

void	backward_qk_matmul (TransformerModel *M, size_t d_scores_offset, size_t q_copy_offset, size_t k_copy_offset, size_t d_q_offset, size_t d_k_offset)

void	backward_linear (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t weight_offset, size_t bias_offset, size_t d_input_offset, size_t d_weight_offset, size_t d_bias_offset)

void	backward_lm_head (TransformerModel *M)

void	backward_transformer_layer (TransformerModel *M, int layer_idx)

void	compute_cross_entropy_loss (TransformerModel M, int32_t target_tokens, float *loss_out)
	Compute cross-entropy loss and gradients w.r.t logits.

void	training_step (TransformerModel M, int32_t input_tokens, int32_t *target_tokens, float learning_rate)

void	update_all_weights_sgd (TransformerModel *M, float learning_rate)

int	main (int argc, char **argv)

Variables
size_t	TrulyOptimalLayer::layer_start_canary_offset

size_t	TrulyOptimalLayer::ln1_weight_offset

size_t	TrulyOptimalLayer::ln1_bias_offset

size_t	TrulyOptimalLayer::ln1_mean_offset

size_t	TrulyOptimalLayer::ln1_rstd_offset

size_t	TrulyOptimalLayer::layer_input_offset

size_t	TrulyOptimalLayer::ln1_output_offset

size_t	TrulyOptimalLayer::q_weight_offset

size_t	TrulyOptimalLayer::q_bias_offset

size_t	TrulyOptimalLayer::q_output_offset

size_t	TrulyOptimalLayer::k_weight_offset

size_t	TrulyOptimalLayer::k_bias_offset

size_t	TrulyOptimalLayer::k_output_offset

size_t	TrulyOptimalLayer::v_weight_offset

size_t	TrulyOptimalLayer::v_bias_offset

size_t	TrulyOptimalLayer::v_output_offset

size_t	TrulyOptimalLayer::attention_scores_offset

size_t	TrulyOptimalLayer::proj_weight_offset

size_t	TrulyOptimalLayer::proj_bias_offset

size_t	TrulyOptimalLayer::attention_output_offset

size_t	TrulyOptimalLayer::residual1_output_offset

size_t	TrulyOptimalLayer::ln2_weight_offset

size_t	TrulyOptimalLayer::ln2_bias_offset

size_t	TrulyOptimalLayer::ln2_mean_offset

size_t	TrulyOptimalLayer::ln2_rstd_offset

size_t	TrulyOptimalLayer::ln2_output_offset

size_t	TrulyOptimalLayer::fc1_weight_offset

size_t	TrulyOptimalLayer::fc1_bias_offset

size_t	TrulyOptimalLayer::fc1_output_offset

size_t	TrulyOptimalLayer::fc2_weight_offset

size_t	TrulyOptimalLayer::fc2_bias_offset

size_t	TrulyOptimalLayer::mlp_output_offset

size_t	TrulyOptimalLayer::residual2_output_offset

size_t	TrulyOptimalLayer::layer_end_canary_offset

size_t	LayerGradients::residual2_copy_offset

size_t	LayerGradients::d_residual2_offset

size_t	LayerGradients::mlp_output_copy_offset

size_t	LayerGradients::d_mlp_output_offset

size_t	LayerGradients::fc2_input_copy_offset

size_t	LayerGradients::fc2_weights_copy_offset

size_t	LayerGradients::fc2_bias_copy_offset

size_t	LayerGradients::d_fc2_input_offset

size_t	LayerGradients::d_fc2_weights_offset

size_t	LayerGradients::d_fc2_bias_offset

size_t	LayerGradients::fc1_output_copy_offset

size_t	LayerGradients::d_fc1_output_offset

size_t	LayerGradients::ln2_output_copy_offset

size_t	LayerGradients::fc1_weights_copy_offset

size_t	LayerGradients::fc1_bias_copy_offset

size_t	LayerGradients::d_ln2_output_offset

size_t	LayerGradients::d_fc1_weights_offset

size_t	LayerGradients::d_fc1_bias_offset

size_t	LayerGradients::ln2_input_copy_offset

size_t	LayerGradients::ln2_mean_copy_offset

size_t	LayerGradients::ln2_rstd_copy_offset

size_t	LayerGradients::ln2_gamma_copy_offset

size_t	LayerGradients::ln2_beta_copy_offset

size_t	LayerGradients::d_ln2_input_offset

size_t	LayerGradients::d_ln2_gamma_offset

size_t	LayerGradients::d_ln2_beta_offset

size_t	LayerGradients::residual1_copy_offset

size_t	LayerGradients::d_residual1_offset

size_t	LayerGradients::attention_output_copy_offset

size_t	LayerGradients::proj_weights_copy_offset

size_t	LayerGradients::proj_bias_copy_offset

size_t	LayerGradients::d_attention_output_offset

size_t	LayerGradients::d_proj_weights_offset

size_t	LayerGradients::d_proj_bias_offset

size_t	LayerGradients::attention_weights_copy_offset

size_t	LayerGradients::v_output_copy_offset

size_t	LayerGradients::d_attention_weights_offset

size_t	LayerGradients::d_v_output_offset

size_t	LayerGradients::attention_scores_copy_offset

size_t	LayerGradients::d_attention_scores_offset

size_t	LayerGradients::q_output_copy_offset

size_t	LayerGradients::k_output_copy_offset

size_t	LayerGradients::d_q_output_offset

size_t	LayerGradients::d_k_output_offset

size_t	LayerGradients::ln1_output_copy_offset

size_t	LayerGradients::q_weights_copy_offset

size_t	LayerGradients::q_bias_copy_offset

size_t	LayerGradients::k_weights_copy_offset

size_t	LayerGradients::k_bias_copy_offset

size_t	LayerGradients::v_weights_copy_offset

size_t	LayerGradients::v_bias_copy_offset

size_t	LayerGradients::d_ln1_output_offset

size_t	LayerGradients::d_q_weights_offset

size_t	LayerGradients::d_q_bias_offset

size_t	LayerGradients::d_k_weights_offset

size_t	LayerGradients::d_k_bias_offset

size_t	LayerGradients::d_v_weights_offset

size_t	LayerGradients::d_v_bias_offset

size_t	LayerGradients::ln1_input_copy_offset

size_t	LayerGradients::ln1_mean_copy_offset

size_t	LayerGradients::ln1_rstd_copy_offset

size_t	LayerGradients::ln1_gamma_copy_offset

size_t	LayerGradients::ln1_beta_copy_offset

size_t	LayerGradients::d_ln1_input_offset

size_t	LayerGradients::d_ln1_gamma_offset

size_t	LayerGradients::d_ln1_beta_offset

size_t	GradientStorage::backprop_base

size_t	GradientStorage::total_gradient_floats

size_t	GradientStorage::logits_copy_offset

size_t	GradientStorage::actual_tokens_offset

size_t	GradientStorage::d_logits_offset

size_t	GradientStorage::final_output_copy_offset

size_t	GradientStorage::d_final_output_offset

size_t	GradientStorage::d_embed_weights_offset

size_t	GradientStorage::final_ln_input_copy_offset

size_t	GradientStorage::final_ln_mean_copy_offset

size_t	GradientStorage::final_ln_rstd_copy_offset

size_t	GradientStorage::final_ln_gamma_copy_offset

size_t	GradientStorage::final_ln_beta_copy_offset

size_t	GradientStorage::d_final_ln_input_offset

size_t	GradientStorage::d_final_ln_gamma_offset

size_t	GradientStorage::d_final_ln_beta_offset

LayerGradients *	GradientStorage::layers

size_t	GradientStorage::d_pos_embed_offset

size_t	GradientStorage::layer_backprop_stride

char	TransformerModel::magic [8]
	Magic string "BUMPWGT2" for file validation.

uint32_t	TransformerModel::version
	Weight file format version.

uint32_t	TransformerModel::model_type
	Model architecture: 0=GPT2, 1=LLAMA, etc.

int	TransformerModel::num_layers
	Number of transformer layers (e.g., 12 for GPT-2)

int	TransformerModel::vocab_size
	Vocabulary size (e.g., 50257 for GPT-2)

int	TransformerModel::embed_dim
	Embedding dimension (e.g., 768 for GPT-2 small)

int	TransformerModel::context_window
	Maximum sequence length (e.g., 1024)

size_t	TransformerModel::aligned_embed_dim
	embed_dim rounded up to 64-byte alignment (in floats)

size_t	TransformerModel::aligned_head_dim
	head_dim rounded up to 64-byte alignment (in floats)

size_t	TransformerModel::aligned_attn_context_window
	context_window padded to prevent false sharing

int	TransformerModel::num_cores
	Number of CPU cores to use (OpenMP threads)

int	TransformerModel::tokens_per_core
	Tokens assigned per core: context_window / num_cores.

int	TransformerModel::num_attention_heads
	Number of attention heads (e.g., 12 for GPT-2)

int	TransformerModel::head_dim
	Dimension per head: embed_dim / num_attention_heads.

float *	TransformerModel::memory_base
	Base pointer to single contiguous memory block.

size_t	TransformerModel::total_floats
	Total size of memory block in float elements.

size_t	TransformerModel::layer_stride
	Byte offset between consecutive layer memory blocks.

size_t	TransformerModel::token_emb_offset
	Token embedding table [vocab_size × aligned_embed_dim].

size_t	TransformerModel::pos_emb_offset
	Positional embedding table [context_window × aligned_embed_dim].

size_t	TransformerModel::embedded_input_offset
	Combined token+pos embeddings [context_window × aligned_embed_dim].

size_t	TransformerModel::layers_start_offset
	Start of first transformer layer memory.

TrulyOptimalLayer *	TransformerModel::layers
	Array of per-layer offset structures.

size_t	TransformerModel::final_ln_weight_offset
	Final LayerNorm gamma [aligned_embed_dim].

size_t	TransformerModel::final_ln_bias_offset
	Final LayerNorm beta [aligned_embed_dim].

size_t	TransformerModel::final_ln_mean_offset
	Final LayerNorm mean [context_window].

size_t	TransformerModel::final_ln_rstd_offset
	Final LayerNorm rstd [context_window].

size_t	TransformerModel::final_output_offset
	Final normalized output [context_window × aligned_embed_dim].

size_t	TransformerModel::lm_head_weight_offset
	Language model head (weight-tied to token_emb_offset)

size_t	TransformerModel::logits_offset
	Output logits [context_window × vocab_size].

GradientStorage	TransformerModel::gradients
	Gradient and activation cache memory (training only)

bool	TransformerModel::training_enabled
	Whether gradient storage is allocated.

float	TransformerModel::learning_rate
	SGD learning rate for weight updates.

uint8_t	TransformerModel::checksum [32]
	SHA256 checksum of weight file.

uint8_t	TransformerModel::reserved [32]
	Reserved for future extensions.

Detailed Description

Matrix multiplication implementations with different optimization strategies.

Macro Definition Documentation

◆ ATTN_SCORES_ACCESS

#define ATTN_SCORES_ACCESS	(	scores_ptr,
		h,
		i,
		j,
		aligned_context_window
	)	scores_ptr[((h) * (aligned_context_window) * (aligned_context_window)) + ((i) * (aligned_context_window)) + (j)]

Function Documentation

◆ add_gpt2_token_and_positional_embeddings()

void add_gpt2_token_and_positional_embeddings	(	TransformerModel *	M,
		size_t	token_ids_offset,
		size_t	output_offset
	)

◆ add_gradient()

void add_gradient	(	TransformerModel *	M,
		size_t	source_offset,
		size_t	dest_offset
	)

ADD GRADIENT (accumulate gradients from residual path)

This is used when gradients from multiple paths need to be summed. For example, at a residual connection, gradients flow through both:

The transformation path (MLP or attention)
The skip connection path

Both gradients need to be added together.

◆ aligned_alloc_64()

static void * aligned_alloc_64 ( size_t size )

inlinestatic

◆ apply_causal_softmax_head_major()

void apply_causal_softmax_head_major	(	TransformerModel *	M,
		int	layer_idx
	)

◆ attention_head_major_complete()

void attention_head_major_complete	(	TransformerModel *	M,
		int	layer_idx
	)

Complete multi-head attention with head-major layout (self-attention)

Computes scaled dot-product attention using head-major memory layout for optimal cache locality. Each attention head operates independently, enabling head-level parallelism and cache-efficient processing.

Parameters

M	Transformer model with memory layout and configuration
layer_idx	Layer index for accessing Q/K/V tensors

Head-Level Parallelism Strategy: Unlike token-parallel operations (LayerNorm, GELU), attention parallelizes across HEADS because each head's computation is independent.

Memory Layout (Head-Major):

Q, K, V Tensors: [num_heads][context_window][head_dim]
 
Head 0:  [Token0: 64f] [Token1: 64f] ... [TokenN: 64f]
Head 1:  [Token0: 64f] [Token1: 64f] ... [TokenN: 64f]
...
Head 11: [Token0: 64f] [Token1: 64f] ... [TokenN: 64f]
 
Each head's data is CONTIGUOUS (no interleaving with other heads)

Why Head-Major Layout?:

✅ Perfect Locality for Q·K^T: All data for one head fits in L2 cache
✅ Attention Matrix in Cache: For 64-dim head, 1024 tokens:
- Score matrix: 1024 × 1024 × 4 bytes = 4MB (fits in L3)
- Per-head Q: 1024 × 64 × 4 bytes = 256KB (fits in L2)
- Per-head K: 1024 × 64 × 4 bytes = 256KB (fits in L2)
✅ No Strided Access: Sequential reads within each head
✅ Head Parallelism: 12 independent heads = 12 parallel tasks

Three-Phase Attention Algorithm:

Phase 1: Compute Attention Scores (Q·K^T / √d_k)

For each head h in parallel:
  For each query token i:
    For each key token j (where j <= i for causal masking):
      scores[h][i][j] = (Q[h][i] · K[h][j]) / sqrt(head_dim)

FLOPs: num_heads × T × (T+1)/2 × head_dim × 2
Memory: Streaming reads of Q and K
Cache: Each head's scores fit in L1 (1024×1024 floats = 4KB)

Phase 2: Causal Softmax

For each head h in parallel:
  For each query token i:
    scores[h][i][0:i+1] = softmax(scores[h][i][0:i+1])
    scores[h][i][i+1:T] = 0  (causal mask)

Prevents attending to future tokens (autoregressive)
Row-wise softmax for numerical stability
FLOPs: num_heads × T × (T+1)/2 × 5 (exp, sum, divide, max)

Phase 3: Weighted Sum of Values (Softmax·V)

For each head h in parallel:
  For each query token i:
    output[h][i] = Σ_{j=0}^{i} scores[h][i][j] * V[h][j]

FLOPs: num_heads × T × (T+1)/2 × head_dim × 2
Produces per-head attention output in head-major layout

Stride Pattern Access: Access to Q[h][t][d]:

offset = h * (context_window * aligned_head_dim) +
         t * aligned_head_dim +
         d

aligned_head_dim ensures 64-byte alignment (prevents false sharing)
Sequential access within a head (hardware prefetcher friendly)
Each head occupies separate cache lines

Performance Characteristics:

Compute: O(num_heads × T² × head_dim)
Memory: O(num_heads × T²) for attention scores
Parallelism: Scales with min(num_heads, num_cores)
Cache: L2/L3 critical (must fit score matrix + Q/K/V for one head)

Comparison: Token-Parallel vs Head-Parallel:

Operation	Parallelism	Memory Pattern	Cache Footprint
LayerNorm	Token	Sequential	3KB per token
Attention	Head	Strided	512KB per head
MLP	Token	Sequential	3KB per token

Why NOT Token-Parallel for Attention?:

Attention requires ALL token pairs (Q[i] · K[j] for all i,j)
Token parallelism would require synchronization at score matrix
Head-major layout allows independent head computation

Note: This function orchestrates all three attention phases

See also: compute_attention_scores_head_major Phase 1: Q·K^T; apply_causal_softmax_head_major Phase 2: Softmax with causal mask; compute_attention_output_head_major Phase 3: Attention·V; Q_ACCESS Head-major memory access macro

@performance Achieves 100-200 GFLOPS on attention computation (Xeon Gold 6248)

◆ attention_projection_with_concat()

void attention_projection_with_concat	(	TransformerModel *	M,
		int	layer_idx
	)

Production attention projection with concat: Head-major → Token-major → GEMM.

This function implements the concat strategy that proved 3.2x faster than direct head-major projection on hyperthreaded systems.

Parameters

M	Transformer model
layer_idx	Layer index to process

Memory flow:

Input: Head-major attention [head][token][head_dim]
Concat: Convert to token-major contiguous [token][embed_dim]
GEMM: Standard matrix multiplication (proven 100 GFLOPS)
Output: Token-major projection result [token][embed_dim]

◆ backward_attention_projection()

void backward_attention_projection	(	TransformerModel *	M,
		size_t	d_output_offset,
		size_t	attention_output_copy_offset,
		size_t	proj_weight_offset,
		size_t	proj_bias_offset,
		size_t	d_attention_offset,
		size_t	d_weight_offset,
		size_t	d_bias_offset
	)

BACKWARD THROUGH ATTENTION OUTPUT PROJECTION

Forward: output[T×D] = attention[T×D] @ W_proj[D×D] + b_proj[D]

This is after concatenating all heads back to [T×D] format

Backward computes:

d_attention[T×D] = d_output[T×D] @ W_proj^T[D×D]
d_W_proj[D×D] = attention^T[D×T] @ d_output[T×D]
d_b_proj[D] = sum(d_output) over T

◆ backward_attention_weighted_values()

void backward_attention_weighted_values	(	TransformerModel *	M,
		size_t	d_output_offset,
		size_t	attention_weights_offset,
		size_t	v_output_offset,
		size_t	d_weights_offset,
		size_t	d_v_offset
	)

BACKWARD THROUGH ATTENTION WEIGHTED VALUES

Forward: attention_output[h,t,d] = sum_over_s(attention_weights[h,t,s] * V[h,s,d])

This operates in HEAD-MAJOR layout

Backward computes:

d_attention_weights[h,t,s] = sum_over_d(d_output[h,t,d] * V[h,s,d])
d_V[h,s,d] = sum_over_t(attention_weights[h,t,s] * d_output[h,t,d])

◆ backward_causal_softmax()

void backward_causal_softmax	(	TransformerModel *	M,
		size_t	d_scores_offset,
		size_t	weights_copy_offset,
		size_t	scores_copy_offset
	)

BACKWARD THROUGH CAUSAL SOFTMAX

Forward: attention_weights[h,i,j] = softmax(scores[h,i,:]) with causal mask

Softmax Jacobian for row i: ∂softmax[i,j]/∂score[i,k] = softmax[i,j] * (δ[j,k] - softmax[i,k]) where δ[j,k] is Kronecker delta (1 if j==k, 0 otherwise)

For a row, if y = softmax(x), then: dx = y * (dy - dot(y, dy))

Causal mask means we only compute for j <= i

◆ backward_embedding_layer()

void backward_embedding_layer ( TransformerModel * M )

◆ backward_fc1()

void backward_fc1	(	TransformerModel *	M,
		size_t	d_output_offset,
		size_t	fc1_input_copy_offset,
		size_t	fc1_weight_offset,
		size_t	fc1_bias_offset,
		size_t	d_input_offset,
		size_t	d_weight_offset,
		size_t	d_bias_offset
	)

BACKWARD THROUGH FC1 (Feed-Forward Layer 1)

FORWARD PASS (for reference): ──────────────────────────────────────────────────────────────── Input: fc1_input [T × D] (output from LayerNorm2) Weight: W_fc1 [D × 4D] (projects from D to 4D) Bias: b_fc1 [4D] Output: fc1_output [T × 4D] = fc1_input @ W_fc1 + b_fc1

BACKWARD PASS: ──────────────────────────────────────────────────────────────── Given: d_output [T × 4D] (gradient from GELU backward)

Need to compute:

d_input [T × D] = d_output @ W_fc1^T
d_W_fc1 [D × 4D] = fc1_input^T @ d_output (accumulated)
d_b_fc1 [4D] = sum over T of d_output (accumulated)

DIMENSION FLOW: d_output[T×4D] ──┬──> @ W_fc1^T[4D×D] ──> d_input[T×D] │ ├──> fc1_input^T[D×T] @ ──> d_W_fc1[D×4D] │ └──> sum_over_T ──> d_b_fc1[4D]

HPC CONSIDERATIONS:

FC1 expands dimensions (D -> 4D), so weight matrix is large
Memory bandwidth critical for weight gradient accumulation
Consider chunking for better cache reuse
Token parallelism for d_input computation

◆ backward_fc2()

void backward_fc2	(	TransformerModel *	M,
		size_t	d_output_offset,
		size_t	fc2_input_copy_offset,
		size_t	fc2_weight_offset,
		size_t	fc2_bias_offset,
		size_t	d_input_offset,
		size_t	d_weight_offset,
		size_t	d_bias_offset
	)

BACKWARD THROUGH FC2 (Feed-Forward Layer 2)

FORWARD PASS (for reference): ──────────────────────────────────────────────────────────────── Input: fc2_input [T × 4D] (after GELU activation) Weight: W_fc2 [4D × D] (projects from 4D back to D) Bias: b_fc2 [D] Output: fc2_output [T × D] = fc2_input @ W_fc2 + b_fc2

BACKWARD PASS: ──────────────────────────────────────────────────────────────── Given: d_output [T × D] (gradient from residual connection)

Need to compute:

d_input [T × 4D] = d_output @ W_fc2^T
d_W_fc2 [4D × D] = fc2_input^T @ d_output (accumulated)
d_b_fc2 [D] = sum over T of d_output (accumulated)

DIMENSION FLOW: d_output[T×D] ──┬──> @ W_fc2^T[D×4D] ──> d_input[T×4D] │ ├──> fc2_input^T[4D×T] @ ──> d_W_fc2[4D×D] │ └──> sum_over_T ──> d_b_fc2[D]

HPC CONSIDERATIONS:

Token-parallel for d_input computation (each thread handles tokens)
Reduction required for weight/bias gradients (atomic ops or local accumulation)
Memory bandwidth bound due to large weight matrix (4D×D)
Cache blocking beneficial for weight gradient accumulation

◆ backward_final_layernorm()

void backward_final_layernorm ( TransformerModel * M )

◆ backward_gelu()

void backward_gelu	(	TransformerModel *	M,
		size_t	d_output_offset,
		size_t	input_copy_offset,
		size_t	d_input_offset
	)

BACKWARD THROUGH GELU ACTIVATION

FORWARD PASS (for reference): ──────────────────────────────────────────────────────────────── GELU(x) = 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x³)))

Approximation used in practice: GELU(x) ≈ 0.5 * x * (1 + tanh(0.7978845608 * (x + 0.044715 * x³)))

BACKWARD PASS (derivative): ──────────────────────────────────────────────────────────────── d/dx[GELU(x)] = 0.5 * tanh(g(x)) + 0.5 * x * sech²(g(x)) * g'(x) + 0.5 where: g(x) = 0.7978845608 * (x + 0.044715 * x³) g'(x) = 0.7978845608 * (1 + 3 * 0.044715 * x²)

Simplified form: GELU'(x) = 0.5 * (1 + tanh(g(x))) + 0.5 * x * sech²(g(x)) * g'(x) = 0.5 * (1 + tanh(g(x))) + 0.5 * x * (1 - tanh²(g(x))) * g'(x)

DIMENSION FLOW: d_output [T × 4D] ──> × GELU'(input) ──> d_input [T × 4D]

HPC CONSIDERATIONS:

Element-wise operation (embarrassingly parallel)
Compute bound (tanh is expensive)
Can fuse with surrounding operations for better cache usage
Consider using fast tanh approximations for speed

◆ backward_gelu_fast()

void backward_gelu_fast	(	TransformerModel *	M,
		size_t	d_output_offset,
		size_t	input_copy_offset,
		size_t	d_input_offset
	)

Alternative: Fast approximation using precomputed GELU derivative This version trades accuracy for speed by using a simpler approximation

◆ backward_layernorm()

void backward_layernorm	(	TransformerModel *	M,
		size_t	d_output_offset,
		size_t	input_copy_offset,
		size_t	gamma_copy_offset,
		size_t	beta_copy_offset,
		size_t	mean_copy_offset,
		size_t	rstd_copy_offset,
		size_t	d_input_offset,
		size_t	d_gamma_offset,
		size_t	d_beta_offset
	)

BACKWARD THROUGH LAYERNORM

FORWARD (for reference): x_hat = (x - mean) / rstd y = gamma * x_hat + beta

BACKWARD: Given d_y, compute d_x, d_gamma, d_beta

The math (per token): d_x = (rstd/D) * [D * d_y * gamma - sum(d_y * gamma) - x_hat * sum(d_y * gamma * x_hat)] d_gamma = sum_over_tokens(d_y * x_hat) d_beta = sum_over_tokens(d_y)

◆ backward_linear()

void backward_linear	(	TransformerModel *	M,
		size_t	d_output_offset,
		size_t	input_copy_offset,
		size_t	weight_offset,
		size_t	bias_offset,
		size_t	d_input_offset,
		size_t	d_weight_offset,
		size_t	d_bias_offset
	)

BACKWARD THROUGH LINEAR LAYER (GENERIC)

Forward: output = input @ W + bias

This handles Q, K, V projections which are all linear layers Note: For QKV, the output is in head-major format but input is token-major

Backward: d_input += d_output @ W^T (accumulate because QKV all contribute) d_W += input^T @ d_output d_bias += sum(d_output)

◆ backward_lm_head()

void backward_lm_head ( TransformerModel * M )

◆ backward_qk_matmul()

void backward_qk_matmul	(	TransformerModel *	M,
		size_t	d_scores_offset,
		size_t	q_copy_offset,
		size_t	k_copy_offset,
		size_t	d_q_offset,
		size_t	d_k_offset
	)

BACKWARD THROUGH Q @ K^T

Forward: scores[h,i,j] = sum_d(Q[h,i,d] * K[h,j,d]) / sqrt(head_dim)

Backward: d_Q[h,i,d] = sum_j(d_scores[h,i,j] * K[h,j,d]) / sqrt(head_dim) d_K[h,j,d] = sum_i(d_scores[h,i,j] * Q[h,i,d]) / sqrt(head_dim)

Note: Causal mask means d_scores[h,i,j] = 0 for j > i

◆ backward_residual_connection()

void backward_residual_connection	(	TransformerModel *	M,
		size_t	d_output_offset,
		size_t	d_input_offset,
		size_t	d_transform_offset
	)

RESIDUAL CONNECTION - FORWARD & BACKWARD PROPAGATION

CONCEPT: A residual connection (skip connection) allows gradients to flow directly through the network by adding the input to the output of a transformation. This addresses the vanishing gradient problem in deep networks.

FORWARD PASS: ──────────────────────────────────────────────────────────────── input (x) │ ├────────────────┐ (identity path / skip connection) │ │ ▼ │ ┌─────────┐ │ │ F(x) │ │ │ (trans- │ │ │ form) │ │ └─────────┘ │ │ │ ▼ ▼ F(x) + x └────────┬───────┘ ▼ output = F(x) + x

Mathematical form: output = input + transform(input)

In transformers specifically: output = input + MultiHeadAttention(LayerNorm(input)) output = input + FFN(LayerNorm(input))

BACKWARD PASS: ────────────────────────────────────────────────────────────────

Given: d_output = ∂L/∂output (gradient from layer above) Need: d_input = ∂L/∂input and d_transform = ∂L/∂transform

Since output = input + transform, by the chain rule: ∂output/∂input = 1 (derivative of input w.r.t itself) ∂output/∂transform = 1 (derivative of transform w.r.t itself)

Therefore: d_input = d_output × 1 = d_output d_transform = d_output × 1 = d_output

BACKWARD FLOW: d_output │ ┌──────────┴──────────┐ │ │ ▼ ▼ d_transform d_input (gradient flows (gradient flows to transform) directly through)

KEY INSIGHT: The gradient d_output flows EQUALLY through both paths:

Through the transformation (to update its parameters)
Directly to the input (skip connection)

This is why residual connections help with vanishing gradients: even if the transformation has small gradients, the skip path ensures gradients can flow directly to earlier layers.

IMPLEMENTATION NOTES:

Both gradients receive the SAME value (d_output)
Use += (accumulation) not = (assignment) in case gradients already exist from other paths
This simple operation is crucial for training deep networks

Parameters

M	Transformer model
d_output_offset	Gradient from the layer above
d_input_offset	Where to accumulate gradient for input path
d_transform_offset	Where to accumulate gradient for transform path

◆ backward_transformer_layer()

void backward_transformer_layer	(	TransformerModel *	M,
		int	layer_idx
	)

◆ benchmark_attention_projection_complete()

void benchmark_attention_projection_complete ( TransformerModel * M )

◆ benchmark_qkv_dual_comparison()

void benchmark_qkv_dual_comparison ( TransformerModel * M )

◆ cache_forward_activations()

void cache_forward_activations ( TransformerModel * M )

Copy forward pass activations to gradient storage for backward pass This preserves the forward computations needed for gradient calculation.

◆ compare_arrays()

double compare_arrays	(	const float *	a,
		const float *	b,
		size_t	size,
		const char *	name
	)

◆ compute_attention_output_head_major()

void compute_attention_output_head_major	(	TransformerModel *	M,
		int	layer_idx
	)

◆ compute_attention_scores_head_major()

void compute_attention_scores_head_major	(	TransformerModel *	M,
		int	layer_idx
	)

◆ compute_cross_entropy_loss()

void compute_cross_entropy_loss	(	TransformerModel *	M,
		int32_t *	target_tokens,
		float *	loss_out
	)

Compute cross-entropy loss and gradients w.r.t logits.

Loss = -sum(log(p[correct])) / context_length Gradient: dL/dlogit[i] = p[i] - 1 (for correct token) dL/dlogit[i] = p[i] (for other tokens)

◆ compute_logits_last_token_optimized()

void compute_logits_last_token_optimized	(	TransformerModel *	M,
		int	position
	)

◆ convert_token_major_to_head_major_layer()

void convert_token_major_to_head_major_layer	(	const float *	token_major_base,
		float *	head_major_base,
		TransformerModel *	M
	)

◆ debug_math_comparison()

void debug_math_comparison ( TransformerModel * M )

◆ embed_tokens()

void embed_tokens	(	TransformerModel *	M,
		int32_t *	token_ids,
		int	num_tokens
	)

◆ gelu_activation_token_parallel()

void gelu_activation_token_parallel	(	TransformerModel *	M,
		size_t	data_offset
	)

◆ gemm_avx512_parallel()

void gemm_avx512_parallel	(	const float *	A,
		const float *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

AVX-512 optimized GEMM with vectorized inner loops.

Parameters

A	Input matrix A [M x K]
B	Input matrix B [N x K] (transposed)
bias	Bias vector [N]
C	Output matrix C [M x N]
M	Number of rows in A and C
N	Number of columns in B and C
K	Inner dimension

@performance

Target: 200-400 GFLOPS on modern Xeon
16-wide SIMD operations
FMA instruction utilization

@optimization_details

Uses _mm512_fmadd_ps for 3 FLOPs per instruction
16-element vectorization of inner loop
Handles remainder elements with scalar code

See also: gemm_naive_parallel for reference implementation

◆ gemm_blocked_serial()

void gemm_blocked_serial	(	const float *	A,
		const float *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

◆ gemm_fine_grained_parallel()

void gemm_fine_grained_parallel	(	const float *	A,
		const float *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Cache-blocked GEMM with fine-grained parallelism.

Parameters

A	Input matrix A [M x K]
B	Input matrix B [N x K] (transposed)
bias	Bias vector [N]
C	Output matrix C [M x N]
M	Number of rows in A and C
N	Number of columns in B and C
K	Inner dimension

@performance

Target: 300-500 GFLOPS
Best performance for large matrices
Optimal cache utilization

@implementation_notes

64x64 blocking for L1 cache optimization
Collapse(3) OpenMP directive for maximum parallelism
Atomic updates for thread safety

@benchmark_results Tested on 8192x8192 matrices:

Naive: 85 GFLOPS
This impl: 474 GFLOPS (5.6x speedup)

◆ gemm_naive_parallel()

void gemm_naive_parallel	(	const float *	A,
		const float *	B,
		const float *	bias,
		float *	C,
		int	M,
		int	N,
		int	K
	)

Naive parallel GEMM implementation (reference baseline)

Parameters

A	Input matrix A [M x K]
B	Input matrix B [N x K] (transposed)
bias	Bias vector [N]
C	Output matrix C [M x N]
M	Number of rows in A and C
N	Number of columns in B and C
K	Inner dimension (columns of A, rows of B)

@performance

Baseline performance: ~50-100 GFLOPS
Used as reference for accuracy validation
Simple OpenMP parallelization

Note: This is the golden reference - all other implementations are validated against this

◆ generate()

void generate	(	TransformerModel *	M,
		int *	prompt,
		int	prompt_len,
		int	max_tokens
	)

◆ layernorm_forward_rolled_slice()

void layernorm_forward_rolled_slice	(	const float *__restrict	input_slice_base,
		const float *__restrict	gamma,
		const float *__restrict	beta,
		float *__restrict	output_slice_base,
		float *__restrict	mean_cache_slice,
		float *__restrict	rstd_cache_slice,
		int	num_tokens_in_slice,
		int	d_model,
		int	aligned_embed_dim,
		float	eps
	)

◆ layernorm_forward_unrolled_slice()

void layernorm_forward_unrolled_slice	(	const float *__restrict	input_slice_base,
		const float *__restrict	gamma,
		const float *__restrict	beta,
		float *__restrict	output_slice_base,
		float *__restrict	mean_cache_slice,
		float *__restrict	rstd_cache_slice,
		int	num_tokens_in_slice,
		int	d_model,
		float	eps
	)

◆ layernorm_naive_serial()

void layernorm_naive_serial	(	const float *	input,
		const float *	gamma,
		const float *	beta,
		float *	output,
		float *	mean_cache,
		float *	rstd_cache,
		int	tokens,
		int	d_model,
		int	aligned_embed_dim,
		float	eps
	)

◆ layernorm_naive_serial_matched_precision()

void layernorm_naive_serial_matched_precision	(	const float *	input,
		const float *	gamma,
		const float *	beta,
		float *	output,
		float *	mean_cache,
		float *	rstd_cache,
		int	tokens,
		int	d_model,
		float	eps
	)

◆ layernorm_token_parallel()

void layernorm_token_parallel	(	TransformerModel *	M,
		size_t	input_offset,
		size_t	weight_offset,
		size_t	bias_offset,
		size_t	mean_cache_offset,
		size_t	rstd_cache_offset,
		size_t	output_offset,
		float	eps
	)

Token-parallel Layer Normalization with AVX-512 optimization.

Performs Layer Normalization across tokens using token-level parallelism. Each CPU core processes a contiguous slice of tokens independently, achieving perfect cache locality and zero synchronization overhead.

Parameters

M	Transformer model containing memory layout and parallelism config
input_offset	Offset to input tensor [context_window × aligned_embed_dim]
weight_offset	Offset to gamma weights [aligned_embed_dim]
bias_offset	Offset to beta biases [aligned_embed_dim]
mean_cache_offset	Offset to mean cache [context_window] (for backward pass)
rstd_cache_offset	Offset to rstd cache [context_window] (for backward pass)
output_offset	Offset to output tensor [context_window × aligned_embed_dim]
eps	Epsilon for numerical stability (typically 1e-5)

Token-Level Parallelism Strategy:

Memory Layout (Token-Major):
┌──────────────┬──────────────┬──────────────┬──────────────┐
│ Token 0      │ Token 1      │ Token 2      │ Token 3      │
│ [768 floats] │ [768 floats] │ [768 floats] │ [768 floats] │
└──────────────┴──────────────┴──────────────┴──────────────┘
 │<─ Core 0 ──>│<─ Core 1 ──>│<─ Core 2 ──>│<─ Core 3 ──>│

Why Token-Parallel?:

✅ Perfect Locality: Each token's data (768 floats) is contiguous in memory
✅ Zero Sync: Tokens are independent, no barriers or atomics needed
✅ Cache Efficiency: Each core streams through sequential memory
✅ Linear Scaling: Speedup = num_cores (measured 7.8x on 8 cores)

Memory Access Pattern (per core):

Core 0 processes tokens [0, tokens_per_core):
  - Read:  input[0*768], input[1*768], ..., input[N*768]  (sequential)
  - Write: output[0*768], output[1*768], ..., output[N*768] (sequential)
  - Gamma/Beta: Shared read-only (broadcast to all cores)

Algorithm (per token):

Pass 1: Compute mean across embed_dim using AVX-512
Pass 2: Compute variance using FMA (fused multiply-add)
Pass 3: Normalize, scale by gamma, shift by beta

AVX-512 Optimization:

Processes 16 floats per instruction (4x16 unrolling)
Uses FMA for variance: acc = diff * diff + acc (2 FLOPs per cycle)
Aligned loads: _mm512_load_ps (requires 64-byte alignment)
Prefetching: Hints to load next cache line while computing current

Performance Characteristics:

Compute: 9 * embed_dim FLOPs per token
Memory: 3 * embed_dim reads + embed_dim writes per token
Bandwidth: Achieves 50-100 GB/s per core (streaming bandwidth)
Latency: ~5 μs per token on modern Xeon (768-dim, 3.0 GHz)

Cache Behavior:

L1 Data Cache: Holds ~4 tokens (768 floats = 3KB per token, 32KB L1)
L2 Cache: Holds ~80 tokens (256KB L2)
Prefetcher: Detects sequential pattern, hides DRAM latency

Why Aligned Embed Dim?: Padding to 64-byte boundaries ensures:

No false sharing between cores writing adjacent tokens
Aligned SIMD loads (faster than unaligned)
Clean cache line ownership (no partial cache line reads)

Note: This is a core building block used in every transformer layer

See also: layernorm_forward_rolled_slice Per-core slice processing kernel; TrulyOptimalLayer For offset definitions within a layer

@performance Measured 7.8x speedup on 8-core Xeon vs serial baseline

◆ load_model_weights()

int load_model_weights	(	TransformerModel *	M,
		const char *	weight_file
	)

Load weights into already-allocated TransformerModel.

This assumes:

read_model_metadata() has been called
layout_transformer() has been called to allocate memory
You've verified you have enough RAM

Parameters

M	Pointer to initialized and allocated TransformerModel
weight_file	Path to the .weights file

Returns: 0 on success, -1 on failure

◆ main()

int main	(	int	argc,
		char **	argv
	)

◆ mlp_token_parallel()

void mlp_token_parallel	(	TransformerModel *	M,
		size_t	input_offset,
		size_t	fc1_weight_offset,
		size_t	fc1_bias_offset,
		size_t	fc1_output_offset,
		size_t	fc2_weight_offset,
		size_t	fc2_bias_offset,
		size_t	output_offset
	)

◆ qkv_micro_kernel_blocked_4x16_polished()

static void qkv_micro_kernel_blocked_4x16_polished	(	const float *__restrict	input_token,
		const float *__restrict	Q_weights_block,
		const float *__restrict	K_weights_block,
		const float *__restrict	V_weights_block,
		const float *__restrict	Q_bias_4,
		const float *__restrict	K_bias_4,
		const float *__restrict	V_bias_4,
		float *__restrict	Q_output_4,
		float *__restrict	K_output_4,
		float *__restrict	V_output_4,
		int	embed_dim
	)

inlinestatic

◆ qkv_micro_kernel_head_major_4x16()

static void qkv_micro_kernel_head_major_4x16	(	const float *__restrict	input_token,
		const float *__restrict	Q_weights_block,
		const float *__restrict	K_weights_block,
		const float *__restrict	V_weights_block,
		const float *__restrict	Q_bias_4,
		const float *__restrict	K_bias_4,
		const float *__restrict	V_bias_4,
		TransformerModel *	M,
		float *__restrict	q_output_base,
		float *__restrict	k_output_base,
		float *__restrict	v_output_base,
		int	embed_dim,
		int	token_idx,
		int	output_start_dim
	)

inlinestatic

◆ qkv_projection()

void qkv_projection	(	TransformerModel *	M,
		size_t	layer_idx
	)

◆ qkv_projection_head_major()

void qkv_projection_head_major	(	TransformerModel *	M,
		int	layer_idx
	)

◆ qkv_token_kernel_4x16_blocked_polished()

static void qkv_token_kernel_4x16_blocked_polished	(	const float *__restrict	input_token,
		const float *__restrict	Q_weights,
		const float *__restrict	K_weights,
		const float *__restrict	V_weights,
		const float *__restrict	Q_bias,
		const float *__restrict	K_bias,
		const float *__restrict	V_bias,
		float *__restrict	Q_output,
		float *__restrict	K_output,
		float *__restrict	V_output,
		int	embed_dim
	)

static

◆ qkv_token_kernel_head_major_4x16()

static void qkv_token_kernel_head_major_4x16	(	const float *__restrict	input_token,
		const float *__restrict	Q_weights,
		const float *__restrict	K_weights,
		const float *__restrict	V_weights,
		const float *__restrict	Q_bias,
		const float *__restrict	K_bias,
		const float *__restrict	V_bias,
		TransformerModel *	M,
		float *__restrict	q_output_base,
		float *__restrict	k_output_base,
		float *__restrict	v_output_base,
		int	embed_dim,
		int	token_idx
	)

static

◆ read_model_metadata()

int read_model_metadata	(	TransformerModel *	M,
		const char *	weight_file
	)

Read model metadata from weight file header.

This function ONLY reads the header and populates model dimensions. It does NOT allocate memory - that's your decision to make separately.

Parameters

M	Pointer to zero-initialized TransformerModel struct
weight_file	Path to the .weights file

Returns: 0 on success, -1 on failure

◆ residual_add_token_parallel()

void residual_add_token_parallel	(	TransformerModel *	M,
		size_t	input_offset,
		size_t	residual_offset,
		size_t	output_offset
	)

◆ run_comprehensive_benchmark()

void run_comprehensive_benchmark ( TransformerModel * M )

◆ run_layernorm_benchmark_performance()

void run_layernorm_benchmark_performance ( TransformerModel * M )

◆ run_layernorm_benchmark_precision_matched()

void run_layernorm_benchmark_precision_matched ( TransformerModel * M )

◆ sample_token()

int sample_token	(	float *	logits,
		int	vocab_size,
		float	temperature
	)

◆ test_attention_head_major_after_qkv()

void test_attention_head_major_after_qkv ( TransformerModel * M )

◆ training_step()

void training_step	(	TransformerModel *	M,
		int32_t *	input_tokens,
		int32_t *	target_tokens,
		float	learning_rate
	)

◆ transformer_layer_forward()

void transformer_layer_forward	(	TransformerModel *	M,
		int	layer_idx,
		size_t	layer_input_offset
	)

◆ update_all_weights_sgd()

void update_all_weights_sgd	(	TransformerModel *	M,
		float	learning_rate
	)

◆ zero_gradients()

void zero_gradients ( TransformerModel * M )

Variable Documentation

◆ actual_tokens_offset

size_t GradientStorage::actual_tokens_offset

◆ aligned_attn_context_window

size_t TransformerModel::aligned_attn_context_window

context_window padded to prevent false sharing

◆ aligned_embed_dim

size_t TransformerModel::aligned_embed_dim

embed_dim rounded up to 64-byte alignment (in floats)

◆ aligned_head_dim

size_t TransformerModel::aligned_head_dim

head_dim rounded up to 64-byte alignment (in floats)

◆ attention_output_copy_offset

size_t LayerGradients::attention_output_copy_offset

◆ attention_output_offset

size_t TrulyOptimalLayer::attention_output_offset

◆ attention_scores_copy_offset

size_t LayerGradients::attention_scores_copy_offset

◆ attention_scores_offset

size_t TrulyOptimalLayer::attention_scores_offset

◆ attention_weights_copy_offset

size_t LayerGradients::attention_weights_copy_offset

◆ backprop_base

size_t GradientStorage::backprop_base

◆ checksum

uint8_t TransformerModel::checksum[32]

SHA256 checksum of weight file.

◆ context_window

int TransformerModel::context_window

Maximum sequence length (e.g., 1024)

◆ d_attention_output_offset

size_t LayerGradients::d_attention_output_offset

◆ d_attention_scores_offset

size_t LayerGradients::d_attention_scores_offset

◆ d_attention_weights_offset

size_t LayerGradients::d_attention_weights_offset

◆ d_embed_weights_offset

size_t GradientStorage::d_embed_weights_offset

◆ d_fc1_bias_offset

size_t LayerGradients::d_fc1_bias_offset

◆ d_fc1_output_offset

size_t LayerGradients::d_fc1_output_offset

◆ d_fc1_weights_offset

size_t LayerGradients::d_fc1_weights_offset

◆ d_fc2_bias_offset

size_t LayerGradients::d_fc2_bias_offset

◆ d_fc2_input_offset

size_t LayerGradients::d_fc2_input_offset

◆ d_fc2_weights_offset

size_t LayerGradients::d_fc2_weights_offset

◆ d_final_ln_beta_offset

size_t GradientStorage::d_final_ln_beta_offset

◆ d_final_ln_gamma_offset

size_t GradientStorage::d_final_ln_gamma_offset

◆ d_final_ln_input_offset

size_t GradientStorage::d_final_ln_input_offset

◆ d_final_output_offset

size_t GradientStorage::d_final_output_offset

◆ d_k_bias_offset

size_t LayerGradients::d_k_bias_offset

◆ d_k_output_offset

size_t LayerGradients::d_k_output_offset

◆ d_k_weights_offset

size_t LayerGradients::d_k_weights_offset

◆ d_ln1_beta_offset

size_t LayerGradients::d_ln1_beta_offset

◆ d_ln1_gamma_offset

size_t LayerGradients::d_ln1_gamma_offset

◆ d_ln1_input_offset

size_t LayerGradients::d_ln1_input_offset

◆ d_ln1_output_offset

size_t LayerGradients::d_ln1_output_offset

◆ d_ln2_beta_offset

size_t LayerGradients::d_ln2_beta_offset

◆ d_ln2_gamma_offset

size_t LayerGradients::d_ln2_gamma_offset

◆ d_ln2_input_offset

size_t LayerGradients::d_ln2_input_offset

◆ d_ln2_output_offset

size_t LayerGradients::d_ln2_output_offset

◆ d_logits_offset

size_t GradientStorage::d_logits_offset

◆ d_mlp_output_offset

size_t LayerGradients::d_mlp_output_offset

◆ d_pos_embed_offset

size_t GradientStorage::d_pos_embed_offset

◆ d_proj_bias_offset

size_t LayerGradients::d_proj_bias_offset

◆ d_proj_weights_offset

size_t LayerGradients::d_proj_weights_offset

◆ d_q_bias_offset

size_t LayerGradients::d_q_bias_offset

◆ d_q_output_offset

size_t LayerGradients::d_q_output_offset

◆ d_q_weights_offset

size_t LayerGradients::d_q_weights_offset

◆ d_residual1_offset

size_t LayerGradients::d_residual1_offset

◆ d_residual2_offset

size_t LayerGradients::d_residual2_offset

◆ d_v_bias_offset

size_t LayerGradients::d_v_bias_offset

◆ d_v_output_offset

size_t LayerGradients::d_v_output_offset

◆ d_v_weights_offset

size_t LayerGradients::d_v_weights_offset

◆ embed_dim

int TransformerModel::embed_dim

Embedding dimension (e.g., 768 for GPT-2 small)

◆ embedded_input_offset

size_t TransformerModel::embedded_input_offset

Combined token+pos embeddings [context_window × aligned_embed_dim].

◆ fc1_bias_copy_offset

size_t LayerGradients::fc1_bias_copy_offset

◆ fc1_bias_offset

size_t TrulyOptimalLayer::fc1_bias_offset

◆ fc1_output_copy_offset

size_t LayerGradients::fc1_output_copy_offset

◆ fc1_output_offset

size_t TrulyOptimalLayer::fc1_output_offset

◆ fc1_weight_offset

size_t TrulyOptimalLayer::fc1_weight_offset

◆ fc1_weights_copy_offset

size_t LayerGradients::fc1_weights_copy_offset

◆ fc2_bias_copy_offset

size_t LayerGradients::fc2_bias_copy_offset

◆ fc2_bias_offset

size_t TrulyOptimalLayer::fc2_bias_offset

◆ fc2_input_copy_offset

size_t LayerGradients::fc2_input_copy_offset

◆ fc2_weight_offset

size_t TrulyOptimalLayer::fc2_weight_offset

◆ fc2_weights_copy_offset

size_t LayerGradients::fc2_weights_copy_offset

◆ final_ln_beta_copy_offset

size_t GradientStorage::final_ln_beta_copy_offset

◆ final_ln_bias_offset

size_t TransformerModel::final_ln_bias_offset

Final LayerNorm beta [aligned_embed_dim].

◆ final_ln_gamma_copy_offset

size_t GradientStorage::final_ln_gamma_copy_offset

◆ final_ln_input_copy_offset

size_t GradientStorage::final_ln_input_copy_offset

◆ final_ln_mean_copy_offset

size_t GradientStorage::final_ln_mean_copy_offset

◆ final_ln_mean_offset

size_t TransformerModel::final_ln_mean_offset

Final LayerNorm mean [context_window].

◆ final_ln_rstd_copy_offset

size_t GradientStorage::final_ln_rstd_copy_offset

◆ final_ln_rstd_offset

size_t TransformerModel::final_ln_rstd_offset

Final LayerNorm rstd [context_window].

◆ final_ln_weight_offset

size_t TransformerModel::final_ln_weight_offset

Final LayerNorm gamma [aligned_embed_dim].

◆ final_output_copy_offset

size_t GradientStorage::final_output_copy_offset

◆ final_output_offset

size_t TransformerModel::final_output_offset

Final normalized output [context_window × aligned_embed_dim].

◆ gradients

GradientStorage TransformerModel::gradients

Gradient and activation cache memory (training only)

◆ head_dim

int TransformerModel::head_dim

Dimension per head: embed_dim / num_attention_heads.

◆ k_bias_copy_offset

size_t LayerGradients::k_bias_copy_offset

◆ k_bias_offset

size_t TrulyOptimalLayer::k_bias_offset

◆ k_output_copy_offset

size_t LayerGradients::k_output_copy_offset

◆ k_output_offset

size_t TrulyOptimalLayer::k_output_offset

◆ k_weight_offset

size_t TrulyOptimalLayer::k_weight_offset

◆ k_weights_copy_offset

size_t LayerGradients::k_weights_copy_offset

◆ layer_backprop_stride

size_t GradientStorage::layer_backprop_stride

◆ layer_end_canary_offset

size_t TrulyOptimalLayer::layer_end_canary_offset

◆ layer_input_offset

size_t TrulyOptimalLayer::layer_input_offset

◆ layer_start_canary_offset

size_t TrulyOptimalLayer::layer_start_canary_offset

◆ layer_stride

size_t TransformerModel::layer_stride

Byte offset between consecutive layer memory blocks.

◆ layers [1/2]

LayerGradients* GradientStorage::layers

◆ layers [2/2]

TrulyOptimalLayer* TransformerModel::layers

Array of per-layer offset structures.

◆ layers_start_offset

size_t TransformerModel::layers_start_offset

Start of first transformer layer memory.

◆ learning_rate

float TransformerModel::learning_rate

SGD learning rate for weight updates.

◆ lm_head_weight_offset

size_t TransformerModel::lm_head_weight_offset

Language model head (weight-tied to token_emb_offset)

◆ ln1_beta_copy_offset

size_t LayerGradients::ln1_beta_copy_offset

◆ ln1_bias_offset

size_t TrulyOptimalLayer::ln1_bias_offset

◆ ln1_gamma_copy_offset

size_t LayerGradients::ln1_gamma_copy_offset

◆ ln1_input_copy_offset

size_t LayerGradients::ln1_input_copy_offset

◆ ln1_mean_copy_offset

size_t LayerGradients::ln1_mean_copy_offset

◆ ln1_mean_offset

size_t TrulyOptimalLayer::ln1_mean_offset

◆ ln1_output_copy_offset

size_t LayerGradients::ln1_output_copy_offset

◆ ln1_output_offset

size_t TrulyOptimalLayer::ln1_output_offset

◆ ln1_rstd_copy_offset

size_t LayerGradients::ln1_rstd_copy_offset

◆ ln1_rstd_offset

size_t TrulyOptimalLayer::ln1_rstd_offset

◆ ln1_weight_offset

size_t TrulyOptimalLayer::ln1_weight_offset

◆ ln2_beta_copy_offset

size_t LayerGradients::ln2_beta_copy_offset

◆ ln2_bias_offset

size_t TrulyOptimalLayer::ln2_bias_offset

◆ ln2_gamma_copy_offset

size_t LayerGradients::ln2_gamma_copy_offset

◆ ln2_input_copy_offset

size_t LayerGradients::ln2_input_copy_offset

◆ ln2_mean_copy_offset

size_t LayerGradients::ln2_mean_copy_offset

◆ ln2_mean_offset

size_t TrulyOptimalLayer::ln2_mean_offset

◆ ln2_output_copy_offset

size_t LayerGradients::ln2_output_copy_offset

◆ ln2_output_offset

size_t TrulyOptimalLayer::ln2_output_offset

◆ ln2_rstd_copy_offset

size_t LayerGradients::ln2_rstd_copy_offset

◆ ln2_rstd_offset

size_t TrulyOptimalLayer::ln2_rstd_offset

◆ ln2_weight_offset

size_t TrulyOptimalLayer::ln2_weight_offset

◆ logits_copy_offset

size_t GradientStorage::logits_copy_offset

◆ logits_offset

size_t TransformerModel::logits_offset

Output logits [context_window × vocab_size].

◆ magic

char TransformerModel::magic[8]

Magic string "BUMPWGT2" for file validation.

◆ memory_base

float* TransformerModel::memory_base

Base pointer to single contiguous memory block.

◆ mlp_output_copy_offset

size_t LayerGradients::mlp_output_copy_offset

◆ mlp_output_offset

size_t TrulyOptimalLayer::mlp_output_offset

◆ model_type

uint32_t TransformerModel::model_type

Model architecture: 0=GPT2, 1=LLAMA, etc.

◆ num_attention_heads

int TransformerModel::num_attention_heads

Number of attention heads (e.g., 12 for GPT-2)

◆ num_cores

int TransformerModel::num_cores

Number of CPU cores to use (OpenMP threads)

◆ num_layers

int TransformerModel::num_layers

Number of transformer layers (e.g., 12 for GPT-2)

◆ pos_emb_offset

size_t TransformerModel::pos_emb_offset

Positional embedding table [context_window × aligned_embed_dim].

◆ proj_bias_copy_offset

size_t LayerGradients::proj_bias_copy_offset

◆ proj_bias_offset

size_t TrulyOptimalLayer::proj_bias_offset

◆ proj_weight_offset

size_t TrulyOptimalLayer::proj_weight_offset

◆ proj_weights_copy_offset

size_t LayerGradients::proj_weights_copy_offset

◆ q_bias_copy_offset

size_t LayerGradients::q_bias_copy_offset

◆ q_bias_offset

size_t TrulyOptimalLayer::q_bias_offset

◆ q_output_copy_offset

size_t LayerGradients::q_output_copy_offset

◆ q_output_offset

size_t TrulyOptimalLayer::q_output_offset

◆ q_weight_offset

size_t TrulyOptimalLayer::q_weight_offset

◆ q_weights_copy_offset

size_t LayerGradients::q_weights_copy_offset

◆ reserved

uint8_t TransformerModel::reserved[32]

Reserved for future extensions.

◆ residual1_copy_offset

size_t LayerGradients::residual1_copy_offset

◆ residual1_output_offset

size_t TrulyOptimalLayer::residual1_output_offset

◆ residual2_copy_offset

size_t LayerGradients::residual2_copy_offset

◆ residual2_output_offset

size_t TrulyOptimalLayer::residual2_output_offset

◆ token_emb_offset

size_t TransformerModel::token_emb_offset

Token embedding table [vocab_size × aligned_embed_dim].

◆ tokens_per_core

int TransformerModel::tokens_per_core

Tokens assigned per core: context_window / num_cores.

◆ total_floats

size_t TransformerModel::total_floats

Total size of memory block in float elements.

◆ total_gradient_floats

size_t GradientStorage::total_gradient_floats

◆ training_enabled

bool TransformerModel::training_enabled

Whether gradient storage is allocated.

◆ v_bias_copy_offset

size_t LayerGradients::v_bias_copy_offset

◆ v_bias_offset

size_t TrulyOptimalLayer::v_bias_offset

◆ v_output_copy_offset

size_t LayerGradients::v_output_copy_offset

◆ v_output_offset

size_t TrulyOptimalLayer::v_output_offset

◆ v_weight_offset

size_t TrulyOptimalLayer::v_weight_offset

◆ v_weights_copy_offset

size_t LayerGradients::v_weights_copy_offset

◆ version

uint32_t TransformerModel::version

Weight file format version.

◆ vocab_size

int TransformerModel::vocab_size

Vocabulary size (e.g., 50257 for GPT-2)

Macros

Functions

Variables

Detailed Description

Macro Definition Documentation

◆ ATTN_SCORES_ACCESS

Function Documentation

◆ add_gpt2_token_and_positional_embeddings()

◆ add_gradient()

ADD GRADIENT (accumulate gradients from residual path)

◆ aligned_alloc_64()

◆ apply_causal_softmax_head_major()

◆ attention_head_major_complete()

◆ attention_projection_with_concat()

◆ backward_attention_projection()

◆ backward_attention_weighted_values()

◆ backward_causal_softmax()

◆ backward_embedding_layer()

◆ backward_fc1()

BACKWARD THROUGH FC1 (Feed-Forward Layer 1)

◆ backward_fc2()

BACKWARD THROUGH FC2 (Feed-Forward Layer 2)

◆ backward_final_layernorm()

◆ backward_gelu()

BACKWARD THROUGH GELU ACTIVATION

◆ backward_gelu_fast()

◆ backward_layernorm()

BACKWARD THROUGH LAYERNORM

◆ backward_linear()

◆ backward_lm_head()

◆ backward_qk_matmul()

◆ backward_residual_connection()

RESIDUAL CONNECTION - FORWARD & BACKWARD PROPAGATION

◆ backward_transformer_layer()

◆ benchmark_attention_projection_complete()

◆ benchmark_qkv_dual_comparison()

◆ cache_forward_activations()

◆ compare_arrays()

◆ compute_attention_output_head_major()

◆ compute_attention_scores_head_major()

◆ compute_cross_entropy_loss()

◆ compute_logits_last_token_optimized()

◆ convert_token_major_to_head_major_layer()

◆ debug_math_comparison()

◆ embed_tokens()

◆ gelu_activation_token_parallel()

◆ gemm_avx512_parallel()

◆ gemm_blocked_serial()

◆ gemm_fine_grained_parallel()

◆ gemm_naive_parallel()

◆ generate()

◆ layernorm_forward_rolled_slice()

◆ layernorm_forward_unrolled_slice()

◆ layernorm_naive_serial()

◆ layernorm_naive_serial_matched_precision()

◆ layernorm_token_parallel()

◆ load_model_weights()

◆ main()

◆ mlp_token_parallel()

◆ qkv_micro_kernel_blocked_4x16_polished()

◆ qkv_micro_kernel_head_major_4x16()

◆ qkv_projection()

◆ qkv_projection_head_major()

◆ qkv_token_kernel_4x16_blocked_polished()

◆ qkv_token_kernel_head_major_4x16()

◆ read_model_metadata()

◆ residual_add_token_parallel()

◆ run_comprehensive_benchmark()

◆ run_layernorm_benchmark_performance()

◆ run_layernorm_benchmark_precision_matched()

◆ sample_token()

◆ test_attention_head_major_after_qkv()

◆ training_step()

◆ transformer_layer_forward()

◆ update_all_weights_sgd()

◆ zero_gradients()

Variable Documentation

◆ actual_tokens_offset

◆ aligned_attn_context_window

◆ aligned_embed_dim