|
C-Transformer
Cache-Optimized Transformers in C (x86)
|
Matrix multiplication implementations with different optimization strategies. More...
Macros | |
| #define | ATTN_SCORES_ACCESS(scores_ptr, h, i, j, aligned_context_window) scores_ptr[((h) * (aligned_context_window) * (aligned_context_window)) + ((i) * (aligned_context_window)) + (j)] |
Functions | |
| void | gemm_naive_parallel (const float *A, const float *B, const float *bias, float *C, int M, int N, int K) |
| Naive parallel GEMM implementation (reference baseline) | |
| void | gemm_avx512_parallel (const float *A, const float *B, const float *bias, float *C, int M, int N, int K) |
| AVX-512 optimized GEMM with vectorized inner loops. | |
| void | gemm_fine_grained_parallel (const float *A, const float *B, const float *bias, float *C, int M, int N, int K) |
| Cache-blocked GEMM with fine-grained parallelism. | |
| void | gemm_blocked_serial (const float *A, const float *B, const float *bias, float *C, int M, int N, int K) |
| void | layernorm_naive_serial (const float *input, const float *gamma, const float *beta, float *output, float *mean_cache, float *rstd_cache, int tokens, int d_model, int aligned_embed_dim, float eps) |
| void | layernorm_forward_rolled_slice (const float *__restrict input_slice_base, const float *__restrict gamma, const float *__restrict beta, float *__restrict output_slice_base, float *__restrict mean_cache_slice, float *__restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, int aligned_embed_dim, float eps) |
| void | layernorm_forward_unrolled_slice (const float *__restrict input_slice_base, const float *__restrict gamma, const float *__restrict beta, float *__restrict output_slice_base, float *__restrict mean_cache_slice, float *__restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, float eps) |
| void | layernorm_token_parallel (TransformerModel *M, size_t input_offset, size_t weight_offset, size_t bias_offset, size_t mean_cache_offset, size_t rstd_cache_offset, size_t output_offset, float eps) |
| Token-parallel Layer Normalization with AVX-512 optimization. | |
| void | layernorm_naive_serial_matched_precision (const float *input, const float *gamma, const float *beta, float *output, float *mean_cache, float *rstd_cache, int tokens, int d_model, float eps) |
| static void * | aligned_alloc_64 (size_t size) |
| void | debug_math_comparison (TransformerModel *M) |
| void | run_layernorm_benchmark_precision_matched (TransformerModel *M) |
| void | run_layernorm_benchmark_performance (TransformerModel *M) |
| static void | qkv_micro_kernel_blocked_4x16_polished (const float *__restrict input_token, const float *__restrict Q_weights_block, const float *__restrict K_weights_block, const float *__restrict V_weights_block, const float *__restrict Q_bias_4, const float *__restrict K_bias_4, const float *__restrict V_bias_4, float *__restrict Q_output_4, float *__restrict K_output_4, float *__restrict V_output_4, int embed_dim) |
| static void | qkv_token_kernel_4x16_blocked_polished (const float *__restrict input_token, const float *__restrict Q_weights, const float *__restrict K_weights, const float *__restrict V_weights, const float *__restrict Q_bias, const float *__restrict K_bias, const float *__restrict V_bias, float *__restrict Q_output, float *__restrict K_output, float *__restrict V_output, int embed_dim) |
| void | qkv_projection (TransformerModel *M, size_t layer_idx) |
| static void | qkv_micro_kernel_head_major_4x16 (const float *__restrict input_token, const float *__restrict Q_weights_block, const float *__restrict K_weights_block, const float *__restrict V_weights_block, const float *__restrict Q_bias_4, const float *__restrict K_bias_4, const float *__restrict V_bias_4, TransformerModel *M, float *__restrict q_output_base, float *__restrict k_output_base, float *__restrict v_output_base, int embed_dim, int token_idx, int output_start_dim) |
| static void | qkv_token_kernel_head_major_4x16 (const float *__restrict input_token, const float *__restrict Q_weights, const float *__restrict K_weights, const float *__restrict V_weights, const float *__restrict Q_bias, const float *__restrict K_bias, const float *__restrict V_bias, TransformerModel *M, float *__restrict q_output_base, float *__restrict k_output_base, float *__restrict v_output_base, int embed_dim, int token_idx) |
| void | qkv_projection_head_major (TransformerModel *M, int layer_idx) |
| double | compare_arrays (const float *a, const float *b, size_t size, const char *name) |
| void | convert_token_major_to_head_major_layer (const float *token_major_base, float *head_major_base, TransformerModel *M) |
| void | benchmark_qkv_dual_comparison (TransformerModel *M) |
| void | compute_attention_scores_head_major (TransformerModel *M, int layer_idx) |
| void | apply_causal_softmax_head_major (TransformerModel *M, int layer_idx) |
| void | compute_attention_output_head_major (TransformerModel *M, int layer_idx) |
| void | attention_head_major_complete (TransformerModel *M, int layer_idx) |
| Complete multi-head attention with head-major layout (self-attention) | |
| void | test_attention_head_major_after_qkv (TransformerModel *M) |
| void | attention_projection_with_concat (TransformerModel *M, int layer_idx) |
| Production attention projection with concat: Head-major → Token-major → GEMM. | |
| void | benchmark_attention_projection_complete (TransformerModel *M) |
| void | add_gpt2_token_and_positional_embeddings (TransformerModel *M, size_t token_ids_offset, size_t output_offset) |
| void | residual_add_token_parallel (TransformerModel *M, size_t input_offset, size_t residual_offset, size_t output_offset) |
| void | gelu_activation_token_parallel (TransformerModel *M, size_t data_offset) |
| void | mlp_token_parallel (TransformerModel *M, size_t input_offset, size_t fc1_weight_offset, size_t fc1_bias_offset, size_t fc1_output_offset, size_t fc2_weight_offset, size_t fc2_bias_offset, size_t output_offset) |
| void | embed_tokens (TransformerModel *M, int32_t *token_ids, int num_tokens) |
| void | compute_logits_last_token_optimized (TransformerModel *M, int position) |
| void | transformer_layer_forward (TransformerModel *M, int layer_idx, size_t layer_input_offset) |
| void | run_comprehensive_benchmark (TransformerModel *M) |
| int | load_model_weights (TransformerModel *M, const char *weight_file) |
| Load weights into already-allocated TransformerModel. | |
| int | read_model_metadata (TransformerModel *M, const char *weight_file) |
| Read model metadata from weight file header. | |
| int | sample_token (float *logits, int vocab_size, float temperature) |
| void | generate (TransformerModel *M, int *prompt, int prompt_len, int max_tokens) |
| void | zero_gradients (TransformerModel *M) |
| void | cache_forward_activations (TransformerModel *M) |
| Copy forward pass activations to gradient storage for backward pass This preserves the forward computations needed for gradient calculation. | |
| void | backward_residual_connection (TransformerModel *M, size_t d_output_offset, size_t d_input_offset, size_t d_transform_offset) |
| void | backward_embedding_layer (TransformerModel *M) |
| void | backward_final_layernorm (TransformerModel *M) |
| void | backward_fc2 (TransformerModel *M, size_t d_output_offset, size_t fc2_input_copy_offset, size_t fc2_weight_offset, size_t fc2_bias_offset, size_t d_input_offset, size_t d_weight_offset, size_t d_bias_offset) |
| void | backward_gelu (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t d_input_offset) |
| void | backward_fc1 (TransformerModel *M, size_t d_output_offset, size_t fc1_input_copy_offset, size_t fc1_weight_offset, size_t fc1_bias_offset, size_t d_input_offset, size_t d_weight_offset, size_t d_bias_offset) |
| void | backward_gelu_fast (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t d_input_offset) |
| void | backward_layernorm (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t gamma_copy_offset, size_t beta_copy_offset, size_t mean_copy_offset, size_t rstd_copy_offset, size_t d_input_offset, size_t d_gamma_offset, size_t d_beta_offset) |
| void | add_gradient (TransformerModel *M, size_t source_offset, size_t dest_offset) |
| void | backward_attention_projection (TransformerModel *M, size_t d_output_offset, size_t attention_output_copy_offset, size_t proj_weight_offset, size_t proj_bias_offset, size_t d_attention_offset, size_t d_weight_offset, size_t d_bias_offset) |
| void | backward_attention_weighted_values (TransformerModel *M, size_t d_output_offset, size_t attention_weights_offset, size_t v_output_offset, size_t d_weights_offset, size_t d_v_offset) |
| void | backward_causal_softmax (TransformerModel *M, size_t d_scores_offset, size_t weights_copy_offset, size_t scores_copy_offset) |
| void | backward_qk_matmul (TransformerModel *M, size_t d_scores_offset, size_t q_copy_offset, size_t k_copy_offset, size_t d_q_offset, size_t d_k_offset) |
| void | backward_linear (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t weight_offset, size_t bias_offset, size_t d_input_offset, size_t d_weight_offset, size_t d_bias_offset) |
| void | backward_lm_head (TransformerModel *M) |
| void | backward_transformer_layer (TransformerModel *M, int layer_idx) |
| void | compute_cross_entropy_loss (TransformerModel *M, int32_t *target_tokens, float *loss_out) |
| Compute cross-entropy loss and gradients w.r.t logits. | |
| void | training_step (TransformerModel *M, int32_t *input_tokens, int32_t *target_tokens, float learning_rate) |
| void | update_all_weights_sgd (TransformerModel *M, float learning_rate) |
| int | main (int argc, char **argv) |
Matrix multiplication implementations with different optimization strategies.
| #define ATTN_SCORES_ACCESS | ( | scores_ptr, | |
| h, | |||
| i, | |||
| j, | |||
| aligned_context_window | |||
| ) | scores_ptr[((h) * (aligned_context_window) * (aligned_context_window)) + ((i) * (aligned_context_window)) + (j)] |
| void add_gpt2_token_and_positional_embeddings | ( | TransformerModel * | M, |
| size_t | token_ids_offset, | ||
| size_t | output_offset | ||
| ) |
| void add_gradient | ( | TransformerModel * | M, |
| size_t | source_offset, | ||
| size_t | dest_offset | ||
| ) |
|
inlinestatic |
| void apply_causal_softmax_head_major | ( | TransformerModel * | M, |
| int | layer_idx | ||
| ) |
| void attention_head_major_complete | ( | TransformerModel * | M, |
| int | layer_idx | ||
| ) |
Complete multi-head attention with head-major layout (self-attention)
Computes scaled dot-product attention using head-major memory layout for optimal cache locality. Each attention head operates independently, enabling head-level parallelism and cache-efficient processing.
| M | Transformer model with memory layout and configuration |
| layer_idx | Layer index for accessing Q/K/V tensors |
Head-Level Parallelism Strategy: Unlike token-parallel operations (LayerNorm, GELU), attention parallelizes across HEADS because each head's computation is independent.
Memory Layout (Head-Major):
Why Head-Major Layout?:
Three-Phase Attention Algorithm:
Phase 1: Compute Attention Scores (Q·K^T / √d_k)
Phase 2: Causal Softmax
Phase 3: Weighted Sum of Values (Softmax·V)
Stride Pattern Access: Access to Q[h][t][d]:
aligned_head_dim ensures 64-byte alignment (prevents false sharing)Performance Characteristics:
Comparison: Token-Parallel vs Head-Parallel:
| Operation | Parallelism | Memory Pattern | Cache Footprint |
|---|---|---|---|
| LayerNorm | Token | Sequential | 3KB per token |
| Attention | Head | Strided | 512KB per head |
| MLP | Token | Sequential | 3KB per token |
Why NOT Token-Parallel for Attention?:
@performance Achieves 100-200 GFLOPS on attention computation (Xeon Gold 6248)
| void attention_projection_with_concat | ( | TransformerModel * | M, |
| int | layer_idx | ||
| ) |
Production attention projection with concat: Head-major → Token-major → GEMM.
This function implements the concat strategy that proved 3.2x faster than direct head-major projection on hyperthreaded systems.
| M | Transformer model |
| layer_idx | Layer index to process |
Memory flow:
| void backward_attention_projection | ( | TransformerModel * | M, |
| size_t | d_output_offset, | ||
| size_t | attention_output_copy_offset, | ||
| size_t | proj_weight_offset, | ||
| size_t | proj_bias_offset, | ||
| size_t | d_attention_offset, | ||
| size_t | d_weight_offset, | ||
| size_t | d_bias_offset | ||
| ) |
BACKWARD THROUGH ATTENTION OUTPUT PROJECTION
Forward: output[T×D] = attention[T×D] @ W_proj[D×D] + b_proj[D]
This is after concatenating all heads back to [T×D] format
Backward computes:
| void backward_attention_weighted_values | ( | TransformerModel * | M, |
| size_t | d_output_offset, | ||
| size_t | attention_weights_offset, | ||
| size_t | v_output_offset, | ||
| size_t | d_weights_offset, | ||
| size_t | d_v_offset | ||
| ) |
BACKWARD THROUGH ATTENTION WEIGHTED VALUES
Forward: attention_output[h,t,d] = sum_over_s(attention_weights[h,t,s] * V[h,s,d])
This operates in HEAD-MAJOR layout
Backward computes:
| void backward_causal_softmax | ( | TransformerModel * | M, |
| size_t | d_scores_offset, | ||
| size_t | weights_copy_offset, | ||
| size_t | scores_copy_offset | ||
| ) |
BACKWARD THROUGH CAUSAL SOFTMAX
Forward: attention_weights[h,i,j] = softmax(scores[h,i,:]) with causal mask
Softmax Jacobian for row i: ∂softmax[i,j]/∂score[i,k] = softmax[i,j] * (δ[j,k] - softmax[i,k]) where δ[j,k] is Kronecker delta (1 if j==k, 0 otherwise)
For a row, if y = softmax(x), then: dx = y * (dy - dot(y, dy))
Causal mask means we only compute for j <= i
| void backward_embedding_layer | ( | TransformerModel * | M | ) |
| void backward_fc1 | ( | TransformerModel * | M, |
| size_t | d_output_offset, | ||
| size_t | fc1_input_copy_offset, | ||
| size_t | fc1_weight_offset, | ||
| size_t | fc1_bias_offset, | ||
| size_t | d_input_offset, | ||
| size_t | d_weight_offset, | ||
| size_t | d_bias_offset | ||
| ) |
FORWARD PASS (for reference): ──────────────────────────────────────────────────────────────── Input: fc1_input [T × D] (output from LayerNorm2) Weight: W_fc1 [D × 4D] (projects from D to 4D) Bias: b_fc1 [4D] Output: fc1_output [T × 4D] = fc1_input @ W_fc1 + b_fc1
BACKWARD PASS: ──────────────────────────────────────────────────────────────── Given: d_output [T × 4D] (gradient from GELU backward)
Need to compute:
DIMENSION FLOW: d_output[T×4D] ──┬──> @ W_fc1^T[4D×D] ──> d_input[T×D] │ ├──> fc1_input^T[D×T] @ ──> d_W_fc1[D×4D] │ └──> sum_over_T ──> d_b_fc1[4D]
HPC CONSIDERATIONS:
| void backward_fc2 | ( | TransformerModel * | M, |
| size_t | d_output_offset, | ||
| size_t | fc2_input_copy_offset, | ||
| size_t | fc2_weight_offset, | ||
| size_t | fc2_bias_offset, | ||
| size_t | d_input_offset, | ||
| size_t | d_weight_offset, | ||
| size_t | d_bias_offset | ||
| ) |
FORWARD PASS (for reference): ──────────────────────────────────────────────────────────────── Input: fc2_input [T × 4D] (after GELU activation) Weight: W_fc2 [4D × D] (projects from 4D back to D) Bias: b_fc2 [D] Output: fc2_output [T × D] = fc2_input @ W_fc2 + b_fc2
BACKWARD PASS: ──────────────────────────────────────────────────────────────── Given: d_output [T × D] (gradient from residual connection)
Need to compute:
DIMENSION FLOW: d_output[T×D] ──┬──> @ W_fc2^T[D×4D] ──> d_input[T×4D] │ ├──> fc2_input^T[4D×T] @ ──> d_W_fc2[4D×D] │ └──> sum_over_T ──> d_b_fc2[D]
HPC CONSIDERATIONS:
| void backward_final_layernorm | ( | TransformerModel * | M | ) |
| void backward_gelu | ( | TransformerModel * | M, |
| size_t | d_output_offset, | ||
| size_t | input_copy_offset, | ||
| size_t | d_input_offset | ||
| ) |
FORWARD PASS (for reference): ──────────────────────────────────────────────────────────────── GELU(x) = 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x³)))
Approximation used in practice: GELU(x) ≈ 0.5 * x * (1 + tanh(0.7978845608 * (x + 0.044715 * x³)))
BACKWARD PASS (derivative): ──────────────────────────────────────────────────────────────── d/dx[GELU(x)] = 0.5 * tanh(g(x)) + 0.5 * x * sech²(g(x)) * g'(x) + 0.5 where: g(x) = 0.7978845608 * (x + 0.044715 * x³) g'(x) = 0.7978845608 * (1 + 3 * 0.044715 * x²)
Simplified form: GELU'(x) = 0.5 * (1 + tanh(g(x))) + 0.5 * x * sech²(g(x)) * g'(x) = 0.5 * (1 + tanh(g(x))) + 0.5 * x * (1 - tanh²(g(x))) * g'(x)
DIMENSION FLOW: d_output [T × 4D] ──> × GELU'(input) ──> d_input [T × 4D]
HPC CONSIDERATIONS:
| void backward_gelu_fast | ( | TransformerModel * | M, |
| size_t | d_output_offset, | ||
| size_t | input_copy_offset, | ||
| size_t | d_input_offset | ||
| ) |
Alternative: Fast approximation using precomputed GELU derivative This version trades accuracy for speed by using a simpler approximation
| void backward_layernorm | ( | TransformerModel * | M, |
| size_t | d_output_offset, | ||
| size_t | input_copy_offset, | ||
| size_t | gamma_copy_offset, | ||
| size_t | beta_copy_offset, | ||
| size_t | mean_copy_offset, | ||
| size_t | rstd_copy_offset, | ||
| size_t | d_input_offset, | ||
| size_t | d_gamma_offset, | ||
| size_t | d_beta_offset | ||
| ) |
FORWARD (for reference): x_hat = (x - mean) / rstd y = gamma * x_hat + beta
BACKWARD: Given d_y, compute d_x, d_gamma, d_beta
The math (per token): d_x = (rstd/D) * [D * d_y * gamma - sum(d_y * gamma) - x_hat * sum(d_y * gamma * x_hat)] d_gamma = sum_over_tokens(d_y * x_hat) d_beta = sum_over_tokens(d_y)
| void backward_linear | ( | TransformerModel * | M, |
| size_t | d_output_offset, | ||
| size_t | input_copy_offset, | ||
| size_t | weight_offset, | ||
| size_t | bias_offset, | ||
| size_t | d_input_offset, | ||
| size_t | d_weight_offset, | ||
| size_t | d_bias_offset | ||
| ) |
BACKWARD THROUGH LINEAR LAYER (GENERIC)
Forward: output = input @ W + bias
This handles Q, K, V projections which are all linear layers Note: For QKV, the output is in head-major format but input is token-major
Backward: d_input += d_output @ W^T (accumulate because QKV all contribute) d_W += input^T @ d_output d_bias += sum(d_output)
| void backward_lm_head | ( | TransformerModel * | M | ) |
| void backward_qk_matmul | ( | TransformerModel * | M, |
| size_t | d_scores_offset, | ||
| size_t | q_copy_offset, | ||
| size_t | k_copy_offset, | ||
| size_t | d_q_offset, | ||
| size_t | d_k_offset | ||
| ) |
BACKWARD THROUGH Q @ K^T
Forward: scores[h,i,j] = sum_d(Q[h,i,d] * K[h,j,d]) / sqrt(head_dim)
Backward: d_Q[h,i,d] = sum_j(d_scores[h,i,j] * K[h,j,d]) / sqrt(head_dim) d_K[h,j,d] = sum_i(d_scores[h,i,j] * Q[h,i,d]) / sqrt(head_dim)
Note: Causal mask means d_scores[h,i,j] = 0 for j > i
| void backward_residual_connection | ( | TransformerModel * | M, |
| size_t | d_output_offset, | ||
| size_t | d_input_offset, | ||
| size_t | d_transform_offset | ||
| ) |
CONCEPT: A residual connection (skip connection) allows gradients to flow directly through the network by adding the input to the output of a transformation. This addresses the vanishing gradient problem in deep networks.
FORWARD PASS: ──────────────────────────────────────────────────────────────── input (x) │ ├────────────────┐ (identity path / skip connection) │ │ ▼ │ ┌─────────┐ │ │ F(x) │ │ │ (trans- │ │ │ form) │ │ └─────────┘ │ │ │ ▼ ▼ F(x) + x └────────┬───────┘ ▼ output = F(x) + x
Mathematical form: output = input + transform(input)
In transformers specifically: output = input + MultiHeadAttention(LayerNorm(input)) output = input + FFN(LayerNorm(input))
BACKWARD PASS: ────────────────────────────────────────────────────────────────
Given: d_output = ∂L/∂output (gradient from layer above) Need: d_input = ∂L/∂input and d_transform = ∂L/∂transform
Since output = input + transform, by the chain rule: ∂output/∂input = 1 (derivative of input w.r.t itself) ∂output/∂transform = 1 (derivative of transform w.r.t itself)
Therefore: d_input = d_output × 1 = d_output d_transform = d_output × 1 = d_output
BACKWARD FLOW: d_output │ ┌──────────┴──────────┐ │ │ ▼ ▼ d_transform d_input (gradient flows (gradient flows to transform) directly through)
KEY INSIGHT: The gradient d_output flows EQUALLY through both paths:
This is why residual connections help with vanishing gradients: even if the transformation has small gradients, the skip path ensures gradients can flow directly to earlier layers.
IMPLEMENTATION NOTES:
| M | Transformer model |
| d_output_offset | Gradient from the layer above |
| d_input_offset | Where to accumulate gradient for input path |
| d_transform_offset | Where to accumulate gradient for transform path |
| void backward_transformer_layer | ( | TransformerModel * | M, |
| int | layer_idx | ||
| ) |
| void benchmark_attention_projection_complete | ( | TransformerModel * | M | ) |
| void benchmark_qkv_dual_comparison | ( | TransformerModel * | M | ) |
| void cache_forward_activations | ( | TransformerModel * | M | ) |
Copy forward pass activations to gradient storage for backward pass This preserves the forward computations needed for gradient calculation.
| double compare_arrays | ( | const float * | a, |
| const float * | b, | ||
| size_t | size, | ||
| const char * | name | ||
| ) |
| void compute_attention_output_head_major | ( | TransformerModel * | M, |
| int | layer_idx | ||
| ) |
| void compute_attention_scores_head_major | ( | TransformerModel * | M, |
| int | layer_idx | ||
| ) |
| void compute_cross_entropy_loss | ( | TransformerModel * | M, |
| int32_t * | target_tokens, | ||
| float * | loss_out | ||
| ) |
Compute cross-entropy loss and gradients w.r.t logits.
Loss = -sum(log(p[correct])) / context_length Gradient: dL/dlogit[i] = p[i] - 1 (for correct token) dL/dlogit[i] = p[i] (for other tokens)
| void compute_logits_last_token_optimized | ( | TransformerModel * | M, |
| int | position | ||
| ) |
| void convert_token_major_to_head_major_layer | ( | const float * | token_major_base, |
| float * | head_major_base, | ||
| TransformerModel * | M | ||
| ) |
| void debug_math_comparison | ( | TransformerModel * | M | ) |
| void embed_tokens | ( | TransformerModel * | M, |
| int32_t * | token_ids, | ||
| int | num_tokens | ||
| ) |
| void gelu_activation_token_parallel | ( | TransformerModel * | M, |
| size_t | data_offset | ||
| ) |
| void gemm_avx512_parallel | ( | const float * | A, |
| const float * | B, | ||
| const float * | bias, | ||
| float * | C, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
AVX-512 optimized GEMM with vectorized inner loops.
| A | Input matrix A [M x K] |
| B | Input matrix B [N x K] (transposed) |
| bias | Bias vector [N] |
| C | Output matrix C [M x N] |
| M | Number of rows in A and C |
| N | Number of columns in B and C |
| K | Inner dimension |
@performance
@optimization_details
| void gemm_blocked_serial | ( | const float * | A, |
| const float * | B, | ||
| const float * | bias, | ||
| float * | C, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
| void gemm_fine_grained_parallel | ( | const float * | A, |
| const float * | B, | ||
| const float * | bias, | ||
| float * | C, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
Cache-blocked GEMM with fine-grained parallelism.
| A | Input matrix A [M x K] |
| B | Input matrix B [N x K] (transposed) |
| bias | Bias vector [N] |
| C | Output matrix C [M x N] |
| M | Number of rows in A and C |
| N | Number of columns in B and C |
| K | Inner dimension |
@performance
@implementation_notes
@benchmark_results Tested on 8192x8192 matrices:
| void gemm_naive_parallel | ( | const float * | A, |
| const float * | B, | ||
| const float * | bias, | ||
| float * | C, | ||
| int | M, | ||
| int | N, | ||
| int | K | ||
| ) |
Naive parallel GEMM implementation (reference baseline)
| A | Input matrix A [M x K] |
| B | Input matrix B [N x K] (transposed) |
| bias | Bias vector [N] |
| C | Output matrix C [M x N] |
| M | Number of rows in A and C |
| N | Number of columns in B and C |
| K | Inner dimension (columns of A, rows of B) |
@performance
| void generate | ( | TransformerModel * | M, |
| int * | prompt, | ||
| int | prompt_len, | ||
| int | max_tokens | ||
| ) |
| void layernorm_forward_rolled_slice | ( | const float *__restrict | input_slice_base, |
| const float *__restrict | gamma, | ||
| const float *__restrict | beta, | ||
| float *__restrict | output_slice_base, | ||
| float *__restrict | mean_cache_slice, | ||
| float *__restrict | rstd_cache_slice, | ||
| int | num_tokens_in_slice, | ||
| int | d_model, | ||
| int | aligned_embed_dim, | ||
| float | eps | ||
| ) |
| void layernorm_forward_unrolled_slice | ( | const float *__restrict | input_slice_base, |
| const float *__restrict | gamma, | ||
| const float *__restrict | beta, | ||
| float *__restrict | output_slice_base, | ||
| float *__restrict | mean_cache_slice, | ||
| float *__restrict | rstd_cache_slice, | ||
| int | num_tokens_in_slice, | ||
| int | d_model, | ||
| float | eps | ||
| ) |
| void layernorm_naive_serial | ( | const float * | input, |
| const float * | gamma, | ||
| const float * | beta, | ||
| float * | output, | ||
| float * | mean_cache, | ||
| float * | rstd_cache, | ||
| int | tokens, | ||
| int | d_model, | ||
| int | aligned_embed_dim, | ||
| float | eps | ||
| ) |
| void layernorm_naive_serial_matched_precision | ( | const float * | input, |
| const float * | gamma, | ||
| const float * | beta, | ||
| float * | output, | ||
| float * | mean_cache, | ||
| float * | rstd_cache, | ||
| int | tokens, | ||
| int | d_model, | ||
| float | eps | ||
| ) |
| void layernorm_token_parallel | ( | TransformerModel * | M, |
| size_t | input_offset, | ||
| size_t | weight_offset, | ||
| size_t | bias_offset, | ||
| size_t | mean_cache_offset, | ||
| size_t | rstd_cache_offset, | ||
| size_t | output_offset, | ||
| float | eps | ||
| ) |
Token-parallel Layer Normalization with AVX-512 optimization.
Performs Layer Normalization across tokens using token-level parallelism. Each CPU core processes a contiguous slice of tokens independently, achieving perfect cache locality and zero synchronization overhead.
| M | Transformer model containing memory layout and parallelism config |
| input_offset | Offset to input tensor [context_window × aligned_embed_dim] |
| weight_offset | Offset to gamma weights [aligned_embed_dim] |
| bias_offset | Offset to beta biases [aligned_embed_dim] |
| mean_cache_offset | Offset to mean cache [context_window] (for backward pass) |
| rstd_cache_offset | Offset to rstd cache [context_window] (for backward pass) |
| output_offset | Offset to output tensor [context_window × aligned_embed_dim] |
| eps | Epsilon for numerical stability (typically 1e-5) |
Token-Level Parallelism Strategy:
Why Token-Parallel?:
Memory Access Pattern (per core):
Algorithm (per token):
AVX-512 Optimization:
acc = diff * diff + acc (2 FLOPs per cycle)_mm512_load_ps (requires 64-byte alignment)Performance Characteristics:
Cache Behavior:
Why Aligned Embed Dim?: Padding to 64-byte boundaries ensures:
@performance Measured 7.8x speedup on 8-core Xeon vs serial baseline
| int load_model_weights | ( | TransformerModel * | M, |
| const char * | weight_file | ||
| ) |
Load weights into already-allocated TransformerModel.
This assumes:
| M | Pointer to initialized and allocated TransformerModel |
| weight_file | Path to the .weights file |
| int main | ( | int | argc, |
| char ** | argv | ||
| ) |
| void mlp_token_parallel | ( | TransformerModel * | M, |
| size_t | input_offset, | ||
| size_t | fc1_weight_offset, | ||
| size_t | fc1_bias_offset, | ||
| size_t | fc1_output_offset, | ||
| size_t | fc2_weight_offset, | ||
| size_t | fc2_bias_offset, | ||
| size_t | output_offset | ||
| ) |
|
inlinestatic |
|
inlinestatic |
| void qkv_projection | ( | TransformerModel * | M, |
| size_t | layer_idx | ||
| ) |
| void qkv_projection_head_major | ( | TransformerModel * | M, |
| int | layer_idx | ||
| ) |
|
static |
|
static |
| int read_model_metadata | ( | TransformerModel * | M, |
| const char * | weight_file | ||
| ) |
Read model metadata from weight file header.
This function ONLY reads the header and populates model dimensions. It does NOT allocate memory - that's your decision to make separately.
| M | Pointer to zero-initialized TransformerModel struct |
| weight_file | Path to the .weights file |
| void residual_add_token_parallel | ( | TransformerModel * | M, |
| size_t | input_offset, | ||
| size_t | residual_offset, | ||
| size_t | output_offset | ||
| ) |
| void run_comprehensive_benchmark | ( | TransformerModel * | M | ) |
| void run_layernorm_benchmark_performance | ( | TransformerModel * | M | ) |
| void run_layernorm_benchmark_precision_matched | ( | TransformerModel * | M | ) |
| int sample_token | ( | float * | logits, |
| int | vocab_size, | ||
| float | temperature | ||
| ) |
| void test_attention_head_major_after_qkv | ( | TransformerModel * | M | ) |
| void training_step | ( | TransformerModel * | M, |
| int32_t * | input_tokens, | ||
| int32_t * | target_tokens, | ||
| float | learning_rate | ||
| ) |
| void transformer_layer_forward | ( | TransformerModel * | M, |
| int | layer_idx, | ||
| size_t | layer_input_offset | ||
| ) |
| void update_all_weights_sgd | ( | TransformerModel * | M, |
| float | learning_rate | ||
| ) |
| void zero_gradients | ( | TransformerModel * | M | ) |
| size_t GradientStorage::actual_tokens_offset |
| size_t TransformerModel::aligned_attn_context_window |
context_window padded to prevent false sharing
| size_t TransformerModel::aligned_embed_dim |
embed_dim rounded up to 64-byte alignment (in floats)
| size_t TransformerModel::aligned_head_dim |
head_dim rounded up to 64-byte alignment (in floats)
| size_t LayerGradients::attention_output_copy_offset |
| size_t TrulyOptimalLayer::attention_output_offset |
| size_t LayerGradients::attention_scores_copy_offset |
| size_t TrulyOptimalLayer::attention_scores_offset |
| size_t LayerGradients::attention_weights_copy_offset |
| size_t GradientStorage::backprop_base |
| uint8_t TransformerModel::checksum[32] |
SHA256 checksum of weight file.
| int TransformerModel::context_window |
Maximum sequence length (e.g., 1024)
| size_t LayerGradients::d_attention_output_offset |
| size_t LayerGradients::d_attention_scores_offset |
| size_t LayerGradients::d_attention_weights_offset |
| size_t GradientStorage::d_embed_weights_offset |
| size_t LayerGradients::d_fc1_bias_offset |
| size_t LayerGradients::d_fc1_output_offset |
| size_t LayerGradients::d_fc1_weights_offset |
| size_t LayerGradients::d_fc2_bias_offset |
| size_t LayerGradients::d_fc2_input_offset |
| size_t LayerGradients::d_fc2_weights_offset |
| size_t GradientStorage::d_final_ln_beta_offset |
| size_t GradientStorage::d_final_ln_gamma_offset |
| size_t GradientStorage::d_final_ln_input_offset |
| size_t GradientStorage::d_final_output_offset |
| size_t LayerGradients::d_k_bias_offset |
| size_t LayerGradients::d_k_output_offset |
| size_t LayerGradients::d_k_weights_offset |
| size_t LayerGradients::d_ln1_beta_offset |
| size_t LayerGradients::d_ln1_gamma_offset |
| size_t LayerGradients::d_ln1_input_offset |
| size_t LayerGradients::d_ln1_output_offset |
| size_t LayerGradients::d_ln2_beta_offset |
| size_t LayerGradients::d_ln2_gamma_offset |
| size_t LayerGradients::d_ln2_input_offset |
| size_t LayerGradients::d_ln2_output_offset |
| size_t GradientStorage::d_logits_offset |
| size_t LayerGradients::d_mlp_output_offset |
| size_t GradientStorage::d_pos_embed_offset |
| size_t LayerGradients::d_proj_bias_offset |
| size_t LayerGradients::d_proj_weights_offset |
| size_t LayerGradients::d_q_bias_offset |
| size_t LayerGradients::d_q_output_offset |
| size_t LayerGradients::d_q_weights_offset |
| size_t LayerGradients::d_residual1_offset |
| size_t LayerGradients::d_residual2_offset |
| size_t LayerGradients::d_v_bias_offset |
| size_t LayerGradients::d_v_output_offset |
| size_t LayerGradients::d_v_weights_offset |
| int TransformerModel::embed_dim |
Embedding dimension (e.g., 768 for GPT-2 small)
| size_t TransformerModel::embedded_input_offset |
Combined token+pos embeddings [context_window × aligned_embed_dim].
| size_t LayerGradients::fc1_bias_copy_offset |
| size_t TrulyOptimalLayer::fc1_bias_offset |
| size_t LayerGradients::fc1_output_copy_offset |
| size_t TrulyOptimalLayer::fc1_output_offset |
| size_t TrulyOptimalLayer::fc1_weight_offset |
| size_t LayerGradients::fc1_weights_copy_offset |
| size_t LayerGradients::fc2_bias_copy_offset |
| size_t TrulyOptimalLayer::fc2_bias_offset |
| size_t LayerGradients::fc2_input_copy_offset |
| size_t TrulyOptimalLayer::fc2_weight_offset |
| size_t LayerGradients::fc2_weights_copy_offset |
| size_t GradientStorage::final_ln_beta_copy_offset |
| size_t TransformerModel::final_ln_bias_offset |
Final LayerNorm beta [aligned_embed_dim].
| size_t GradientStorage::final_ln_gamma_copy_offset |
| size_t GradientStorage::final_ln_input_copy_offset |
| size_t GradientStorage::final_ln_mean_copy_offset |
| size_t TransformerModel::final_ln_mean_offset |
Final LayerNorm mean [context_window].
| size_t GradientStorage::final_ln_rstd_copy_offset |
| size_t TransformerModel::final_ln_rstd_offset |
Final LayerNorm rstd [context_window].
| size_t TransformerModel::final_ln_weight_offset |
Final LayerNorm gamma [aligned_embed_dim].
| size_t GradientStorage::final_output_copy_offset |
| size_t TransformerModel::final_output_offset |
Final normalized output [context_window × aligned_embed_dim].
| GradientStorage TransformerModel::gradients |
Gradient and activation cache memory (training only)
| int TransformerModel::head_dim |
Dimension per head: embed_dim / num_attention_heads.
| size_t LayerGradients::k_bias_copy_offset |
| size_t TrulyOptimalLayer::k_bias_offset |
| size_t LayerGradients::k_output_copy_offset |
| size_t TrulyOptimalLayer::k_output_offset |
| size_t TrulyOptimalLayer::k_weight_offset |
| size_t LayerGradients::k_weights_copy_offset |
| size_t GradientStorage::layer_backprop_stride |
| size_t TrulyOptimalLayer::layer_end_canary_offset |
| size_t TrulyOptimalLayer::layer_input_offset |
| size_t TrulyOptimalLayer::layer_start_canary_offset |
| size_t TransformerModel::layer_stride |
Byte offset between consecutive layer memory blocks.
| LayerGradients* GradientStorage::layers |
| TrulyOptimalLayer* TransformerModel::layers |
Array of per-layer offset structures.
| size_t TransformerModel::layers_start_offset |
Start of first transformer layer memory.
| float TransformerModel::learning_rate |
SGD learning rate for weight updates.
| size_t TransformerModel::lm_head_weight_offset |
Language model head (weight-tied to token_emb_offset)
| size_t LayerGradients::ln1_beta_copy_offset |
| size_t TrulyOptimalLayer::ln1_bias_offset |
| size_t LayerGradients::ln1_gamma_copy_offset |
| size_t LayerGradients::ln1_input_copy_offset |
| size_t LayerGradients::ln1_mean_copy_offset |
| size_t TrulyOptimalLayer::ln1_mean_offset |
| size_t LayerGradients::ln1_output_copy_offset |
| size_t TrulyOptimalLayer::ln1_output_offset |
| size_t LayerGradients::ln1_rstd_copy_offset |
| size_t TrulyOptimalLayer::ln1_rstd_offset |
| size_t TrulyOptimalLayer::ln1_weight_offset |
| size_t LayerGradients::ln2_beta_copy_offset |
| size_t TrulyOptimalLayer::ln2_bias_offset |
| size_t LayerGradients::ln2_gamma_copy_offset |
| size_t LayerGradients::ln2_input_copy_offset |
| size_t LayerGradients::ln2_mean_copy_offset |
| size_t TrulyOptimalLayer::ln2_mean_offset |
| size_t LayerGradients::ln2_output_copy_offset |
| size_t TrulyOptimalLayer::ln2_output_offset |
| size_t LayerGradients::ln2_rstd_copy_offset |
| size_t TrulyOptimalLayer::ln2_rstd_offset |
| size_t TrulyOptimalLayer::ln2_weight_offset |
| size_t GradientStorage::logits_copy_offset |
| size_t TransformerModel::logits_offset |
Output logits [context_window × vocab_size].
| char TransformerModel::magic[8] |
Magic string "BUMPWGT2" for file validation.
| float* TransformerModel::memory_base |
Base pointer to single contiguous memory block.
| size_t LayerGradients::mlp_output_copy_offset |
| size_t TrulyOptimalLayer::mlp_output_offset |
| uint32_t TransformerModel::model_type |
Model architecture: 0=GPT2, 1=LLAMA, etc.
| int TransformerModel::num_attention_heads |
Number of attention heads (e.g., 12 for GPT-2)
| int TransformerModel::num_cores |
Number of CPU cores to use (OpenMP threads)
| int TransformerModel::num_layers |
Number of transformer layers (e.g., 12 for GPT-2)
| size_t TransformerModel::pos_emb_offset |
Positional embedding table [context_window × aligned_embed_dim].
| size_t LayerGradients::proj_bias_copy_offset |
| size_t TrulyOptimalLayer::proj_bias_offset |
| size_t TrulyOptimalLayer::proj_weight_offset |
| size_t LayerGradients::proj_weights_copy_offset |
| size_t LayerGradients::q_bias_copy_offset |
| size_t TrulyOptimalLayer::q_bias_offset |
| size_t LayerGradients::q_output_copy_offset |
| size_t TrulyOptimalLayer::q_output_offset |
| size_t TrulyOptimalLayer::q_weight_offset |
| size_t LayerGradients::q_weights_copy_offset |
| uint8_t TransformerModel::reserved[32] |
Reserved for future extensions.
| size_t LayerGradients::residual1_copy_offset |
| size_t TrulyOptimalLayer::residual1_output_offset |
| size_t LayerGradients::residual2_copy_offset |
| size_t TrulyOptimalLayer::residual2_output_offset |
| size_t TransformerModel::token_emb_offset |
Token embedding table [vocab_size × aligned_embed_dim].
| int TransformerModel::tokens_per_core |
Tokens assigned per core: context_window / num_cores.
| size_t TransformerModel::total_floats |
Total size of memory block in float elements.
| size_t GradientStorage::total_gradient_floats |
| bool TransformerModel::training_enabled |
Whether gradient storage is allocated.
| size_t LayerGradients::v_bias_copy_offset |
| size_t TrulyOptimalLayer::v_bias_offset |
| size_t LayerGradients::v_output_copy_offset |
| size_t TrulyOptimalLayer::v_output_offset |
| size_t TrulyOptimalLayer::v_weight_offset |
| size_t LayerGradients::v_weights_copy_offset |
| uint32_t TransformerModel::version |
Weight file format version.
| int TransformerModel::vocab_size |
Vocabulary size (e.g., 50257 for GPT-2)