C-Transformer
Cache-Optimized Transformers in C (x86)
Loading...
Searching...
No Matches
Classes | Macros | Functions
main.c File Reference

CPU-Optimized Large Language Model Runtime (x86-64) More...

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <errno.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>
#include <getopt.h>
#include <immintrin.h>
#include <math.h>
#include <time.h>
#include <sys/time.h>
#include <omp.h>
#include <stdbool.h>
#include <assert.h>
Include dependency graph for main.c:

Classes

struct  TrulyOptimalLayer
 
struct  LayerGradients
 
struct  GradientStorage
 
struct  TransformerModel
 Main transformer model structure with unified memory layout. More...
 

Macros

#define _GNU_SOURCE
 
#define ALIGN_UP(n, a)   (((n) + (a) - 1) & ~((a) - 1))
 
#define min(a, b)   ((a) < (b) ? (a) : (b))
 
#define CACHE_ALIGN   64ULL
 
#define HUGE_ALIGN   (2ULL * 1024 * 1024) /* 2 MB huge page */
 
#define CANARY_SIZE_FLOATS   16
 
#define FINAL_CANARY_ZONE_FLOATS   1024
 
#define Q_ACCESS(q_ptr, h, t, d, context_window, aligned_head_dim)    q_ptr[((h) * (context_window) + (t)) * (aligned_head_dim) + (d)]
 
#define K_ACCESS(k_ptr, h, t, d, context_window, aligned_head_dim)    k_ptr[((h) * (context_window) + (t)) * (aligned_head_dim) + (d)]
 
#define V_ACCESS(v_ptr, h, t, d, context_window, aligned_head_dim)    v_ptr[((h) * (context_window) + (t)) * (aligned_head_dim) + (d)]
 
#define ATTN_ACCESS(attn_ptr, head_idx, query_token, key_token, context_window)    attn_ptr[((head_idx) * (context_window) + (query_token)) * (context_window) + (key_token)]
 
#define ATTN_SCORES_ACCESS(scores_ptr, h, i, j, aligned_context_window)    scores_ptr[((h) * (aligned_context_window) * (aligned_context_window)) + ((i) * (aligned_context_window)) + (j)]
 
#define READ_ALIGNED_TENSOR(ptr, expected_floats, name)
 

Functions

static size_t align_up (size_t n, size_t a)
 
static double get_time_sec ()
 
static void * huge_alloc (size_t bytes)
 Allocate memory using 2MB hugepages (with fallback to THP)
 
static size_t bump (size_t *off, size_t count, size_t alignB)
 Bump allocator for sequential memory layout.
 
void layout_gradients (TransformerModel *M, size_t *offset)
 Lays out the memory for the backward pass.
 
void layout_transformer (TransformerModel *M, bool training_mode)
 Plans and allocates a single contiguous memory block for the entire Transformer model.
 
void destroy_transformer (TransformerModel *M)
 
static size_t bytes_needed (int layers, int vocab, int d_model, int ctx)
 
float compute_max_diff (const float *ref, const float *test, size_t count)
 
float compute_rmse (const float *ref, const float *test, size_t count)
 
void gemm_naive_parallel (const float *A, const float *B, const float *bias, float *C, int M, int N, int K)
 Naive parallel GEMM implementation (reference baseline)
 
void gemm_avx512_parallel (const float *A, const float *B, const float *bias, float *C, int M, int N, int K)
 AVX-512 optimized GEMM with vectorized inner loops.
 
void gemm_fine_grained_parallel (const float *A, const float *B, const float *bias, float *C, int M, int N, int K)
 Cache-blocked GEMM with fine-grained parallelism.
 
void gemm_blocked_serial (const float *A, const float *B, const float *bias, float *C, int M, int N, int K)
 
void layernorm_naive_serial (const float *input, const float *gamma, const float *beta, float *output, float *mean_cache, float *rstd_cache, int tokens, int d_model, int aligned_embed_dim, float eps)
 
void layernorm_forward_rolled_slice (const float *__restrict input_slice_base, const float *__restrict gamma, const float *__restrict beta, float *__restrict output_slice_base, float *__restrict mean_cache_slice, float *__restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, int aligned_embed_dim, float eps)
 
void layernorm_forward_unrolled_slice (const float *__restrict input_slice_base, const float *__restrict gamma, const float *__restrict beta, float *__restrict output_slice_base, float *__restrict mean_cache_slice, float *__restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, float eps)
 
void layernorm_token_parallel (TransformerModel *M, size_t input_offset, size_t weight_offset, size_t bias_offset, size_t mean_cache_offset, size_t rstd_cache_offset, size_t output_offset, float eps)
 Token-parallel Layer Normalization with AVX-512 optimization.
 
void layernorm_naive_serial_matched_precision (const float *input, const float *gamma, const float *beta, float *output, float *mean_cache, float *rstd_cache, int tokens, int d_model, float eps)
 
static void * aligned_alloc_64 (size_t size)
 
void debug_math_comparison (TransformerModel *M)
 
void run_layernorm_benchmark_precision_matched (TransformerModel *M)
 
void run_layernorm_benchmark_performance (TransformerModel *M)
 
static void qkv_micro_kernel_blocked_4x16_polished (const float *__restrict input_token, const float *__restrict Q_weights_block, const float *__restrict K_weights_block, const float *__restrict V_weights_block, const float *__restrict Q_bias_4, const float *__restrict K_bias_4, const float *__restrict V_bias_4, float *__restrict Q_output_4, float *__restrict K_output_4, float *__restrict V_output_4, int embed_dim)
 
static void qkv_token_kernel_4x16_blocked_polished (const float *__restrict input_token, const float *__restrict Q_weights, const float *__restrict K_weights, const float *__restrict V_weights, const float *__restrict Q_bias, const float *__restrict K_bias, const float *__restrict V_bias, float *__restrict Q_output, float *__restrict K_output, float *__restrict V_output, int embed_dim)
 
void qkv_projection (TransformerModel *M, size_t layer_idx)
 
static void qkv_micro_kernel_head_major_4x16 (const float *__restrict input_token, const float *__restrict Q_weights_block, const float *__restrict K_weights_block, const float *__restrict V_weights_block, const float *__restrict Q_bias_4, const float *__restrict K_bias_4, const float *__restrict V_bias_4, TransformerModel *M, float *__restrict q_output_base, float *__restrict k_output_base, float *__restrict v_output_base, int embed_dim, int token_idx, int output_start_dim)
 
static void qkv_token_kernel_head_major_4x16 (const float *__restrict input_token, const float *__restrict Q_weights, const float *__restrict K_weights, const float *__restrict V_weights, const float *__restrict Q_bias, const float *__restrict K_bias, const float *__restrict V_bias, TransformerModel *M, float *__restrict q_output_base, float *__restrict k_output_base, float *__restrict v_output_base, int embed_dim, int token_idx)
 
void qkv_projection_head_major (TransformerModel *M, int layer_idx)
 
double compare_arrays (const float *a, const float *b, size_t size, const char *name)
 
void convert_token_major_to_head_major_layer (const float *token_major_base, float *head_major_base, TransformerModel *M)
 
void benchmark_qkv_dual_comparison (TransformerModel *M)
 
void compute_attention_scores_head_major (TransformerModel *M, int layer_idx)
 
void apply_causal_softmax_head_major (TransformerModel *M, int layer_idx)
 
void compute_attention_output_head_major (TransformerModel *M, int layer_idx)
 
void attention_head_major_complete (TransformerModel *M, int layer_idx)
 Complete multi-head attention with head-major layout (self-attention)
 
void test_attention_head_major_after_qkv (TransformerModel *M)
 
void attention_projection_with_concat (TransformerModel *M, int layer_idx)
 Production attention projection with concat: Head-major → Token-major → GEMM.
 
void benchmark_attention_projection_complete (TransformerModel *M)
 
void add_gpt2_token_and_positional_embeddings (TransformerModel *M, size_t token_ids_offset, size_t output_offset)
 
void residual_add_token_parallel (TransformerModel *M, size_t input_offset, size_t residual_offset, size_t output_offset)
 
void gelu_activation_token_parallel (TransformerModel *M, size_t data_offset)
 
void mlp_token_parallel (TransformerModel *M, size_t input_offset, size_t fc1_weight_offset, size_t fc1_bias_offset, size_t fc1_output_offset, size_t fc2_weight_offset, size_t fc2_bias_offset, size_t output_offset)
 
void embed_tokens (TransformerModel *M, int32_t *token_ids, int num_tokens)
 
void compute_logits_last_token_optimized (TransformerModel *M, int position)
 
void transformer_layer_forward (TransformerModel *M, int layer_idx, size_t layer_input_offset)
 
void run_comprehensive_benchmark (TransformerModel *M)
 
int load_model_weights (TransformerModel *M, const char *weight_file)
 Load weights into already-allocated TransformerModel.
 
int read_model_metadata (TransformerModel *M, const char *weight_file)
 Read model metadata from weight file header.
 
int sample_token (float *logits, int vocab_size, float temperature)
 
void generate (TransformerModel *M, int *prompt, int prompt_len, int max_tokens)
 
void zero_gradients (TransformerModel *M)
 
void cache_forward_activations (TransformerModel *M)
 Copy forward pass activations to gradient storage for backward pass This preserves the forward computations needed for gradient calculation.
 
void backward_residual_connection (TransformerModel *M, size_t d_output_offset, size_t d_input_offset, size_t d_transform_offset)
 
void backward_embedding_layer (TransformerModel *M)
 
void backward_final_layernorm (TransformerModel *M)
 
void backward_fc2 (TransformerModel *M, size_t d_output_offset, size_t fc2_input_copy_offset, size_t fc2_weight_offset, size_t fc2_bias_offset, size_t d_input_offset, size_t d_weight_offset, size_t d_bias_offset)
 
void backward_gelu (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t d_input_offset)
 
void backward_fc1 (TransformerModel *M, size_t d_output_offset, size_t fc1_input_copy_offset, size_t fc1_weight_offset, size_t fc1_bias_offset, size_t d_input_offset, size_t d_weight_offset, size_t d_bias_offset)
 
void backward_gelu_fast (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t d_input_offset)
 
void backward_layernorm (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t gamma_copy_offset, size_t beta_copy_offset, size_t mean_copy_offset, size_t rstd_copy_offset, size_t d_input_offset, size_t d_gamma_offset, size_t d_beta_offset)
 
void add_gradient (TransformerModel *M, size_t source_offset, size_t dest_offset)
 
void backward_attention_projection (TransformerModel *M, size_t d_output_offset, size_t attention_output_copy_offset, size_t proj_weight_offset, size_t proj_bias_offset, size_t d_attention_offset, size_t d_weight_offset, size_t d_bias_offset)
 
void backward_attention_weighted_values (TransformerModel *M, size_t d_output_offset, size_t attention_weights_offset, size_t v_output_offset, size_t d_weights_offset, size_t d_v_offset)
 
void backward_causal_softmax (TransformerModel *M, size_t d_scores_offset, size_t weights_copy_offset, size_t scores_copy_offset)
 
void backward_qk_matmul (TransformerModel *M, size_t d_scores_offset, size_t q_copy_offset, size_t k_copy_offset, size_t d_q_offset, size_t d_k_offset)
 
void backward_linear (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t weight_offset, size_t bias_offset, size_t d_input_offset, size_t d_weight_offset, size_t d_bias_offset)
 
void backward_lm_head (TransformerModel *M)
 
void backward_transformer_layer (TransformerModel *M, int layer_idx)
 
void compute_cross_entropy_loss (TransformerModel *M, int32_t *target_tokens, float *loss_out)
 Compute cross-entropy loss and gradients w.r.t logits.
 
void training_step (TransformerModel *M, int32_t *input_tokens, int32_t *target_tokens, float learning_rate)
 
void update_all_weights_sgd (TransformerModel *M, float learning_rate)
 
int main (int argc, char **argv)
 

Detailed Description

CPU-Optimized Large Language Model Runtime (x86-64)

Author
ANTSHIV ROBOTICS
Version
1.0
Date
2025

Macro Definition Documentation

◆ _GNU_SOURCE

#define _GNU_SOURCE

◆ ALIGN_UP

#define ALIGN_UP (   n,
 
)    (((n) + (a) - 1) & ~((a) - 1))

◆ ATTN_ACCESS

#define ATTN_ACCESS (   attn_ptr,
  head_idx,
  query_token,
  key_token,
  context_window 
)     attn_ptr[((head_idx) * (context_window) + (query_token)) * (context_window) + (key_token)]

◆ CACHE_ALIGN

#define CACHE_ALIGN   64ULL

◆ CANARY_SIZE_FLOATS

#define CANARY_SIZE_FLOATS   16

◆ FINAL_CANARY_ZONE_FLOATS

#define FINAL_CANARY_ZONE_FLOATS   1024

◆ HUGE_ALIGN

#define HUGE_ALIGN   (2ULL * 1024 * 1024) /* 2 MB huge page */

◆ K_ACCESS

#define K_ACCESS (   k_ptr,
  h,
  t,
  d,
  context_window,
  aligned_head_dim 
)     k_ptr[((h) * (context_window) + (t)) * (aligned_head_dim) + (d)]

◆ min

#define min (   a,
 
)    ((a) < (b) ? (a) : (b))

◆ Q_ACCESS

#define Q_ACCESS (   q_ptr,
  h,
  t,
  d,
  context_window,
  aligned_head_dim 
)     q_ptr[((h) * (context_window) + (t)) * (aligned_head_dim) + (d)]

◆ READ_ALIGNED_TENSOR

#define READ_ALIGNED_TENSOR (   ptr,
  expected_floats,
  name 
)
Value:
do { \
size_t bytes_to_read = (expected_floats) * sizeof(float); \
size_t bytes_read = fread(ptr, 1, bytes_to_read, fp); \
if (bytes_read != bytes_to_read) { \
fprintf(stderr, "❌ Failed to read %s: expected %zu bytes, got %zu\n", \
name, bytes_to_read, bytes_read); \
fclose(fp); \
return -1; \
} \
} while(0)

◆ V_ACCESS

#define V_ACCESS (   v_ptr,
  h,
  t,
  d,
  context_window,
  aligned_head_dim 
)     v_ptr[((h) * (context_window) + (t)) * (aligned_head_dim) + (d)]

Function Documentation

◆ align_up()

static size_t align_up ( size_t  n,
size_t  a 
)
inlinestatic

◆ bump()

static size_t bump ( size_t *  off,
size_t  count,
size_t  alignB 
)
inlinestatic

Bump allocator for sequential memory layout.

Core primitive for building the contiguous memory layout. This function:

  1. Aligns the current offset to the requested boundary
  2. Returns the aligned offset for tensor placement
  3. Advances the offset by the tensor size
Parameters
offPointer to current offset cursor (in float elements)
countNumber of float elements to allocate
alignBAlignment requirement in bytes (e.g., 64 for cache lines)
Returns
Aligned offset where tensor should be placed

Why Bump Allocation?

  • Zero Fragmentation: Sequential layout, no holes
  • Predictable Addresses: Enables memory-mapped file loading
  • Cache Locality: Related tensors are spatially close
  • Dry-Run Mode: Can compute total size before allocation

Example Usage:

size_t offset = 0;
size_t q_weight_off = bump(&offset, 768*768, CACHE_ALIGN); // Aligned to 64B
size_t k_weight_off = bump(&offset, 768*768, CACHE_ALIGN); // Sequential
size_t v_weight_off = bump(&offset, 768*768, CACHE_ALIGN);
// offset now contains total size needed
#define CACHE_ALIGN
Definition main.c:82
static size_t bump(size_t *off, size_t count, size_t alignB)
Bump allocator for sequential memory layout.
Definition main.c:532

Alignment Rationale:

  • 64-byte alignment matches cache line size
  • Enables use of aligned SIMD loads (_mm512_load_ps vs _mm512_loadu_ps)
  • Prevents false sharing (each tensor starts on new cache line)
Note
This function does NOT allocate memory, only tracks offsets
See also
layout_transformer Full memory layout using bump allocation

◆ bytes_needed()

static size_t bytes_needed ( int  layers,
int  vocab,
int  d_model,
int  ctx 
)
static

◆ compute_max_diff()

float compute_max_diff ( const float *  ref,
const float *  test,
size_t  count 
)

◆ compute_rmse()

float compute_rmse ( const float *  ref,
const float *  test,
size_t  count 
)

◆ destroy_transformer()

void destroy_transformer ( TransformerModel M)

◆ get_time_sec()

static double get_time_sec ( )
inlinestatic

◆ huge_alloc()

static void * huge_alloc ( size_t  bytes)
static

Allocate memory using 2MB hugepages (with fallback to THP)

Attempts to allocate memory backed by explicit 2MB hugepages via mmap. If that fails (e.g., insufficient hugepages configured), falls back to aligned_alloc + madvise(MADV_HUGEPAGE) for transparent hugepage (THP) support.

Parameters
bytesNumber of bytes to allocate
Returns
Pointer to allocated memory (2MB-aligned)

Why Hugepages Matter:

  • TLB Efficiency: 2MB pages reduce TLB entries by 512x vs 4KB pages
  • Page Fault Reduction: Fewer page faults during model initialization
  • Memory Bandwidth: Better DRAM page locality
  • Latency: Reduced virtual-to-physical address translation overhead

Performance Impact: On a 4GB model with 1024 4KB pages:

  • TLB misses: ~1M misses without hugepages
  • TLB misses: ~2K misses with 2MB hugepages
  • Result: 500x reduction in TLB overhead (~5-10% speedup)

System Configuration (Linux):

# Check available hugepages
cat /proc/meminfo | grep Huge
# Allocate 2048 × 2MB = 4GB of hugepages
echo 2048 | sudo tee /proc/sys/vm/nr_hugepages
# Enable THP (fallback)
echo madvise | sudo tee /sys/kernel/mm/transparent_hugepage/enabled

Allocation Strategy:

  1. Try explicit hugepages via mmap + MAP_HUGETLB (best performance)
  2. Fall back to aligned_alloc + MADV_HUGEPAGE (THP, still good)
  3. Kernel promotes pages to 2MB when possible
Warning
Requires root or CAP_SYS_RESOURCE for explicit hugepages
Note
Falls back gracefully to THP if explicit allocation fails
See also
https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt

◆ layout_gradients()

void layout_gradients ( TransformerModel M,
size_t *  offset 
)

Lays out the memory for the backward pass.

Parameters
MPointer to the TransformerModel struct.
offsetPointer to the current memory offset, which will be updated.

This function allocates a dedicated, contiguous memory arena for all data required during backpropagation. This includes:

  • GRADS: Buffers to accumulate gradients for every weight and bias.
  • ACTS: Cached copies of activations from the forward pass needed by the backward pass.
  • dACTS: Buffers to hold the gradients of activations as they flow backward.
* ┌────────────────────────────────────────────────────────────┐
* │ Global GRADS (Embeddings, Final LN)                        │
* ├────────────────────────────────────────────────────────────┤
* │ Global ACTS (Logits, Final LN inputs...)                   │
* ├────────────────────────────────────────────────────────────┤
* │ Global dACTS (dLogits, dFinal_output...)                   │
* ├────────────────────────────────────────────────────────────┤
* │ Per-Layer Arena (Layer 0)                                  │
* │ ┌────────────────────────────────────────────────────────┐ │
* │ │ Layer 0 ACTS (ln_mean, attn_probs, fc1_preact...)      │ │
* │ ├────────────────────────────────────────────────────────┤ │
* │ │ Layer 0 dACTS (dL/d_mlp_output, dL/d_attn_output...)   │ │
* │ ├────────────────────────────────────────────────────────┤ │
* │ │ Layer 0 GRADS (dL/dW_fc1, dL/dW_q...)                  │ │
* │ └────────────────────────────────────────────────────────┘ │
* ├────────────────────────────────────────────────────────────┤
* │ ... repeated for each layer ...                            │
* └────────────────────────────────────────────────────────────┘
* 

◆ layout_transformer()

void layout_transformer ( TransformerModel M,
bool  training_mode 
)

Plans and allocates a single contiguous memory block for the entire Transformer model.

Parameters
MPointer to the TransformerModel struct to be populated.

This function orchestrates the memory layout for the model, aligned with HPC best practices:

  • Uses a single allocation to minimize OS overhead and TLB misses.
  • Aligns blocks to cache lines (CACHE_ALIGN) to improve memory performance.
  • Supports memory-mapped file layout for fast startup.
  • Inserts debug-friendly CANARY markers to detect buffer overflows.

Memory Layout Overview:

Let:

  • D = aligned embedding dimension
  • H = aligned head dimension
  • T = context window
  • V = vocabulary size
* ┌────────────────────────────────────────────────────────────┐
* │ Token & Positional Embeddings Tables                       │ ← Shared [V * D, T * D]
* ├────────────────────────────────────────────────────────────┤
* │ Layer 0                                                    │
* │ ┌────────────────────────────────────────────────────────┐ │
* │ │ START CANARY                                           │ │
* │ ├────────────────────────────────────────────────────────┤ │
* │ │ LN1 Weights & Biases                                   │ ← [~D]
* │ ├────────────────────────────────────────────────────────┤ │
* │ │ Attention Weights (Wq, Wk, Wv, W_proj)                 │ ← [~D * D]
* │ ├────────────────────────────────────────────────────────┤ │
* │ │ QKV Activation Buffers                                 │ ← [T * num_heads * H]
* │ ├────────────────────────────────────────────────────────┤ │
* │ │ LN2 Weights & Biases                                   │ ← [~D]
* │ ├────────────────────────────────────────────────────────┤ │
* │ │ MLP Weights (W_fc1, W_fc2)                             │ ← [~D * 4D, 4D * D]
* │ ├────────────────────────────────────────────────────────┤ │
* │ │ MLP Activations & Residual Buffers                     │ ← [T * 4D, T * D]
* │ ├────────────────────────────────────────────────────────┤ │
* │ │ END CANARY                                             │ │
* │ └────────────────────────────────────────────────────────┘ │
* ├────────────────────────────────────────────────────────────┤
* │ ... repeated for each layer ...                            │
* ├────────────────────────────────────────────────────────────┤
* │ Final LayerNorm + Final CANARY                             │
* └────────────────────────────────────────────────────────────┘
* 

This memory map ensures high locality for token-wise, head-wise, and GEMM-parallel computations.