CPU-Optimized Large Language Model Runtime (x86-64) More...

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <errno.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>
#include <getopt.h>
#include <immintrin.h>
#include <math.h>
#include <time.h>
#include <sys/time.h>
#include <omp.h>
#include <stdbool.h>
#include <assert.h>

Include dependency graph for main.c:

Classes
struct	TrulyOptimalLayer

struct	LayerGradients

struct	GradientStorage

struct	TransformerModel
	Main transformer model structure with unified memory layout. More...

Macros
#define	_GNU_SOURCE

#define	ALIGN_UP(n, a) (((n) + (a) - 1) & ~((a) - 1))

#define	min(a, b) ((a) < (b) ? (a) : (b))

#define	CACHE_ALIGN 64ULL

#define	HUGE_ALIGN (2ULL * 1024 * 1024) /* 2 MB huge page */

#define	CANARY_SIZE_FLOATS 16

#define	FINAL_CANARY_ZONE_FLOATS 1024

#define	Q_ACCESS(q_ptr, h, t, d, context_window, aligned_head_dim) q_ptr[((h) * (context_window) + (t)) * (aligned_head_dim) + (d)]

#define	K_ACCESS(k_ptr, h, t, d, context_window, aligned_head_dim) k_ptr[((h) * (context_window) + (t)) * (aligned_head_dim) + (d)]

#define	V_ACCESS(v_ptr, h, t, d, context_window, aligned_head_dim) v_ptr[((h) * (context_window) + (t)) * (aligned_head_dim) + (d)]

#define	ATTN_ACCESS(attn_ptr, head_idx, query_token, key_token, context_window) attn_ptr[((head_idx) * (context_window) + (query_token)) * (context_window) + (key_token)]

#define	ATTN_SCORES_ACCESS(scores_ptr, h, i, j, aligned_context_window) scores_ptr[((h) * (aligned_context_window) * (aligned_context_window)) + ((i) * (aligned_context_window)) + (j)]

#define	READ_ALIGNED_TENSOR(ptr, expected_floats, name)

Functions
static size_t	align_up (size_t n, size_t a)

static double	get_time_sec ()

static void *	huge_alloc (size_t bytes)
	Allocate memory using 2MB hugepages (with fallback to THP)

static size_t	bump (size_t *off, size_t count, size_t alignB)
	Bump allocator for sequential memory layout.

void	layout_gradients (TransformerModel M, size_t offset)
	Lays out the memory for the backward pass.

void	layout_transformer (TransformerModel *M, bool training_mode)
	Plans and allocates a single contiguous memory block for the entire Transformer model.

void	destroy_transformer (TransformerModel *M)

static size_t	bytes_needed (int layers, int vocab, int d_model, int ctx)

float	compute_max_diff (const float ref, const float test, size_t count)

float	compute_rmse (const float ref, const float test, size_t count)

void	gemm_naive_parallel (const float A, const float B, const float bias, float C, int M, int N, int K)
	Naive parallel GEMM implementation (reference baseline)

void	gemm_avx512_parallel (const float A, const float B, const float bias, float C, int M, int N, int K)
	AVX-512 optimized GEMM with vectorized inner loops.

void	gemm_fine_grained_parallel (const float A, const float B, const float bias, float C, int M, int N, int K)
	Cache-blocked GEMM with fine-grained parallelism.

void	gemm_blocked_serial (const float A, const float B, const float bias, float C, int M, int N, int K)

void	layernorm_naive_serial (const float input, const float gamma, const float beta, float output, float mean_cache, float rstd_cache, int tokens, int d_model, int aligned_embed_dim, float eps)

void	layernorm_forward_rolled_slice (const float __restrict input_slice_base, const float __restrict gamma, const float __restrict beta, float __restrict output_slice_base, float __restrict mean_cache_slice, float __restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, int aligned_embed_dim, float eps)

void	layernorm_forward_unrolled_slice (const float __restrict input_slice_base, const float __restrict gamma, const float __restrict beta, float __restrict output_slice_base, float __restrict mean_cache_slice, float __restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, float eps)

void	layernorm_token_parallel (TransformerModel *M, size_t input_offset, size_t weight_offset, size_t bias_offset, size_t mean_cache_offset, size_t rstd_cache_offset, size_t output_offset, float eps)
	Token-parallel Layer Normalization with AVX-512 optimization.

void	layernorm_naive_serial_matched_precision (const float input, const float gamma, const float beta, float output, float mean_cache, float rstd_cache, int tokens, int d_model, float eps)

static void *	aligned_alloc_64 (size_t size)

void	debug_math_comparison (TransformerModel *M)

void	run_layernorm_benchmark_precision_matched (TransformerModel *M)

void	run_layernorm_benchmark_performance (TransformerModel *M)

static void	qkv_micro_kernel_blocked_4x16_polished (const float __restrict input_token, const float __restrict Q_weights_block, const float __restrict K_weights_block, const float __restrict V_weights_block, const float __restrict Q_bias_4, const float __restrict K_bias_4, const float __restrict V_bias_4, float __restrict Q_output_4, float __restrict K_output_4, float __restrict V_output_4, int embed_dim)

static void	qkv_token_kernel_4x16_blocked_polished (const float __restrict input_token, const float __restrict Q_weights, const float __restrict K_weights, const float __restrict V_weights, const float __restrict Q_bias, const float __restrict K_bias, const float __restrict V_bias, float __restrict Q_output, float __restrict K_output, float __restrict V_output, int embed_dim)

void	qkv_projection (TransformerModel *M, size_t layer_idx)

static void	qkv_micro_kernel_head_major_4x16 (const float __restrict input_token, const float __restrict Q_weights_block, const float __restrict K_weights_block, const float __restrict V_weights_block, const float __restrict Q_bias_4, const float __restrict K_bias_4, const float __restrict V_bias_4, TransformerModel M, float __restrict q_output_base, float __restrict k_output_base, float *__restrict v_output_base, int embed_dim, int token_idx, int output_start_dim)

static void	qkv_token_kernel_head_major_4x16 (const float __restrict input_token, const float __restrict Q_weights, const float __restrict K_weights, const float __restrict V_weights, const float __restrict Q_bias, const float __restrict K_bias, const float __restrict V_bias, TransformerModel M, float __restrict q_output_base, float __restrict k_output_base, float *__restrict v_output_base, int embed_dim, int token_idx)

void	qkv_projection_head_major (TransformerModel *M, int layer_idx)

double	compare_arrays (const float a, const float b, size_t size, const char *name)

void	convert_token_major_to_head_major_layer (const float token_major_base, float head_major_base, TransformerModel *M)

void	benchmark_qkv_dual_comparison (TransformerModel *M)

void	compute_attention_scores_head_major (TransformerModel *M, int layer_idx)

void	apply_causal_softmax_head_major (TransformerModel *M, int layer_idx)

void	compute_attention_output_head_major (TransformerModel *M, int layer_idx)

void	attention_head_major_complete (TransformerModel *M, int layer_idx)
	Complete multi-head attention with head-major layout (self-attention)

void	test_attention_head_major_after_qkv (TransformerModel *M)

void	attention_projection_with_concat (TransformerModel *M, int layer_idx)
	Production attention projection with concat: Head-major → Token-major → GEMM.

void	benchmark_attention_projection_complete (TransformerModel *M)

void	add_gpt2_token_and_positional_embeddings (TransformerModel *M, size_t token_ids_offset, size_t output_offset)

void	residual_add_token_parallel (TransformerModel *M, size_t input_offset, size_t residual_offset, size_t output_offset)

void	gelu_activation_token_parallel (TransformerModel *M, size_t data_offset)

void	mlp_token_parallel (TransformerModel *M, size_t input_offset, size_t fc1_weight_offset, size_t fc1_bias_offset, size_t fc1_output_offset, size_t fc2_weight_offset, size_t fc2_bias_offset, size_t output_offset)

void	embed_tokens (TransformerModel M, int32_t token_ids, int num_tokens)

void	compute_logits_last_token_optimized (TransformerModel *M, int position)

void	transformer_layer_forward (TransformerModel *M, int layer_idx, size_t layer_input_offset)

void	run_comprehensive_benchmark (TransformerModel *M)

int	load_model_weights (TransformerModel M, const char weight_file)
	Load weights into already-allocated TransformerModel.

int	read_model_metadata (TransformerModel M, const char weight_file)
	Read model metadata from weight file header.

int	sample_token (float *logits, int vocab_size, float temperature)

void	generate (TransformerModel M, int prompt, int prompt_len, int max_tokens)

void	zero_gradients (TransformerModel *M)

void	cache_forward_activations (TransformerModel *M)
	Copy forward pass activations to gradient storage for backward pass This preserves the forward computations needed for gradient calculation.

void	backward_residual_connection (TransformerModel *M, size_t d_output_offset, size_t d_input_offset, size_t d_transform_offset)

void	backward_embedding_layer (TransformerModel *M)

void	backward_final_layernorm (TransformerModel *M)

void	backward_fc2 (TransformerModel *M, size_t d_output_offset, size_t fc2_input_copy_offset, size_t fc2_weight_offset, size_t fc2_bias_offset, size_t d_input_offset, size_t d_weight_offset, size_t d_bias_offset)

void	backward_gelu (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t d_input_offset)

void	backward_fc1 (TransformerModel *M, size_t d_output_offset, size_t fc1_input_copy_offset, size_t fc1_weight_offset, size_t fc1_bias_offset, size_t d_input_offset, size_t d_weight_offset, size_t d_bias_offset)

void	backward_gelu_fast (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t d_input_offset)

void	backward_layernorm (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t gamma_copy_offset, size_t beta_copy_offset, size_t mean_copy_offset, size_t rstd_copy_offset, size_t d_input_offset, size_t d_gamma_offset, size_t d_beta_offset)

void	add_gradient (TransformerModel *M, size_t source_offset, size_t dest_offset)

void	backward_attention_projection (TransformerModel *M, size_t d_output_offset, size_t attention_output_copy_offset, size_t proj_weight_offset, size_t proj_bias_offset, size_t d_attention_offset, size_t d_weight_offset, size_t d_bias_offset)

void	backward_attention_weighted_values (TransformerModel *M, size_t d_output_offset, size_t attention_weights_offset, size_t v_output_offset, size_t d_weights_offset, size_t d_v_offset)

void	backward_causal_softmax (TransformerModel *M, size_t d_scores_offset, size_t weights_copy_offset, size_t scores_copy_offset)

void	backward_qk_matmul (TransformerModel *M, size_t d_scores_offset, size_t q_copy_offset, size_t k_copy_offset, size_t d_q_offset, size_t d_k_offset)

void	backward_linear (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t weight_offset, size_t bias_offset, size_t d_input_offset, size_t d_weight_offset, size_t d_bias_offset)

void	backward_lm_head (TransformerModel *M)

void	backward_transformer_layer (TransformerModel *M, int layer_idx)

void	compute_cross_entropy_loss (TransformerModel M, int32_t target_tokens, float *loss_out)
	Compute cross-entropy loss and gradients w.r.t logits.

void	training_step (TransformerModel M, int32_t input_tokens, int32_t *target_tokens, float learning_rate)

void	update_all_weights_sgd (TransformerModel *M, float learning_rate)

int	main (int argc, char **argv)

Detailed Description

CPU-Optimized Large Language Model Runtime (x86-64)

Author: ANTSHIV ROBOTICS

Version: 1.0

Date: 2025

Macro Definition Documentation

◆ _GNU_SOURCE

#define _GNU_SOURCE

◆ ALIGN_UP

#define ALIGN_UP	(	n,
		a
	)	(((n) + (a) - 1) & ~((a) - 1))

◆ ATTN_ACCESS

#define ATTN_ACCESS	(	attn_ptr,
		head_idx,
		query_token,
		key_token,
		context_window
	)	attn_ptr[((head_idx) * (context_window) + (query_token)) * (context_window) + (key_token)]

◆ CACHE_ALIGN

#define CACHE_ALIGN 64ULL

◆ CANARY_SIZE_FLOATS

#define CANARY_SIZE_FLOATS 16

◆ FINAL_CANARY_ZONE_FLOATS

#define FINAL_CANARY_ZONE_FLOATS 1024

◆ HUGE_ALIGN

#define HUGE_ALIGN (2ULL * 1024 * 1024) /* 2 MB huge page */

◆ K_ACCESS

#define K_ACCESS	(	k_ptr,
		h,
		t,
		d,
		context_window,
		aligned_head_dim
	)	k_ptr[((h) * (context_window) + (t)) * (aligned_head_dim) + (d)]

◆ min

#define min	(	a,
		b
	)	((a) < (b) ? (a) : (b))

◆ Q_ACCESS

#define Q_ACCESS	(	q_ptr,
		h,
		t,
		d,
		context_window,
		aligned_head_dim
	)	q_ptr[((h) * (context_window) + (t)) * (aligned_head_dim) + (d)]

◆ READ_ALIGNED_TENSOR

#define READ_ALIGNED_TENSOR	(	ptr,
		expected_floats,
		name
	)

Value:

        do { \
        size_t bytes_to_read = (expected_floats) * sizeof(float); \
        size_t bytes_read = fread(ptr, 1, bytes_to_read, fp); \
        if (bytes_read != bytes_to_read) { \
            fprintf(stderr, "❌ Failed to read %s: expected %zu bytes, got %zu\n", \
                    name, bytes_to_read, bytes_read); \
            fclose(fp); \
            return -1; \
        } \
    } while(0)

◆ V_ACCESS

#define V_ACCESS	(	v_ptr,
		h,
		t,
		d,
		context_window,
		aligned_head_dim
	)	v_ptr[((h) * (context_window) + (t)) * (aligned_head_dim) + (d)]

Function Documentation

◆ align_up()

static size_t align_up	(	size_t	n,
		size_t	a
	)

inlinestatic

◆ bump()

static size_t bump	(	size_t *	off,
		size_t	count,
		size_t	alignB
	)

inlinestatic

Bump allocator for sequential memory layout.

Core primitive for building the contiguous memory layout. This function:

Aligns the current offset to the requested boundary
Returns the aligned offset for tensor placement
Advances the offset by the tensor size

Parameters

off	Pointer to current offset cursor (in float elements)
count	Number of float elements to allocate
alignB	Alignment requirement in bytes (e.g., 64 for cache lines)

Returns: Aligned offset where tensor should be placed

Why Bump Allocation?

✅ Zero Fragmentation: Sequential layout, no holes
✅ Predictable Addresses: Enables memory-mapped file loading
✅ Cache Locality: Related tensors are spatially close
✅ Dry-Run Mode: Can compute total size before allocation

Example Usage:

size_t offset = 0;
size_t q_weight_off = bump(&offset, 768*768, CACHE_ALIGN);  // Aligned to 64B
size_t k_weight_off = bump(&offset, 768*768, CACHE_ALIGN);  // Sequential
size_t v_weight_off = bump(&offset, 768*768, CACHE_ALIGN);
// offset now contains total size needed

Alignment Rationale:

64-byte alignment matches cache line size
Enables use of aligned SIMD loads (_mm512_load_ps vs _mm512_loadu_ps)
Prevents false sharing (each tensor starts on new cache line)

Note: This function does NOT allocate memory, only tracks offsets

See also: layout_transformer Full memory layout using bump allocation

◆ bytes_needed()

static size_t bytes_needed	(	int	layers,
		int	vocab,
		int	d_model,
		int	ctx
	)

static

◆ compute_max_diff()

float compute_max_diff	(	const float *	ref,
		const float *	test,
		size_t	count
	)

◆ compute_rmse()

float compute_rmse	(	const float *	ref,
		const float *	test,
		size_t	count
	)

◆ destroy_transformer()

void destroy_transformer ( TransformerModel * M )

◆ get_time_sec()

static double get_time_sec ( )

inlinestatic

◆ huge_alloc()

static void * huge_alloc ( size_t bytes )

static

Allocate memory using 2MB hugepages (with fallback to THP)

Attempts to allocate memory backed by explicit 2MB hugepages via mmap. If that fails (e.g., insufficient hugepages configured), falls back to aligned_alloc + madvise(MADV_HUGEPAGE) for transparent hugepage (THP) support.

Parameters

bytes Number of bytes to allocate

Returns: Pointer to allocated memory (2MB-aligned)

Why Hugepages Matter:

✅ TLB Efficiency: 2MB pages reduce TLB entries by 512x vs 4KB pages
✅ Page Fault Reduction: Fewer page faults during model initialization
✅ Memory Bandwidth: Better DRAM page locality
✅ Latency: Reduced virtual-to-physical address translation overhead

Performance Impact: On a 4GB model with 1024 4KB pages:

TLB misses: ~1M misses without hugepages
TLB misses: ~2K misses with 2MB hugepages
Result: 500x reduction in TLB overhead (~5-10% speedup)

System Configuration (Linux):

# Check available hugepages
cat /proc/meminfo | grep Huge
 
# Allocate 2048 × 2MB = 4GB of hugepages
echo 2048 | sudo tee /proc/sys/vm/nr_hugepages
 
# Enable THP (fallback)
echo madvise | sudo tee /sys/kernel/mm/transparent_hugepage/enabled

Allocation Strategy:

Try explicit hugepages via mmap + MAP_HUGETLB (best performance)
Fall back to aligned_alloc + MADV_HUGEPAGE (THP, still good)
Kernel promotes pages to 2MB when possible

Warning: Requires root or CAP_SYS_RESOURCE for explicit hugepages

Note: Falls back gracefully to THP if explicit allocation fails

See also: https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt

◆ layout_gradients()

void layout_gradients	(	TransformerModel *	M,
		size_t *	offset
	)

Lays out the memory for the backward pass.

Parameters

M	Pointer to the TransformerModel struct.
offset	Pointer to the current memory offset, which will be updated.

This function allocates a dedicated, contiguous memory arena for all data required during backpropagation. This includes:

GRADS: Buffers to accumulate gradients for every weight and bias.
ACTS: Cached copies of activations from the forward pass needed by the backward pass.
dACTS: Buffers to hold the gradients of activations as they flow backward.

* ┌────────────────────────────────────────────────────────────┐
* │ Global GRADS (Embeddings, Final LN)                        │
* ├────────────────────────────────────────────────────────────┤
* │ Global ACTS (Logits, Final LN inputs...)                   │
* ├────────────────────────────────────────────────────────────┤
* │ Global dACTS (dLogits, dFinal_output...)                   │
* ├────────────────────────────────────────────────────────────┤
* │ Per-Layer Arena (Layer 0)                                  │
* │ ┌────────────────────────────────────────────────────────┐ │
* │ │ Layer 0 ACTS (ln_mean, attn_probs, fc1_preact...)      │ │
* │ ├────────────────────────────────────────────────────────┤ │
* │ │ Layer 0 dACTS (dL/d_mlp_output, dL/d_attn_output...)   │ │
* │ ├────────────────────────────────────────────────────────┤ │
* │ │ Layer 0 GRADS (dL/dW_fc1, dL/dW_q...)                  │ │
* │ └────────────────────────────────────────────────────────┘ │
* ├────────────────────────────────────────────────────────────┤
* │ ... repeated for each layer ...                            │
* └────────────────────────────────────────────────────────────┘
*

◆ layout_transformer()

void layout_transformer	(	TransformerModel *	M,
		bool	training_mode
	)

Plans and allocates a single contiguous memory block for the entire Transformer model.

Parameters

M	Pointer to the TransformerModel struct to be populated.

This function orchestrates the memory layout for the model, aligned with HPC best practices:

Uses a single allocation to minimize OS overhead and TLB misses.
Aligns blocks to cache lines (CACHE_ALIGN) to improve memory performance.
Supports memory-mapped file layout for fast startup.
Inserts debug-friendly CANARY markers to detect buffer overflows.

Memory Layout Overview:

Let:

D = aligned embedding dimension
H = aligned head dimension
T = context window
V = vocabulary size

* ┌────────────────────────────────────────────────────────────┐
* │ Token & Positional Embeddings Tables                       │ ← Shared [V * D, T * D]
* ├────────────────────────────────────────────────────────────┤
* │ Layer 0                                                    │
* │ ┌────────────────────────────────────────────────────────┐ │
* │ │ START CANARY                                           │ │
* │ ├────────────────────────────────────────────────────────┤ │
* │ │ LN1 Weights & Biases                                   │ ← [~D]
* │ ├────────────────────────────────────────────────────────┤ │
* │ │ Attention Weights (Wq, Wk, Wv, W_proj)                 │ ← [~D * D]
* │ ├────────────────────────────────────────────────────────┤ │
* │ │ QKV Activation Buffers                                 │ ← [T * num_heads * H]
* │ ├────────────────────────────────────────────────────────┤ │
* │ │ LN2 Weights & Biases                                   │ ← [~D]
* │ ├────────────────────────────────────────────────────────┤ │
* │ │ MLP Weights (W_fc1, W_fc2)                             │ ← [~D * 4D, 4D * D]
* │ ├────────────────────────────────────────────────────────┤ │
* │ │ MLP Activations & Residual Buffers                     │ ← [T * 4D, T * D]
* │ ├────────────────────────────────────────────────────────┤ │
* │ │ END CANARY                                             │ │
* │ └────────────────────────────────────────────────────────┘ │
* ├────────────────────────────────────────────────────────────┤
* │ ... repeated for each layer ...                            │
* ├────────────────────────────────────────────────────────────┤
* │ Final LayerNorm + Final CANARY                             │
* └────────────────────────────────────────────────────────────┘
*

This memory map ensures high locality for token-wise, head-wise, and GEMM-parallel computations.

Classes

Macros

Functions

Detailed Description

Macro Definition Documentation

◆ _GNU_SOURCE

◆ ALIGN_UP

◆ ATTN_ACCESS

◆ CACHE_ALIGN

◆ CANARY_SIZE_FLOATS

◆ FINAL_CANARY_ZONE_FLOATS

◆ HUGE_ALIGN

◆ K_ACCESS

◆ min

◆ Q_ACCESS

◆ READ_ALIGNED_TENSOR

◆ V_ACCESS

Function Documentation

◆ align_up()

◆ bump()

◆ bytes_needed()

◆ compute_max_diff()

◆ compute_rmse()

◆ destroy_transformer()

◆ get_time_sec()

◆ huge_alloc()

◆ layout_gradients()

◆ layout_transformer()