|
| static size_t | align_up (size_t n, size_t a) |
| |
| static double | get_time_sec () |
| |
| static void * | huge_alloc (size_t bytes) |
| | Allocate memory using 2MB hugepages (with fallback to THP)
|
| |
| static size_t | bump (size_t *off, size_t count, size_t alignB) |
| | Bump allocator for sequential memory layout.
|
| |
| void | layout_gradients (TransformerModel *M, size_t *offset) |
| | Lays out the memory for the backward pass.
|
| |
| void | layout_transformer (TransformerModel *M, bool training_mode) |
| | Plans and allocates a single contiguous memory block for the entire Transformer model.
|
| |
| void | destroy_transformer (TransformerModel *M) |
| |
| static size_t | bytes_needed (int layers, int vocab, int d_model, int ctx) |
| |
| float | compute_max_diff (const float *ref, const float *test, size_t count) |
| |
| float | compute_rmse (const float *ref, const float *test, size_t count) |
| |
| void | gemm_naive_parallel (const float *A, const float *B, const float *bias, float *C, int M, int N, int K) |
| | Naive parallel GEMM implementation (reference baseline)
|
| |
| void | gemm_avx512_parallel (const float *A, const float *B, const float *bias, float *C, int M, int N, int K) |
| | AVX-512 optimized GEMM with vectorized inner loops.
|
| |
| void | gemm_fine_grained_parallel (const float *A, const float *B, const float *bias, float *C, int M, int N, int K) |
| | Cache-blocked GEMM with fine-grained parallelism.
|
| |
| void | gemm_blocked_serial (const float *A, const float *B, const float *bias, float *C, int M, int N, int K) |
| |
| void | layernorm_naive_serial (const float *input, const float *gamma, const float *beta, float *output, float *mean_cache, float *rstd_cache, int tokens, int d_model, int aligned_embed_dim, float eps) |
| |
| void | layernorm_forward_rolled_slice (const float *__restrict input_slice_base, const float *__restrict gamma, const float *__restrict beta, float *__restrict output_slice_base, float *__restrict mean_cache_slice, float *__restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, int aligned_embed_dim, float eps) |
| |
| void | layernorm_forward_unrolled_slice (const float *__restrict input_slice_base, const float *__restrict gamma, const float *__restrict beta, float *__restrict output_slice_base, float *__restrict mean_cache_slice, float *__restrict rstd_cache_slice, int num_tokens_in_slice, int d_model, float eps) |
| |
| void | layernorm_token_parallel (TransformerModel *M, size_t input_offset, size_t weight_offset, size_t bias_offset, size_t mean_cache_offset, size_t rstd_cache_offset, size_t output_offset, float eps) |
| | Token-parallel Layer Normalization with AVX-512 optimization.
|
| |
| void | layernorm_naive_serial_matched_precision (const float *input, const float *gamma, const float *beta, float *output, float *mean_cache, float *rstd_cache, int tokens, int d_model, float eps) |
| |
| static void * | aligned_alloc_64 (size_t size) |
| |
| void | debug_math_comparison (TransformerModel *M) |
| |
| void | run_layernorm_benchmark_precision_matched (TransformerModel *M) |
| |
| void | run_layernorm_benchmark_performance (TransformerModel *M) |
| |
| static void | qkv_micro_kernel_blocked_4x16_polished (const float *__restrict input_token, const float *__restrict Q_weights_block, const float *__restrict K_weights_block, const float *__restrict V_weights_block, const float *__restrict Q_bias_4, const float *__restrict K_bias_4, const float *__restrict V_bias_4, float *__restrict Q_output_4, float *__restrict K_output_4, float *__restrict V_output_4, int embed_dim) |
| |
| static void | qkv_token_kernel_4x16_blocked_polished (const float *__restrict input_token, const float *__restrict Q_weights, const float *__restrict K_weights, const float *__restrict V_weights, const float *__restrict Q_bias, const float *__restrict K_bias, const float *__restrict V_bias, float *__restrict Q_output, float *__restrict K_output, float *__restrict V_output, int embed_dim) |
| |
| void | qkv_projection (TransformerModel *M, size_t layer_idx) |
| |
| static void | qkv_micro_kernel_head_major_4x16 (const float *__restrict input_token, const float *__restrict Q_weights_block, const float *__restrict K_weights_block, const float *__restrict V_weights_block, const float *__restrict Q_bias_4, const float *__restrict K_bias_4, const float *__restrict V_bias_4, TransformerModel *M, float *__restrict q_output_base, float *__restrict k_output_base, float *__restrict v_output_base, int embed_dim, int token_idx, int output_start_dim) |
| |
| static void | qkv_token_kernel_head_major_4x16 (const float *__restrict input_token, const float *__restrict Q_weights, const float *__restrict K_weights, const float *__restrict V_weights, const float *__restrict Q_bias, const float *__restrict K_bias, const float *__restrict V_bias, TransformerModel *M, float *__restrict q_output_base, float *__restrict k_output_base, float *__restrict v_output_base, int embed_dim, int token_idx) |
| |
| void | qkv_projection_head_major (TransformerModel *M, int layer_idx) |
| |
| double | compare_arrays (const float *a, const float *b, size_t size, const char *name) |
| |
| void | convert_token_major_to_head_major_layer (const float *token_major_base, float *head_major_base, TransformerModel *M) |
| |
| void | benchmark_qkv_dual_comparison (TransformerModel *M) |
| |
| void | compute_attention_scores_head_major (TransformerModel *M, int layer_idx) |
| |
| void | apply_causal_softmax_head_major (TransformerModel *M, int layer_idx) |
| |
| void | compute_attention_output_head_major (TransformerModel *M, int layer_idx) |
| |
| void | attention_head_major_complete (TransformerModel *M, int layer_idx) |
| | Complete multi-head attention with head-major layout (self-attention)
|
| |
| void | test_attention_head_major_after_qkv (TransformerModel *M) |
| |
| void | attention_projection_with_concat (TransformerModel *M, int layer_idx) |
| | Production attention projection with concat: Head-major → Token-major → GEMM.
|
| |
| void | benchmark_attention_projection_complete (TransformerModel *M) |
| |
| void | add_gpt2_token_and_positional_embeddings (TransformerModel *M, size_t token_ids_offset, size_t output_offset) |
| |
| void | residual_add_token_parallel (TransformerModel *M, size_t input_offset, size_t residual_offset, size_t output_offset) |
| |
| void | gelu_activation_token_parallel (TransformerModel *M, size_t data_offset) |
| |
| void | mlp_token_parallel (TransformerModel *M, size_t input_offset, size_t fc1_weight_offset, size_t fc1_bias_offset, size_t fc1_output_offset, size_t fc2_weight_offset, size_t fc2_bias_offset, size_t output_offset) |
| |
| void | embed_tokens (TransformerModel *M, int32_t *token_ids, int num_tokens) |
| |
| void | compute_logits_last_token_optimized (TransformerModel *M, int position) |
| |
| void | transformer_layer_forward (TransformerModel *M, int layer_idx, size_t layer_input_offset) |
| |
| void | run_comprehensive_benchmark (TransformerModel *M) |
| |
| int | load_model_weights (TransformerModel *M, const char *weight_file) |
| | Load weights into already-allocated TransformerModel.
|
| |
| int | read_model_metadata (TransformerModel *M, const char *weight_file) |
| | Read model metadata from weight file header.
|
| |
| int | sample_token (float *logits, int vocab_size, float temperature) |
| |
| void | generate (TransformerModel *M, int *prompt, int prompt_len, int max_tokens) |
| |
| void | zero_gradients (TransformerModel *M) |
| |
| void | cache_forward_activations (TransformerModel *M) |
| | Copy forward pass activations to gradient storage for backward pass This preserves the forward computations needed for gradient calculation.
|
| |
| void | backward_residual_connection (TransformerModel *M, size_t d_output_offset, size_t d_input_offset, size_t d_transform_offset) |
| |
| void | backward_embedding_layer (TransformerModel *M) |
| |
| void | backward_final_layernorm (TransformerModel *M) |
| |
| void | backward_fc2 (TransformerModel *M, size_t d_output_offset, size_t fc2_input_copy_offset, size_t fc2_weight_offset, size_t fc2_bias_offset, size_t d_input_offset, size_t d_weight_offset, size_t d_bias_offset) |
| |
| void | backward_gelu (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t d_input_offset) |
| |
| void | backward_fc1 (TransformerModel *M, size_t d_output_offset, size_t fc1_input_copy_offset, size_t fc1_weight_offset, size_t fc1_bias_offset, size_t d_input_offset, size_t d_weight_offset, size_t d_bias_offset) |
| |
| void | backward_gelu_fast (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t d_input_offset) |
| |
| void | backward_layernorm (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t gamma_copy_offset, size_t beta_copy_offset, size_t mean_copy_offset, size_t rstd_copy_offset, size_t d_input_offset, size_t d_gamma_offset, size_t d_beta_offset) |
| |
| void | add_gradient (TransformerModel *M, size_t source_offset, size_t dest_offset) |
| |
| void | backward_attention_projection (TransformerModel *M, size_t d_output_offset, size_t attention_output_copy_offset, size_t proj_weight_offset, size_t proj_bias_offset, size_t d_attention_offset, size_t d_weight_offset, size_t d_bias_offset) |
| |
| void | backward_attention_weighted_values (TransformerModel *M, size_t d_output_offset, size_t attention_weights_offset, size_t v_output_offset, size_t d_weights_offset, size_t d_v_offset) |
| |
| void | backward_causal_softmax (TransformerModel *M, size_t d_scores_offset, size_t weights_copy_offset, size_t scores_copy_offset) |
| |
| void | backward_qk_matmul (TransformerModel *M, size_t d_scores_offset, size_t q_copy_offset, size_t k_copy_offset, size_t d_q_offset, size_t d_k_offset) |
| |
| void | backward_linear (TransformerModel *M, size_t d_output_offset, size_t input_copy_offset, size_t weight_offset, size_t bias_offset, size_t d_input_offset, size_t d_weight_offset, size_t d_bias_offset) |
| |
| void | backward_lm_head (TransformerModel *M) |
| |
| void | backward_transformer_layer (TransformerModel *M, int layer_idx) |
| |
| void | compute_cross_entropy_loss (TransformerModel *M, int32_t *target_tokens, float *loss_out) |
| | Compute cross-entropy loss and gradients w.r.t logits.
|
| |
| void | training_step (TransformerModel *M, int32_t *input_tokens, int32_t *target_tokens, float learning_rate) |
| |
| void | update_all_weights_sgd (TransformerModel *M, float learning_rate) |
| |
| int | main (int argc, char **argv) |
| |