{
  "version": "https://jsonfeed.org/version/1.1",
  "title": "Frontier Checkpoint",
  "home_page_url": "https://frontiercheckpoint.com/",
  "feed_url": "https://frontiercheckpoint.com/feed.json",
  "description": "A practitioner-only technical publication for working ML and agent engineers. We verify, reproduce, and recreate the work — exploring the open frontier of what actually holds up.",
  "language": "en-US",
  "icon": "https://frontiercheckpoint.com/favicon-32.png",
  "authors": [
    {
      "name": "Frontier Checkpoint",
      "url": "https://frontiercheckpoint.com"
    }
  ],
  "items": [
    {
      "id": "https://frontiercheckpoint.com/essays/what-a-4b-model-can-do/",
      "url": "https://frontiercheckpoint.com/essays/what-a-4b-model-can-do/",
      "title": "What a 4B Model Can Actually Do: Field Notes from 155 Experiments",
      "summary": "Across 155 small-model experiments centered on Qwen 3.5 4B, the same thing kept working: give the model something executable it can check against the evidence it has, and it punches far above its benchmark weight. Here is the field guide — the levers that worked, how I know they're real, and the frontier they opened up.",
      "content_text": "Across 155 small-model experiments centered on Qwen 3.5 4B, the same thing kept working: give the model something executable it can check against the evidence it has, and it punches far above its benchmark weight. Here is the field guide — the levers that worked, how I know they're real, and the frontier they opened up.",
      "date_published": "2026-06-28T00:00:00.000Z",
      "date_modified": "2026-06-28T00:00:00.000Z",
      "tags": [
        "Essays",
        "reproducibility",
        "evaluation",
        "fine-tuning",
        "methodology",
        "llm",
        "agents"
      ]
    },
    {
      "id": "https://frontiercheckpoint.com/essays/agent-harness-evals-moat/",
      "url": "https://frontiercheckpoint.com/essays/agent-harness-evals-moat/",
      "title": "The Harness Is the Product: Why Agent Evals Are the Real Moat",
      "summary": "Swapping the frontier model rarely moves your agent's success rate as much as fixing retries and context management — and the one thing competitors can't clone is your evaluation environment. A thesis on why agent evals, not weights, are where reproducible capability accrues.",
      "content_text": "Swapping the frontier model rarely moves your agent's success rate as much as fixing retries and context management — and the one thing competitors can't clone is your evaluation environment. A thesis on why agent evals, not weights, are where reproducible capability accrues.",
      "date_published": "2026-06-27T00:00:00.000Z",
      "date_modified": "2026-06-27T00:00:00.000Z",
      "tags": [
        "Essays",
        "agents",
        "agent-harness",
        "tool-use",
        "evaluation"
      ]
    },
    {
      "id": "https://frontiercheckpoint.com/signals/vllm-paged-attention-serving-standard/",
      "url": "https://frontiercheckpoint.com/signals/vllm-paged-attention-serving-standard/",
      "title": "vLLM and the new default shape of LLM serving",
      "summary": "If you are still serving with naive static batching, the gap is not marginal — paged KV-cache and continuous batching change the throughput-per-GPU math, and most other stacks have copied the idea.",
      "content_text": "If you are still serving with naive static batching, the gap is not marginal — paged KV-cache and continuous batching change the throughput-per-GPU math, and most other stacks have copied the idea.",
      "date_published": "2026-06-26T00:00:00.000Z",
      "date_modified": "2026-06-26T00:00:00.000Z",
      "tags": [
        "Signals",
        "serving",
        "paged-attention",
        "kv-cache"
      ]
    },
    {
      "id": "https://frontiercheckpoint.com/signals/flashattention-3-hopper/",
      "url": "https://frontiercheckpoint.com/signals/flashattention-3-hopper/",
      "title": "FlashAttention-3: async, low-precision, Hopper-native",
      "summary": "The headline is hardware-specific: FA3 is a Hopper story (async copy/MMA overlap, FP8 paths). The portable lesson from the FA line is still the one that matters — attention is bandwidth-bound, and the win is in HBM traffic, not FLOPs.",
      "content_text": "The headline is hardware-specific: FA3 is a Hopper story (async copy/MMA overlap, FP8 paths). The portable lesson from the FA line is still the one that matters — attention is bandwidth-bound, and the win is in HBM traffic, not FLOPs.",
      "date_published": "2026-06-24T00:00:00.000Z",
      "date_modified": "2026-06-24T00:00:00.000Z",
      "tags": [
        "Signals",
        "flash-attention",
        "kernels",
        "gpu-memory"
      ]
    },
    {
      "id": "https://frontiercheckpoint.com/essays/economics-of-test-time-compute/",
      "url": "https://frontiercheckpoint.com/essays/economics-of-test-time-compute/",
      "title": "The Economics of Thinking: Test-Time Compute as a Scaling Axis",
      "summary": "Reasoning models turned inference into a per-request dial. This is an economic read on when spending FLOPs at test time actually buys accuracy, why it only pays where answers are cheap to verify, and what variable-cost inference does to latency budgets and capacity planning.",
      "content_text": "Reasoning models turned inference into a per-request dial. This is an economic read on when spending FLOPs at test time actually buys accuracy, why it only pays where answers are cheap to verify, and what variable-cost inference does to latency budgets and capacity planning.",
      "date_published": "2026-06-23T00:00:00.000Z",
      "date_modified": "2026-06-23T00:00:00.000Z",
      "tags": [
        "Essays",
        "test-time-compute",
        "reasoning",
        "serving",
        "industry"
      ]
    },
    {
      "id": "https://frontiercheckpoint.com/signals/mamba-ssm-attention-alternative/",
      "url": "https://frontiercheckpoint.com/signals/mamba-ssm-attention-alternative/",
      "title": "Mamba and the selective-state-space line",
      "summary": "Worth understanding even if you ship transformers: SSMs change the asymptotics (linear in sequence length, constant state at inference) and the failure modes. The interesting deployments are hybrids, not pure-SSM.",
      "content_text": "Worth understanding even if you ship transformers: SSMs change the asymptotics (linear in sequence length, constant state at inference) and the failure modes. The interesting deployments are hybrids, not pure-SSM.",
      "date_published": "2026-06-20T00:00:00.000Z",
      "date_modified": "2026-06-20T00:00:00.000Z",
      "tags": [
        "Signals",
        "state-space-models",
        "attention",
        "long-context"
      ]
    },
    {
      "id": "https://frontiercheckpoint.com/explainers/reading-a-model-release/",
      "url": "https://frontiercheckpoint.com/explainers/reading-a-model-release/",
      "title": "Reading a Model Release Like an Engineer: Weights, Licenses, System Cards, and Evals",
      "summary": "The headline benchmark is the least durable thing in a model release. Here is how to read access, licenses, cards, eval protocols, and serving facts before you commit engineering to a number you cannot reproduce.",
      "content_text": "The headline benchmark is the least durable thing in a model release. Here is how to read access, licenses, cards, eval protocols, and serving facts before you commit engineering to a number you cannot reproduce.",
      "date_published": "2026-06-19T00:00:00.000Z",
      "date_modified": "2026-06-19T00:00:00.000Z",
      "tags": [
        "Explainers",
        "evaluation",
        "reproducibility",
        "llm",
        "industry"
      ]
    },
    {
      "id": "https://frontiercheckpoint.com/signals/sglang-radixattention-prefix-cache/",
      "url": "https://frontiercheckpoint.com/signals/sglang-radixattention-prefix-cache/",
      "title": "SGLang and RadixAttention for prefix reuse",
      "summary": "If your workload has heavy shared prefixes — system prompts, few-shot exemplars, agent scaffolds — automatic prefix caching is close to free latency. This is where serving for agents diverges from serving for chat.",
      "content_text": "If your workload has heavy shared prefixes — system prompts, few-shot exemplars, agent scaffolds — automatic prefix caching is close to free latency. This is where serving for agents diverges from serving for chat.",
      "date_published": "2026-06-17T00:00:00.000Z",
      "date_modified": "2026-06-17T00:00:00.000Z",
      "tags": [
        "Signals",
        "serving",
        "kv-cache",
        "agents"
      ]
    },
    {
      "id": "https://frontiercheckpoint.com/reproductions/reproducing-nanogpt-speedrun/",
      "url": "https://frontiercheckpoint.com/reproductions/reproducing-nanogpt-speedrun/",
      "title": "Reproducing the nanoGPT Speedrun: What Actually Moves the Loss Curve",
      "summary": "The nanoGPT speedrun is a rare, fully open optimization target: hit 3.28 FineWeb validation loss on a GPT-2 (124M)-class model in minimum wall-clock on 8×H100. We reproduce the pipeline, isolate what the Muon optimizer and the architecture changes actually buy, and flag what will not transfer off the bench.",
      "content_text": "The nanoGPT speedrun is a rare, fully open optimization target: hit 3.28 FineWeb validation loss on a GPT-2 (124M)-class model in minimum wall-clock on 8×H100. We reproduce the pipeline, isolate what the Muon optimizer and the architecture changes actually buy, and flag what will not transfer off the bench.",
      "date_published": "2026-06-15T00:00:00.000Z",
      "date_modified": "2026-06-15T00:00:00.000Z",
      "tags": [
        "Reproductions",
        "reproducibility",
        "pretraining",
        "optimization",
        "distributed-training"
      ]
    },
    {
      "id": "https://frontiercheckpoint.com/signals/deepseek-r1-rl-reasoning-open-weights/",
      "url": "https://frontiercheckpoint.com/signals/deepseek-r1-rl-reasoning-open-weights/",
      "title": "DeepSeek-R1: RL-trained reasoning with open weights",
      "summary": "The reproducible part is the method, not a leaderboard cell: group-relative RL on verifiable rewards, with open weights to probe. It is the cleanest public artifact for understanding the reasoning-model training loop.",
      "content_text": "The reproducible part is the method, not a leaderboard cell: group-relative RL on verifiable rewards, with open weights to probe. It is the cleanest public artifact for understanding the reasoning-model training loop.",
      "date_published": "2026-06-13T00:00:00.000Z",
      "date_modified": "2026-06-13T00:00:00.000Z",
      "tags": [
        "Signals",
        "reasoning",
        "grpo",
        "rlhf"
      ]
    },
    {
      "id": "https://frontiercheckpoint.com/signals/modded-nanogpt-muon-speedrun/",
      "url": "https://frontiercheckpoint.com/signals/modded-nanogpt-muon-speedrun/",
      "title": "The modded-nanogpt speedrun and the Muon optimizer",
      "summary": "A rare fully-public optimization target with a reproducible harness — exactly the kind of artifact we like. The Muon optimizer it popularized is the most interesting practical idea to come out of it.",
      "content_text": "A rare fully-public optimization target with a reproducible harness — exactly the kind of artifact we like. The Muon optimizer it popularized is the most interesting practical idea to come out of it.",
      "date_published": "2026-06-11T00:00:00.000Z",
      "date_modified": "2026-06-11T00:00:00.000Z",
      "tags": [
        "Signals",
        "pretraining",
        "optimization",
        "reproducibility"
      ]
    },
    {
      "id": "https://frontiercheckpoint.com/libraries/trl-sft-dpo-grpo-library/",
      "url": "https://frontiercheckpoint.com/libraries/trl-sft-dpo-grpo-library/",
      "title": "TRL in Anger: SFT, DPO, and GRPO Without Rewriting Your Training Loop",
      "summary": "TRL turns SFT, DPO, and GRPO into Trainer subclasses that inherit the entire Hugging Face stack — accelerate, peft, DeepSpeed. The convenience is real; the cost is that you're debugging someone else's training loop the moment your problem stops looking like the quickstart.",
      "content_text": "TRL turns SFT, DPO, and GRPO into Trainer subclasses that inherit the entire Hugging Face stack — accelerate, peft, DeepSpeed. The convenience is real; the cost is that you're debugging someone else's training loop the moment your problem stops looking like the quickstart.",
      "date_published": "2026-06-11T00:00:00.000Z",
      "date_modified": "2026-06-11T00:00:00.000Z",
      "tags": [
        "Libraries",
        "dpo",
        "grpo",
        "peft",
        "fine-tuning",
        "rlhf"
      ]
    },
    {
      "id": "https://frontiercheckpoint.com/explainers/post-training-quantization-gptq-awq-fp8/",
      "url": "https://frontiercheckpoint.com/explainers/post-training-quantization-gptq-awq-fp8/",
      "title": "Post-Training Quantization in Practice: GPTQ, AWQ, and FP8",
      "summary": "Post-training quantization is the cheapest inference lever and the easiest to pull wrong. The right method is set by your serving regime — bandwidth-bound decode wants weight-only INT4, compute-bound prefill wants FP8 — and the win is real only if a fast kernel accelerates your exact config.",
      "content_text": "Post-training quantization is the cheapest inference lever and the easiest to pull wrong. The right method is set by your serving regime — bandwidth-bound decode wants weight-only INT4, compute-bound prefill wants FP8 — and the win is real only if a fast kernel accelerates your exact config.",
      "date_published": "2026-06-08T00:00:00.000Z",
      "date_modified": "2026-06-08T00:00:00.000Z",
      "tags": [
        "Explainers",
        "quantization",
        "serving",
        "kv-cache"
      ]
    },
    {
      "id": "https://frontiercheckpoint.com/explainers/grpo-group-relative-policy-optimization/",
      "url": "https://frontiercheckpoint.com/explainers/grpo-group-relative-policy-optimization/",
      "title": "GRPO, Demystified: Group-Relative Policy Optimization for Reasoning Models",
      "summary": "GRPO swaps PPO's learned critic for a Monte-Carlo baseline — the mean reward over a group of sampled completions — trading rollout compute and per-token credit assignment for a simpler, more stable RL loop on verifiable-reward tasks.",
      "content_text": "GRPO swaps PPO's learned critic for a Monte-Carlo baseline — the mean reward over a group of sampled completions — trading rollout compute and per-token credit assignment for a simpler, more stable RL loop on verifiable-reward tasks.",
      "date_published": "2026-06-04T00:00:00.000Z",
      "date_modified": "2026-06-04T00:00:00.000Z",
      "tags": [
        "Explainers",
        "grpo",
        "rlhf",
        "ppo",
        "reasoning",
        "fine-tuning"
      ]
    },
    {
      "id": "https://frontiercheckpoint.com/explainers/moe-routing-practitioners-guide/",
      "url": "https://frontiercheckpoint.com/explainers/moe-routing-practitioners-guide/",
      "title": "Routing Is the Hard Part: A Practitioner's Guide to Mixture-of-Experts",
      "summary": "MoE decouples parameter count from per-token FLOPs, but every hard problem — instability, dropped tokens, load imbalance, all-to-all traffic, a footprint set by total not active params — lives in the router. A structural tour from Switch/GShard to fine-grained and aux-loss-free designs, and the systems bill you actually pay.",
      "content_text": "MoE decouples parameter count from per-token FLOPs, but every hard problem — instability, dropped tokens, load imbalance, all-to-all traffic, a footprint set by total not active params — lives in the router. A structural tour from Switch/GShard to fine-grained and aux-loss-free designs, and the systems bill you actually pay.",
      "date_published": "2026-06-01T00:00:00.000Z",
      "date_modified": "2026-06-01T00:00:00.000Z",
      "tags": [
        "Explainers",
        "mixture-of-experts",
        "transformers",
        "tensor-parallelism"
      ]
    },
    {
      "id": "https://frontiercheckpoint.com/recreations/recreating-flashattention-tiled-kernel/",
      "url": "https://frontiercheckpoint.com/recreations/recreating-flashattention-tiled-kernel/",
      "title": "Recreating FlashAttention: A Tiled, IO-Aware Attention Kernel from Scratch",
      "summary": "FlashAttention is exact attention restructured for the memory hierarchy, not an approximation. We implement the tiled forward and recompute backward in Triton, validate exactness against a reference, and separate what a tutorial actually reproduces from what needs CUTLASS-grade engineering.",
      "content_text": "FlashAttention is exact attention restructured for the memory hierarchy, not an approximation. We implement the tiled forward and recompute backward in Triton, validate exactness against a reference, and separate what a tutorial actually reproduces from what needs CUTLASS-grade engineering.",
      "date_published": "2026-05-28T00:00:00.000Z",
      "date_modified": "2026-05-28T00:00:00.000Z",
      "tags": [
        "Recreations",
        "flash-attention",
        "kernels",
        "attention",
        "gpu-memory"
      ]
    },
    {
      "id": "https://frontiercheckpoint.com/explainers/rope-long-context-stack/",
      "url": "https://frontiercheckpoint.com/explainers/rope-long-context-stack/",
      "title": "RoPE and the Long-Context Stack: Rotation, Interpolation, and What Breaks at 128k",
      "summary": "RoPE turns position into a per-dimension rotation — and that same rotation is why PI, NTK-aware scaling, and YaRN exist, and why a 128k window rarely means 128k of usable context. The math, the methods, and the serving bill.",
      "content_text": "RoPE turns position into a per-dimension rotation — and that same rotation is why PI, NTK-aware scaling, and YaRN exist, and why a 128k window rarely means 128k of usable context. The math, the methods, and the serving bill.",
      "date_published": "2026-05-24T00:00:00.000Z",
      "date_modified": "2026-05-24T00:00:00.000Z",
      "tags": [
        "Explainers",
        "rope",
        "long-context",
        "attention",
        "transformers"
      ]
    },
    {
      "id": "https://frontiercheckpoint.com/libraries/vllm-paged-attention-serving/",
      "url": "https://frontiercheckpoint.com/libraries/vllm-paged-attention-serving/",
      "title": "vLLM, Explained: PagedAttention, Continuous Batching, and the Serving Stack",
      "summary": "vLLM treats the KV cache like OS virtual memory — non-contiguous paged blocks — and schedules work at the token, not the request. You get high aggregate throughput; the cost is that per-request latency becomes something you tune rather than something you get for free.",
      "content_text": "vLLM treats the KV cache like OS virtual memory — non-contiguous paged blocks — and schedules work at the token, not the request. You get high aggregate throughput; the cost is that per-request latency becomes something you tune rather than something you get for free.",
      "date_published": "2026-05-20T00:00:00.000Z",
      "date_modified": "2026-05-20T00:00:00.000Z",
      "tags": [
        "Libraries",
        "paged-attention",
        "kv-cache",
        "serving",
        "llm"
      ]
    },
    {
      "id": "https://frontiercheckpoint.com/explainers/distributed-training-fsdp-zero-parallelism/",
      "url": "https://frontiercheckpoint.com/explainers/distributed-training-fsdp-zero-parallelism/",
      "title": "Sharding the Model: FSDP, ZeRO, and Tensor/Pipeline Parallelism",
      "summary": "Past one GPU you stop training a model and start operating a distributed system. Here is what each parallelism axis actually shards, what it costs on the wire, and how practitioners stack them into 3D/4D layouts.",
      "content_text": "Past one GPU you stop training a model and start operating a distributed system. Here is what each parallelism axis actually shards, what it costs on the wire, and how practitioners stack them into 3D/4D layouts.",
      "date_published": "2026-05-15T00:00:00.000Z",
      "date_modified": "2026-05-15T00:00:00.000Z",
      "tags": [
        "Explainers",
        "distributed-training",
        "fsdp",
        "tensor-parallelism",
        "gpu-memory"
      ]
    },
    {
      "id": "https://frontiercheckpoint.com/essays/the-checkpoint-signal-vs-noise/",
      "url": "https://frontiercheckpoint.com/essays/the-checkpoint-signal-vs-noise/",
      "title": "How We Separate Signal From Noise: Frontier Checkpoint's Verification Rubric",
      "summary": "The standard behind everything we publish: the filters that decide what earns your attention, the reproduced-to-unverified ladder we grade claims on, how we handle benchmarks and weightless releases, and why every correction is dated and logged rather than silently edited.",
      "content_text": "The standard behind everything we publish: the filters that decide what earns your attention, the reproduced-to-unverified ladder we grade claims on, how we handle benchmarks and weightless releases, and why every correction is dated and logged rather than silently edited.",
      "date_published": "2026-05-12T00:00:00.000Z",
      "date_modified": "2026-05-12T00:00:00.000Z",
      "tags": [
        "Essays",
        "methodology",
        "reproducibility",
        "evaluation",
        "industry"
      ]
    }
  ]
}