This page contains the architecture diagrams for the VIN NBV system.

1 Target Selection Contract

Code

---
config:
  htmlLabels: true
  flowchart:
    htmlLabels: true
    nodeSpacing: 16
    rankSpacing: 26
  layout: elk
  themeVariables:
    fontSize: "18px"
  themeCSS: |
    .nodeLabel { font-size: 18px; }
    .node span.nodeLabel { color: #1F2937 !important; fill: #1F2937 !important; stroke: none !important; }
    .edgeLabel { font-size: 16px; }
    .cluster-label { font-size: 19px; font-weight: 700; }
---
flowchart TB
  subgraph Actor["Actor-visible selection"]
    direction TB
    Obs["detected OBBs<br/>else predicted OBBs"]:::input
    Rows["target rows<br/>confidence + support + projection"]:::compute
    Mask["hard mask m_e<br/>finite, visible, supported"]:::compute
    Score["score S_e<br/>p_e s_vis s_sup s_def"]:::compute
    Policy["greedy top-K<br/>or target softmax"]:::compute
    Selected["selected target rows z_e"]:::output
  end

  subgraph Oracle["Oracle audit after selection"]
    direction TB
    Gt["GT OBB table<br/>evaluation only"]:::data
    Match["semantic IoU match<br/>G(e,g) threshold"]:::compute
    Audit["gt_label_valid<br/>match status + reason bits"]:::data
  end

  Obs --> Rows --> Mask --> Score --> Policy --> Selected
  Selected --> Match
  Gt --> Match --> Audit

  classDef input fill:#D5E8D4,stroke:#82B366,stroke-width:1.5px,rx:0,ry:0;
  classDef output fill:#F8CECC,stroke:#B85450,stroke-width:1.5px,rx:0,ry:0;
  classDef compute fill:#E1D5E7,stroke:#9673A6,stroke-width:1.5px,rx:8,ry:8;
  classDef data fill:#F5F5F5,stroke:#9E9E9E,stroke-width:1.2px,rx:0,ry:0;

  style Actor fill:#f1fff6,stroke:#97ddb5,stroke-width:2px,rx:12,ry:12
  style Oracle fill:#fff8e8,stroke:#e0b66a,stroke-width:2px,rx:12,ry:12

2 Candidate Sampling Pipeline

Code

---
config:
  htmlLabels: true
  flowchart:
    htmlLabels: true
    nodeSpacing: 16
    rankSpacing: 26
  layout: elk
  themeVariables:
    fontSize: "18px"
  themeCSS: |
    .nodeLabel { font-size: 18px; }
    .node span.nodeLabel { color: #1F2937 !important; fill: #1F2937 !important; stroke: none !important; }
    .edgeLabel { font-size: 16px; }
    .cluster-label { font-size: 19px; font-weight: 700; }
---
flowchart LR
  Ref["reference pose<br/>world <- rig"]:::input
  Grav["gravity-aligned<br/>sampling frame"]:::compute
  Dir["direction draw<br/>uniform or PowerSpherical"]:::compute
  Caps["azimuth/elevation caps<br/>radius sample"]:::compute
  Pos["position family<br/>forward, target, bypass"]:::compute
  Orient["orientation builder<br/>rig, radial, target look-at"]:::compute
  Prune["pruning rules<br/>bounds, mesh, path, motion"]:::compute
  Shell["full shell<br/>masks + provenance"]:::data
  Valid["compact valid table<br/>finite actions"]:::output

  Ref --> Grav --> Dir --> Caps --> Pos --> Orient --> Prune
  Prune --> Shell
  Shell --> Valid

  classDef input fill:#D5E8D4,stroke:#82B366,stroke-width:1.5px,rx:0,ry:0;
  classDef output fill:#F8CECC,stroke:#B85450,stroke-width:1.5px,rx:0,ry:0;
  classDef compute fill:#E1D5E7,stroke:#9673A6,stroke-width:1.5px,rx:8,ry:8;
  classDef data fill:#F5F5F5,stroke:#9E9E9E,stroke-width:1.2px,rx:0,ry:0;

3 Candidate Mixture Families

Code

---
config:
  htmlLabels: true
  flowchart:
    htmlLabels: true
    nodeSpacing: 16
    rankSpacing: 24
  layout: elk
  themeVariables:
    fontSize: "18px"
  themeCSS: |
    .nodeLabel { font-size: 18px; }
    .node span.nodeLabel { color: #1F2937 !important; fill: #1F2937 !important; stroke: none !important; }
    .edgeLabel { font-size: 16px; }
    .cluster-label { font-size: 19px; font-weight: 700; }
---
flowchart TB
  Target["selected actor-visible target"]:::input
  Ref["current reference pose"]:::input

  subgraph Mix["60-row default candidate mixture"]
    direction LR
    Targ["target_bearing_local<br/>18 rows"]:::compute
    Fwd["forward_local<br/>18 rows"]:::compute
    Lat["lateral_target_bypass<br/>12 rows"]:::compute
    Refine["local_refinement<br/>6 rows"]:::compute
    Back["revisit_backtrack<br/>6 rows"]:::compute
  end

  Prov["stable provenance<br/>position_id + strategy_id + mixture_id"]:::data
  Table["full shell<br/>sampler_probability = 1/N"]:::output

  Target --> Targ
  Target --> Lat
  Ref --> Targ
  Ref --> Fwd
  Ref --> Lat
  Ref --> Refine
  Ref --> Back
  Targ --> Prov
  Fwd --> Prov
  Lat --> Prov
  Refine --> Prov
  Back --> Prov
  Prov --> Table

  classDef input fill:#D5E8D4,stroke:#82B366,stroke-width:1.5px,rx:0,ry:0;
  classDef output fill:#F8CECC,stroke:#B85450,stroke-width:1.5px,rx:0,ry:0;
  classDef compute fill:#E1D5E7,stroke:#9673A6,stroke-width:1.5px,rx:8,ry:8;
  classDef data fill:#F5F5F5,stroke:#9E9E9E,stroke-width:1.2px,rx:0,ry:0;

  style Mix fill:#f0fbff,stroke:#8fd0ff,stroke-width:2px,rx:12,ry:12

4 Rollout Branch Selection

Code

---
config:
  htmlLabels: true
  flowchart:
    htmlLabels: true
    nodeSpacing: 16
    rankSpacing: 26
  layout: elk
  themeVariables:
    fontSize: "18px"
  themeCSS: |
    .nodeLabel { font-size: 18px; }
    .node span.nodeLabel { color: #1F2937 !important; fill: #1F2937 !important; stroke: none !important; }
    .edgeLabel { font-size: 16px; }
    .cluster-label { font-size: 19px; font-weight: 700; }
---
flowchart LR
  Cand["candidate table<br/>valid mask m_i"]:::input
  Scores["policy scores<br/>heuristic or oracle evaluator"]:::compute
  Policy["selection policy<br/>greedy, random, softmax"]:::compute
  Diversity["diversity guards<br/>distance, yaw, target bearing, strategy"]:::compute
  Branches["selected sibling branches"]:::output
  Store["rollout / Q_H store<br/>action lineage + rewards"]:::data
  Next["next counterfactual state<br/>selected history updated"]:::output

  Cand --> Scores --> Policy --> Diversity --> Branches
  Branches --> Store
  Branches --> Next

  classDef input fill:#D5E8D4,stroke:#82B366,stroke-width:1.5px,rx:0,ry:0;
  classDef output fill:#F8CECC,stroke:#B85450,stroke-width:1.5px,rx:0,ry:0;
  classDef compute fill:#E1D5E7,stroke:#9673A6,stroke-width:1.5px,rx:8,ry:8;
  classDef data fill:#F5F5F5,stroke:#9E9E9E,stroke-width:1.2px,rx:0,ry:0;

5 Overall Architecture

Code

---
config:
  htmlLabels: true
  flowchart:
    htmlLabels: true
    nodeSpacing: 16
    rankSpacing: 30
  layout: elk
  themeVariables:
    fontSize: "18px"
  themeCSS: |
    .nodeLabel { font-size: 18px; }
    .edgeLabel { font-size: 16px; }
---
flowchart LR
  %% VINv3 overview (compact). Edges = shapes only.

  %% Inputs
  Poses["$$\\textbf{Poses + Cameras}$$"]
  Snip["$$\\textbf{Snippet + EVL}$$"]

  subgraph Branches["$$\\textbf{Per-Candidate Feature Branches}$$"]
    direction LR

    PoseBranch["$$\\begin{array}{c}\\textbf{Pose Encoding}\\\\(12.7K\\ \\text{params})\\\\\\mathbf{e}_{\\mathrm{pose}}=\\texttt{R6dLffPoseEncoder}(\\bullet)\\end{array}$$"]

    FieldBranch["$$\\begin{array}{c}\\textbf{Scene Field}\\\\(192\\ \\text{params})\\\\\\mathbf{F}=\\texttt{field\\_proj}(\\bullet)\\end{array}$$"]

    GlobalBranch["$$\\begin{array}{c}\\textbf{Global Feature}\\\\(7.3K+336\\ \\text{params})\\\\\\tilde{\\mathbf{g}}=\\texttt{FiLM}(\\texttt{PoseConditionedGlobalPool}(\\mathbf{F},\\mathbf{e}_{\\mathrm{pose}}),\\mathbf{s}_{\\mathrm{vox}})\\end{array}$$"]

    SemBranch["$$\\begin{array}{c}\\textbf{Semidense Evidence}\\\\\\mathbf{s}_{\\mathrm{proj}}=\\texttt{proj\\_stats}(\\Pi(\\mathbf{P}_{\\mathrm{sem}}))\\\\\\mathbf{s}_{\\mathrm{grid}}=\\texttt{TinyCNN}(\\texttt{scatter\\_add}(\\Pi(\\mathbf{P}_{\\mathrm{sem}})))\\ (\\text{opt.})\\end{array}$$"]
  end

  VoxelValid["$$\\begin{array}{c}\\textbf{Voxel Coverage Proxy}\\\\\\boldsymbol{\\nu}_{\\mathrm{vox}}=\\texttt{sample\\_center}(\\mathbf{c}_{\\mathrm{norm}})\\end{array}$$"]

  Head["$$\\begin{array}{c}\\textbf{Head + CORAL}\\\\(58.2K+221\\ \\text{params})\\\\\\boldsymbol{\\ell}=\\texttt{CoralLayer}(\\texttt{MLP}(\\oplus\\{\\mathbf{e}_{\\mathrm{pose}},\\tilde{\\mathbf{g}},\\mathbf{s}_{\\mathrm{proj}},\\mathbf{s}_{\\mathrm{grid}},(\\mathbf{z}_{\\mathrm{traj}})\\}))\\\\\\hat{\\mathbf{r}}=\\sum_{k=0}^{K-2}\\sigma(\\boldsymbol{\\ell}_k)\\end{array}$$"]

  Mask["$$\\begin{array}{c}\\textbf{Candidate Valid}\\\\\\mathbf{m}=\\mathbb{1}[\\mathrm{finite}]\\wedge\\mathbb{1}[\\boldsymbol{\\nu}_{\\mathrm{vox}}>0]\\\\\\wedge\\mathbb{1}[\\boldsymbol{\\nu}_{\\mathrm{sem}}>0]\\end{array}$$"]

  Pred["$$\\textbf{VinPrediction}$$"]
  CandValid["$$\\textbf{Candidate Valid}$$"]

  %% Wiring (shapes only)
  Poses -->|"$$\\begin{array}{c}\\texttt{PoseTW}[B,N_q,12]\\\\\\texttt{PoseTW}[B,12]\\end{array}$$"| PoseBranch

  Snip -->|"$$\\begin{array}{c}\\texttt{FloatTensor}[B,1,V,V,V]\\times 4\\\\\\texttt{IntTensor}[B,V,V,V]\\end{array}$$"| FieldBranch
  FieldBranch -->|"$$\\texttt{FloatTensor}[B,F_g,V,V,V]$$"| GlobalBranch
  PoseBranch -->|"$$\\texttt{FloatTensor}[B,N_q,F_{\\mathrm{pose}}]$$"| GlobalBranch
  Poses -->|"$$\\texttt{PerspectiveCameras}[B\\cdot N_q]$$"| GlobalBranch

  Snip -->|"$$\\texttt{FloatTensor}[B,P,C_{\\mathrm{sem}}]$$"| SemBranch
  Poses -->|"$$\\texttt{PerspectiveCameras}[B\\cdot N_q]$$"| SemBranch

  PoseBranch -->|"$$\\texttt{FloatTensor}[B,N_q,F_{\\mathrm{pose}}]$$"| Head
  GlobalBranch -->|"$$\\texttt{FloatTensor}[B,N_q,F_g]$$"| Head
  SemBranch -->|"$$\\begin{array}{c}\\texttt{FloatTensor}[B,N_q,F_{\\mathrm{proj}}]\\\\\\texttt{FloatTensor}[B,N_q,F_{\\mathrm{fr}}]\\end{array}$$"| Head

  Snip -->|"$$\\texttt{IntTensor}[B,V,V,V]$$"| VoxelValid
  Poses -->|"$$\\texttt{PoseTW}[B,N_q,12]$$"| VoxelValid

  VoxelValid -->|"$$\\texttt{FloatTensor}[B,N_q]$$"| Mask
  SemBranch -->|"$$\\texttt{FloatTensor}[B,N_q,F_{\\mathrm{proj}}]$$"| Mask

  Head --> Pred
  Mask -->|"$$\\texttt{BoolTensor}[B,N_q]$$"| CandValid

  classDef input fill:#D5E8D4,stroke:#82B366,stroke-width:1.5px,rx:0,ry:0;
  classDef output fill:#F8CECC,stroke:#B85450,stroke-width:1.5px,rx:0,ry:0;
  classDef compute fill:#E1D5E7,stroke:#9673A6,stroke-width:1.5px,rx:8,ry:8;

  class Poses,Snip input;
  class PoseBranch,FieldBranch,GlobalBranch,SemBranch,VoxelValid,Head,Mask compute;
  class Pred,CandValid output;

6 Scene Field

Code

---
config:
  htmlLabels: true
  flowchart:
    htmlLabels: true
    nodeSpacing: 20
    rankSpacing: 32
  layout: elk
  themeVariables:
    fontSize: "18px"
  themeCSS: |
    .nodeLabel { font-size: 18px; }
    .edgeLabel { font-size: 16px; }
---
flowchart LR
  %% Scene field (compact): derive uncertainty/coverage channels, then project to a small field tensor.

  VoxelHeads["$$\\begin{array}{c}\\textbf{EVL Voxel Heads}\\\\\\{\\mathbf{o}_{\\mathrm{pr}},\\ \\mathbf{o}_{\\mathrm{in}},\\ \\mathbf{c}_{\\mathrm{pr}},\\ \\mathbf{f}_{\\mathrm{in}},\\ \\mathbf{n}\\}\\end{array}$$"]

  Norm["$$\\begin{array}{c}\\textbf{Counts Normalization}\\\\\\mathbf{c}=\\log(1+\\mathbf{n})/\\log(1+\\max(\\mathbf{n}))\\end{array}$$"]
  Prior["$$\\begin{array}{c}\\textbf{New Surface Prior}\\\\\\mathbf{u}=1-\\mathbf{c}\\\\\\boldsymbol{\\pi}_{\\mathrm{new}}=\\mathbf{u}\\odot\\mathbf{o}_{\\mathrm{pr}}\\end{array}$$"]

  FieldIn["$$\\begin{array}{c}\\textbf{Field Input}\\\\\\mathbf{F}_{\\mathrm{in}}=\\texttt{cat}(\\bullet,\\ \\text{dim}=1)\\\\\\texttt{scene\\_field\\_channels}\\end{array}$$"]

  FieldProj["$$\\begin{array}{c}\\textbf{Scene Field Projection}\\\\(192\\ \\text{params})\\\\\\texttt{field\\_proj}(\\bullet)\\\\\\texttt{nn.Conv3d}\\rightarrow\\texttt{nn.GroupNorm}\\rightarrow\\texttt{nn.GELU}\\end{array}$$"]

  Field["$$\\begin{array}{c}\\textbf{Scene Field}\\\\\\mathbf{F}\\end{array}$$"]

  %% Wiring (shapes only)
  VoxelHeads -->|"$$\\begin{array}{c}\\texttt{FloatTensor}[B,1,V,V,V]\\\\\\texttt{IntTensor}[B,V,V,V]\\end{array}$$"| Norm
  Norm -->|"$$\\texttt{FloatTensor}[B,1,V,V,V]$$"| Prior
  Prior -->|"$$\\texttt{FloatTensor}[B,1,V,V,V]$$"| FieldIn
  FieldIn -->|"$$\\texttt{FloatTensor}[B,F_{\\mathrm{in}},V,V,V]$$"| FieldProj
  FieldProj -->|"$$\\texttt{FloatTensor}[B,F_g,V,V,V]$$"| Field

  classDef input fill:#D5E8D4,stroke:#82B366,stroke-width:1.5px,rx:0,ry:0;
  classDef output fill:#F8CECC,stroke:#B85450,stroke-width:1.5px,rx:0,ry:0;
  classDef compute fill:#E1D5E7,stroke:#9673A6,stroke-width:1.5px,rx:8,ry:8;
  classDef data fill:#F5F5F5,stroke:#9E9E9E,stroke-width:1.2px,rx:0,ry:0;

  class VoxelHeads input;
  class Norm,Prior,FieldIn,FieldProj compute;
  class Field output;

7 Pose Encoder

Code

---
config:
  htmlLabels: true
  flowchart:
    htmlLabels: true
    nodeSpacing: 20
    rankSpacing: 32
  layout: elk
  themeVariables:
    fontSize: "18px"
  themeCSS: |
    .nodeLabel { font-size: 18px; }
    .edgeLabel { font-size: 16px; }
    .cluster-label { font-size: 18px; font-weight: 700; }
---
flowchart TB
  %% Pose encoding branch (compact)

  Inputs["$$\\begin{array}{c}\\textbf{Poses}\\end{array}$$"]

  Rel["$$\\begin{array}{c}\\textbf{Rig-Relative Pose}\\\\\\mathbf{T}^{r}_{c_q}=(\\mathbf{T}^{w}_{r})^{-1}\\circ\\mathbf{T}^{w}_{c_q}\\end{array}$$"]

  PoseVec["$$\\begin{array}{c}\\textbf{Pose Vector}\\\\\\mathbf{v}=[\\mathbf{t}_{r c_q},\\ \\mathrm{R6D}(\\mathbf{R}_{r c_q})]\\end{array}$$"]

  Encoder["$$\\begin{array}{c}\\textbf{Pose Encoder}\\\\(12.7K\\ \\text{params})\\\\\\texttt{R6dLffPoseEncoder}(\\bullet)\\\\\\texttt{LearnableFourierFeatures}(\\bullet)\\end{array}$$"]

  PoseEmb["$$\\begin{array}{c}\\textbf{Pose Embedding}\\\\\\mathbf{e}_{\\mathrm{pose}}\\end{array}$$"]

  Inputs -->|"$$\\begin{array}{c}\\texttt{PoseTW}[B,N_q,12]\\\\\\texttt{PoseTW}[B,12]\\end{array}$$"| Rel
  Rel -->|"$$\\texttt{FloatTensor}[B,N_q,9]$$"| PoseVec
  PoseVec -->|"$$\\texttt{FloatTensor}[B,N_q,9]$$"| Encoder
  Encoder -->|"$$\\texttt{FloatTensor}[B,N_q,F_{\\mathrm{pose}}]$$"| PoseEmb
  classDef input fill:#D5E8D4,stroke:#82B366,stroke-width:1.5px,rx:0,ry:0;
  classDef output fill:#F8CECC,stroke:#B85450,stroke-width:1.5px,rx:0,ry:0;
  classDef compute fill:#E1D5E7,stroke:#9673A6,stroke-width:1.5px,rx:8,ry:8;
  classDef data fill:#F5F5F5,stroke:#9E9E9E,stroke-width:1.2px,rx:0,ry:0;

  class Inputs input;
  class Rel,PoseVec,Encoder compute;
  class PoseEmb output;

8 Global Pool

Code

---
config:
  htmlLabels: true
  flowchart:
    htmlLabels: true
    nodeSpacing: 18
    rankSpacing: 30
  layout: elk
  themeVariables:
    fontSize: "19px"
  themeCSS: |
    .nodeLabel { font-size: 19px; }
    .edgeLabel { font-size: 15px; opacity: 0.85; }
---
flowchart TB
  %% Global pooling + voxel FiLM (compact)

  Field["$$\\begin{array}{c}\\textbf{Scene Field}\\\\\\mathbf{F}\\end{array}$$"]
  PoseEnc["$$\\begin{array}{c}\\textbf{Pose Embedding}\\\\\\mathbf{e}_{\\mathrm{pose}}\\end{array}$$"]
  VoxCenters["$$\\begin{array}{c}\\textbf{Voxel Centers}\\\\\\mathbf{x}_w\\end{array}$$"]
  Cam["$$\\begin{array}{c}\\textbf{Candidate Cameras}\\\\\\mathbf{C}_q\\end{array}$$"]

  GlobalPool["$$\\begin{array}{c}\\textbf{PoseConditionedGlobalPool}\\\\\\mathbf{g}=\\texttt{Pool}(\\mathbf{F},\\mathbf{e}_{\\mathrm{pose}},\\mathbf{x}_w)\\end{array}$$"]

  VoxPool["$$\\begin{array}{c}\\textbf{Voxel Pooling}\\\\\\mathbf{p}=\\texttt{adaptive\\_avg\\_pool3d}(\\mathbf{x}_w)\\end{array}$$"]
  VoxProj["$$\\begin{array}{c}\\textbf{Voxel Projection Stats}\\\\\\Pi(\\mathbf{p})=\\texttt{transform\\_points\\_screen}(\\mathbf{C}_q,\\mathbf{p})\\\\\\mathbf{s}_{\\mathrm{vox}}=\\texttt{stats}(\\Pi(\\mathbf{p}))\\ \\ \\text{(coverage/validity + depth)}\\end{array}$$"]

  FiLM["$$\\begin{array}{c}\\textbf{FiLM Modulation}\\\\(\\boldsymbol{\\gamma},\\boldsymbol{\\beta})=\\texttt{Linear}(\\mathbf{s}_{\\mathrm{vox}})\\\\\\tilde{\\mathbf{g}}=(1+\\boldsymbol{\\gamma})\\odot\\mathbf{g}+\\boldsymbol{\\beta}\\\\\\texttt{GroupNorm}\\end{array}$$"]
  GOut["$$\\begin{array}{c}\\textbf{Global Feature}\\\\\\tilde{\\mathbf{g}}\\end{array}$$"]

  Field --> GlobalPool
  PoseEnc --> GlobalPool
  VoxCenters --> GlobalPool
  GlobalPool -->|"$$\\mathbf{g}:\\ [B,N_q,F_g]$$"| FiLM

  VoxCenters --> VoxPool
  VoxPool --> VoxProj
  Cam --> VoxProj
  VoxProj -->|"$$\\mathbf{s}_{\\mathrm{vox}}:\\ [B,N_q,F_{\\mathrm{vox}}]$$"| FiLM
  FiLM -->|"$$\\texttt{FloatTensor}[B,N_q,F_g]$$"| GOut

  classDef input fill:#D5E8D4,stroke:#82B366,stroke-width:1.5px,rx:0,ry:0;
  classDef output fill:#F8CECC,stroke:#B85450,stroke-width:1.5px,rx:0,ry:0;
  classDef compute fill:#E1D5E7,stroke:#9673A6,stroke-width:1.5px,rx:8,ry:8;
  classDef data fill:#F5F5F5,stroke:#9E9E9E,stroke-width:1.2px,rx:0,ry:0;

  class Field,PoseEnc,VoxCenters,Cam input;
  class GlobalPool,VoxPool,VoxProj,FiLM compute;
  class GOut output;

10 Trajectory

Code

---
config:
  htmlLabels: true
  flowchart:
    htmlLabels: true
    nodeSpacing: 20
    rankSpacing: 32
  layout: elk
  themeVariables:
    fontSize: "18px"
  themeCSS: |
    .nodeLabel { font-size: 18px; }
    .edgeLabel { font-size: 16px; }
---
flowchart TB
  %% Optional trajectory context (compact)

  Traj["$$\\begin{array}{c}\\textbf{Trajectory Poses}\\\\\\mathbf{T}^{w}_{r}(t)\\end{array}$$"]
  PoseEmb["$$\\begin{array}{c}\\textbf{Pose Embedding}\\\\\\mathbf{e}_{\\mathrm{pose}}\\end{array}$$"]

  Rel["$$\\begin{array}{c}\\textbf{Rig-Relative Trajectory}\\\\\\mathbf{T}^{r}_{r}(t)=(\\mathbf{T}^{w}_{r})^{-1}\\circ\\mathbf{T}^{w}_{r}(t)\\end{array}$$"]
  Enc["$$\\begin{array}{c}\\textbf{Trajectory Encoder}\\\\(12.7K\\ \\text{params})\\\\\\mathbf{t}=\\texttt{TrajectoryEncoder}(\\bullet)\\end{array}$$"]
  Attn["$$\\begin{array}{c}\\textbf{Cross-Attention}\\\\(optional)\\ (4.2K\\ \\text{params})\\\\\\texttt{nn.MultiheadAttention}(q=\\mathbf{e}_{\\mathrm{pose}},\\ kv=\\mathbf{t})\\end{array}$$"]
  Norm["$$\\begin{array}{c}\\textbf{Normalization}\\\\(optional)\\\\\\texttt{nn.GroupNorm}(\\bullet)\\end{array}$$"]

  Out["$$\\begin{array}{c}\\textbf{Trajectory Context}\\\\\\mathbf{z}_{\\mathrm{traj}}\\end{array}$$"]

  Traj -->|"$$\\texttt{PoseTW}[B,T,12]$$"| Rel
  Rel -->|"$$\\texttt{PoseTW}[B,T,12]$$"| Enc
  Enc -->|"$$\\texttt{FloatTensor}[B,T,F_{\\mathrm{traj}}]$$"| Attn
  PoseEmb -->|"$$\\texttt{FloatTensor}[B,N_q,F_{\\mathrm{pose}}]$$"| Attn
  Attn -->|"$$\\texttt{FloatTensor}[B,N_q,F_{\\mathrm{pose}}]$$"| Norm
  Norm -->|"$$\\texttt{FloatTensor}[B,N_q,F_{\\mathrm{pose}}]$$"| Out

  classDef input fill:#D5E8D4,stroke:#82B366,stroke-width:1.5px,rx:0,ry:0;
  classDef output fill:#F8CECC,stroke:#B85450,stroke-width:1.5px,rx:0,ry:0;
  classDef compute fill:#E1D5E7,stroke:#9673A6,stroke-width:1.5px,rx:8,ry:8;
  classDef data fill:#F5F5F5,stroke:#9E9E9E,stroke-width:1.2px,rx:0,ry:0;

  class Traj,PoseEmb input;
  class Rel,Enc,Attn,Norm compute;
  class Out output;

11 Semidense Projection

Code

---
config:
  htmlLabels: true
  flowchart:
    htmlLabels: true
    nodeSpacing: 20
    rankSpacing: 32
  layout: elk
  themeVariables:
    fontSize: "18px"
  themeCSS: |
    .nodeLabel { font-size: 18px; }
    .edgeLabel { font-size: 16px; }
---
flowchart TB
  %% Semidense projection stats (compact)

  SemPts["$$\\begin{array}{c}\\textbf{Semidense Points}\\\\\\mathbf{P}_{\\mathrm{sem}}\\end{array}$$"]
  Cam["$$\\begin{array}{c}\\textbf{Candidate Cameras}\\\\\\mathbf{C}_q\\end{array}$$"]
  Proj["$$\\begin{array}{c}\\textbf{Screen-Space Projection}\\\\\\Pi(\\bullet)=\\texttt{transform\\_points\\_screen}(\\mathbf{C}_q,\\bullet)\\end{array}$$"]
  Stats["$$\\begin{array}{c}\\textbf{Projection Statistics}\\\\\\mathbf{s}_{\\mathrm{proj}}=\\texttt{proj\\_stats}(\\bullet)\\\\\\left[\\boldsymbol{\\nu}_{\\mathrm{cov}},\\ \\boldsymbol{\\nu}_{\\mathrm{empty}},\\ \\boldsymbol{\\nu}_{\\mathrm{sem}},\\ \\boldsymbol{\\mu}_z,\\ \\boldsymbol{\\sigma}_z\\right]\\\\\\textit{weights: }(\\mathbf{n}_{\\mathrm{obs}},\\ 1/\\boldsymbol{\\sigma}_d)\\end{array}$$"]
  Out["$$\\begin{array}{c}\\textbf{Semidense Proj Feature}\\\\\\mathbf{s}_{\\mathrm{proj}}\\end{array}$$"]

  SemPts -->|"$$\\texttt{FloatTensor}[B,P,C_{\\mathrm{sem}}]$$"| Proj
  Cam -->|"$$\\texttt{PerspectiveCameras}[B\\cdot N_q]$$"| Proj
  Proj -->|"$$\\begin{array}{c}\\texttt{FloatTensor}[B\\cdot N_q,P]\\\\\\texttt{BoolTensor}[B\\cdot N_q,P]\\end{array}$$"| Stats
  Stats -->|"$$\\texttt{FloatTensor}[B,N_q,F_{\\mathrm{proj}}]$$"| Out

  classDef input fill:#D5E8D4,stroke:#82B366,stroke-width:1.5px,rx:0,ry:0;
  classDef output fill:#F8CECC,stroke:#B85450,stroke-width:1.5px,rx:0,ry:0;
  classDef compute fill:#E1D5E7,stroke:#9673A6,stroke-width:1.5px,rx:8,ry:8;
  classDef data fill:#F5F5F5,stroke:#9E9E9E,stroke-width:1.2px,rx:0,ry:0;

  class SemPts,Cam input;
  class Proj,Stats compute;
  class Out output;

12 Semidense Frustum

Code

---
config:
  htmlLabels: true
  flowchart:
    htmlLabels: true
    nodeSpacing: 20
    rankSpacing: 32
  layout: elk
  themeVariables:
    fontSize: "18px"
  themeCSS: |
    .nodeLabel { font-size: 18px; }
    .edgeLabel { font-size: 16px; }
---
flowchart TB
  %% Semidense grid CNN (compact)

  ProjData["$$\\begin{array}{c}\\textbf{Projection Data}\\end{array}$$"]
  Grid["$$\\begin{array}{c}\\textbf{Screen-Space Grid}\\\\\\mathbf{H}=\\texttt{scatter\\_add}(\\bullet)\\\\\\left[\\mathbf{occ},\\ \\boldsymbol{\\mu}_z,\\ \\boldsymbol{\\sigma}_z\\right]\\end{array}$$"]
  CNNTrunk["$$\\begin{array}{c}\\textbf{CNN Backbone}\\\\(1.3K\\ \\text{params})\\\\\\texttt{nn.Conv2d}\\rightarrow\\texttt{nn.GELU}\\\\\\rightarrow\\texttt{nn.Conv2d}\\rightarrow\\texttt{nn.GELU}\\end{array}$$"]
  CNNHead["$$\\begin{array}{c}\\textbf{CNN Head}\\\\\\texttt{nn.AdaptiveAvgPool2d}\\\\\\rightarrow\\texttt{nn.Flatten}\\rightarrow\\texttt{nn.Linear}\\end{array}$$"]
  Out["$$\\begin{array}{c}\\textbf{Semidense Grid Feature}\\\\\\mathbf{s}_{\\mathrm{grid}}\\end{array}$$"]

  ProjData -->|"$$\\begin{array}{c}\\texttt{FloatTensor}[B\\cdot N_q,P]\\\\\\texttt{BoolTensor}[B\\cdot N_q,P]\\end{array}$$"| Grid
  Grid -->|"$$\\texttt{FloatTensor}[B\\cdot N_q,C_{\\mathrm{grid}},G,G]$$"| CNNTrunk
  CNNTrunk -->|"$$\\texttt{FloatTensor}[B\\cdot N_q,C_{\\mathrm{mid}},G,G]$$"| CNNHead
  CNNHead -->|"$$\\texttt{FloatTensor}[B,N_q,F_{\\mathrm{cnn}}]$$"| Out

  classDef input fill:#D5E8D4,stroke:#82B366,stroke-width:1.5px,rx:0,ry:0;
  classDef output fill:#F8CECC,stroke:#B85450,stroke-width:1.5px,rx:0,ry:0;
  classDef compute fill:#E1D5E7,stroke:#9673A6,stroke-width:1.5px,rx:8,ry:8;
  classDef data fill:#F5F5F5,stroke:#9E9E9E,stroke-width:1.2px,rx:0,ry:0;

  class ProjData input;
  class Grid,CNNTrunk,CNNHead compute;
  class Out output;