diff --git a/crates/plotnik-lib/src/bytecode/constants.rs b/crates/plotnik-lib/src/bytecode/constants.rs index 3a61776..9a4088f 100644 --- a/crates/plotnik-lib/src/bytecode/constants.rs +++ b/crates/plotnik-lib/src/bytecode/constants.rs @@ -4,7 +4,8 @@ pub const MAGIC: [u8; 4] = *b"PTKQ"; /// Current bytecode format version. -pub const VERSION: u32 = 1; +/// v2: Removed explicit offsets (computed from counts), added regex section. +pub const VERSION: u32 = 2; /// Section alignment in bytes. pub const SECTION_ALIGN: usize = 64; diff --git a/crates/plotnik-lib/src/bytecode/header.rs b/crates/plotnik-lib/src/bytecode/header.rs index ccdd891..fbb1d3a 100644 --- a/crates/plotnik-lib/src/bytecode/header.rs +++ b/crates/plotnik-lib/src/bytecode/header.rs @@ -1,6 +1,11 @@ //! Bytecode file header (64 bytes). +//! +//! v2 layout: Offsets are computed from counts + SECTION_ALIGN (64 bytes). +//! Section order: Header → StringBlob → RegexBlob → StringTable → RegexTable → +//! NodeTypes → NodeFields → Trivia → TypeDefs → TypeMembers → TypeNames → +//! Entrypoints → Transitions -use super::{MAGIC, VERSION}; +use super::{MAGIC, SECTION_ALIGN, VERSION}; /// Header flags (bit field). pub mod flags { @@ -11,41 +16,43 @@ pub mod flags { /// File header - first 64 bytes of the bytecode file. /// -/// Note: TypeMeta sub-section counts are stored in the TypeMetaHeader, -/// not in the main header. See type_meta.rs for details. +/// v2 layout (offsets computed from counts): +/// - 0-23: identity and sizes (magic, version, checksum, total_size, str_blob_size, regex_blob_size) +/// - 24-45: counts (11 × u16) — order matches section order +/// - 46-63: reserved #[derive(Clone, Copy, Debug, PartialEq, Eq)] #[repr(C, align(64))] pub struct Header { + // Bytes 0-23: Identity and sizes (6 × u32) /// Magic bytes: b"PTKQ" pub magic: [u8; 4], - /// Format version (currently 1) + /// Format version (currently 2) pub version: u32, /// CRC32 checksum of everything after the header pub checksum: u32, /// Total file size in bytes pub total_size: u32, + /// Size of the string blob in bytes. + pub str_blob_size: u32, + /// Size of the regex blob in bytes. + pub regex_blob_size: u32, - // Section offsets (absolute byte offsets) - pub str_blob_offset: u32, - pub str_table_offset: u32, - pub node_types_offset: u32, - pub node_fields_offset: u32, - pub trivia_offset: u32, - pub type_meta_offset: u32, - pub entrypoints_offset: u32, - pub transitions_offset: u32, - - // Element counts (type counts are in TypeMetaHeader at type_meta_offset) + // Bytes 24-45: Element counts (11 × u16) — order matches section order pub str_table_count: u16, + pub regex_table_count: u16, pub node_types_count: u16, pub node_fields_count: u16, pub trivia_count: u16, + pub type_defs_count: u16, + pub type_members_count: u16, + pub type_names_count: u16, pub entrypoints_count: u16, pub transitions_count: u16, /// Header flags (see `flags` module for bit definitions). pub flags: u16, - /// Padding to maintain 64-byte size. - pub(crate) _pad: u16, + + // Bytes 46-63: Reserved + pub(crate) _reserved: [u8; 18], } const _: () = assert!(std::mem::size_of::
() == 64); @@ -57,52 +64,71 @@ impl Default for Header { version: VERSION, checksum: 0, total_size: 0, - str_blob_offset: 0, - str_table_offset: 0, - node_types_offset: 0, - node_fields_offset: 0, - trivia_offset: 0, - type_meta_offset: 0, - entrypoints_offset: 0, - transitions_offset: 0, + str_blob_size: 0, + regex_blob_size: 0, str_table_count: 0, + regex_table_count: 0, node_types_count: 0, node_fields_count: 0, trivia_count: 0, + type_defs_count: 0, + type_members_count: 0, + type_names_count: 0, entrypoints_count: 0, transitions_count: 0, flags: 0, - _pad: 0, + _reserved: [0; 18], } } } +/// Computed section offsets derived from header counts. +/// +/// Order: StringBlob → RegexBlob → StringTable → RegexTable → NodeTypes → +/// NodeFields → Trivia → TypeDefs → TypeMembers → TypeNames → Entrypoints → Transitions +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] +pub struct SectionOffsets { + pub str_blob: u32, + pub regex_blob: u32, + pub str_table: u32, + pub regex_table: u32, + pub node_types: u32, + pub node_fields: u32, + pub trivia: u32, + pub type_defs: u32, + pub type_members: u32, + pub type_names: u32, + pub entrypoints: u32, + pub transitions: u32, +} + impl Header { /// Decode header from 64 bytes. pub fn from_bytes(bytes: &[u8]) -> Self { assert!(bytes.len() >= 64, "header too short"); + let mut reserved = [0u8; 18]; + reserved.copy_from_slice(&bytes[46..64]); + Self { magic: [bytes[0], bytes[1], bytes[2], bytes[3]], version: u32::from_le_bytes([bytes[4], bytes[5], bytes[6], bytes[7]]), checksum: u32::from_le_bytes([bytes[8], bytes[9], bytes[10], bytes[11]]), total_size: u32::from_le_bytes([bytes[12], bytes[13], bytes[14], bytes[15]]), - str_blob_offset: u32::from_le_bytes([bytes[16], bytes[17], bytes[18], bytes[19]]), - str_table_offset: u32::from_le_bytes([bytes[20], bytes[21], bytes[22], bytes[23]]), - node_types_offset: u32::from_le_bytes([bytes[24], bytes[25], bytes[26], bytes[27]]), - node_fields_offset: u32::from_le_bytes([bytes[28], bytes[29], bytes[30], bytes[31]]), - trivia_offset: u32::from_le_bytes([bytes[32], bytes[33], bytes[34], bytes[35]]), - type_meta_offset: u32::from_le_bytes([bytes[36], bytes[37], bytes[38], bytes[39]]), - entrypoints_offset: u32::from_le_bytes([bytes[40], bytes[41], bytes[42], bytes[43]]), - transitions_offset: u32::from_le_bytes([bytes[44], bytes[45], bytes[46], bytes[47]]), - str_table_count: u16::from_le_bytes([bytes[48], bytes[49]]), - node_types_count: u16::from_le_bytes([bytes[50], bytes[51]]), - node_fields_count: u16::from_le_bytes([bytes[52], bytes[53]]), - trivia_count: u16::from_le_bytes([bytes[54], bytes[55]]), - entrypoints_count: u16::from_le_bytes([bytes[56], bytes[57]]), - transitions_count: u16::from_le_bytes([bytes[58], bytes[59]]), - flags: u16::from_le_bytes([bytes[60], bytes[61]]), - _pad: u16::from_le_bytes([bytes[62], bytes[63]]), + str_blob_size: u32::from_le_bytes([bytes[16], bytes[17], bytes[18], bytes[19]]), + regex_blob_size: u32::from_le_bytes([bytes[20], bytes[21], bytes[22], bytes[23]]), + str_table_count: u16::from_le_bytes([bytes[24], bytes[25]]), + regex_table_count: u16::from_le_bytes([bytes[26], bytes[27]]), + node_types_count: u16::from_le_bytes([bytes[28], bytes[29]]), + node_fields_count: u16::from_le_bytes([bytes[30], bytes[31]]), + trivia_count: u16::from_le_bytes([bytes[32], bytes[33]]), + type_defs_count: u16::from_le_bytes([bytes[34], bytes[35]]), + type_members_count: u16::from_le_bytes([bytes[36], bytes[37]]), + type_names_count: u16::from_le_bytes([bytes[38], bytes[39]]), + entrypoints_count: u16::from_le_bytes([bytes[40], bytes[41]]), + transitions_count: u16::from_le_bytes([bytes[42], bytes[43]]), + flags: u16::from_le_bytes([bytes[44], bytes[45]]), + _reserved: reserved, } } @@ -113,22 +139,20 @@ impl Header { bytes[4..8].copy_from_slice(&self.version.to_le_bytes()); bytes[8..12].copy_from_slice(&self.checksum.to_le_bytes()); bytes[12..16].copy_from_slice(&self.total_size.to_le_bytes()); - bytes[16..20].copy_from_slice(&self.str_blob_offset.to_le_bytes()); - bytes[20..24].copy_from_slice(&self.str_table_offset.to_le_bytes()); - bytes[24..28].copy_from_slice(&self.node_types_offset.to_le_bytes()); - bytes[28..32].copy_from_slice(&self.node_fields_offset.to_le_bytes()); - bytes[32..36].copy_from_slice(&self.trivia_offset.to_le_bytes()); - bytes[36..40].copy_from_slice(&self.type_meta_offset.to_le_bytes()); - bytes[40..44].copy_from_slice(&self.entrypoints_offset.to_le_bytes()); - bytes[44..48].copy_from_slice(&self.transitions_offset.to_le_bytes()); - bytes[48..50].copy_from_slice(&self.str_table_count.to_le_bytes()); - bytes[50..52].copy_from_slice(&self.node_types_count.to_le_bytes()); - bytes[52..54].copy_from_slice(&self.node_fields_count.to_le_bytes()); - bytes[54..56].copy_from_slice(&self.trivia_count.to_le_bytes()); - bytes[56..58].copy_from_slice(&self.entrypoints_count.to_le_bytes()); - bytes[58..60].copy_from_slice(&self.transitions_count.to_le_bytes()); - bytes[60..62].copy_from_slice(&self.flags.to_le_bytes()); - bytes[62..64].copy_from_slice(&self._pad.to_le_bytes()); + bytes[16..20].copy_from_slice(&self.str_blob_size.to_le_bytes()); + bytes[20..24].copy_from_slice(&self.regex_blob_size.to_le_bytes()); + bytes[24..26].copy_from_slice(&self.str_table_count.to_le_bytes()); + bytes[26..28].copy_from_slice(&self.regex_table_count.to_le_bytes()); + bytes[28..30].copy_from_slice(&self.node_types_count.to_le_bytes()); + bytes[30..32].copy_from_slice(&self.node_fields_count.to_le_bytes()); + bytes[32..34].copy_from_slice(&self.trivia_count.to_le_bytes()); + bytes[34..36].copy_from_slice(&self.type_defs_count.to_le_bytes()); + bytes[36..38].copy_from_slice(&self.type_members_count.to_le_bytes()); + bytes[38..40].copy_from_slice(&self.type_names_count.to_le_bytes()); + bytes[40..42].copy_from_slice(&self.entrypoints_count.to_le_bytes()); + bytes[42..44].copy_from_slice(&self.transitions_count.to_le_bytes()); + bytes[44..46].copy_from_slice(&self.flags.to_le_bytes()); + bytes[46..64].copy_from_slice(&self._reserved); bytes } @@ -153,4 +177,71 @@ impl Header { self.flags &= !flags::LINKED; } } + + /// Compute section offsets from counts and blob sizes. + /// + /// Section order (all 64-byte aligned): + /// Header → StringBlob → RegexBlob → StringTable → RegexTable → + /// NodeTypes → NodeFields → Trivia → TypeDefs → TypeMembers → + /// TypeNames → Entrypoints → Transitions + pub fn compute_offsets(&self) -> SectionOffsets { + let align = SECTION_ALIGN as u32; + + // Blobs first (right after header) + let str_blob = align; // 64 + let regex_blob = align_up(str_blob + self.str_blob_size, align); + + // Tables after blobs + let str_table = align_up(regex_blob + self.regex_blob_size, align); + let str_table_size = (self.str_table_count as u32 + 1) * 4; + + let regex_table = align_up(str_table + str_table_size, align); + let regex_table_size = (self.regex_table_count as u32 + 1) * 4; + + // Symbol sections + let node_types = align_up(regex_table + regex_table_size, align); + let node_types_size = self.node_types_count as u32 * 4; + + let node_fields = align_up(node_types + node_types_size, align); + let node_fields_size = self.node_fields_count as u32 * 4; + + let trivia = align_up(node_fields + node_fields_size, align); + let trivia_size = self.trivia_count as u32 * 2; + + // Type metadata + let type_defs = align_up(trivia + trivia_size, align); + let type_defs_size = self.type_defs_count as u32 * 4; + + let type_members = align_up(type_defs + type_defs_size, align); + let type_members_size = self.type_members_count as u32 * 4; + + let type_names = align_up(type_members + type_members_size, align); + let type_names_size = self.type_names_count as u32 * 4; + + // Entry points and instructions + let entrypoints = align_up(type_names + type_names_size, align); + let entrypoints_size = self.entrypoints_count as u32 * 8; + + let transitions = align_up(entrypoints + entrypoints_size, align); + + SectionOffsets { + str_blob, + regex_blob, + str_table, + regex_table, + node_types, + node_fields, + trivia, + type_defs, + type_members, + type_names, + entrypoints, + transitions, + } + } +} + +/// Round up to the next multiple of `align`. +fn align_up(value: u32, align: u32) -> u32 { + (value + align - 1) & !(align - 1) } diff --git a/crates/plotnik-lib/src/bytecode/header_tests.rs b/crates/plotnik-lib/src/bytecode/header_tests.rs index 3e65675..a2f90b3 100644 --- a/crates/plotnik-lib/src/bytecode/header_tests.rs +++ b/crates/plotnik-lib/src/bytecode/header_tests.rs @@ -20,21 +20,20 @@ fn header_roundtrip() { version: VERSION, checksum: 0x12345678, total_size: 1024, - str_blob_offset: 64, - str_table_offset: 128, - node_types_offset: 192, - node_fields_offset: 256, - trivia_offset: 320, - type_meta_offset: 384, - entrypoints_offset: 448, - transitions_offset: 512, + str_blob_size: 100, + regex_blob_size: 256, str_table_count: 10, + regex_table_count: 3, node_types_count: 20, node_fields_count: 5, trivia_count: 2, + type_defs_count: 8, + type_members_count: 12, + type_names_count: 4, entrypoints_count: 1, transitions_count: 15, - ..Default::default() + flags: 0, + _reserved: [0; 18], }; let bytes = h.to_bytes(); @@ -69,3 +68,59 @@ fn header_flags_roundtrip() { assert!(decoded.is_linked()); assert_eq!(decoded.flags, flags::LINKED); } + +#[test] +fn compute_offsets_empty() { + let h = Header::default(); + let offsets = h.compute_offsets(); + + // New order: blobs first, then tables + // All sections 64-byte aligned. With 0 counts, each table still has 1 sentinel entry (4 bytes) + assert_eq!(offsets.str_blob, 64); // after header + assert_eq!(offsets.regex_blob, 64); // 64 + align(0) = 64 + assert_eq!(offsets.str_table, 64); // 64 + align(0) = 64 + assert_eq!(offsets.regex_table, 128); // 64 + align(4) = 128 + assert_eq!(offsets.node_types, 192); // 128 + align(4) = 192 + assert_eq!(offsets.node_fields, 192); // 192 + align(0) = 192 + assert_eq!(offsets.trivia, 192); + assert_eq!(offsets.type_defs, 192); + assert_eq!(offsets.type_members, 192); + assert_eq!(offsets.type_names, 192); + assert_eq!(offsets.entrypoints, 192); + assert_eq!(offsets.transitions, 192); +} + +#[test] +fn compute_offsets_with_data() { + let h = Header { + str_table_count: 5, // (5+1)*4 = 24 bytes + regex_table_count: 2, // (2+1)*4 = 12 bytes + node_types_count: 10, // 10*4 = 40 bytes + node_fields_count: 5, // 5*4 = 20 bytes + trivia_count: 3, // 3*2 = 6 bytes + type_defs_count: 8, // 8*4 = 32 bytes + type_members_count: 12, // 12*4 = 48 bytes + type_names_count: 4, // 4*4 = 16 bytes + entrypoints_count: 2, // 2*8 = 16 bytes + transitions_count: 20, // 20*8 = 160 bytes + str_blob_size: 100, + regex_blob_size: 128, + ..Default::default() + }; + + let offsets = h.compute_offsets(); + + // New order: blobs first, then tables. All offsets 64-byte aligned. + assert_eq!(offsets.str_blob, 64); // header end + assert_eq!(offsets.regex_blob, 192); // 64 + 100 = 164 → 192 + assert_eq!(offsets.str_table, 320); // 192 + 128 = 320 (aligned) + assert_eq!(offsets.regex_table, 384); // 320 + 24 = 344 → 384 + assert_eq!(offsets.node_types, 448); // 384 + 12 = 396 → 448 + assert_eq!(offsets.node_fields, 512); // 448 + 40 = 488 → 512 + assert_eq!(offsets.trivia, 576); // 512 + 20 = 532 → 576 + assert_eq!(offsets.type_defs, 640); // 576 + 6 = 582 → 640 + assert_eq!(offsets.type_members, 704); // 640 + 32 = 672 → 704 + assert_eq!(offsets.type_names, 768); // 704 + 48 = 752 → 768 + assert_eq!(offsets.entrypoints, 832); // 768 + 16 = 784 → 832 + assert_eq!(offsets.transitions, 896); // 832 + 16 = 848 → 896 +} diff --git a/crates/plotnik-lib/src/bytecode/instructions.rs b/crates/plotnik-lib/src/bytecode/instructions.rs index 1bd85a0..2925bb2 100644 --- a/crates/plotnik-lib/src/bytecode/instructions.rs +++ b/crates/plotnik-lib/src/bytecode/instructions.rs @@ -147,6 +147,8 @@ pub struct Match<'a> { neg_count: u8, post_count: u8, succ_count: u8, + /// Whether this instruction has a predicate (4-byte payload). + has_predicate: bool, } impl<'a> Match<'a> { @@ -173,11 +175,18 @@ impl<'a> Match<'a> { let node_type = NodeTypeIR::from_bytes(node_kind, node_type_val); let node_field = NonZeroU16::new(u16::from_le_bytes([bytes[4], bytes[5]])); - let (is_match8, match8_next, pre_count, neg_count, post_count, succ_count) = + let (is_match8, match8_next, pre_count, neg_count, post_count, succ_count, has_predicate) = if opcode == Opcode::Match8 { let next = u16::from_le_bytes([bytes[6], bytes[7]]); - (true, next, 0, 0, 0, if next == 0 { 0 } else { 1 }) + (true, next, 0, 0, 0, if next == 0 { 0 } else { 1 }, false) } else { + // counts field layout (16 bits): + // bits 15-13: pre_count (3) + // bits 12-10: neg_count (3) + // bits 9-7: post_count (3) + // bits 6-2: succ_count (5, max 31) + // bit 1: has_predicate + // bit 0: reserved let counts = u16::from_le_bytes([bytes[6], bytes[7]]); ( false, @@ -185,7 +194,8 @@ impl<'a> Match<'a> { ((counts >> 13) & 0x7) as u8, ((counts >> 10) & 0x7) as u8, ((counts >> 7) & 0x7) as u8, - ((counts >> 1) & 0x3F) as u8, + ((counts >> 2) & 0x1F) as u8, + (counts >> 1) & 0x1 != 0, ) }; @@ -201,6 +211,7 @@ impl<'a> Match<'a> { neg_count, post_count, succ_count, + has_predicate, } } @@ -284,10 +295,20 @@ impl<'a> Match<'a> { (0..self.succ_count as usize).map(move |i| self.successor(i)) } + /// Whether this instruction has a predicate (text filter). + #[inline] + pub fn has_predicate(&self) -> bool { + self.has_predicate + } + /// Byte offset where successors start in the payload. + /// Accounts for predicate (4 bytes) if present. #[inline] fn succ_offset(&self) -> usize { - 8 + (self.pre_count as usize + self.neg_count as usize + self.post_count as usize) * 2 + let effects_size = + (self.pre_count as usize + self.neg_count as usize + self.post_count as usize) * 2; + let predicate_size = if self.has_predicate { 4 } else { 0 }; + 8 + effects_size + predicate_size } } diff --git a/crates/plotnik-lib/src/bytecode/ir.rs b/crates/plotnik-lib/src/bytecode/ir.rs index e9cc34a..55bce7b 100644 --- a/crates/plotnik-lib/src/bytecode/ir.rs +++ b/crates/plotnik-lib/src/bytecode/ir.rs @@ -530,20 +530,23 @@ impl MatchIR { let neg_count = self.neg_fields.len(); let post_count = self.post_effects.len(); let succ_count = self.successors.len(); + let has_predicate = false; // TODO: predicates not yet implemented - // Validate bit-packed field limits (3 bits for counts, 6 bits for successors) + // Validate bit-packed field limits + // counts layout: pre(3) | neg(3) | post(3) | succ(5) | has_pred(1) | reserved(1) assert!( pre_count <= 7, "pre_effects overflow: {pre_count} > 7 (use emit_match_with_cascade)" ); assert!(neg_count <= 7, "neg_fields overflow: {neg_count} > 7"); assert!(post_count <= 7, "post_effects overflow: {post_count} > 7"); - assert!(succ_count <= 63, "successors overflow: {succ_count} > 63"); + assert!(succ_count <= 31, "successors overflow: {succ_count} > 31"); let counts = ((pre_count as u16) << 13) | ((neg_count as u16) << 10) | ((post_count as u16) << 7) - | ((succ_count as u16) << 1); + | ((succ_count as u16) << 2) + | ((has_predicate as u16) << 1); bytes[6..8].copy_from_slice(&counts.to_le_bytes()); let mut offset = 8; diff --git a/crates/plotnik-lib/src/bytecode/mod.rs b/crates/plotnik-lib/src/bytecode/mod.rs index 0eca50e..1d989e4 100644 --- a/crates/plotnik-lib/src/bytecode/mod.rs +++ b/crates/plotnik-lib/src/bytecode/mod.rs @@ -22,13 +22,13 @@ pub use constants::{ pub use ids::{StringId, TypeId}; -pub use header::{Header, flags}; +pub use header::{flags, Header, SectionOffsets}; pub use sections::{FieldSymbol, NodeSymbol, Slice, TriviaEntry}; pub use entrypoint::Entrypoint; -pub use type_meta::{TypeData, TypeDef, TypeKind, TypeMember, TypeMetaHeader, TypeName}; +pub use type_meta::{TypeData, TypeDef, TypeKind, TypeMember, TypeName}; pub use nav::Nav; @@ -40,8 +40,8 @@ pub use instructions::{ }; pub use module::{ - ByteStorage, EntrypointsView, Instruction, Module, ModuleError, StringsView, SymbolsView, - TriviaView, TypesView, + ByteStorage, EntrypointsView, Instruction, Module, ModuleError, RegexView, StringsView, + SymbolsView, TriviaView, TypesView, }; pub use dump::dump; diff --git a/crates/plotnik-lib/src/bytecode/module.rs b/crates/plotnik-lib/src/bytecode/module.rs index 79c538a..ebfc3d3 100644 --- a/crates/plotnik-lib/src/bytecode/module.rs +++ b/crates/plotnik-lib/src/bytecode/module.rs @@ -7,12 +7,12 @@ use std::io; use std::ops::Deref; use std::path::Path; -use super::header::Header; +use super::header::{Header, SectionOffsets}; use super::ids::{StringId, TypeId}; use super::instructions::{Call, Match, Opcode, Return, Trampoline}; use super::sections::{FieldSymbol, NodeSymbol, TriviaEntry}; -use super::type_meta::{TypeData, TypeDef, TypeKind, TypeMember, TypeMetaHeader, TypeName}; -use super::{Entrypoint, SECTION_ALIGN, STEP_SIZE, VERSION}; +use super::type_meta::{TypeData, TypeDef, TypeKind, TypeMember, TypeName}; +use super::{Entrypoint, STEP_SIZE, VERSION}; /// Read a little-endian u16 from bytes at the given offset. #[inline] @@ -113,6 +113,8 @@ pub enum ModuleError { pub struct Module { storage: ByteStorage, header: Header, + /// Cached section offsets (computed from header counts). + offsets: SectionOffsets, } impl Module { @@ -148,7 +150,14 @@ impl Module { }); } - Ok(Self { storage, header }) + // Compute all section offsets from header counts and blob sizes + let offsets = header.compute_offsets(); + + Ok(Self { + storage, + header, + offsets, + }) } /// Get the parsed header. @@ -156,6 +165,11 @@ impl Module { &self.header } + /// Get the computed section offsets. + pub fn offsets(&self) -> &SectionOffsets { + &self.offsets + } + /// Get the raw bytes. pub fn bytes(&self) -> &[u8] { &self.storage @@ -164,21 +178,21 @@ impl Module { /// Decode an instruction at the given step index. #[inline] pub fn decode_step(&self, step: u16) -> Instruction<'_> { - let offset = self.header.transitions_offset as usize + (step as usize) * STEP_SIZE; + let offset = self.offsets.transitions as usize + (step as usize) * STEP_SIZE; Instruction::from_bytes(&self.storage[offset..]) } /// Get a view into the string table. pub fn strings(&self) -> StringsView<'_> { StringsView { - blob: &self.storage[self.header.str_blob_offset as usize..], + blob: &self.storage[self.offsets.str_blob as usize..], table: self.string_table_slice(), } } /// Get a view into the node type symbols. pub fn node_types(&self) -> SymbolsView<'_, NodeSymbol> { - let offset = self.header.node_types_offset as usize; + let offset = self.offsets.node_types as usize; let count = self.header.node_types_count as usize; SymbolsView { bytes: &self.storage[offset..offset + count * 4], @@ -189,7 +203,7 @@ impl Module { /// Get a view into the node field symbols. pub fn node_fields(&self) -> SymbolsView<'_, FieldSymbol> { - let offset = self.header.node_fields_offset as usize; + let offset = self.offsets.node_fields as usize; let count = self.header.node_fields_count as usize; SymbolsView { bytes: &self.storage[offset..offset + count * 4], @@ -200,7 +214,7 @@ impl Module { /// Get a view into the trivia entries. pub fn trivia(&self) -> TriviaView<'_> { - let offset = self.header.trivia_offset as usize; + let offset = self.offsets.trivia as usize; let count = self.header.trivia_count as usize; TriviaView { bytes: &self.storage[offset..offset + count * 2], @@ -208,18 +222,22 @@ impl Module { } } + /// Get a view into the regex table. + pub fn regexes(&self) -> RegexView<'_> { + RegexView { + blob: &self.storage[self.offsets.regex_blob as usize..], + table: self.regex_table_slice(), + } + } + /// Get a view into the type metadata. pub fn types(&self) -> TypesView<'_> { - let meta_offset = self.header.type_meta_offset as usize; - let meta_header = TypeMetaHeader::from_bytes(&self.storage[meta_offset..]); - - // Sub-section offsets (each aligned to 64-byte boundary) - let defs_offset = align64(meta_offset + 8); - let defs_count = meta_header.type_defs_count as usize; - let members_offset = align64(defs_offset + defs_count * 4); - let members_count = meta_header.type_members_count as usize; - let names_offset = align64(members_offset + members_count * 4); - let names_count = meta_header.type_names_count as usize; + let defs_offset = self.offsets.type_defs as usize; + let defs_count = self.header.type_defs_count as usize; + let members_offset = self.offsets.type_members as usize; + let members_count = self.header.type_members_count as usize; + let names_offset = self.offsets.type_names as usize; + let names_count = self.header.type_names_count as usize; TypesView { defs_bytes: &self.storage[defs_offset..defs_offset + defs_count * 4], @@ -233,7 +251,7 @@ impl Module { /// Get a view into the entrypoints. pub fn entrypoints(&self) -> EntrypointsView<'_> { - let offset = self.header.entrypoints_offset as usize; + let offset = self.offsets.entrypoints as usize; let count = self.header.entrypoints_count as usize; EntrypointsView { bytes: &self.storage[offset..offset + count * 8], @@ -241,22 +259,20 @@ impl Module { } } - // Helper to get string table as bytes - // The table has count+1 entries (includes sentinel for length calculation) + /// Helper to get string table as bytes. + /// The table has count+1 entries (includes sentinel for length calculation). fn string_table_slice(&self) -> &[u8] { - let offset = self.header.str_table_offset as usize; + let offset = self.offsets.str_table as usize; let count = self.header.str_table_count as usize; &self.storage[offset..offset + (count + 1) * 4] } -} -/// Align offset to 64-byte boundary. -fn align64(offset: usize) -> usize { - let rem = offset % SECTION_ALIGN; - if rem == 0 { - offset - } else { - offset + SECTION_ALIGN - rem + /// Helper to get regex table as bytes. + /// The table has count+1 entries (includes sentinel for length calculation). + fn regex_table_slice(&self) -> &[u8] { + let offset = self.offsets.regex_table as usize; + let count = self.header.regex_table_count as usize; + &self.storage[offset..offset + (count + 1) * 4] } } @@ -363,9 +379,27 @@ impl<'a> TriviaView<'a> { } } +/// View into the regex table for lazy DFA lookup. +pub struct RegexView<'a> { + blob: &'a [u8], + table: &'a [u8], +} + +impl<'a> RegexView<'a> { + /// Get regex blob bytes by index. + /// + /// Returns the raw DFA bytes for the regex at the given index. + /// Use `regex-automata` to deserialize: `DFA::from_bytes(&bytes)`. + pub fn get_by_index(&self, idx: usize) -> &'a [u8] { + let start = read_u32_le(self.table, idx * 4) as usize; + let end = read_u32_le(self.table, (idx + 1) * 4) as usize; + &self.blob[start..end] + } +} + /// View into type metadata. /// -/// The TypeMeta section contains three sub-sections: +/// Types are stored in three sub-sections: /// - TypeDefs: structural topology (4 bytes each) /// - TypeMembers: fields and variants (4 bytes each) /// - TypeNames: name → TypeId mapping (4 bytes each) diff --git a/crates/plotnik-lib/src/bytecode/type_meta.rs b/crates/plotnik-lib/src/bytecode/type_meta.rs index 25b090b..81aa893 100644 --- a/crates/plotnik-lib/src/bytecode/type_meta.rs +++ b/crates/plotnik-lib/src/bytecode/type_meta.rs @@ -13,67 +13,6 @@ impl TypeKind { pub const ARRAY_PLUS: Self = Self::ArrayOneOrMore; } -/// TypeMeta section header (8 bytes). -/// -/// Contains counts for the three sub-sections. Located at `type_meta_offset`. -#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] -#[repr(C)] -pub struct TypeMetaHeader { - /// Number of TypeDef entries. - pub(crate) type_defs_count: u16, - /// Number of TypeMember entries. - pub(crate) type_members_count: u16, - /// Number of TypeName entries. - pub(crate) type_names_count: u16, - /// Padding for alignment. - pub(crate) _pad: u16, -} - -const _: () = assert!(std::mem::size_of::() == 8); - -impl TypeMetaHeader { - /// Create a new header. - pub fn new(type_defs_count: u16, type_members_count: u16, type_names_count: u16) -> Self { - Self { - type_defs_count, - type_members_count, - type_names_count, - _pad: 0, - } - } - - /// Decode from 8 bytes. - pub fn from_bytes(bytes: &[u8]) -> Self { - assert!(bytes.len() >= 8, "TypeMetaHeader too short"); - Self { - type_defs_count: u16::from_le_bytes([bytes[0], bytes[1]]), - type_members_count: u16::from_le_bytes([bytes[2], bytes[3]]), - type_names_count: u16::from_le_bytes([bytes[4], bytes[5]]), - _pad: 0, - } - } - - /// Encode to 8 bytes. - pub fn to_bytes(&self) -> [u8; 8] { - let mut bytes = [0u8; 8]; - bytes[0..2].copy_from_slice(&self.type_defs_count.to_le_bytes()); - bytes[2..4].copy_from_slice(&self.type_members_count.to_le_bytes()); - bytes[4..6].copy_from_slice(&self.type_names_count.to_le_bytes()); - // _pad is always 0 - bytes - } - - pub fn type_defs_count(&self) -> u16 { - self.type_defs_count - } - pub fn type_members_count(&self) -> u16 { - self.type_members_count - } - pub fn type_names_count(&self) -> u16 { - self.type_names_count - } -} - /// Type definition entry (4 bytes). /// /// Semantics of `data` and `count` depend on `kind`: diff --git a/crates/plotnik-lib/src/bytecode/type_meta_tests.rs b/crates/plotnik-lib/src/bytecode/type_meta_tests.rs index ce7ee68..97219b2 100644 --- a/crates/plotnik-lib/src/bytecode/type_meta_tests.rs +++ b/crates/plotnik-lib/src/bytecode/type_meta_tests.rs @@ -1,18 +1,5 @@ use super::*; -#[test] -fn type_meta_header_size() { - assert_eq!(std::mem::size_of::(), 8); -} - -#[test] -fn type_meta_header_roundtrip() { - let header = TypeMetaHeader::new(42, 100, 5); - let bytes = header.to_bytes(); - let decoded = TypeMetaHeader::from_bytes(&bytes); - assert_eq!(decoded, header); -} - #[test] fn type_def_size() { assert_eq!(std::mem::size_of::(), 4); diff --git a/crates/plotnik-lib/src/emit/emitter.rs b/crates/plotnik-lib/src/emit/emitter.rs index 7157790..22af8f0 100644 --- a/crates/plotnik-lib/src/emit/emitter.rs +++ b/crates/plotnik-lib/src/emit/emitter.rs @@ -8,9 +8,7 @@ use plotnik_core::{Interner, NodeFieldId, NodeTypeId, Symbol}; use crate::analyze::symbol_table::SymbolTable; use crate::analyze::type_check::{TypeContext, TypeId}; use crate::bytecode::Label; -use crate::bytecode::{ - Entrypoint, FieldSymbol, Header, NodeSymbol, SECTION_ALIGN, TriviaEntry, TypeMetaHeader, -}; +use crate::bytecode::{Entrypoint, FieldSymbol, Header, NodeSymbol, SECTION_ALIGN, TriviaEntry}; use crate::compile::Compiler; use crate::query::LinkedQuery; @@ -120,6 +118,10 @@ fn emit_inner( // Trivia (empty for now) let trivia_entries: Vec = Vec::new(); + // Regex (empty for now - predicates not yet implemented) + let regex_table: Vec = vec![0, 0, 0, 0]; // sentinel: single u32 = 0 + let regex_blob: Vec = Vec::new(); + // Resolve and serialize transitions let transitions_bytes = emit_transitions(&compile_result.instructions, &layout, &types, &strings); @@ -133,51 +135,42 @@ fn emit_inner( let trivia_bytes = emit_trivia(&trivia_entries); let entrypoints_bytes = emit_entrypoints(&entrypoints); - // Build output with sections + // Build output with sections in v2 order: + // Header → StringBlob → RegexBlob → StringTable → RegexTable → + // NodeTypes → NodeFields → Trivia → TypeDefs → TypeMembers → + // TypeNames → Entrypoints → Transitions let mut output = vec![0u8; 64]; // Reserve header space - let str_blob_offset = emit_section(&mut output, &str_blob); - let str_table_offset = emit_section(&mut output, &str_table); - let node_types_offset = emit_section(&mut output, &node_types_bytes); - let node_fields_offset = emit_section(&mut output, &node_fields_bytes); - let trivia_offset = emit_section(&mut output, &trivia_bytes); - - // Type metadata section (header + 3 aligned sub-sections) - let type_meta_offset = emit_section( - &mut output, - &TypeMetaHeader::new( - types.type_defs_count() as u16, - types.type_members_count() as u16, - types.type_names_count() as u16, - ) - .to_bytes(), - ); + emit_section(&mut output, &str_blob); + emit_section(&mut output, ®ex_blob); + emit_section(&mut output, &str_table); + emit_section(&mut output, ®ex_table); + emit_section(&mut output, &node_types_bytes); + emit_section(&mut output, &node_fields_bytes); + emit_section(&mut output, &trivia_bytes); emit_section(&mut output, &type_defs_bytes); emit_section(&mut output, &type_members_bytes); emit_section(&mut output, &type_names_bytes); - - let entrypoints_offset = emit_section(&mut output, &entrypoints_bytes); - let transitions_offset = emit_section(&mut output, &transitions_bytes); + emit_section(&mut output, &entrypoints_bytes); + emit_section(&mut output, &transitions_bytes); pad_to_section(&mut output); let total_size = output.len() as u32; - // Build and write header + // Build header (offsets computed from counts and blob sizes) let mut header = Header { - str_blob_offset, - str_table_offset, - node_types_offset, - node_fields_offset, - trivia_offset, - type_meta_offset, - entrypoints_offset, - transitions_offset, str_table_count: strings.len() as u16, node_types_count: node_symbols.len() as u16, node_fields_count: field_symbols.len() as u16, trivia_count: trivia_entries.len() as u16, + regex_table_count: 0, // no regexes yet + type_defs_count: types.type_defs_count() as u16, + type_members_count: types.type_members_count() as u16, + type_names_count: types.type_names_count() as u16, entrypoints_count: entrypoints.len() as u16, transitions_count: layout.total_steps, + str_blob_size: str_blob.len() as u32, + regex_blob_size: regex_blob.len() as u32, total_size, ..Default::default() }; @@ -234,11 +227,9 @@ fn emit_transitions( bytes } -fn emit_section(output: &mut Vec, data: &[u8]) -> u32 { +fn emit_section(output: &mut Vec, data: &[u8]) { pad_to_section(output); - let offset = output.len() as u32; output.extend_from_slice(data); - offset } fn emit_node_symbols(symbols: &[NodeSymbol]) -> Vec { diff --git a/docs/binary-format/01-overview.md b/docs/binary-format/01-overview.md index 8c1d1ba..4658b41 100644 --- a/docs/binary-format/01-overview.md +++ b/docs/binary-format/01-overview.md @@ -1,12 +1,12 @@ # Binary Format: Overview -64-byte Header + 8 aligned Sections. +64-byte Header + 13 Sections. All sections 64-byte aligned. Offsets computed from counts. ## Architecture - **Alignment**: Sections start on 64-byte boundaries; internal structures align to natural size (2/4/8 bytes) - **Sequential**: Fixed order for single-pass writing -- **Endianness**: Little Endian +- **Endianness**: Little-endian throughout - **Limits**: All indices u16 (max 65,535). Transitions: 512 KB max. Use `Call` to share patterns. ### Addressing @@ -18,83 +18,93 @@ | `TypeId` (u16) | Type Definition index | | `NodeTypeId` (u16) | Tree-sitter node type ID | | `NodeFieldId` (u16) | Tree-sitter field ID | - -## Memory Layout - -Section offsets defined in Header for robust parsing. - -| Section | Content | Record Size | -| ------------- | ------------------------ | ----------- | -| Header | Meta | 64 | -| [StringBlob] | UTF-8 | 1 | -| [StringTable] | StringId → Offset+Length | 4 | -| [NodeTypes] | NodeTypeId → StringId | 4 | -| [NodeFields] | NodeFieldId → StringId | 4 | -| [Trivia] | List of NodeTypeId | 2 | -| [TypeMeta] | Types (3 sub-sections) | 4 | -| [Entrypoints] | Definitions | 8 | -| [Transitions] | Tree walking graph | 8 | - -**TypeMeta sub-sections** (contiguous, offsets computed from counts): - -- **TypeDefs**: Structural topology -- **TypeMembers**: Fields and variants -- **TypeNames**: Name → TypeId mapping +| `RegexId` (u16) | Regex Table index | + +## Section Layout + +Sections appear in fixed order, each starting on a 64-byte boundary: + +| # | Section | Record Size | Count Source | +| -- | ------------- | ----------- | --------------------- | +| 0 | Header | 64 bytes | (fixed) | +| 1 | [StringBlob] | 1 | `str_blob_size` | +| 2 | [RegexBlob] | 1 | `regex_blob_size` | +| 3 | [StringTable] | 4 | `str_table_count + 1` | +| 4 | [RegexTable] | 4 | `regex_table_count + 1` | +| 5 | [NodeTypes] | 4 | `node_types_count` | +| 6 | [NodeFields] | 4 | `node_fields_count` | +| 7 | [Trivia] | 2 | `trivia_count` | +| 8 | [TypeDefs] | 4 | `type_defs_count` | +| 9 | [TypeMembers] | 4 | `type_members_count` | +| 10 | [TypeNames] | 4 | `type_names_count` | +| 11 | [Entrypoints] | 8 | `entrypoints_count` | +| 12 | [Transitions] | 8 | `transitions_count` | [StringBlob]: 02-strings.md [StringTable]: 02-strings.md +[RegexBlob]: 03-symbols.md#4-regex +[RegexTable]: 03-symbols.md#4-regex [NodeTypes]: 03-symbols.md [NodeFields]: 03-symbols.md [Trivia]: 03-symbols.md -[TypeMeta]: 04-types.md +[TypeDefs]: 04-types.md +[TypeMembers]: 04-types.md +[TypeNames]: 04-types.md [Entrypoints]: 05-entrypoints.md [Transitions]: 06-transitions.md -## Header +### Sentinel Pattern + +StringTable and RegexTable use `count + 1` entries. The final entry stores the blob size, enabling O(1) length calculation: `length[i] = table[i+1] - table[i]`. -First 64 bytes: magic (`PTKQ`), version (1), CRC32 checksum, section offsets. +### Offset Computation + +Section offsets are not stored in the header. Loaders compute them by: + +1. Start after header (offset 64) +2. For each section in order: + - Current offset = previous section end, rounded up to 64-byte boundary + - Section size = count × record size (or explicit size for blobs) +3. Blob sizes come from header: `str_blob_size` and `regex_blob_size` + +## Header (v2) ```rust #[repr(C, align(64))] struct Header { + // Bytes 0-23: Identity and sizes (6 × u32) magic: [u8; 4], // b"PTKQ" - version: u32, // 1 - checksum: u32, // CRC32 - total_size: u32, // Total file size in bytes - - // Section Offsets (Absolute byte offsets) - str_blob_offset: u32, - str_table_offset: u32, - node_types_offset: u32, - node_fields_offset: u32, - trivia_offset: u32, - type_meta_offset: u32, // Points to TypeMeta header (see 04-types.md) - entrypoints_offset: u32, - transitions_offset: u32, - - // Element Counts + version: u32, // 2 + checksum: u32, // CRC32 of everything after header + total_size: u32, + str_blob_size: u32, + regex_blob_size: u32, + + // Bytes 24-45: Element counts (11 × u16) — order matches section order str_table_count: u16, + regex_table_count: u16, node_types_count: u16, node_fields_count: u16, trivia_count: u16, + type_defs_count: u16, + type_members_count: u16, + type_names_count: u16, entrypoints_count: u16, transitions_count: u16, - flags: u16, // Bit 0: linked flag - _pad: u16, + flags: u16, + + // Bytes 46-63: Reserved + _reserved: [u8; 18], } -// Size: 16 + 32 + 16 = 64 bytes -// -// Note: TypeMeta sub-section counts are stored in the TypeMeta header, -// not in the main header. See 04-types.md for details. ``` -### Flags Field +### Flags | Bit | Name | Description | | --- | ------ | -------------------------------------------------------- | -| 0 | LINKED | If set, bytecode contains grammar NodeTypeId/NodeFieldId | +| 0 | LINKED | If set, bytecode contains resolved NodeTypeId/NodeFieldId | -**Linked vs Unlinked Bytecode**: +**Linked vs Unlinked**: -- **Linked** (`flags & 0x01 != 0`): Match instructions store tree-sitter `NodeTypeId` and `NodeFieldId` in bytes 2-5. Executable directly. NodeTypes and NodeFields sections contain symbol tables for verification. -- **Unlinked** (`flags & 0x01 == 0`): Match instructions store `StringId` references in bytes 2-5 pointing to type/field names in the string table. Requires linking against a grammar before execution. NodeTypes and NodeFields sections are empty. +- **Linked**: Match instructions store tree-sitter `NodeTypeId` and `NodeFieldId` directly. Executable immediately. +- **Unlinked**: Match instructions store `StringId` references. Requires linking against a grammar before execution. diff --git a/docs/binary-format/02-strings.md b/docs/binary-format/02-strings.md index 0cfdc69..fdd9b44 100644 --- a/docs/binary-format/02-strings.md +++ b/docs/binary-format/02-strings.md @@ -16,7 +16,8 @@ This reservation has a practical purpose: since Match instructions use `0` to in Contains the raw UTF-8 bytes for all strings concatenated together. -- **Section Offset**: `header.str_blob_offset` +- **Section Offset**: Computed (first section after header, at offset 64) +- **Size**: `header.str_blob_size` - **Content**: Raw bytes. Strings are **not** null-terminated. - **Padding**: The section is padded to a 64-byte boundary at the end. @@ -24,7 +25,7 @@ Contains the raw UTF-8 bytes for all strings concatenated together. Lookup table mapping `StringId` to byte offsets within the String Blob. -- **Section Offset**: `header.str_table_offset` +- **Section Offset**: Computed (follows RegexBlob, 64-byte aligned) - **Record Size**: 4 bytes (`u32`). - **Capacity**: `header.str_table_count + 1` entries. - The table contains one extra entry at the end representing the total size of the unpadded blob. diff --git a/docs/binary-format/03-symbols.md b/docs/binary-format/03-symbols.md index 1521e25..029da3d 100644 --- a/docs/binary-format/03-symbols.md +++ b/docs/binary-format/03-symbols.md @@ -1,52 +1,78 @@ # Binary Format: Symbols -This section defines the symbol tables used to map external Tree-sitter IDs to internal string representations, and to define trivia kinds. +Symbol tables map external tree-sitter IDs to internal string names. -## 1. Node Types +## 1. Regex -A mapping from Tree-sitter's internal `u16` node type ID to a `StringId` in the query's string table. This allows the runtime to verify node kinds by name or display them for debugging. +Precompiled DFA patterns for predicate matching. Uses the sentinel pattern like StringTable. -- **Section Offset**: `header.node_types_offset` +### RegexBlob + +- **Section Offset**: Computed (follows StringBlob) +- **Size**: `header.regex_blob_size` + +Contains concatenated serialized DFAs (from `regex-automata`). Each DFA is deserialized via `DFA::from_bytes()` for O(1) loading. + +### RegexTable + +- **Section Offset**: Computed (follows StringTable) +- **Record Size**: 4 bytes (`u32`) +- **Count**: `header.regex_table_count + 1` + +Each entry is a byte offset into RegexBlob. The final entry is the blob size. + +To retrieve regex `i`: +1. `start = table[i]` +2. `end = table[i+1]` +3. `bytes = blob[start..end]` + +## 2. Node Types + +Maps tree-sitter node type IDs to their string names. + +- **Section Offset**: Computed (follows RegexTable) - **Record Size**: 4 bytes - **Count**: `header.node_types_count` ```rust #[repr(C)] struct NodeSymbol { - id: u16, // Tree-sitter Node Type ID + id: u16, // Tree-sitter node type ID name: u16, // StringId } ``` -## 2. Node Fields +In **linked** bytecode, this table enables name lookup for debugging and error messages. In **unlinked** bytecode, this section is empty. + +## 3. Node Fields -A mapping from Tree-sitter's internal `u16` field ID to a `StringId`. Used for field verification during matching. +Maps tree-sitter field IDs to their string names. -- **Section Offset**: `header.node_fields_offset` +- **Section Offset**: Computed (follows NodeTypes) - **Record Size**: 4 bytes - **Count**: `header.node_fields_count` ```rust #[repr(C)] struct FieldSymbol { - id: u16, // Tree-sitter Field ID + id: u16, // Tree-sitter field ID name: u16, // StringId } ``` -## 3. Trivia +## 4. Trivia -A list of node type IDs that are considered "trivia" (e.g., whitespace, comments). The runtime uses this list when executing navigation commands like `NextSkipTrivia` or `DownSkipTrivia`. +Node types considered "trivia" (whitespace, comments). The runtime skips these during navigation with `NextSkip`, `DownSkip`, etc. -- **Section Offset**: `header.trivia_offset` +- **Section Offset**: Computed (follows NodeFields) - **Record Size**: 2 bytes - **Count**: `header.trivia_count` ```rust #[repr(C)] struct TriviaEntry { - node_type: u16, // Tree-sitter Node Type ID + node_type: u16, // Tree-sitter node type ID } ``` -The list is not required to be sorted. Runtimes should build a lookup structure (e.g., bitset indexed by node type) on load for O(1) trivia checks. +Unsorted. Loaders should build a lookup structure (e.g., bitset indexed by node type) for O(1) trivia checks. diff --git a/docs/binary-format/04-types.md b/docs/binary-format/04-types.md index da14241..a177d16 100644 --- a/docs/binary-format/04-types.md +++ b/docs/binary-format/04-types.md @@ -1,87 +1,44 @@ # Binary Format: Type Metadata -This section defines the type system metadata used for code generation and runtime validation. It allows consumers to understand the shape of the data extracted by the query. +Type system metadata for code generation and runtime validation. Describes the shape of data extracted by queries. -## 1. Primitives +## Primitives -**TypeId (u16)**: Zero-based index into the TypeDefs array. All types, including primitives, are stored as TypeDef entries. +**TypeId (u16)**: Zero-based index into TypeDefs. All types, including primitives, are stored as TypeDef entries. + +**TypeKind (u8)**: Discriminator for TypeDef. + +| Value | Kind | Description | +| ----- | --------------- | ------------------------------ | +| 0 | `Void` | Unit type, captures nothing | +| 1 | `Node` | AST node reference | +| 2 | `String` | Source text | +| 3 | `Optional` | Wraps another type | +| 4 | `ArrayZeroOrMore` | Zero or more (T*) | +| 5 | `ArrayOneOrMore` | One or more (T+) | +| 6 | `Struct` | Record with named fields | +| 7 | `Enum` | Discriminated union | +| 8 | `Alias` | Named reference to another type | ### Node Semantics -The `Node` type (`TypeKind = 1`) represents a platform-dependent handle to a tree-sitter AST node: +The `Node` type represents a platform-dependent handle to a tree-sitter AST node: | Context | Representation | | :--------- | :--------------------------------------------------------- | | Rust | `tree_sitter::Node<'tree>` (lifetime-bound reference) | | TypeScript | Binding-provided object with `startPosition`, `text`, etc. | -| JSON | Unique node identifier (e.g., `"node:42"` or path-based) | +| JSON | Unique node identifier (e.g., `"node:42"`) | -The handle provides access to node metadata (kind, span, text) without copying the source. Lifetime management is platform-specific — Rust enforces it statically, bindings may use reference counting or arena allocation. +## Sections -**TypeKind (u8)**: Discriminator for `TypeDef`. +Three separate sections store type metadata. Counts are in the main header. -- `0`: `Void` (Unit type, captures nothing) -- `1`: `Node` (AST node reference) -- `2`: `String` (Source text) -- `3`: `Optional` (Wraps another type) -- `4`: `ArrayZeroOrMore` (Zero or more, aka ArrayStar) -- `5`: `ArrayOneOrMore` (One or more, aka ArrayPlus) -- `6`: `Struct` (Record with named fields) -- `7`: `Enum` (Discriminated union) -- `8`: `Alias` (Named reference to another type, e.g., `@x :: Identifier`) +### TypeDefs -## 2. Layout - -The TypeMeta section begins with an 8-byte header containing sub-section counts, followed by three 64-byte aligned sub-sections: - -``` -type_meta_offset -│ -├─ TypeMetaHeader (8 bytes) -│ type_defs_count: u16 -│ type_members_count: u16 -│ type_names_count: u16 -│ _pad: u16 -│ -├─ [padding to 64-byte boundary] -│ -├─ TypeDefs[type_defs_count] (4 bytes each) -│ -├─ [padding to 64-byte boundary] -│ -├─ TypeMembers[type_members_count] (4 bytes each) -│ -├─ [padding to 64-byte boundary] -│ -└─ TypeNames[type_names_count] (4 bytes each) -``` - -```rust -#[repr(C)] -struct TypeMetaHeader { - type_defs_count: u16, - type_members_count: u16, - type_names_count: u16, - _pad: u16, -} -``` - -**Sub-section offsets** (each aligned to 64-byte boundary): - -- TypeDefs: `align64(type_meta_offset + 8)` -- TypeMembers: `align64(TypeDefs_offset + type_defs_count * 4)` -- TypeNames: `align64(TypeMembers_offset + type_members_count * 4)` - -This separation ensures: - -- No wasted space (anonymous types don't need name storage) -- Clean concerns (structure vs. naming) -- Uniform 4-byte records within each sub-section -- 64-byte alignment for cache-friendly access - -### 2.1. TypeDef (4 bytes) - -Describes the structure of a single type. +- **Section Offset**: Computed (follows Trivia) +- **Record Size**: 4 bytes +- **Count**: `header.type_defs_count` ```rust #[repr(C)] @@ -92,27 +49,27 @@ struct TypeDef { } ``` -**Semantics of `data` and `count` fields**: - -| Kind | `data` (u16) | `count` (u8) | Interpretation | -| :---------------- | :------------- | :------------- | :-------------------- | -| `Void` | 0 | 0 | Unit type | -| `Node` | 0 | 0 | AST node reference | -| `String` | 0 | 0 | Source text | -| `Optional` | `InnerTypeId` | 0 | Wrapper `T?` | -| `ArrayZeroOrMore` | `InnerTypeId` | 0 | Wrapper `T[]` | -| `ArrayOneOrMore` | `InnerTypeId` | 0 | Wrapper `[T, ...T[]]` | -| `Struct` | `MemberIndex` | `FieldCount` | Record with fields | -| `Enum` | `MemberIndex` | `VariantCount` | Discriminated union | -| `Alias` | `TargetTypeId` | 0 | Named type reference | +**Field semantics by kind**: -> **Note**: For primitives (`Void`, `Node`, `String`), `data` and `count` are unused. For wrappers and `Alias`, `data` is a `TypeId`. For `Struct` and `Enum`, `data` is an index into the TypeMembers section. Parsers must dispatch on `kind` first. +| Kind | `data` | `count` | +| :---------------- | :------------- | :------------- | +| `Void` | 0 | 0 | +| `Node` | 0 | 0 | +| `String` | 0 | 0 | +| `Optional` | Inner TypeId | 0 | +| `ArrayZeroOrMore` | Inner TypeId | 0 | +| `ArrayOneOrMore` | Inner TypeId | 0 | +| `Alias` | Target TypeId | 0 | +| `Struct` | MemberIndex | FieldCount | +| `Enum` | MemberIndex | VariantCount | -> **Limit**: `count` is u8, so structs/enums are limited to 255 members. +> **Limit**: `count` is u8, so composites are limited to 255 members. -### 2.2. TypeMember (4 bytes) +### TypeMembers -Describes a field in a struct or a variant in an enum. +- **Section Offset**: Computed (follows TypeDefs) +- **Record Size**: 4 bytes +- **Count**: `header.type_members_count` ```rust #[repr(C)] @@ -125,9 +82,11 @@ struct TypeMember { For struct fields: `name` is the field name, `ty` is the field's type. For enum variants: `name` is the variant tag, `ty` is the payload type (use `Void` for unit variants). -### 2.3. TypeName (4 bytes) +### TypeNames -Maps a name to a type. Only types that have names appear here. +- **Section Offset**: Computed (follows TypeMembers) +- **Record Size**: 4 bytes +- **Count**: `header.type_names_count` ```rust #[repr(C)] @@ -137,26 +96,21 @@ struct TypeName { } ``` -**Ordering**: Entries are sorted lexicographically by name (resolved via String Table) for binary search. +Sorted lexicographically by name (resolved via String Table) for binary search. **Usage**: +- Named definitions (`List = [...]`) get an entry +- Custom type annotations (`@x :: Identifier`) create an Alias TypeDef with an entry +- Anonymous types have no entry -- Named definitions (`List = [...]`) get an entry mapping "List" to their TypeId -- Custom type annotations (`@x :: Identifier`) create an Alias TypeDef, with an entry here -- Anonymous types (inline structs, wrappers) have no entry +## Examples -For code generation, build a reverse map (`TypeId → Option`) to look up names when emitting types. +> **Note**: Only **used** primitives are emitted to TypeDefs. The emitter writes them first in order (Void, Node, String), then composite types. -## 3. Examples - -> **Note**: In bytecode, only **used** primitives are emitted to TypeDefs. The emitter writes them first in order (Void, Node, String), then composite types. TypeId values depend on which primitives the query actually uses. - -### 3.1. Simple Struct +### Simple Struct Query: `Q = (function name: (identifier) @name)` -Run `plotnik dump -q ''` to see: - ``` [type_defs] T0 = @@ -169,14 +123,9 @@ M0: S1 → T0 ; name: N0: S2 → T1 ; Q ``` -- `T0` is the `Node` primitive (only used primitive is emitted) -- `T1` is a `Struct` with 1 member starting at `M0` -- `M0` maps "name" to type `T0` (Node) - -### 3.2. Recursive Enum +### Recursive Enum Query: - ``` List = [ Nil: (nil) @@ -184,8 +133,6 @@ List = [ ] ``` -Run `plotnik dump -q ''` to see: - ``` [type_defs] T0 = @@ -203,17 +150,10 @@ M3: S4 → T2 ; Cons: T2 N0: S5 → T3 ; List ``` -- `T0` (Void) and `T1` (Node) are primitives used by the query -- `T2` is the Cons payload struct with `head` and `tail` fields -- `T3` is the `List` enum with `Nil` and `Cons` variants -- `M1` shows `tail: List` — self-reference to `T3` - -### 3.3. Custom Type Annotation +### Custom Type Annotation Query: `Q = (identifier) @name :: Identifier` -Run `plotnik dump -q ''` to see: - ``` [type_defs] T0 = @@ -228,11 +168,7 @@ N0: S1 → T1 ; Identifier N1: S3 → T2 ; Q ``` -- `T0` is the underlying `Node` primitive -- `T1` is an `Alias` pointing to `T0`, named "Identifier" -- The `name` field has type `T1` (the alias), so code generators emit `Identifier` instead of `Node` - -## 4. Validation +## Validation Loaders must verify for `Struct`/`Enum` kinds: @@ -240,7 +176,7 @@ Loaders must verify for `Struct`/`Enum` kinds: This prevents out-of-bounds reads from malformed binaries. -## 5. Code Generation +## Code Generation To emit types (TypeScript, Rust, etc.): diff --git a/docs/binary-format/05-entrypoints.md b/docs/binary-format/05-entrypoints.md index e87796a..1ea92c9 100644 --- a/docs/binary-format/05-entrypoints.md +++ b/docs/binary-format/05-entrypoints.md @@ -4,7 +4,7 @@ This section defines the named entry points for the query. Every named definitio ## Layout -- **Section Offset**: `header.entrypoints_offset` +- **Section Offset**: Computed (follows TypeNames) - **Record Size**: 8 bytes - **Count**: `header.entrypoints_count` - **Ordering**: Entries **must** be sorted lexicographically by the UTF-8 content of their `name` (resolved via String Table). This enables binary search at runtime. diff --git a/docs/binary-format/06-transitions.md b/docs/binary-format/06-transitions.md index 29f1ab7..ab405b5 100644 --- a/docs/binary-format/06-transitions.md +++ b/docs/binary-format/06-transitions.md @@ -4,7 +4,7 @@ This section contains the Virtual Machine (VM) instructions. It is a heap of 8-b ## 1. Addressing -**StepId (u16)**: Zero-based index into this section. Byte offset = `header.transitions_offset + (StepId * 8)`. +**StepId (u16)**: Zero-based index into this section. Byte offset = `transitions_offset + (StepId * 8)` where `transitions_offset` is computed (follows Entrypoints). - **StepId 0 is reserved as the Terminal Sentinel.** Jumping to StepId 0 means the match is complete (Accept). - Limit: 65,536 steps (512 KB section size). @@ -187,37 +187,50 @@ struct MatchHeader { ```text counts (u16) -┌─────────┬─────────┬──────────┬──────────┬───┐ -│ pre (3) │ neg (3) │ post (3) │ succ (6) │ 0 │ -└─────────┴─────────┴──────────┴──────────┴───┘ - bits bits bits bits bit - 15-13 12-10 9-7 6-1 0 +┌─────────┬─────────┬──────────┬──────────┬───────┬───┐ +│ pre (3) │ neg (3) │ post (3) │ succ (5) │ pred │ 0 │ +└─────────┴─────────┴──────────┴──────────┴───────┴───┘ + bits bits bits bits bit bit + 15-13 12-10 9-7 6-2 1 0 ``` - **Bits 15-13**: `pre_count` (0-7) - **Bits 12-10**: `neg_count` (0-7) - **Bits 9-7**: `post_count` (0-7) -- **Bits 6-1**: `succ_count` (0-63) +- **Bits 6-2**: `succ_count` (0-31) +- **Bit 1**: `has_predicate` (if set, payload includes 4-byte predicate before successors) - **Bit 0**: Reserved (must be 0) -Extraction: +**Payload** (immediately follows header): + +| Order | Content | Type | Condition | +| :---- | :--------------- | :----------------------- | :---------------- | +| 1 | `pre_effects` | `[EffectOp; pre_count]` | always | +| 2 | `negated_fields` | `[u16; neg_count]` | always | +| 3 | `post_effects` | `[EffectOp; post_count]` | always | +| 4 | `predicate` | `Predicate` (4 bytes) | if `has_predicate` | +| 5 | `successors` | `[u16; succ_count]` | always | +| 6 | Padding | Zero bytes to step size | always | + +**Predicate** (4 bytes, when `has_predicate` is set): ```rust -let pre_count = (counts >> 13) & 0x7; -let neg_count = (counts >> 10) & 0x7; -let post_count = (counts >> 7) & 0x7; -let succ_count = (counts >> 1) & 0x3F; +#[repr(C)] +struct Predicate { + op: u16, // Bits 0-3: operator (1-7), rest reserved + value_ref: u16, // StringId (string ops) or RegexId (regex ops) +} ``` -**Payload** (immediately follows header): - -| Order | Content | Type | -| :---- | :--------------- | :----------------------- | -| 1 | `pre_effects` | `[EffectOp; pre_count]` | -| 2 | `negated_fields` | `[u16; neg_count]` | -| 3 | `post_effects` | `[EffectOp; post_count]` | -| 4 | `successors` | `[u16; succ_count]` | -| 5 | Padding | Zero bytes to step size | +| Op | Name | Meaning | +| -- | ------ | -------------------------------- | +| 1 | `==` | Exact string match | +| 2 | `!=` | Not equal | +| 3 | `^=` | Starts with | +| 4 | `$=` | Ends with | +| 5 | `*=` | Contains | +| 6 | `=~` | Regex match (value_ref = RegexId)| +| 7 | `!~` | Regex non-match | **Payload Capacity**: @@ -229,7 +242,7 @@ let succ_count = (counts >> 1) & 0x3F; | Match48 | 48 | 40 | 20 | | Match64 | 64 | 56 | 28 | -The compiler selects the smallest step size that fits the payload. If the total exceeds 28 slots, the transition must be split into a chain. +The compiler selects the smallest step size that fits the payload. If the total exceeds 28 slots, the transition must be split into a chain. With predicates (4 bytes = 2 slots), available slots for other payload items are reduced. **Continuation Logic**: diff --git a/docs/runtime-engine.md b/docs/runtime-engine.md index e6ed310..83b8747 100644 --- a/docs/runtime-engine.md +++ b/docs/runtime-engine.md @@ -6,7 +6,7 @@ Executes compiled query graphs against Tree-sitter syntax trees. See [06-transit ```rust struct VM<'t> { - cursor: TreeCursor<'t>, // Never reset — preserves descendant_index for O(1) backtrack + cursor: TreeCursor<'t>, // Never reset — preserves descendant_index for checkpointing ip: StepId, // Current step index frames: Vec, // Call stack effects: EffectLog<'t>, // Side-effect log