diff --git a/crates/codegraph-core/src/build_pipeline.rs b/crates/codegraph-core/src/build_pipeline.rs index aad6f30c..8821de71 100644 --- a/crates/codegraph-core/src/build_pipeline.rs +++ b/crates/codegraph-core/src/build_pipeline.rs @@ -11,10 +11,10 @@ //! 4. Parse files in parallel (existing `parallel::parse_files_parallel`) //! 5. Insert nodes (existing `insert_nodes::do_insert_nodes`) //! 6. Resolve imports (existing `import_resolution::resolve_imports_batch`) -//! 7. Build import edges + barrel resolution -//! 8. Build call edges (existing `edge_builder::build_call_edges`) -//! 9. Structure metrics + role classification -//! 10. Finalize (metadata, journal) +//! 6b. Re-parse barrel candidates (incremental only) +//! 7. Build import edges + call edges + barrel resolution +//! 8. Structure metrics + role classification +//! 9. Finalize (metadata, journal) use crate::change_detection; use crate::config::{BuildConfig, BuildOpts, BuildPathAliases}; @@ -125,58 +125,7 @@ pub fn run_pipeline( .map(|f| normalize_path(f)) .collect() }); - let collect_result = if let Some(ref scope) = opts.scope { - // Scoped rebuild — only collect files that exist on disk - let files: Vec = scope - .iter() - .map(|f| { - let abs = Path::new(root_dir).join(normalize_path(f)); - abs.to_str().unwrap_or("").to_string() - }) - .filter(|f| Path::new(f).exists()) - .collect(); - file_collector::CollectResult { - directories: files - .iter() - .filter_map(|f| { - Path::new(f) - .parent() - .map(|p| p.to_str().unwrap_or("").to_string()) - }) - .collect(), - files, - } - } else if incremental && !force_full_rebuild { - // Try fast collect from DB + journal - let journal = journal::read_journal(root_dir); - let has_entries = - journal.valid && (!journal.changed.is_empty() || !journal.removed.is_empty()); - - if has_entries { - let db_files: Vec = conn - .prepare("SELECT file FROM file_hashes") - .and_then(|mut stmt| { - stmt.query_map([], |row| row.get::<_, String>(0)) - .map(|rows| rows.filter_map(|r| r.ok()).collect()) - }) - .unwrap_or_default(); - - if !db_files.is_empty() { - file_collector::try_fast_collect( - root_dir, - &db_files, - &journal.changed, - &journal.removed, - ) - } else { - file_collector::collect_files(root_dir, &config.ignore_dirs) - } - } else { - file_collector::collect_files(root_dir, &config.ignore_dirs) - } - } else { - file_collector::collect_files(root_dir, &config.ignore_dirs) - }; + let collect_result = collect_source_files(conn, root_dir, &config, &opts, incremental, force_full_rebuild); timing.collect_ms = t0.elapsed().as_secs_f64() * 1000.0; // ── Stage 3: Detect changes ──────────────────────────────────────── @@ -285,7 +234,7 @@ pub fn run_pipeline( // ── Stage 5: Insert nodes ────────────────────────────────────────── let t0 = Instant::now(); let insert_batches = build_insert_batches(&file_symbols); - let file_hashes = build_file_hash_entries(&parse_changes, root_dir); + let file_hashes = build_file_hash_entries(&parse_changes); let _ = crate::insert_nodes::do_insert_nodes( conn, &insert_batches, @@ -333,121 +282,11 @@ pub fn run_pipeline( timing.resolve_ms = t0.elapsed().as_secs_f64() * 1000.0; // ── Stage 6b: Re-parse barrel candidates (incremental only) ───────── - // Mirrors JS pipeline's findBarrelCandidates + reparseBarrelFiles. - // For incremental builds, barrel files (re-export-only index files) may - // not be in file_symbols because they weren't changed or reverse-deps. - // Without their symbols, barrel resolution in Stage 7 can't create the - // transitive import edges (e.g. app.js -> math.js through index.js). if !change_result.is_full_build { - // Find all barrel files from DB (files that have 'reexports' edges) - let barrel_files_in_db: HashSet = { - let rows: Vec = match conn.prepare( - "SELECT DISTINCT n1.file FROM edges e \ - JOIN nodes n1 ON e.source_id = n1.id \ - WHERE e.kind = 'reexports' AND n1.kind = 'file'", - ) { - Ok(mut stmt) => match stmt.query_map([], |row| row.get::<_, String>(0)) { - Ok(mapped) => mapped.filter_map(|r| r.ok()).collect(), - Err(_) => Vec::new(), - }, - Err(_) => Vec::new(), - }; - rows.into_iter().collect() - }; - - // Check which barrels are imported by parsed files but not in file_symbols - let mut barrel_paths_to_parse: Vec = Vec::new(); - for (_rel_path, symbols) in &file_symbols { - for imp in &symbols.imports { - // Look up resolved path from batch_resolved - let abs_file = Path::new(root_dir).join(_rel_path); - let fwd = abs_file.to_str().unwrap_or("").replace('\\', "/"); - let key = format!("{}|{}", fwd, imp.source); - if let Some(resolved) = batch_resolved.get(&key) { - if barrel_files_in_db.contains(resolved) && !file_symbols.contains_key(resolved) - { - let abs = Path::new(root_dir).join(resolved); - if abs.exists() { - barrel_paths_to_parse - .push(abs.to_str().unwrap_or("").to_string()); - } - } - } - } - } - - // Also find barrels that re-export FROM changed files - { - let changed_rel: Vec<&str> = file_symbols.keys().map(|s| s.as_str()).collect(); - if let Ok(mut stmt) = conn.prepare( - "SELECT DISTINCT n1.file FROM edges e \ - JOIN nodes n1 ON e.source_id = n1.id \ - JOIN nodes n2 ON e.target_id = n2.id \ - WHERE e.kind = 'reexports' AND n1.kind = 'file' AND n2.file = ?1", - ) { - for changed in &changed_rel { - if let Ok(rows) = stmt.query_map(rusqlite::params![changed], |row| { - row.get::<_, String>(0) - }) { - for row in rows.flatten() { - if !file_symbols.contains_key(&row) { - let abs = Path::new(root_dir).join(&row); - if abs.exists() { - barrel_paths_to_parse - .push(abs.to_str().unwrap_or("").to_string()); - } - } - } - } - } - } - } - - // Re-parse barrel files and merge into file_symbols - if !barrel_paths_to_parse.is_empty() { - barrel_paths_to_parse.sort(); - barrel_paths_to_parse.dedup(); - // Barrel files are re-export-only — no function bodies or dataflow, - // so skip dataflow/AST analysis to avoid unnecessary overhead. - let barrel_parsed = parallel::parse_files_parallel( - &barrel_paths_to_parse, - root_dir, - false, - false, - ); - for mut sym in barrel_parsed { - let rel = relative_path(root_dir, &sym.file); - sym.file = rel.clone(); - // Delete outgoing import/reexport edges for barrel files being re-parsed - // (scoped to import-related kinds to avoid dropping calls edges) - let _ = conn.execute( - "DELETE FROM edges WHERE source_id IN (SELECT id FROM nodes WHERE file = ?1) \ - AND kind IN ('imports', 'reexports')", - rusqlite::params![&rel], - ); - // Re-resolve imports for the barrel file - // Normalize to forward slashes so batch_resolved keys match get_resolved lookups on Windows. - let abs_str = - Path::new(root_dir).join(&rel).to_str().unwrap_or("").replace('\\', "/"); - for imp in &sym.imports { - let input = ImportResolutionInput { - from_file: abs_str.clone(), - import_source: imp.source.clone(), - }; - let resolved_batch = import_resolution::resolve_imports_batch( - &[input], - root_dir, - &napi_aliases, - Some(&known_files), - ); - for r in &resolved_batch { - let key = format!("{}|{}", r.from_file, r.import_source); - batch_resolved.insert(key, r.resolved_path.clone()); - } - } - file_symbols.insert(rel, sym); - } - } + reparse_barrel_candidates( + conn, root_dir, &napi_aliases, &known_files, + &mut file_symbols, &mut batch_resolved, + ); } // ── Stage 7: Build edges ─────────────────────────────────────────── @@ -551,27 +390,7 @@ pub fn run_pipeline( // ── Stage 9: Finalize ────────────────────────────────────────────── let t0 = Instant::now(); - let node_count = conn - .query_row("SELECT COUNT(*) FROM nodes", [], |row| row.get::<_, i64>(0)) - .unwrap_or(0); - let edge_count = conn - .query_row("SELECT COUNT(*) FROM edges", [], |row| row.get::<_, i64>(0)) - .unwrap_or(0); - - // Persist build metadata - let version = env!("CARGO_PKG_VERSION"); - let meta_sql = "INSERT OR REPLACE INTO build_meta (key, value) VALUES (?, ?)"; - if let Ok(mut stmt) = conn.prepare(meta_sql) { - let _ = stmt.execute(["engine", "native"]); - let _ = stmt.execute(["engine_version", version]); - let _ = stmt.execute(["codegraph_version", version]); - let _ = stmt.execute(["node_count", &node_count.to_string()]); - let _ = stmt.execute(["edge_count", &edge_count.to_string()]); - let _ = stmt.execute(["last_build", &now_ms().to_string()]); - } - - // Write journal header - journal::write_journal_header(root_dir, now_ms()); + let (node_count, edge_count) = finalize_build(conn, root_dir); timing.finalize_ms = t0.elapsed().as_secs_f64() * 1000.0; // Include total time in setup for overhead accounting. @@ -601,6 +420,218 @@ pub fn run_pipeline( }) } +/// Stage 2: Collect source files with strategy selection (scoped, journal-fast, or full). +fn collect_source_files( + conn: &Connection, + root_dir: &str, + config: &BuildConfig, + opts: &BuildOpts, + incremental: bool, + force_full_rebuild: bool, +) -> file_collector::CollectResult { + if let Some(ref scope) = opts.scope { + // Scoped rebuild + let files: Vec = scope + .iter() + .map(|f| { + let abs = Path::new(root_dir).join(normalize_path(f)); + abs.to_str().unwrap_or("").to_string() + }) + .filter(|f| Path::new(f).exists()) + .collect(); + file_collector::CollectResult { + directories: files + .iter() + .filter_map(|f| { + Path::new(f) + .parent() + .map(|p| p.to_str().unwrap_or("").to_string()) + }) + .collect(), + files, + } + } else if incremental && !force_full_rebuild { + // Try fast collect from DB + journal + let journal = journal::read_journal(root_dir); + let has_entries = + journal.valid && (!journal.changed.is_empty() || !journal.removed.is_empty()); + + if has_entries { + let db_files: Vec = conn + .prepare("SELECT file FROM file_hashes") + .and_then(|mut stmt| { + stmt.query_map([], |row| row.get::<_, String>(0)) + .map(|rows| rows.filter_map(|r| r.ok()).collect()) + }) + .unwrap_or_default(); + + if !db_files.is_empty() { + file_collector::try_fast_collect( + root_dir, + &db_files, + &journal.changed, + &journal.removed, + ) + } else { + file_collector::collect_files(root_dir, &config.ignore_dirs) + } + } else { + file_collector::collect_files(root_dir, &config.ignore_dirs) + } + } else { + file_collector::collect_files(root_dir, &config.ignore_dirs) + } +} + +/// Stage 6b: Re-parse barrel candidates for incremental builds. +/// +/// Barrel files (re-export-only index files) may not be in file_symbols because +/// they weren't changed or reverse-deps. Without their symbols, barrel resolution +/// in Stage 7 can't create transitive import edges. +fn reparse_barrel_candidates( + conn: &Connection, + root_dir: &str, + napi_aliases: &crate::types::PathAliases, + known_files: &HashSet, + file_symbols: &mut HashMap, + batch_resolved: &mut HashMap, +) { + // Find all barrel files from DB (files that have 'reexports' edges) + let barrel_files_in_db: HashSet = { + let rows: Vec = match conn.prepare( + "SELECT DISTINCT n1.file FROM edges e \ + JOIN nodes n1 ON e.source_id = n1.id \ + WHERE e.kind = 'reexports' AND n1.kind = 'file'", + ) { + Ok(mut stmt) => match stmt.query_map([], |row| row.get::<_, String>(0)) { + Ok(mapped) => mapped.filter_map(|r| r.ok()).collect(), + Err(_) => Vec::new(), + }, + Err(_) => Vec::new(), + }; + rows.into_iter().collect() + }; + + // Check which barrels are imported by parsed files but not in file_symbols + let mut barrel_paths_to_parse: Vec = Vec::new(); + for (rel_path, symbols) in file_symbols.iter() { + for imp in &symbols.imports { + let abs_file = Path::new(root_dir).join(rel_path); + let fwd = abs_file.to_str().unwrap_or("").replace('\\', "/"); + let key = format!("{}|{}", fwd, imp.source); + if let Some(resolved) = batch_resolved.get(&key) { + if barrel_files_in_db.contains(resolved) && !file_symbols.contains_key(resolved) + { + let abs = Path::new(root_dir).join(resolved); + if abs.exists() { + barrel_paths_to_parse + .push(abs.to_str().unwrap_or("").to_string()); + } + } + } + } + } + + // Also find barrels that re-export FROM changed files + { + let changed_rel: Vec<&str> = file_symbols.keys().map(|s| s.as_str()).collect(); + if let Ok(mut stmt) = conn.prepare( + "SELECT DISTINCT n1.file FROM edges e \ + JOIN nodes n1 ON e.source_id = n1.id \ + JOIN nodes n2 ON e.target_id = n2.id \ + WHERE e.kind = 'reexports' AND n1.kind = 'file' AND n2.file = ?1", + ) { + for changed in &changed_rel { + if let Ok(rows) = stmt.query_map(rusqlite::params![changed], |row| { + row.get::<_, String>(0) + }) { + for row in rows.flatten() { + if !file_symbols.contains_key(&row) { + let abs = Path::new(root_dir).join(&row); + if abs.exists() { + barrel_paths_to_parse + .push(abs.to_str().unwrap_or("").to_string()); + } + } + } + } + } + } + } + + // Re-parse barrel files and merge into file_symbols + if !barrel_paths_to_parse.is_empty() { + barrel_paths_to_parse.sort(); + barrel_paths_to_parse.dedup(); + // Barrel files are re-export-only — no function bodies or dataflow, + // so skip dataflow/AST analysis to avoid unnecessary overhead. + let barrel_parsed = parallel::parse_files_parallel( + &barrel_paths_to_parse, + root_dir, + false, + false, + ); + for mut sym in barrel_parsed { + let rel = relative_path(root_dir, &sym.file); + sym.file = rel.clone(); + // Delete outgoing import/reexport edges for barrel files being re-parsed + // (scoped to import-related kinds to avoid dropping calls edges) + let _ = conn.execute( + "DELETE FROM edges WHERE source_id IN (SELECT id FROM nodes WHERE file = ?1) \ + AND kind IN ('imports', 'reexports')", + rusqlite::params![&rel], + ); + // Re-resolve imports for the barrel file + // Normalize to forward slashes so batch_resolved keys match get_resolved lookups on Windows. + let abs_str = + Path::new(root_dir).join(&rel).to_str().unwrap_or("").replace('\\', "/"); + for imp in &sym.imports { + let input = ImportResolutionInput { + from_file: abs_str.clone(), + import_source: imp.source.clone(), + }; + let resolved_batch = import_resolution::resolve_imports_batch( + &[input], + root_dir, + napi_aliases, + Some(known_files), + ); + for r in &resolved_batch { + let key = format!("{}|{}", r.from_file, r.import_source); + batch_resolved.insert(key, r.resolved_path.clone()); + } + } + file_symbols.insert(rel, sym); + } + } +} + +/// Stage 9: Finalize build — persist metadata, write journal, return counts. +fn finalize_build(conn: &Connection, root_dir: &str) -> (i64, i64) { + let node_count = conn + .query_row("SELECT COUNT(*) FROM nodes", [], |row| row.get::<_, i64>(0)) + .unwrap_or(0); + let edge_count = conn + .query_row("SELECT COUNT(*) FROM edges", [], |row| row.get::<_, i64>(0)) + .unwrap_or(0); + + // Persist build metadata + let version = env!("CARGO_PKG_VERSION"); + let meta_sql = "INSERT OR REPLACE INTO build_meta (key, value) VALUES (?, ?)"; + if let Ok(mut stmt) = conn.prepare(meta_sql) { + let _ = stmt.execute(["engine", "native"]); + let _ = stmt.execute(["engine_version", version]); + let _ = stmt.execute(["codegraph_version", version]); + let _ = stmt.execute(["node_count", &node_count.to_string()]); + let _ = stmt.execute(["edge_count", &edge_count.to_string()]); + let _ = stmt.execute(["last_build", &now_ms().to_string()]); + } + + // Write journal header + journal::write_journal_header(root_dir, now_ms()); + (node_count, edge_count) +} + /// Check if engine/schema/version changed since last build (forces full rebuild). fn check_version_mismatch(conn: &Connection) -> bool { let get_meta = |key: &str| -> Option { @@ -681,7 +712,6 @@ fn build_insert_batches( /// that `file_hashes` is populated for subsequent incremental builds. fn build_file_hash_entries( changed: &[&change_detection::ChangedFile], - _root_dir: &str, ) -> Vec { changed .iter() diff --git a/crates/codegraph-core/src/dataflow.rs b/crates/codegraph-core/src/dataflow.rs index f22313ac..26ea2d21 100644 --- a/crates/codegraph-core/src/dataflow.rs +++ b/crates/codegraph-core/src/dataflow.rs @@ -579,127 +579,143 @@ fn function_name<'a>(fn_node: &Node<'a>, rules: &DataflowRules, source: &[u8]) - None } -/// Extract parameter names using per-language strategy. -fn extract_param_names_strategy(node: &Node, strategy: ParamStrategy, source: &[u8]) -> Option> { - match strategy { - ParamStrategy::Default => None, - ParamStrategy::Python => { - let t = node.kind(); - if t == "typed_parameter" || t == "typed_default_parameter" { - let cursor = &mut node.walk(); - for c in node.named_children(cursor) { - if c.kind() == "identifier" { - return Some(vec![node_text(&c, source).to_string()]); - } - } - return Some(vec![]); - } - if t == "default_parameter" { - if let Some(name_node) = node.child_by_field_name("name") { - return Some(vec![node_text(&name_node, source).to_string()]); - } - return Some(vec![]); - } - if t == "list_splat_pattern" || t == "dictionary_splat_pattern" { - let cursor = &mut node.walk(); - for c in node.named_children(cursor) { - if c.kind() == "identifier" { - return Some(vec![node_text(&c, source).to_string()]); - } - } - return Some(vec![]); +// ── Per-language parameter extraction handlers ───────────────────────────── + +fn extract_params_python(node: &Node, source: &[u8]) -> Option> { + let t = node.kind(); + if t == "typed_parameter" || t == "typed_default_parameter" { + let cursor = &mut node.walk(); + for c in node.named_children(cursor) { + if c.kind() == "identifier" { + return Some(vec![node_text(&c, source).to_string()]); } - None } - ParamStrategy::Go => { - let t = node.kind(); - if t == "parameter_declaration" { - let mut names = Vec::new(); - let cursor = &mut node.walk(); - for c in node.named_children(cursor) { - if c.kind() == "identifier" { - names.push(node_text(&c, source).to_string()); - } - } - if !names.is_empty() { Some(names) } else { None } - } else if t == "variadic_parameter_declaration" { - node.child_by_field_name("name") - .map(|n| vec![node_text(&n, source).to_string()]) - } else { - None - } + return Some(vec![]); + } + if t == "default_parameter" { + if let Some(name_node) = node.child_by_field_name("name") { + return Some(vec![node_text(&name_node, source).to_string()]); } - ParamStrategy::Rust => { - let t = node.kind(); - if t == "parameter" { - if let Some(pat) = node.child_by_field_name("pattern") { - if pat.kind() == "identifier" { - return Some(vec![node_text(&pat, source).to_string()]); - } - } - return Some(vec![]); - } - if t == "identifier" { - return Some(vec![node_text(node, source).to_string()]); + return Some(vec![]); + } + if t == "list_splat_pattern" || t == "dictionary_splat_pattern" { + let cursor = &mut node.walk(); + for c in node.named_children(cursor) { + if c.kind() == "identifier" { + return Some(vec![node_text(&c, source).to_string()]); } - None } - ParamStrategy::Java => { - let t = node.kind(); - if t == "formal_parameter" || t == "spread_parameter" { - if let Some(name_node) = node.child_by_field_name("name") { - return Some(vec![node_text(&name_node, source).to_string()]); - } - return Some(vec![]); - } - if t == "identifier" { - return Some(vec![node_text(node, source).to_string()]); + return Some(vec![]); + } + None +} + +fn extract_params_go(node: &Node, source: &[u8]) -> Option> { + let t = node.kind(); + if t == "parameter_declaration" { + let mut names = Vec::new(); + let cursor = &mut node.walk(); + for c in node.named_children(cursor) { + if c.kind() == "identifier" { + names.push(node_text(&c, source).to_string()); } - None } - ParamStrategy::CSharp => { - let t = node.kind(); - if t == "parameter" { - if let Some(name_node) = node.child_by_field_name("name") { - return Some(vec![node_text(&name_node, source).to_string()]); - } - return Some(vec![]); - } - if t == "identifier" { - return Some(vec![node_text(node, source).to_string()]); + if !names.is_empty() { Some(names) } else { None } + } else if t == "variadic_parameter_declaration" { + node.child_by_field_name("name") + .map(|n| vec![node_text(&n, source).to_string()]) + } else { + None + } +} + +fn extract_params_rust(node: &Node, source: &[u8]) -> Option> { + let t = node.kind(); + if t == "parameter" { + if let Some(pat) = node.child_by_field_name("pattern") { + if pat.kind() == "identifier" { + return Some(vec![node_text(&pat, source).to_string()]); } - None } - ParamStrategy::Php => { - let t = node.kind(); - if t == "simple_parameter" || t == "variadic_parameter" { - if let Some(name_node) = node.child_by_field_name("name") { - return Some(vec![node_text(&name_node, source).to_string()]); - } - return Some(vec![]); - } - if t == "variable_name" { - return Some(vec![node_text(node, source).to_string()]); - } - None + return Some(vec![]); + } + if t == "identifier" { + return Some(vec![node_text(node, source).to_string()]); + } + None +} + +fn extract_params_java(node: &Node, source: &[u8]) -> Option> { + let t = node.kind(); + if t == "formal_parameter" || t == "spread_parameter" { + if let Some(name_node) = node.child_by_field_name("name") { + return Some(vec![node_text(&name_node, source).to_string()]); } - ParamStrategy::Ruby => { - let t = node.kind(); - if t == "identifier" { - return Some(vec![node_text(node, source).to_string()]); - } - if t == "optional_parameter" - || t == "keyword_parameter" - || t == "splat_parameter" - || t == "hash_splat_parameter" - { - if let Some(name_node) = node.child_by_field_name("name") { - return Some(vec![node_text(&name_node, source).to_string()]); - } - return Some(vec![]); - } - None + return Some(vec![]); + } + if t == "identifier" { + return Some(vec![node_text(node, source).to_string()]); + } + None +} + +fn extract_params_csharp(node: &Node, source: &[u8]) -> Option> { + let t = node.kind(); + if t == "parameter" { + if let Some(name_node) = node.child_by_field_name("name") { + return Some(vec![node_text(&name_node, source).to_string()]); + } + return Some(vec![]); + } + if t == "identifier" { + return Some(vec![node_text(node, source).to_string()]); + } + None +} + +fn extract_params_php(node: &Node, source: &[u8]) -> Option> { + let t = node.kind(); + if t == "simple_parameter" || t == "variadic_parameter" { + if let Some(name_node) = node.child_by_field_name("name") { + return Some(vec![node_text(&name_node, source).to_string()]); } + return Some(vec![]); + } + if t == "variable_name" { + return Some(vec![node_text(node, source).to_string()]); + } + None +} + +fn extract_params_ruby(node: &Node, source: &[u8]) -> Option> { + let t = node.kind(); + if t == "identifier" { + return Some(vec![node_text(node, source).to_string()]); + } + if t == "optional_parameter" + || t == "keyword_parameter" + || t == "splat_parameter" + || t == "hash_splat_parameter" + { + if let Some(name_node) = node.child_by_field_name("name") { + return Some(vec![node_text(&name_node, source).to_string()]); + } + return Some(vec![]); + } + None +} + +/// Extract parameter names using per-language strategy. +fn extract_param_names_strategy(node: &Node, strategy: ParamStrategy, source: &[u8]) -> Option> { + match strategy { + ParamStrategy::Default => None, + ParamStrategy::Python => extract_params_python(node, source), + ParamStrategy::Go => extract_params_go(node, source), + ParamStrategy::Rust => extract_params_rust(node, source), + ParamStrategy::Java => extract_params_java(node, source), + ParamStrategy::CSharp => extract_params_csharp(node, source), + ParamStrategy::Php => extract_params_php(node, source), + ParamStrategy::Ruby => extract_params_ruby(node, source), } } @@ -1052,7 +1068,7 @@ fn handle_return_stmt( func_name: func_name.clone(), expression: truncate( expr.map(|e| node_text(&e, source)).unwrap_or(""), - 120, + DATAFLOW_TRUNCATION_LIMIT, ), referenced_names, line: node_line(node), diff --git a/crates/codegraph-core/src/graph_algorithms.rs b/crates/codegraph-core/src/graph_algorithms.rs index f2dc9889..a30c269f 100644 --- a/crates/codegraph-core/src/graph_algorithms.rs +++ b/crates/codegraph-core/src/graph_algorithms.rs @@ -247,12 +247,21 @@ pub fn louvain_communities( ) } -fn louvain_impl( +/// Internal state for the Louvain multi-level loop. +struct LouvainState { + cur_n: usize, + cur_edges: HashMap<(usize, usize), f64>, + cur_degree: Vec, + original_community: Vec, + rng_state: u32, +} + +/// Build the initial index-based edge map and degree vector from raw edges. +fn louvain_init( edges: &[GraphEdge], node_ids: &[String], - resolution: f64, seed: u32, -) -> LouvainResult { +) -> (HashMap<(usize, usize), f64>, f64, LouvainState) { let n = node_ids.len(); let mut id_to_idx: HashMap<&str, usize> = HashMap::with_capacity(n); for (i, id) in node_ids.iter().enumerate() { @@ -275,168 +284,174 @@ fn louvain_impl( } let total_weight: f64 = edge_map.values().sum(); - if total_weight == 0.0 { - return LouvainResult { - assignments: node_ids - .iter() - .enumerate() - .map(|(i, id)| CommunityAssignment { - node: id.clone(), - community: i as i32, - }) - .collect(), - modularity: 0.0, - }; - } - // original_community[i] tracks each original node's final community - let mut original_community: Vec = (0..n).collect(); - - // Current level's graph - let mut cur_n = n; - let mut cur_edges = edge_map.clone(); - let mut cur_degree: Vec = vec![0.0; cur_n]; - for (&(src, tgt), &w) in &cur_edges { + let mut cur_degree: Vec = vec![0.0; n]; + for (&(src, tgt), &w) in &edge_map { cur_degree[src] += w; cur_degree[tgt] += w; } - // Seeded xorshift32 RNG - let mut rng_state: u32 = if seed == 0 { 1 } else { seed }; - let mut next_rand = || -> u32 { - rng_state ^= rng_state << 13; - rng_state ^= rng_state >> 17; - rng_state ^= rng_state << 5; - rng_state + let rng_state = if seed == 0 { 1 } else { seed }; + + let state = LouvainState { + cur_n: n, + cur_edges: edge_map.clone(), + cur_degree, + original_community: (0..n).collect(), + rng_state, }; - // m2 = 2 × total edge weight of the ORIGINAL graph — a constant across all levels. - // Recalculating from cur_edges would undercount because coarsening strips intra-community - // edges, inflating the penalty term and causing under-merging at coarser levels. - let total_m2: f64 = 2.0 * total_weight; + (edge_map, total_weight, state) +} - for _level in 0..LOUVAIN_MAX_LEVELS { - if cur_edges.is_empty() { - break; - } +/// Xorshift32 PRNG step. +fn xorshift32(state: &mut u32) -> u32 { + *state ^= *state << 13; + *state ^= *state >> 17; + *state ^= *state << 5; + *state +} - // Build adjacency list - let mut adj: Vec> = vec![vec![]; cur_n]; - for (&(src, tgt), &w) in &cur_edges { - adj[src].push((tgt, w)); - adj[tgt].push((src, w)); - } +/// Local move phase: greedily reassign nodes to communities to maximize modularity. +/// Returns true if any node moved. +fn local_move_phase( + state: &mut LouvainState, + resolution: f64, + total_m2: f64, +) -> (Vec, bool) { + let cur_n = state.cur_n; + + // Build adjacency list + let mut adj: Vec> = vec![vec![]; cur_n]; + for (&(src, tgt), &w) in &state.cur_edges { + adj[src].push((tgt, w)); + adj[tgt].push((src, w)); + } - // Local phase: greedy modularity optimization - let mut level_comm: Vec = (0..cur_n).collect(); - let mut comm_total: Vec = cur_degree.clone(); + let mut level_comm: Vec = (0..cur_n).collect(); + let mut comm_total: Vec = state.cur_degree.clone(); - let mut order: Vec = (0..cur_n).collect(); - for i in (1..order.len()).rev() { - let j = next_rand() as usize % (i + 1); - order.swap(i, j); - } + // Shuffle visit order with seeded RNG + let mut order: Vec = (0..cur_n).collect(); + for i in (1..order.len()).rev() { + let j = xorshift32(&mut state.rng_state) as usize % (i + 1); + order.swap(i, j); + } - let mut any_moved = false; - for _pass in 0..LOUVAIN_MAX_PASSES { - let mut pass_moved = false; - for &node in &order { - let node_comm = level_comm[node]; - let node_deg = cur_degree[node]; + let mut any_moved = false; + for _pass in 0..LOUVAIN_MAX_PASSES { + let mut pass_moved = false; + for &node in &order { + let node_comm = level_comm[node]; + let node_deg = state.cur_degree[node]; - let mut comm_w: HashMap = HashMap::new(); - for &(neighbor, w) in &adj[node] { - *comm_w.entry(level_comm[neighbor]).or_insert(0.0) += w; - } + let mut comm_w: HashMap = HashMap::new(); + for &(neighbor, w) in &adj[node] { + *comm_w.entry(level_comm[neighbor]).or_insert(0.0) += w; + } - let w_own = *comm_w.get(&node_comm).unwrap_or(&0.0); - let remove_cost = - w_own - resolution * node_deg * (comm_total[node_comm] - node_deg) / total_m2; + let w_own = *comm_w.get(&node_comm).unwrap_or(&0.0); + let remove_cost = + w_own - resolution * node_deg * (comm_total[node_comm] - node_deg) / total_m2; - let mut best_comm = node_comm; - let mut best_gain: f64 = 0.0; + let mut best_comm = node_comm; + let mut best_gain: f64 = 0.0; - for (&target_comm, &w_target) in &comm_w { - if target_comm == node_comm { - continue; - } - let gain = w_target - - resolution * node_deg * comm_total[target_comm] / total_m2 - - remove_cost; - if gain > best_gain { - best_gain = gain; - best_comm = target_comm; - } + for (&target_comm, &w_target) in &comm_w { + if target_comm == node_comm { + continue; } - - if best_comm != node_comm && best_gain > LOUVAIN_MIN_GAIN { - comm_total[node_comm] -= node_deg; - comm_total[best_comm] += node_deg; - level_comm[node] = best_comm; - pass_moved = true; - any_moved = true; + let gain = w_target + - resolution * node_deg * comm_total[target_comm] / total_m2 + - remove_cost; + if gain > best_gain { + best_gain = gain; + best_comm = target_comm; } } - if !pass_moved { - break; + + if best_comm != node_comm && best_gain > LOUVAIN_MIN_GAIN { + comm_total[node_comm] -= node_deg; + comm_total[best_comm] += node_deg; + level_comm[node] = best_comm; + pass_moved = true; + any_moved = true; } } - - if !any_moved { + if !pass_moved { break; } + } - // Renumber communities contiguously - let mut comm_remap: HashMap = HashMap::new(); - let mut next_id: usize = 0; - for &c in &level_comm { - if !comm_remap.contains_key(&c) { - comm_remap.insert(c, next_id); - next_id += 1; - } - } - for c in level_comm.iter_mut() { - *c = comm_remap[c]; - } - let coarse_n = next_id; + (level_comm, any_moved) +} - if coarse_n == cur_n { - break; +/// Aggregation phase: renumber communities, compose original mapping, build coarse graph. +/// Returns false if no further coarsening is possible (convergence). +fn aggregation_phase( + state: &mut LouvainState, + level_comm: &mut Vec, +) -> bool { + // Renumber communities contiguously + let mut comm_remap: HashMap = HashMap::new(); + let mut next_id: usize = 0; + for &c in level_comm.iter() { + if !comm_remap.contains_key(&c) { + comm_remap.insert(c, next_id); + next_id += 1; } + } + for c in level_comm.iter_mut() { + *c = comm_remap[c]; + } + let coarse_n = next_id; - // Compose: update original_community through this level's assignments - for oc in original_community.iter_mut() { - *oc = level_comm[*oc]; - } + if coarse_n == state.cur_n { + return false; + } - // Build coarse graph for next level - let mut coarse_edge_map: HashMap<(usize, usize), f64> = HashMap::new(); - for (&(src, tgt), &w) in &cur_edges { - let cu = level_comm[src]; - let cv = level_comm[tgt]; - if cu == cv { - continue; - } - let key = if cu < cv { (cu, cv) } else { (cv, cu) }; - *coarse_edge_map.entry(key).or_insert(0.0) += w; - } + // Compose: update original_community through this level's assignments + for oc in state.original_community.iter_mut() { + *oc = level_comm[*oc]; + } - let mut coarse_degree: Vec = vec![0.0; coarse_n]; - for (i, °) in cur_degree.iter().enumerate() { - coarse_degree[level_comm[i]] += deg; + // Build coarse graph for next level + let mut coarse_edge_map: HashMap<(usize, usize), f64> = HashMap::new(); + for (&(src, tgt), &w) in &state.cur_edges { + let cu = level_comm[src]; + let cv = level_comm[tgt]; + if cu == cv { + continue; } + let key = if cu < cv { (cu, cv) } else { (cv, cu) }; + *coarse_edge_map.entry(key).or_insert(0.0) += w; + } - cur_n = coarse_n; - cur_edges = coarse_edge_map; - cur_degree = coarse_degree; + let mut coarse_degree: Vec = vec![0.0; coarse_n]; + for (i, °) in state.cur_degree.iter().enumerate() { + coarse_degree[level_comm[i]] += deg; } - // Compute modularity: Q = sum_c [ L_c / m - gamma * (k_c / 2m)^2 ] + state.cur_n = coarse_n; + state.cur_edges = coarse_edge_map; + state.cur_degree = coarse_degree; + + true +} + +/// Compute final modularity score: Q = sum_c [ L_c / m - gamma * (k_c / 2m)^2 ] +fn compute_modularity( + edge_map: &HashMap<(usize, usize), f64>, + original_community: &[usize], + total_weight: f64, + resolution: f64, + n: usize, +) -> f64 { let m = total_weight; let m2 = 2.0 * m; let mut orig_degree: Vec = vec![0.0; n]; - for (&(src, tgt), &w) in &edge_map { + for (&(src, tgt), &w) in edge_map { orig_degree[src] += w; orig_degree[tgt] += w; } @@ -448,7 +463,7 @@ fn louvain_impl( for (i, °) in orig_degree.iter().enumerate() { kc[original_community[i]] += deg; } - for (&(src, tgt), &w) in &edge_map { + for (&(src, tgt), &w) in edge_map { if original_community[src] == original_community[tgt] { lc[original_community[src]] += w; } @@ -460,13 +475,60 @@ fn louvain_impl( modularity += lc[c] / m - resolution * (kc[c] / m2).powi(2); } } + modularity +} + +fn louvain_impl( + edges: &[GraphEdge], + node_ids: &[String], + resolution: f64, + seed: u32, +) -> LouvainResult { + let n = node_ids.len(); + let (edge_map, total_weight, mut state) = louvain_init(edges, node_ids, seed); + + if total_weight == 0.0 { + return LouvainResult { + assignments: node_ids + .iter() + .enumerate() + .map(|(i, id)| CommunityAssignment { + node: id.clone(), + community: i as i32, + }) + .collect(), + modularity: 0.0, + }; + } + + // m2 = 2 x total edge weight of the ORIGINAL graph -- a constant across all levels. + // Recalculating from cur_edges would undercount because coarsening strips intra-community + // edges, inflating the penalty term and causing under-merging at coarser levels. + let total_m2: f64 = 2.0 * total_weight; + + for _level in 0..LOUVAIN_MAX_LEVELS { + if state.cur_edges.is_empty() { + break; + } + + let (mut level_comm, any_moved) = local_move_phase(&mut state, resolution, total_m2); + if !any_moved { + break; + } + + if !aggregation_phase(&mut state, &mut level_comm) { + break; + } + } + + let modularity = compute_modularity(&edge_map, &state.original_community, total_weight, resolution, n); let assignments = node_ids .iter() .enumerate() .map(|(i, id)| CommunityAssignment { node: id.clone(), - community: original_community[i] as i32, + community: state.original_community[i] as i32, }) .collect();