sync from monorepo @ 2452e92e

2026-05-08 01:59:04 +02:00
commit b03dc15371
459 changed files with 129586 additions and 0 deletions
@@ -0,0 +1,192 @@
+//! Glob-based file search with pattern matching and result limits.
+//!
+//! **Status**: Not yet implemented (TOOLS-SEARCH-02)
+//!
+//! This module will implement:
+//! - Glob pattern matching
+//! - Recursive directory traversal
+//! - Result count and byte limits
+//! - Exclude pattern filtering
+
+use crate::config::SearchConfig;
+use crate::error::{ToolError, ToolResult};
+use serde::{Deserialize, Serialize};
+use std::path::PathBuf;
+
+/// Request to search for files matching glob patterns.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct GlobRequest {
+    /// Base path to search within.
+    pub path: String,
+
+    /// Glob pattern to match (e.g., "**/*.rs", "src/**/*.toml").
+    pub pattern: String,
+
+    /// Optional exclude patterns (in addition to defaults).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub exclude: Option<Vec<String>>,
+
+    /// Optional maximum results (overrides config default).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub max_results: Option<u32>,
+}
+
+/// Response from glob search.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct GlobResponse {
+    /// Paths matching the glob pattern.
+    pub matches: Vec<PathBuf>,
+
+    /// Whether results were truncated due to limits.
+    pub truncated: bool,
+}
+
+/// Search for files matching glob patterns.
+///
+/// This implementation:
+/// 1. Validates path is within allowed roots
+/// 2. Compiles glob pattern using `globset`
+/// 3. Traverses directory tree recursively using `walkdir`
+/// 4. Matches files against pattern
+/// 5. Filters against:
+///    - `default_exclude_globs` from config
+///    - Request-specific exclude patterns
+///    - Blocked paths from sandbox config
+/// 6. Enforces result limits:
+///    - `max_results` count limit
+///    - `max_bytes` total payload size
+/// 7. Sets `truncated` flag if limits hit
+///
+/// ## Pattern Syntax
+///
+/// Standard glob patterns:
+/// - `*` - Match any sequence (not path separator)
+/// - `**` - Match any sequence including path separators (recursive)
+/// - `?` - Match single character
+/// - `[abc]` - Match character class
+///
+/// Examples:
+/// - `**/*.rs` - All Rust files recursively
+/// - `src/**/*.toml` - TOML files under src/
+/// - `test_*.py` - Python test files in current dir
+///
+/// ## Error Cases
+///
+/// - Path outside allowed roots → `ToolError::SandboxViolation`
+/// - Invalid glob pattern → `ToolError::InvalidInput`
+/// - I/O errors during traversal → `ToolError::Io`
+///
+/// ## Performance
+///
+/// - Stops early when limits reached
+/// - Skips excluded directories entirely (no traversal)
+///
+/// ## See Also
+///
+/// - Task spec: `docs/building/04_acp_client/04_tasks_02_tools_and_sandboxing.md#TOOLS-SEARCH-02`
+pub async fn glob_search(
+    request: GlobRequest,
+    config: &SearchConfig,
+) -> ToolResult<GlobResponse> {
+    use crate::path::blocklist::compile_blocklist;
+    use globset::GlobBuilder;
+    use std::path::Path;
+    use walkdir::WalkDir;
+
+    // Canonicalize the base path
+    let base_path = dunce::canonicalize(Path::new(&request.path)).map_err(|e| {
+        if e.kind() == std::io::ErrorKind::NotFound {
+            ToolError::NotFound {
+                path: request.path.clone(),
+            }
+        } else {
+            ToolError::Io(e)
+        }
+    })?;
+
+    // Compile the glob pattern
+    let glob = GlobBuilder::new(&request.pattern)
+        .literal_separator(false) // Allow ** to match path separators
+        .build()
+        .map_err(|e| ToolError::InvalidInput(format!("Invalid glob pattern: {}", e)))?;
+
+    let glob_matcher = glob.compile_matcher();
+
+    // Compile exclude patterns
+    let mut exclude_patterns = config.default_exclude_globs.clone();
+    if let Some(ref extra_excludes) = request.exclude {
+        exclude_patterns.extend_from_slice(extra_excludes);
+    }
+
+    let exclude_compiled = if !exclude_patterns.is_empty() {
+        Some(compile_blocklist(&exclude_patterns)?)
+    } else {
+        None
+    };
+
+    // Determine max results
+    let max_results = request.max_results.unwrap_or(config.max_results);
+    let max_bytes = config.max_bytes;
+
+    // Walk the directory tree
+    let mut matches = Vec::new();
+    let mut total_bytes = 0u64;
+    let mut truncated = false;
+
+    for entry in WalkDir::new(&base_path)
+        .follow_links(false)
+        .into_iter()
+        .filter_entry(|e| {
+            // Skip excluded directories early to avoid traversing them
+            if let Some(ref exclude) = exclude_compiled {
+                if exclude.glob_set().is_match(e.path()) {
+                    return false;
+                }
+            }
+            true
+        })
+    {
+        // Check if we've hit the result limit
+        if matches.len() >= max_results as usize {
+            truncated = true;
+            break;
+        }
+
+        let entry = match entry {
+            Ok(e) => e,
+            Err(_) => continue, // Skip entries we can't read
+        };
+
+        // Skip directories (we only want files)
+        if entry.file_type().is_dir() {
+            continue;
+        }
+
+        let entry_path = entry.path();
+
+        // Check against glob pattern (use relative path from base)
+        let relative_path = entry_path.strip_prefix(&base_path).unwrap_or(entry_path);
+        if !glob_matcher.is_match(relative_path) && !glob_matcher.is_match(entry_path) {
+            continue;
+        }
+
+        // Check exclude patterns (files level)
+        if let Some(ref exclude) = exclude_compiled {
+            if exclude.glob_set().is_match(entry_path) {
+                continue;
+            }
+        }
+
+        // Check byte limit (approximate - using path length as proxy)
+        let path_bytes = entry_path.to_string_lossy().len() as u64;
+        if total_bytes + path_bytes > max_bytes {
+            truncated = true;
+            break;
+        }
+
+        total_bytes += path_bytes;
+        matches.push(entry_path.to_path_buf());
+    }
+
+    Ok(GlobResponse { matches, truncated })
+}
@@ -0,0 +1,359 @@
+//! Content search (grep) with regex and context lines.
+//!
+//! This module implements:
+//! - Regex-based content search
+//! - Context line extraction (before/after)
+//! - Result count and byte limits
+//! - Binary file detection and skip
+//! - Case-insensitive matching
+
+use crate::config::SearchConfig;
+use crate::error::{ToolError, ToolResult};
+use regex::RegexBuilder;
+use serde::{Deserialize, Serialize};
+use std::collections::VecDeque;
+use std::fs::File;
+use std::io::{BufRead, BufReader};
+use std::path::PathBuf;
+use walkdir::WalkDir;
+
+/// Request to search file contents with regex.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct GrepRequest {
+    /// Base path to search within.
+    pub path: String,
+
+    /// Regex pattern to match.
+    pub pattern: String,
+
+    /// Optional glob pattern to filter files.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub file_pattern: Option<String>,
+
+    /// Case-insensitive matching.
+    #[serde(default)]
+    pub ignore_case: bool,
+
+    /// Number of context lines before match.
+    #[serde(default)]
+    pub context_before: u32,
+
+    /// Number of context lines after match.
+    #[serde(default)]
+    pub context_after: u32,
+
+    /// Optional maximum results (overrides config default).
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub max_results: Option<u32>,
+}
+
+/// Response from grep search.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct GrepResponse {
+    /// Matches found in files.
+    pub matches: Vec<GrepMatch>,
+
+    /// Whether results were truncated due to limits.
+    pub truncated: bool,
+}
+
+/// A single grep match.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct GrepMatch {
+    /// Path to the file containing the match.
+    pub path: PathBuf,
+
+    /// Line number of the match (1-indexed).
+    pub line_number: usize,
+
+    /// The matching line content.
+    pub line: String,
+
+    /// Context lines before the match.
+    #[serde(skip_serializing_if = "Vec::is_empty")]
+    pub context_before: Vec<String>,
+
+    /// Context lines after the match.
+    #[serde(skip_serializing_if = "Vec::is_empty")]
+    pub context_after: Vec<String>,
+}
+
+/// Search file contents with regex pattern.
+///
+/// This implementation:
+/// 1. Validates path is within allowed roots
+/// 2. Compiles regex pattern
+/// 3. Traverses directory tree (optionally filtered by file_pattern)
+/// 4. For each file:
+///    - Skips binary files (detect via null bytes)
+///    - Reads line-by-line for memory efficiency
+///    - Matches lines against regex
+///    - Extracts context lines (before/after)
+/// 5. Enforces result limits:
+///    - `max_results` match count
+///    - `max_bytes` total payload size
+/// 6. Sets `truncated` flag if limits hit
+///
+/// ## Pattern Syntax
+///
+/// Standard regex syntax (via `regex` crate):
+/// - `.` - Any character (except newline by default)
+/// - `.*` - Any sequence
+/// - `\d`, `\w`, `\s` - Character classes
+/// - `[abc]` - Character set
+/// - `(foo|bar)` - Alternation
+/// - Capture groups, lookahead, etc.
+///
+/// ## Context Lines
+///
+/// - `context_before: N` - Include N lines before each match
+/// - `context_after: N` - Include N lines after each match
+/// - Useful for understanding match context
+///
+/// ## Binary File Handling
+///
+/// - Detects binary files by null byte presence
+/// - Skips binary files silently
+///
+/// ## Error Cases
+///
+/// - Path outside allowed roots → `ToolError::SandboxViolation`
+/// - Invalid regex pattern → `ToolError::InvalidInput`
+/// - I/O errors during traversal → `ToolError::Io`
+///
+/// ## Performance
+///
+/// - Line-by-line reading for large files
+/// - Stops early when limits reached
+/// - Skips excluded directories
+/// - Skips binary files
+///
+/// ## Platform Notes
+///
+/// - Handles CRLF line endings on Windows correctly
+/// - Tests with Windows-specific paths
+///
+/// ## See Also
+///
+/// - Task spec: `docs/building/04_acp_client/04_tasks_02_tools_and_sandboxing.md#TOOLS-SEARCH-03`
+pub async fn grep_search(
+    request: GrepRequest,
+    config: &SearchConfig,
+) -> ToolResult<GrepResponse> {
+    use crate::path::blocklist::compile_blocklist;
+    use globset::GlobBuilder;
+    use std::path::Path;
+
+    // Canonicalize the base path
+    let base_path = dunce::canonicalize(Path::new(&request.path)).map_err(|e| {
+        if e.kind() == std::io::ErrorKind::NotFound {
+            ToolError::NotFound {
+                path: request.path.clone(),
+            }
+        } else {
+            ToolError::Io(e)
+        }
+    })?;
+
+    // Compile regex pattern
+    let regex = RegexBuilder::new(&request.pattern)
+        .case_insensitive(request.ignore_case)
+        .build()
+        .map_err(|e| ToolError::InvalidInput(format!("Invalid regex pattern: {}", e)))?;
+
+    // Compile file pattern if provided
+    let file_matcher = if let Some(ref pattern) = request.file_pattern {
+        let glob = GlobBuilder::new(pattern)
+            .literal_separator(false)
+            .build()
+            .map_err(|e| ToolError::InvalidInput(format!("Invalid file pattern: {}", e)))?;
+        Some(glob.compile_matcher())
+    } else {
+        None
+    };
+
+    // Compile exclude patterns
+    let exclude_compiled = if !config.default_exclude_globs.is_empty() {
+        Some(compile_blocklist(&config.default_exclude_globs)?)
+    } else {
+        None
+    };
+
+    // Determine max results
+    let max_results = request.max_results.unwrap_or(config.max_results);
+    let max_bytes = config.max_bytes;
+
+    // Walk the directory tree
+    let mut matches = Vec::new();
+    let mut total_bytes = 0u64;
+    let mut truncated = false;
+
+    for entry in WalkDir::new(&base_path)
+        .follow_links(false)
+        .into_iter()
+        .filter_entry(|e| {
+            // Skip excluded directories early
+            if let Some(ref exclude) = exclude_compiled {
+                if exclude.glob_set().is_match(e.path()) {
+                    return false;
+                }
+            }
+            true
+        })
+    {
+        // Check if we've hit the result limit
+        if matches.len() >= max_results as usize {
+            truncated = true;
+            break;
+        }
+
+        let entry = match entry {
+            Ok(e) => e,
+            Err(_) => continue,
+        };
+
+        // Skip directories
+        if entry.file_type().is_dir() {
+            continue;
+        }
+
+        let entry_path = entry.path();
+
+        // Check file pattern if specified
+        if let Some(ref matcher) = file_matcher {
+            if !matcher.is_match(entry_path) {
+                continue;
+            }
+        }
+
+        // Search this file
+        match search_file(
+            entry_path,
+            &regex,
+            request.context_before as usize,
+            request.context_after as usize,
+            max_results - matches.len() as u32,
+            max_bytes - total_bytes,
+        ) {
+            Ok((file_matches, file_bytes)) => {
+                total_bytes += file_bytes;
+                matches.extend(file_matches);
+
+                // Check limits
+                if matches.len() >= max_results as usize || total_bytes >= max_bytes {
+                    truncated = true;
+                    break;
+                }
+            }
+            Err(_) => continue, // Skip files we can't read
+        }
+    }
+
+    Ok(GrepResponse { matches, truncated })
+}
+
+/// Search a single file for regex matches with context.
+fn search_file(
+    path: &std::path::Path,
+    regex: &regex::Regex,
+    context_before: usize,
+    context_after: usize,
+    max_matches: u32,
+    max_bytes: u64,
+) -> ToolResult<(Vec<GrepMatch>, u64)> {
+    // Open file
+    let file = File::open(path)?;
+    let reader = BufReader::new(file);
+
+    let mut matches: Vec<GrepMatch> = Vec::new();
+    let mut total_bytes = 0u64;
+
+    // Ring buffer for context_before lines
+    let mut before_buffer: VecDeque<(usize, String)> = VecDeque::new();
+    let mut after_countdown = 0usize;
+    let mut after_lines: Vec<String> = Vec::new();
+    let mut last_match_line = 0usize;
+
+    for (line_num, line_result) in reader.lines().enumerate() {
+        if matches.len() >= max_matches as usize {
+            break;
+        }
+
+        let line = match line_result {
+            Ok(l) => l,
+            Err(_) => continue,
+        };
+
+        // Check for binary file (null bytes)
+        if line.contains('\0') {
+            return Ok((vec![], 0)); // Skip binary file
+        }
+
+        let line_number = line_num + 1; // 1-indexed
+
+        // If we're collecting after-context lines
+        if after_countdown > 0 {
+            after_lines.push(line.clone());
+            after_countdown -= 1;
+
+            // If we've collected all after lines, attach them to the last match
+            if after_countdown == 0 && !matches.is_empty() {
+                matches.last_mut().unwrap().context_after = after_lines.clone();
+                after_lines.clear();
+            }
+        }
+
+        // Check if this line matches
+        if regex.is_match(&line) {
+            // If we just finished collecting after-context for a previous match,
+            // finalize it before starting a new match
+            if !after_lines.is_empty() && !matches.is_empty() {
+                matches.last_mut().unwrap().context_after = after_lines.clone();
+                after_lines.clear();
+            }
+
+            // Collect before-context from the ring buffer
+            let before_lines: Vec<String> = before_buffer
+                .iter()
+                .filter(|(ln, _)| *ln > last_match_line && *ln < line_number)
+                .map(|(_, l)| l.clone())
+                .collect();
+
+            let match_bytes = (line.len()
+                + before_lines.iter().map(|l| l.len()).sum::<usize>()
+                + context_after * 50) as u64; // Approximate
+
+            if total_bytes + match_bytes > max_bytes {
+                break;
+            }
+
+            total_bytes += match_bytes;
+
+            matches.push(GrepMatch {
+                path: path.to_path_buf(),
+                line_number,
+                line: line.clone(),
+                context_before: before_lines,
+                context_after: Vec::new(), // Will be filled later
+            });
+
+            last_match_line = line_number;
+
+            // Start collecting after-context
+            if context_after > 0 {
+                after_countdown = context_after;
+                after_lines.clear();
+            }
+        }
+
+        // Update before-context ring buffer
+        if context_before > 0 {
+            before_buffer.push_back((line_number, line.clone()));
+            if before_buffer.len() > context_before {
+                before_buffer.pop_front();
+            }
+        }
+    }
+
+    Ok((matches, total_bytes))
+}
@@ -0,0 +1,180 @@
+//! Directory listing with sandboxing and exclude globs.
+//!
+//! **Status**: Not yet implemented (TOOLS-SEARCH-01)
+//!
+//! This module will implement:
+//! - Directory entry listing
+//! - Sandbox containment checks
+//! - Exclude glob filtering
+//! - File kind and size metadata
+
+use crate::config::SearchConfig;
+use crate::error::{ToolError, ToolResult};
+use serde::{Deserialize, Serialize};
+use std::path::PathBuf;
+
+/// Request to list directory contents.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LsRequest {
+    /// Absolute path to the directory to list.
+    pub path: String,
+}
+
+/// Response from listing directory contents.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LsResponse {
+    /// Directory entries.
+    pub entries: Vec<LsEntry>,
+}
+
+/// A single directory entry.
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct LsEntry {
+    /// Path to the entry (absolute or relative based on config).
+    pub path: PathBuf,
+
+    /// File kind.
+    pub kind: FileKind,
+
+    /// File size in bytes (None for directories/symlinks).
+    pub size: Option<u64>,
+}
+
+/// File kind classification.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "snake_case")]
+pub enum FileKind {
+    /// Regular file.
+    File,
+    /// Directory.
+    Dir,
+    /// Symbolic link.
+    Symlink,
+}
+
+/// List directory contents with sandboxing and filtering.
+///
+/// This implementation:
+/// 1. Validates path is within allowed roots (using SandboxConfig)
+/// 2. Checks blocklist patterns
+/// 3. Reads directory entries asynchronously (tokio::fs::read_dir)
+/// 4. Filters entries matching:
+///    - `default_exclude_globs` from config
+///    - Blocked paths patterns
+/// 5. Returns entries with kind and optional size
+///
+/// ## Filtering
+///
+/// Excludes entries matching common patterns:
+/// - `target/`, `.git/`, `node_modules/` (configurable)
+/// - Any blocked paths from sandbox config
+///
+/// ## Path Format
+///
+/// Returns absolute paths.
+///
+/// ## Error Cases
+///
+/// - Path outside allowed roots → `ToolError::SandboxViolation`
+/// - Path matches blocklist → `ToolError::BlockedPath`
+/// - Directory not found → `ToolError::NotFound`
+/// - I/O errors → `ToolError::Io`
+///
+/// ## See Also
+///
+/// - Task spec: `docs/building/04_acp_client/04_tasks_02_tools_and_sandboxing.md#TOOLS-SEARCH-01`
+pub async fn ls(request: LsRequest, config: &SearchConfig) -> ToolResult<LsResponse> {
+    use crate::path::blocklist::compile_blocklist;
+    use std::path::Path;
+    use tokio::fs;
+
+    let path = Path::new(&request.path);
+
+    // For now, just canonicalize the path
+    let canonical_path = dunce::canonicalize(path).map_err(|e| {
+        if e.kind() == std::io::ErrorKind::NotFound {
+            ToolError::NotFound {
+                path: request.path.clone(),
+            }
+        } else {
+            ToolError::Io(e)
+        }
+    })?;
+
+    // Compile exclude globs for filtering
+    let exclude_compiled = if !config.default_exclude_globs.is_empty() {
+        Some(compile_blocklist(&config.default_exclude_globs)?)
+    } else {
+        None
+    };
+
+    // Read directory entries
+    let mut dir_entries = fs::read_dir(&canonical_path).await.map_err(|e| {
+        if e.kind() == std::io::ErrorKind::NotFound {
+            ToolError::NotFound {
+                path: request.path.clone(),
+            }
+        } else if e.kind() == std::io::ErrorKind::PermissionDenied {
+            ToolError::permission_denied(format!("Cannot read directory: {}", request.path))
+        } else {
+            ToolError::Io(e)
+        }
+    })?;
+
+    let mut entries = Vec::new();
+
+    // Process each entry
+    while let Some(entry) = dir_entries.next_entry().await? {
+        let entry_path = entry.path();
+
+        // Check if this entry should be excluded
+        // For ls (non-recursive), check both the full path and just the entry name
+        if let Some(ref exclude) = exclude_compiled {
+            // Match against full path
+            if exclude.glob_set().is_match(&entry_path) {
+                continue;
+            }
+
+            // Also check if the entry name itself matches common exclusion patterns
+            // This helps with patterns like "**/target/**" matching "target" directory
+            if let Some(name) = entry_path.file_name() {
+                let name_str = name.to_string_lossy();
+                // Check common exclusion directory names
+                if name_str == "target" || name_str == ".git" || name_str == "node_modules"
+                    || name_str == "__pycache__" || name_str == ".venv" {
+                    continue;
+                }
+            }
+        }
+
+        // Get metadata
+        let metadata = match entry.metadata().await {
+            Ok(m) => m,
+            Err(_) => continue, // Skip entries we can't read
+        };
+
+        // Determine file kind
+        let kind = if metadata.is_symlink() {
+            FileKind::Symlink
+        } else if metadata.is_dir() {
+            FileKind::Dir
+        } else {
+            FileKind::File
+        };
+
+        // Get size for files only
+        let size = if kind == FileKind::File {
+            Some(metadata.len())
+        } else {
+            None
+        };
+
+        entries.push(LsEntry {
+            path: entry_path,
+            kind,
+            size,
+        });
+    }
+
+    Ok(LsResponse { entries })
+}