sync from monorepo @ ffee08f2

This commit is contained in:
2026-05-09 19:52:44 +02:00
parent e20542a40e
commit bf5a79d931
177 changed files with 242 additions and 37736 deletions
+2 -18
View File
@@ -4,19 +4,11 @@ members = [
"crates/dirigent_protocol",
"crates/dirigent_core",
"crates/dirigent_tools",
"crates/dirigent_fermata",
"crates/dirigent_auth",
"crates/dirigent_config",
"crates/dirigent_acp_api",
"crates/dirigent_archivist",
"crates/dirigent_process",
"crates/dirigent_taskrunner",
"crates/dirigent_anth",
"crates/dirigent_inspector",
"crates/dirigent_matrix",
"crates/dirigent_zed",
"crates/dirigent_chatgpt",
"crates/dirigent_codex",
"crates/dirigent_process",
"crates/opencode_client",
]
@@ -31,17 +23,9 @@ unused_assignments = "allow"
dirigent_protocol = { path = "crates/dirigent_protocol" }
dirigent_core = { path = "crates/dirigent_core" }
dirigent_tools = { path = "crates/dirigent_tools" }
dirigent_fermata = { path = "crates/dirigent_fermata" }
dirigent_auth = { path = "crates/dirigent_auth" }
dirigent_config = { path = "crates/dirigent_config" }
dirigent_acp_api = { path = "crates/dirigent_acp_api" }
dirigent_archivist = { path = "crates/dirigent_archivist" }
dirigent_process = { path = "crates/dirigent_process" }
dirigent_taskrunner = { path = "crates/dirigent_taskrunner" }
dirigent_anth = { path = "crates/dirigent_anth" }
dirigent_inspector = { path = "crates/dirigent_inspector" }
dirigent_matrix = { path = "crates/dirigent_matrix", default-features = true }
dirigent_zed = { path = "crates/dirigent_zed" }
dirigent_chatgpt = { path = "crates/dirigent_chatgpt" }
dirigent_codex = { path = "crates/dirigent_codex" }
dirigent_process = { path = "crates/dirigent_process" }
opencode_client = { path = "crates/opencode_client" }
+9 -18
View File
@@ -29,11 +29,10 @@ These tools are developed in this monorepo but distributed as independent reposi
</p>
**Layers top-to-bottom:**
- **Standalone Tools** — installable from their own repositories; depend on foundation crates
- **Orchestration** — multi-connector runtime, ACP server, task management, archival
- **Foundation** — protocol types, tool sandbox, configuration, auth
- **Integrations** — Matrix, Zed, and other external system connectors
- **Parsers** — readers for third-party session formats (OpenCode, ChatGPT, Codex)
- **Consumers** *(shadow)* — server assembly, web app, integrations — not in this repo
- **Standalone Tools** — installable from their own repositories; depend on these crates
- **Orchestration** — connector runtime, ACP server, introspection
- **Foundation** — protocol types, tool sandbox, configuration, auth, process management
---
@@ -41,22 +40,14 @@ These tools are developed in this monorepo but distributed as independent reposi
| Crate | Maturity | Description |
|-------|----------|-------------|
| `dirigent_protocol` | beta | ACP protocol types — messages, events, and RPC definitions |
| `dirigent_core` | beta | Multi-connector orchestration runtime |
| `dirigent_tools` | concept | Tool sandbox and execution abstractions |
| `dirigent_fermata` | production | Policy gate for AI coding agents (`.botignore` / `botignore.toml`) |
| `dirigent_auth` | concept | User authorization model |
| `dirigent_config` | beta | Configuration management |
| `dirigent_protocol` | beta | ACP protocol types — messages, events, and RPC definitions |
| `dirigent_acp_api` | beta | ACP server for incoming agent connections |
| `dirigent_archivist` | production | Event-driven session archival |
| `dirigent_inspector` | concept | Runtime introspection tree |
| `dirigent_config` | beta | Configuration management |
| `dirigent_auth` | concept | User authorization model |
| `dirigent_process` | beta | Child process management |
| `dirigent_taskrunner` | beta | Background task runner |
| `dirigent_anth` | production | Claude Code JSONL session parser |
| `dirigent_inspector` | concept | Session inspection tools |
| `dirigent_matrix` | concept | Matrix integration for session sharing |
| `dirigent_zed` | concept | Zed editor integration |
| `dirigent_chatgpt` | beta | ChatGPT `conversations.json` parser |
| `dirigent_codex` | beta | OpenAI Codex session parser |
| `dirigent_tools` | concept | Tool sandbox and execution abstractions |
| `opencode_client` | beta | OpenCode.ai HTTP client |
---
+68 -72
View File
@@ -1,86 +1,82 @@
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 720 520" width="720" height="520" font-family="system-ui, sans-serif" font-size="11">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 720 440" width="720" height="440" font-family="system-ui, sans-serif" font-size="11">
<defs>
<marker id="arr" markerWidth="8" markerHeight="6" refX="8" refY="3" orient="auto">
<path d="M0,0 L8,3 L0,6" fill="none" stroke="#666" stroke-width="1"/>
</marker>
</defs>
<rect width="720" height="520" rx="8" fill="#f8f9fa"/>
<rect width="720" height="440" rx="8" fill="#f8f9fa"/>
<text x="360" y="24" text-anchor="middle" font-size="14" font-weight="bold" fill="#1a1a2e">Dirigent package architecture</text>
<!-- Layer: External Tools (top) -->
<rect x="20" y="40" width="680" height="70" rx="8" fill="#e8f0fe" stroke="#4285f4" stroke-width="1.5"/>
<text x="30" y="58" font-size="10" font-weight="600" fill="#4285f4">STANDALONE TOOLS</text>
<rect x="40" y="66" width="130" height="32" rx="6" fill="#fff" stroke="#4285f4"/>
<text x="105" y="86" text-anchor="middle" fill="#333" font-weight="600">fermata</text>
<rect x="200" y="66" width="130" height="32" rx="6" fill="#fff" stroke="#4285f4"/>
<text x="265" y="86" text-anchor="middle" fill="#333" font-weight="600">dirigate</text>
<rect x="360" y="66" width="130" height="32" rx="6" fill="#fff" stroke="#4285f4"/>
<text x="425" y="86" text-anchor="middle" fill="#333" font-weight="600">anth</text>
<text x="530" y="80" fill="#4285f4" font-size="10" font-style="italic">← own repos, installable</text>
<!-- Layer: Consumers (shadow — not in this repo) -->
<rect x="20" y="40" width="680" height="55" rx="8" fill="#f0f0f0" stroke="#999" stroke-width="1" stroke-dasharray="4,3"/>
<text x="30" y="56" font-size="10" font-weight="600" fill="#999">CONSUMERS (not in this repo)</text>
<rect x="40" y="62" width="110" height="24" rx="5" fill="#f8f8f8" stroke="#bbb" stroke-dasharray="3,2"/>
<text x="95" y="78" text-anchor="middle" fill="#999" font-size="9">server assembly</text>
<rect x="165" y="62" width="100" height="24" rx="5" fill="#f8f8f8" stroke="#bbb" stroke-dasharray="3,2"/>
<text x="215" y="78" text-anchor="middle" fill="#999" font-size="9">web application</text>
<rect x="280" y="62" width="85" height="24" rx="5" fill="#f8f8f8" stroke="#bbb" stroke-dasharray="3,2"/>
<text x="322" y="78" text-anchor="middle" fill="#999" font-size="9">API layer</text>
<rect x="380" y="62" width="85" height="24" rx="5" fill="#f8f8f8" stroke="#bbb" stroke-dasharray="3,2"/>
<text x="422" y="78" text-anchor="middle" fill="#999" font-size="9">integrations</text>
<rect x="480" y="62" width="70" height="24" rx="5" fill="#f8f8f8" stroke="#bbb" stroke-dasharray="3,2"/>
<text x="515" y="78" text-anchor="middle" fill="#999" font-size="9">archival</text>
<rect x="565" y="62" width="120" height="24" rx="5" fill="#f8f8f8" stroke="#bbb" stroke-dasharray="3,2"/>
<text x="625" y="78" text-anchor="middle" fill="#999" font-size="9">parsers + importers</text>
<!-- Layer: External Tools -->
<rect x="20" y="110" width="680" height="70" rx="8" fill="#e8f0fe" stroke="#4285f4" stroke-width="1.5"/>
<text x="30" y="128" font-size="10" font-weight="600" fill="#4285f4">STANDALONE TOOLS</text>
<rect x="40" y="136" width="130" height="32" rx="6" fill="#fff" stroke="#4285f4"/>
<text x="105" y="156" text-anchor="middle" fill="#333" font-weight="600">fermata</text>
<rect x="200" y="136" width="130" height="32" rx="6" fill="#fff" stroke="#4285f4"/>
<text x="265" y="156" text-anchor="middle" fill="#333" font-weight="600">dirigate</text>
<rect x="360" y="136" width="130" height="32" rx="6" fill="#fff" stroke="#4285f4"/>
<text x="425" y="156" text-anchor="middle" fill="#333" font-weight="600">anth</text>
<text x="530" y="150" fill="#4285f4" font-size="10" font-style="italic">&#x2190; own repos, installable</text>
<!-- Layer: Orchestration -->
<rect x="20" y="130" width="680" height="80" rx="8" fill="#fef9e7" stroke="#f0ad4e" stroke-width="1.5"/>
<text x="30" y="148" font-size="10" font-weight="600" fill="#b9770e">ORCHESTRATION</text>
<rect x="40" y="158" width="140" height="36" rx="6" fill="#fff" stroke="#f0ad4e"/>
<text x="110" y="172" text-anchor="middle" fill="#333" font-size="10" font-weight="600">dirigent_core</text>
<text x="110" y="185" text-anchor="middle" fill="#888" font-size="9">multi-connector runtime</text>
<rect x="210" y="158" width="140" height="36" rx="6" fill="#fff" stroke="#f0ad4e"/>
<text x="280" y="172" text-anchor="middle" fill="#333" font-size="10" font-weight="600">dirigent_acp_api</text>
<text x="280" y="185" text-anchor="middle" fill="#888" font-size="9">ACP server</text>
<rect x="380" y="158" width="140" height="36" rx="6" fill="#fff" stroke="#f0ad4e"/>
<text x="450" y="172" text-anchor="middle" fill="#333" font-size="10" font-weight="600">dirigent_taskrunner</text>
<text x="450" y="185" text-anchor="middle" fill="#888" font-size="9">background tasks</text>
<rect x="550" y="158" width="140" height="36" rx="6" fill="#fff" stroke="#f0ad4e"/>
<text x="620" y="172" text-anchor="middle" fill="#333" font-size="10" font-weight="600">dirigent_archivist</text>
<text x="620" y="185" text-anchor="middle" fill="#888" font-size="9">session archival</text>
<rect x="20" y="200" width="680" height="80" rx="8" fill="#fef9e7" stroke="#f0ad4e" stroke-width="1.5"/>
<text x="30" y="218" font-size="10" font-weight="600" fill="#b9770e">ORCHESTRATION</text>
<rect x="40" y="228" width="200" height="36" rx="6" fill="#fff" stroke="#f0ad4e"/>
<text x="140" y="242" text-anchor="middle" fill="#333" font-size="10" font-weight="600">dirigent_core</text>
<text x="140" y="255" text-anchor="middle" fill="#888" font-size="9">connector runtime</text>
<rect x="270" y="228" width="170" height="36" rx="6" fill="#fff" stroke="#f0ad4e"/>
<text x="355" y="242" text-anchor="middle" fill="#333" font-size="10" font-weight="600">dirigent_acp_api</text>
<text x="355" y="255" text-anchor="middle" fill="#888" font-size="9">ACP server</text>
<rect x="470" y="228" width="220" height="36" rx="6" fill="#fff" stroke="#f0ad4e"/>
<text x="580" y="242" text-anchor="middle" fill="#333" font-size="10" font-weight="600">dirigent_inspector</text>
<text x="580" y="255" text-anchor="middle" fill="#888" font-size="9">introspection tree</text>
<!-- Layer: Foundation -->
<rect x="20" y="230" width="680" height="80" rx="8" fill="#e8f8f0" stroke="#1e8449" stroke-width="1.5"/>
<text x="30" y="248" font-size="10" font-weight="600" fill="#1e8449">FOUNDATION</text>
<rect x="40" y="258" width="140" height="36" rx="6" fill="#fff" stroke="#1e8449"/>
<text x="110" y="272" text-anchor="middle" fill="#333" font-size="10" font-weight="600">dirigent_protocol</text>
<text x="110" y="285" text-anchor="middle" fill="#888" font-size="9">ACP types + messages</text>
<rect x="210" y="258" width="140" height="36" rx="6" fill="#fff" stroke="#1e8449"/>
<text x="280" y="272" text-anchor="middle" fill="#333" font-size="10" font-weight="600">dirigent_tools</text>
<text x="280" y="285" text-anchor="middle" fill="#888" font-size="9">tool sandbox</text>
<rect x="380" y="258" width="140" height="36" rx="6" fill="#fff" stroke="#1e8449"/>
<text x="450" y="272" text-anchor="middle" fill="#333" font-size="10" font-weight="600">dirigent_config</text>
<text x="450" y="285" text-anchor="middle" fill="#888" font-size="9">configuration</text>
<rect x="550" y="258" width="140" height="36" rx="6" fill="#fff" stroke="#1e8449"/>
<text x="620" y="272" text-anchor="middle" fill="#333" font-size="10" font-weight="600">dirigent_auth</text>
<text x="620" y="285" text-anchor="middle" fill="#888" font-size="9">authorization</text>
<!-- Layer: Integrations -->
<rect x="20" y="330" width="680" height="80" rx="8" fill="#f3e8fd" stroke="#8e44ad" stroke-width="1.5"/>
<text x="30" y="348" font-size="10" font-weight="600" fill="#8e44ad">INTEGRATIONS</text>
<rect x="40" y="358" width="100" height="36" rx="6" fill="#fff" stroke="#8e44ad"/>
<text x="90" y="372" text-anchor="middle" fill="#333" font-size="10" font-weight="600">matrix</text>
<text x="90" y="385" text-anchor="middle" fill="#888" font-size="9">session sharing</text>
<rect x="160" y="358" width="100" height="36" rx="6" fill="#fff" stroke="#8e44ad"/>
<text x="210" y="372" text-anchor="middle" fill="#333" font-size="10" font-weight="600">langfuse</text>
<text x="210" y="385" text-anchor="middle" fill="#888" font-size="9">observability</text>
<rect x="280" y="358" width="100" height="36" rx="6" fill="#fff" stroke="#8e44ad"/>
<text x="330" y="372" text-anchor="middle" fill="#333" font-size="10" font-weight="600">zed</text>
<text x="330" y="385" text-anchor="middle" fill="#888" font-size="9">editor</text>
<!-- Layer: Parsers -->
<rect x="20" y="430" width="680" height="70" rx="8" fill="#fdecea" stroke="#c0392b" stroke-width="1.5" stroke-dasharray="6,3"/>
<text x="30" y="448" font-size="10" font-weight="600" fill="#c0392b">PARSERS (third-party format readers)</text>
<rect x="40" y="458" width="120" height="30" rx="6" fill="#fff" stroke="#c0392b" stroke-dasharray="4,2"/>
<text x="100" y="477" text-anchor="middle" fill="#333" font-size="10">opencode_client</text>
<rect x="180" y="458" width="120" height="30" rx="6" fill="#fff" stroke="#c0392b" stroke-dasharray="4,2"/>
<text x="240" y="477" text-anchor="middle" fill="#333" font-size="10">dirigent_chatgpt</text>
<rect x="320" y="458" width="120" height="30" rx="6" fill="#fff" stroke="#c0392b" stroke-dasharray="4,2"/>
<text x="380" y="477" text-anchor="middle" fill="#333" font-size="10">dirigent_codex</text>
<rect x="460" y="458" width="120" height="30" rx="6" fill="#fff" stroke="#c0392b" stroke-dasharray="4,2"/>
<text x="520" y="477" text-anchor="middle" fill="#333" font-size="10">dirigent_inspector</text>
<rect x="20" y="300" width="680" height="80" rx="8" fill="#e8f8f0" stroke="#1e8449" stroke-width="1.5"/>
<text x="30" y="318" font-size="10" font-weight="600" fill="#1e8449">FOUNDATION</text>
<rect x="40" y="328" width="120" height="36" rx="6" fill="#fff" stroke="#1e8449"/>
<text x="100" y="342" text-anchor="middle" fill="#333" font-size="10" font-weight="600">protocol</text>
<text x="100" y="355" text-anchor="middle" fill="#888" font-size="9">ACP types</text>
<rect x="180" y="328" width="90" height="36" rx="6" fill="#fff" stroke="#1e8449"/>
<text x="225" y="342" text-anchor="middle" fill="#333" font-size="10" font-weight="600">tools</text>
<text x="225" y="355" text-anchor="middle" fill="#888" font-size="9">sandbox</text>
<rect x="290" y="328" width="90" height="36" rx="6" fill="#fff" stroke="#1e8449"/>
<text x="335" y="342" text-anchor="middle" fill="#333" font-size="10" font-weight="600">config</text>
<text x="335" y="355" text-anchor="middle" fill="#888" font-size="9">paths + toml</text>
<rect x="400" y="328" width="80" height="36" rx="6" fill="#fff" stroke="#1e8449"/>
<text x="440" y="342" text-anchor="middle" fill="#333" font-size="10" font-weight="600">auth</text>
<text x="440" y="355" text-anchor="middle" fill="#888" font-size="9">accounts</text>
<rect x="500" y="328" width="85" height="36" rx="6" fill="#fff" stroke="#1e8449"/>
<text x="542" y="342" text-anchor="middle" fill="#333" font-size="10" font-weight="600">process</text>
<text x="542" y="355" text-anchor="middle" fill="#888" font-size="9">lifecycle</text>
<rect x="605" y="328" width="85" height="36" rx="6" fill="#fff" stroke="#1e8449"/>
<text x="647" y="342" text-anchor="middle" fill="#333" font-size="10" font-weight="600">opencode</text>
<text x="647" y="355" text-anchor="middle" fill="#888" font-size="9">HTTP client</text>
<!-- Dependency arrows -->
<line x1="105" y1="98" x2="110" y2="158" stroke="#666" stroke-width="1" marker-end="url(#arr)"/>
<line x1="265" y1="98" x2="110" y2="158" stroke="#666" stroke-width="1" marker-end="url(#arr)"/>
<line x1="265" y1="98" x2="280" y2="158" stroke="#666" stroke-width="1" marker-end="url(#arr)"/>
<line x1="110" y1="194" x2="110" y2="258" stroke="#666" stroke-width="1" marker-end="url(#arr)"/>
<line x1="110" y1="194" x2="280" y2="258" stroke="#666" stroke-width="1" marker-end="url(#arr)"/>
<line x1="280" y1="194" x2="110" y2="258" stroke="#666" stroke-width="1" marker-end="url(#arr)"/>
<line x1="450" y1="194" x2="450" y2="258" stroke="#666" stroke-width="1" marker-end="url(#arr)"/>
<line x1="265" y1="168" x2="140" y2="228" stroke="#666" stroke-width="1" marker-end="url(#arr)"/>
<line x1="265" y1="168" x2="355" y2="228" stroke="#666" stroke-width="1" marker-end="url(#arr)"/>
<line x1="140" y1="264" x2="100" y2="328" stroke="#666" stroke-width="1" marker-end="url(#arr)"/>
<line x1="140" y1="264" x2="225" y2="328" stroke="#666" stroke-width="1" marker-end="url(#arr)"/>
<line x1="355" y1="264" x2="100" y2="328" stroke="#666" stroke-width="1" marker-end="url(#arr)"/>
<!-- Footer -->
<text x="360" y="408" text-anchor="middle" font-size="9" fill="#999">Shadow boxes = downstream consumers not included in this repository</text>
<text x="360" y="422" text-anchor="middle" font-size="9" fill="#999">9 crates &#x2014; minimal set for dirigate and standalone tool dependencies</text>
</svg>

Before

Width:  |  Height:  |  Size: 6.9 KiB

After

Width:  |  Height:  |  Size: 6.6 KiB

-148
View File
@@ -1,148 +0,0 @@
# Package: dirigent_anth
Claude Code JSONL session parser and toolkit.
## Quick Facts
- **Type**: Library
- **Main Entry**: src/lib.rs
- **Dependencies**: serde, serde_json, chrono, uuid, camino, thiserror, tracing, dirs
- **Status**: Core parsing complete — ready for downstream consumers
## Purpose
Reads Claude Code's local JSONL session storage (`~/.claude/projects/`) and produces typed, deduplicated, correlated Rust data structures. The types are the product — downstream consumers (archivist import, shell usage analyzers, session browsers) depend on these structs.
## Key Features
- **Session Discovery**: Scan `~/.claude/projects/` for all Claude Code projects and sessions
- **JSONL Parsing**: Lenient line-by-line parser that handles unknown fields and message types
- **Streaming Dedup**: Collapse streamed assistant messages to their final version
- **Tool Correlation**: ID-based pairing of tool_use → tool_result across parallel calls
- **Conversation Tree**: Reconstruct uuid/parentUuid threading with branch detection
- **Noise Classification**: Identify meta messages, warmup, interruptions, API errors
- **Sub-Agent Loading**: Recursive parsing of sub-agent JSONL with metadata
- **Timestamp Parsing**: Handle ISO 8601, Unix seconds, and Unix milliseconds
## Architecture
### Design Principles
1. **Types are the product** — Well-typed Rust structs that downstream consumers import
2. **Lenient parsing** — Unknown fields ignored, unknown message types logged and skipped
3. **Stream-oriented** — Line-by-line BufReader parsing, never loads entire files
4. **Sync-first** — File parsing is CPU-bound; no async overhead
5. **Cross-platform** — camino::Utf8PathBuf throughout for Windows/Unix compatibility
### Module Organization
- **`types.rs`** — All public data types (Content, ContentBlock, RawMessage variants, ToolCall, etc.)
- **`error.rs`** — AntError enum with I/O, JSON parse, home-not-found, invalid-path variants
- **`parser.rs`** — JSONL line parser and file parser with lenient error handling
- **`dedup.rs`** — Streaming deduplication of assistant messages by uuid
- **`correlation.rs`** — Tool call ↔ result pairing by tool_use_id
- **`tree.rs`** — Conversation tree from uuid/parentUuid relationships
- **`noise.rs`** — Noise pattern classification (meta, warmup, interruptions, etc.)
- **`discovery.rs`** — Filesystem scanning for Claude projects and sessions
- **`subagent.rs`** — Sub-agent JSONL and metadata loading
- **`util.rs`** — Timestamp parsing utilities
## Public API
### Quick Start
```rust
use dirigent_anth::{discover_claude_home, discover_projects, load_session};
// Discover all projects
let home = discover_claude_home()?;
let projects = discover_projects(&home)?;
// Load a session with full parsing
for project in &projects {
for session_ref in &project.sessions {
let session = load_session(session_ref)?;
println!("Messages: {}, Tools: {}, Subagents: {}",
session.messages.len(),
session.tool_exchanges.len(),
session.subagents.len());
}
}
```
### Key Functions
| Function | Purpose |
|----------|---------|
| `discover_claude_home()` | Find `~/.claude/` directory |
| `discover_projects(home)` | Scan for all project directories |
| `parse_session(path)` | Parse a JSONL file into messages |
| `parse_session_deduped(path)` | Parse with streaming dedup applied |
| `dedup_messages(msgs)` | Deduplicate streamed assistant messages |
| `correlate_tools(msgs)` | Pair tool calls with results by ID |
| `ConversationTree::build(msgs)` | Build conversation tree |
| `classify_noise(msg)` | Classify a message as noise |
| `load_subagents(dir)` | Load sub-agent sessions from artifacts |
| `load_session(ref)` | Full parse: dedup + correlate + tree + subagents |
| `parse_timestamp(value)` | Parse ISO/Unix timestamps |
## Data Model
### Claude Code JSONL Format
Each line in `~/.claude/projects/<encoded-path>/<session-uuid>.jsonl` is a JSON object with a `type` field discriminator. Five types: `user`, `assistant`, `progress`, `system`, `queue-operation`.
- **Outer wrapper**: camelCase fields (sessionId, parentUuid, isSidechain, gitBranch)
- **Inner message body**: snake_case fields (stop_reason, tool_use_id, is_error)
- **Content**: Either a plain string or array of typed content blocks
### Content Blocks
| Type | Fields |
|------|--------|
| text | `text` |
| tool_use | `id`, `name`, `input` |
| tool_result | `tool_use_id`, `content`, `is_error` |
| thinking | `thinking` |
| image | `source` |
Unknown content block types are silently dropped (lenient deserialization).
## Testing
```bash
cargo test --package dirigent_anth
```
Tests use synthetic JSONL fixtures in `tests/fixtures/`:
- `minimal_session.jsonl` — Basic session with all message types
- `streaming_dedup.jsonl` — Streaming dedup scenario
- `tool_correlation.jsonl` — Parallel and sequential tool calls
- `branching_tree.jsonl` — Conversation with branches
- `noise_patterns.jsonl` — All noise pattern types
- `subagent/` — Sub-agent session with parent and metadata
## Error Handling
- Individual unparseable JSONL lines are logged and skipped (lenient)
- I/O errors and missing directories are propagated as AntError
- Unknown message types are skipped via serde
- Unknown content blocks are silently filtered
## Related Packages
- **dirigent_archivist** — Future consumer for session import
- No current dependencies on other dirigent packages (standalone)
## Future Enhancements
- Bash command analysis module (shell usage analytics)
- Archivist event transform/import
- CLI tool with scan/analyze/import subcommands
- SQLite caching layer
- Watch mode for new session monitoring
## Documentation
- **Package README**: `./README.md` - User-facing overview
- **API Docs**: Run `cargo doc --package dirigent_anth --open`
- **Design Plan**: `docs/superpowers/plans/2026-03-23-dirigent-ant-design.md`
-37
View File
@@ -1,37 +0,0 @@
[package]
name = "dirigent_anth"
version = "0.1.0"
edition = "2021"
[lib]
path = "src/lib.rs"
[[bin]]
name = "anth_bear"
path = "src/bin/anth.rs"
[[bin]]
name = "anth_usage"
path = "src/bin/anth_usage.rs"
[features]
default = []
dirigent-paths = ["dep:dirigent_config"]
[dependencies]
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
chrono = { version = "0.4", features = ["serde"] }
chrono-tz = "0.10"
uuid = { version = "1.11", features = ["serde"] }
camino = { version = "1.1", features = ["serde1"] }
dirs = "6.0"
thiserror = "2.0"
tracing = "0.1"
regex = "1"
portable-pty = "0.8"
vt100 = "0.15"
dirigent_config = { path = "../dirigent_config", optional = true }
[dev-dependencies]
tempfile = "3.0"
-331
View File
@@ -1,331 +0,0 @@
use chrono::{Datelike, NaiveDate, NaiveTime, Utc};
use chrono_tz::Tz;
use serde::Serialize;
#[derive(Debug, Serialize, Default)]
pub struct UsageData {
pub gauges: Vec<UsageGauge>,
#[serde(skip_serializing_if = "Option::is_none")]
pub contributions: Option<ContributionInfo>,
}
#[derive(Debug, Serialize)]
pub struct UsageGauge {
pub name: String,
pub percent_used: u32,
#[serde(skip_serializing_if = "Option::is_none")]
pub resets: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub resets_iso: Option<String>,
}
#[derive(Debug, Serialize, Default)]
pub struct ContributionInfo {
#[serde(skip_serializing_if = "Vec::is_empty")]
pub factors: Vec<ContributionFactor>,
#[serde(skip_serializing_if = "Vec::is_empty")]
pub subagents: Vec<SubagentUsage>,
}
#[derive(Debug, Serialize)]
pub struct ContributionFactor {
pub description: String,
pub percent: u32,
}
#[derive(Debug, Serialize)]
pub struct SubagentUsage {
pub name: String,
pub percent: u32,
}
pub struct ProcessedOutput {
pub raw_screen: String,
pub data: UsageData,
}
pub fn process_usage_screen(raw: &str) -> ProcessedOutput {
let lines: Vec<&str> = raw.lines().collect();
let start = lines
.iter()
.position(|l| {
let t = l.trim();
t.starts_with('─') && t.chars().filter(|&c| c == '─').count() >= 6
})
.unwrap_or(0);
let end = lines
.iter()
.rposition(|l| !l.trim().is_empty())
.map(|i| i + 1)
.unwrap_or(lines.len());
let clean_lines = &lines[start..end];
let raw_screen = clean_lines.join("\n");
let data = extract_usage_data(clean_lines);
ProcessedOutput { raw_screen, data }
}
fn extract_usage_data(lines: &[&str]) -> UsageData {
let mut data = UsageData::default();
let mut i = 0;
while i < lines.len() {
let trimmed = lines[i].trim();
if (trimmed.starts_with("Current session") || trimmed.starts_with("Current week"))
&& !trimmed.contains('%')
{
let name = trimmed.to_string();
if let Some(gauge) = find_gauge(&lines[i..], &name) {
data.gauges.push(gauge);
}
}
if let Some(factor) = parse_contribution_factor(trimmed) {
data.contributions
.get_or_insert_with(ContributionInfo::default)
.factors
.push(factor);
}
if trimmed.starts_with("Subagents") {
let subs = parse_subagent_table(&lines[i + 1..]);
if !subs.is_empty() {
data.contributions
.get_or_insert_with(ContributionInfo::default)
.subagents = subs;
}
}
i += 1;
}
data
}
fn find_gauge(lines: &[&str], name: &str) -> Option<UsageGauge> {
let mut percent = None;
let mut resets_raw = None;
for line in lines.iter().skip(1).take(4) {
let t = line.trim();
if let Some(pct) = extract_percent_used(t) {
percent = Some(pct);
}
if t.starts_with("Resets ") {
resets_raw = Some(t.trim_start_matches("Resets ").to_string());
}
}
percent.map(|p| {
let resets_iso = resets_raw.as_deref().and_then(parse_reset_to_iso);
UsageGauge {
name: name.to_string(),
percent_used: p,
resets: resets_raw,
resets_iso,
}
})
}
/// Parse reset strings like:
/// "12:30pm (Europe/Vienna)" → today at 12:30 in that tz
/// "May 12, 9am (Europe/Vienna)" → May 12 at 09:00
/// "May 12, 9:30am (Europe/Vienna)" → May 12 at 09:30
/// "Jun 1, 12pm (America/New_York)" → Jun 1 at 12:00
///
/// Claude Code uses JS `Intl.DateTimeFormat` style output.
fn parse_reset_to_iso(s: &str) -> Option<String> {
// Split off the timezone from parentheses
let (datetime_part, tz_str) = {
let open = s.rfind('(')?;
let close = s.rfind(')')?;
let tz = s[open + 1..close].trim();
let dt = s[..open].trim();
(dt, tz)
};
let tz: Tz = tz_str.parse().ok()?;
let now = Utc::now().with_timezone(&tz);
let (date, time_str) = if datetime_part.contains(',') {
// "May 12, 9am" or "May 12, 9:30am"
let comma_pos = datetime_part.find(',')?;
let date_part = datetime_part[..comma_pos].trim();
let time_part = datetime_part[comma_pos + 1..].trim();
let date = parse_month_day(date_part, now.year())?;
(date, time_part)
} else {
// "12:30pm" — today in the given timezone
(now.date_naive(), datetime_part)
};
let time = parse_12h_time(time_str)?;
let naive = date.and_time(time);
let local = naive.and_local_timezone(tz).earliest()?;
let utc = local.with_timezone(&Utc);
Some(utc.to_rfc3339())
}
/// Parse "May 12", "Jun 1", "December 25", etc.
fn parse_month_day(s: &str, year: i32) -> Option<NaiveDate> {
let parts: Vec<&str> = s.split_whitespace().collect();
if parts.len() != 2 {
return None;
}
let month = match parts[0].to_lowercase().as_str() {
"jan" | "january" => 1,
"feb" | "february" => 2,
"mar" | "march" => 3,
"apr" | "april" => 4,
"may" => 5,
"jun" | "june" => 6,
"jul" | "july" => 7,
"aug" | "august" => 8,
"sep" | "september" => 9,
"oct" | "october" => 10,
"nov" | "november" => 11,
"dec" | "december" => 12,
_ => return None,
};
let day: u32 = parts[1].parse().ok()?;
NaiveDate::from_ymd_opt(year, month, day)
}
/// Parse "9am", "12pm", "9:30am", "12:30pm"
fn parse_12h_time(s: &str) -> Option<NaiveTime> {
let s = s.trim().to_lowercase();
let is_pm = s.ends_with("pm");
let is_am = s.ends_with("am");
if !is_pm && !is_am {
return None;
}
let num_part = &s[..s.len() - 2];
let (hour, minute) = if let Some((h, m)) = num_part.split_once(':') {
(h.parse::<u32>().ok()?, m.parse::<u32>().ok()?)
} else {
(num_part.parse::<u32>().ok()?, 0)
};
let hour_24 = match (hour, is_pm) {
(12, true) => 12,
(12, false) => 0,
(h, true) => h + 12,
(h, false) => h,
};
NaiveTime::from_hms_opt(hour_24, minute, 0)
}
fn extract_percent_used(line: &str) -> Option<u32> {
let line = line.trim();
if !line.ends_with("% used") {
return None;
}
let before_pct = line.trim_end_matches("% used").trim();
before_pct
.rsplit_once(char::is_whitespace)
.map(|(_, n)| n)
.unwrap_or(before_pct)
.parse()
.ok()
}
fn parse_contribution_factor(line: &str) -> Option<ContributionFactor> {
if !line.contains("% of your usage") {
return None;
}
let pct_str = line.split('%').next()?;
let percent: u32 = pct_str.trim().parse().ok()?;
let description = line.to_string();
Some(ContributionFactor {
description,
percent,
})
}
fn parse_subagent_table(lines: &[&str]) -> Vec<SubagentUsage> {
let mut subs = Vec::new();
for line in lines {
let t = line.trim();
if t.is_empty() || t.starts_with('─') || t.contains("to day") || t.contains("to cancel") {
break;
}
if let Some(pos) = t.rfind('%') {
let num_start = t[..pos]
.rfind(char::is_whitespace)
.map(|i| i + 1)
.unwrap_or(0);
if let Ok(pct) = t[num_start..pos].parse::<u32>() {
let name = t[..num_start].trim().to_string();
if !name.is_empty() && !name.contains("% of") {
subs.push(SubagentUsage {
name,
percent: pct,
});
}
}
}
}
subs
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_time_only() {
let t = parse_12h_time("12:30pm").unwrap();
assert_eq!(t, NaiveTime::from_hms_opt(12, 30, 0).unwrap());
}
#[test]
fn parse_time_am() {
let t = parse_12h_time("9am").unwrap();
assert_eq!(t, NaiveTime::from_hms_opt(9, 0, 0).unwrap());
}
#[test]
fn parse_time_12am() {
let t = parse_12h_time("12am").unwrap();
assert_eq!(t, NaiveTime::from_hms_opt(0, 0, 0).unwrap());
}
#[test]
fn parse_time_with_minutes() {
let t = parse_12h_time("9:30am").unwrap();
assert_eq!(t, NaiveTime::from_hms_opt(9, 30, 0).unwrap());
}
#[test]
fn parse_reset_time_only() {
let iso = parse_reset_to_iso("12:30pm (Europe/Vienna)");
assert!(iso.is_some());
let iso = iso.unwrap();
assert!(iso.contains("T"));
// Should end in +00:00 (UTC via rfc3339)
assert!(iso.ends_with("+00:00"));
}
#[test]
fn parse_reset_date_and_time() {
let iso = parse_reset_to_iso("May 12, 9am (Europe/Vienna)").unwrap();
assert!(iso.contains("T07:00:00") || iso.contains("T08:00:00"));
// CEST is UTC+2, CET is UTC+1 — depends on whether May 12 is summer time
}
#[test]
fn parse_month_day_basic() {
let d = parse_month_day("May 12", 2026).unwrap();
assert_eq!(d, NaiveDate::from_ymd_opt(2026, 5, 12).unwrap());
}
}
-252
View File
@@ -1,252 +0,0 @@
//! Minimal CLI for dirigent_anth — validate parsing and search sessions.
//!
//! Usage:
//! cargo run --package dirigent_anth --bin ant # validate all sessions
//! cargo run --package dirigent_anth --bin ant -- search "query" # search user messages
//! cargo run --package dirigent_anth --bin ant -- stats # show statistics
use dirigent_anth::*;
use std::io::BufRead;
fn main() {
let args: Vec<String> = std::env::args().skip(1).collect();
let home = match discover_claude_home() {
Ok(h) => h,
Err(e) => {
eprintln!("Could not find Claude home: {e}");
std::process::exit(1);
}
};
let projects = match discover_projects(&home) {
Ok(p) => p,
Err(e) => {
eprintln!("Could not discover projects: {e}");
std::process::exit(1);
}
};
match args.first().map(|s| s.as_str()) {
Some("search") => {
let query = args.get(1).map(|s| s.as_str()).unwrap_or("");
if query.is_empty() {
eprintln!("Usage: ant search <query>");
std::process::exit(1);
}
cmd_search(&projects, query);
}
Some("stats") => cmd_stats(&projects),
Some("validate") | None => cmd_validate(&projects),
Some(other) => {
eprintln!("Unknown command: {other}");
eprintln!("Commands: validate (default), search <query>, stats");
std::process::exit(1);
}
}
}
/// Validate that the parser can handle all sessions without errors.
fn cmd_validate(projects: &[ClaudeProject]) {
let mut total_sessions = 0;
let mut total_ok = 0;
let mut total_messages = 0;
let mut total_skipped_lines = 0;
let mut errors: Vec<(String, String)> = Vec::new();
for project in projects {
println!(
"Project: {} ({} sessions)",
project.original_path,
project.sessions.len()
);
for session in &project.sessions {
total_sessions += 1;
// Raw line-level validation: count how many lines parse vs skip
let (_raw_ok, raw_skip) = validate_lines(&session.jsonl_path);
total_skipped_lines += raw_skip;
// Full pipeline validation
match load_session(session) {
Ok(parsed) => {
total_ok += 1;
total_messages += parsed.messages.len();
let tools = parsed.tool_exchanges.len();
let subs = parsed.subagents.len();
let branches = if parsed.tree.is_linear() {
"linear"
} else {
"branched"
};
if raw_skip > 0 {
println!(
" {}{} msgs, {} tools, {} subagents, {} | {raw_skip} lines skipped",
&session.id[..8.min(session.id.len())],
parsed.messages.len(),
tools,
subs,
branches,
);
}
}
Err(e) => {
errors.push((session.id.clone(), e.to_string()));
eprintln!(" {} — ERROR: {e}", &session.id[..8.min(session.id.len())]);
}
}
}
}
println!("\n--- Validation Summary ---");
println!("Projects: {}", projects.len());
println!("Sessions: {total_sessions} ({total_ok} ok, {} errors)", errors.len());
println!("Messages: {total_messages}");
if total_skipped_lines > 0 {
println!("Skipped: {total_skipped_lines} unparseable lines");
}
if !errors.is_empty() {
println!("\nErrors:");
for (id, err) in &errors {
println!(" {id}: {err}");
}
std::process::exit(1);
}
}
/// Count parseable vs skipped lines in a JSONL file.
fn validate_lines(path: &camino::Utf8Path) -> (usize, usize) {
let file = match std::fs::File::open(path.as_std_path()) {
Ok(f) => f,
Err(_) => return (0, 0),
};
let reader = std::io::BufReader::new(file);
let mut ok = 0;
let mut skip = 0;
for (i, line) in reader.lines().enumerate() {
let line = match line {
Ok(l) => l,
Err(_) => {
skip += 1;
continue;
}
};
if line.trim().is_empty() {
continue;
}
if parse_line(&line, i + 1).is_some() {
ok += 1;
} else {
skip += 1;
}
}
(ok, skip)
}
/// Search user messages for a query string (case-insensitive).
fn cmd_search(projects: &[ClaudeProject], query: &str) {
let query_lower = query.to_lowercase();
let mut hits = 0;
for project in projects {
for session in &project.sessions {
let messages = match parse_session_deduped(&session.jsonl_path) {
Ok(m) => m,
Err(_) => continue,
};
for msg in &messages {
let text = match msg {
types::RawMessage::User(u) => match &u.message.content {
types::Content::Text(s) => s.clone(),
types::Content::Blocks(_) => continue,
},
types::RawMessage::Assistant(a) => {
let mut parts = Vec::new();
for block in &a.message.content {
if let types::ContentBlock::Text { text } = block {
parts.push(text.as_str());
}
}
parts.join(" ")
}
_ => continue,
};
if text.to_lowercase().contains(&query_lower) {
let role = match msg {
types::RawMessage::User(_) => "user",
types::RawMessage::Assistant(_) => "assistant",
_ => "other",
};
let preview = truncate(&text, 120);
println!(
"[{}] {} {} | {}",
&project.original_path,
&session.id[..8.min(session.id.len())],
role,
preview
);
hits += 1;
}
}
}
}
println!("\n{hits} matches for \"{query}\"");
}
/// Show aggregate statistics across all sessions.
fn cmd_stats(projects: &[ClaudeProject]) {
let mut total_sessions = 0;
let mut total_messages = 0;
let mut total_tools = 0;
let mut total_subagents = 0;
let mut tool_counts: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
for project in projects {
for session in &project.sessions {
total_sessions += 1;
if let Ok(parsed) = load_session(session) {
total_messages += parsed.messages.len();
total_tools += parsed.tool_exchanges.len();
total_subagents += parsed.subagents.len();
for ex in &parsed.tool_exchanges {
let name = format!("{:?}", ex.call.name);
*tool_counts.entry(name).or_default() += 1;
}
}
}
}
println!("--- Statistics ---");
println!("Projects: {}", projects.len());
println!("Sessions: {total_sessions}");
println!("Messages: {total_messages}");
println!("Tool calls: {total_tools}");
println!("Sub-agents: {total_subagents}");
if !tool_counts.is_empty() {
println!("\nTool usage:");
let mut sorted: Vec<_> = tool_counts.into_iter().collect();
sorted.sort_by(|a, b| b.1.cmp(&a.1));
for (name, count) in sorted.iter().take(15) {
println!(" {name:20} {count}");
}
}
}
fn truncate(s: &str, max: usize) -> String {
let s = s.replace('\n', " ").replace('\r', "");
if s.len() <= max {
s
} else {
format!("{}...", &s[..max])
}
}
-192
View File
@@ -1,192 +0,0 @@
use portable_pty::{CommandBuilder, NativePtySystem, PtySize, PtySystem};
use std::io::{Read, Write};
use std::path::PathBuf;
use std::time::Duration;
const ROWS: u16 = 80;
const COLS: u16 = 120;
struct Args {
debug: bool,
raw: bool,
no_trust: bool,
workdir: Option<PathBuf>,
use_cwd: bool,
}
fn parse_args() -> Args {
let mut args = Args {
debug: false,
raw: false,
no_trust: false,
workdir: None,
use_cwd: false,
};
let mut iter = std::env::args().skip(1);
while let Some(arg) = iter.next() {
match arg.as_str() {
"--debug" => args.debug = true,
"--raw" => args.raw = true,
"--no-trust" => args.no_trust = true,
"--cwd" => args.use_cwd = true,
"--workdir" => {
args.workdir = Some(PathBuf::from(
iter.next().expect("--workdir requires a path argument"),
));
}
other => {
eprintln!("Unknown argument: {other}");
eprintln!(
"Usage: anth_usage [--debug] [--raw] [--no-trust] [--workdir <path>] [--cwd]"
);
std::process::exit(2);
}
}
}
args
}
fn resolve_workdir(args: &Args) -> PathBuf {
if let Some(ref dir) = args.workdir {
return dir.clone();
}
if args.use_cwd {
return std::env::current_dir().expect("failed to get current directory");
}
#[cfg(feature = "dirigent-paths")]
{
if let Ok(paths) = dirigent_config::DirigentPaths::resolve() {
let noproject = paths.noproject_home_dir();
if noproject.exists() {
return noproject;
}
}
}
dirs::home_dir().expect("failed to resolve home directory")
}
fn grab_screen(parser: &vt100::Parser) -> String {
let screen = parser.screen();
let mut output = String::new();
for line in screen.rows(0, COLS) {
output.push_str(&line);
output.push('\n');
}
output
}
macro_rules! debug {
($args:expr, $($tt:tt)*) => {
if $args.debug {
eprintln!($($tt)*);
}
};
}
fn main() {
let args = parse_args();
let workdir = resolve_workdir(&args);
debug!(args, "Working directory: {}", workdir.display());
let pty_system = NativePtySystem::default();
let pair = pty_system
.openpty(PtySize {
rows: ROWS,
cols: COLS,
pixel_width: 0,
pixel_height: 0,
})
.expect("failed to open pty");
let mut cmd = CommandBuilder::new("claude");
cmd.cwd(&workdir);
let mut child = pair.slave.spawn_command(cmd).expect("failed to spawn claude");
drop(pair.slave);
let mut writer = pair.master.take_writer().expect("failed to get writer");
let reader = pair.master.try_clone_reader().expect("failed to get reader");
let (tx, rx) = std::sync::mpsc::channel();
std::thread::spawn(move || {
let mut reader = reader;
let mut buf = [0u8; 4096];
loop {
match reader.read(&mut buf) {
Ok(0) => break,
Ok(n) => {
let _ = tx.send(buf[..n].to_vec());
}
Err(_) => break,
}
}
});
// Wait for claude to render
std::thread::sleep(Duration::from_secs(5));
debug!(
args,
"Child alive: {}",
matches!(child.try_wait(), Ok(None))
);
// Grab screen
let mut parser = vt100::Parser::new(ROWS, COLS, 0);
while let Ok(data) = rx.try_recv() {
parser.process(&data);
}
let output = grab_screen(&parser);
debug!(args, "=== SCREEN ===\n{output}=== END ===");
// Handle trust prompt
if output.contains("Yes, I trust this folder") {
if args.no_trust {
eprintln!("Folder is not trusted: {}", workdir.display());
eprintln!("Run claude in this folder manually to trust it, or omit --no-trust.");
let _ = child.kill();
std::process::exit(1);
}
debug!(args, "Sending enter for trust...");
writer.write_all(b"\r").expect("failed to confirm trust");
std::thread::sleep(Duration::from_secs(3));
while let Ok(data) = rx.try_recv() {
parser.process(&data);
}
debug!(
args,
"=== AFTER TRUST ===\n{}=== END ===",
grab_screen(&parser)
);
}
// Send /usage
debug!(args, "Sending /usage...");
writer
.write_all(b"/usage\r")
.expect("failed to send /usage");
std::thread::sleep(Duration::from_secs(3));
while let Ok(data) = rx.try_recv() {
parser.process(&data);
}
let raw_output = grab_screen(&parser);
let processed = dirigent_anth::anth_usage::process_usage_screen(&raw_output);
if args.raw {
println!("{}", processed.raw_screen);
} else {
println!(
"{}",
serde_json::to_string_pretty(&processed.data).expect("failed to serialize usage data")
);
}
let _ = child.kill();
}
-157
View File
@@ -1,157 +0,0 @@
use portable_pty::{Child, CommandBuilder, NativePtySystem, PtySize, PtySystem};
use std::io::{Read, Write};
use std::sync::mpsc::{self, Receiver};
use std::time::Duration;
use vt100::Parser;
const DEFAULT_ROWS: u16 = 80;
const DEFAULT_COLS: u16 = 120;
pub struct PtySession {
parser: Parser,
writer: Option<Box<dyn Write + Send>>,
rx: Receiver<Vec<u8>>,
cols: u16,
#[allow(dead_code)]
child: Box<dyn Child + Send + Sync>,
}
impl PtySession {
pub fn spawn_claude(args: &[&str]) -> Self {
Self::spawn_claude_with_size(args, DEFAULT_ROWS, DEFAULT_COLS)
}
pub fn spawn_claude_with_size(args: &[&str], rows: u16, cols: u16) -> Self {
let pty_system = NativePtySystem::default();
let pair = pty_system
.openpty(PtySize {
rows,
cols,
pixel_width: 0,
pixel_height: 0,
})
.expect("failed to open pty");
let mut cmd = CommandBuilder::new("claude");
for arg in args {
cmd.arg(*arg);
}
if let Some(home) = dirs::home_dir() {
cmd.cwd(home);
}
let child = pair
.slave
.spawn_command(cmd)
.expect("failed to spawn claude");
drop(pair.slave);
let writer = pair.master.take_writer().expect("failed to get writer");
let reader = pair
.master
.try_clone_reader()
.expect("failed to get reader");
let (tx, rx) = mpsc::channel::<Vec<u8>>();
std::thread::spawn(move || {
let mut reader = reader;
let mut chunk = [0u8; 4096];
loop {
match reader.read(&mut chunk) {
Ok(0) => break,
Ok(n) => {
if tx.send(chunk[..n].to_vec()).is_err() {
break;
}
}
Err(_) => break,
}
}
});
Self {
parser: Parser::new(rows, cols, 0),
writer: Some(writer),
rx,
cols,
child,
}
}
pub fn grab_screen(&mut self) -> String {
while let Ok(data) = self.rx.try_recv() {
self.parser.process(&data);
}
let deadline = std::time::Instant::now() + Duration::from_millis(200);
while std::time::Instant::now() < deadline {
match self.rx.recv_timeout(Duration::from_millis(50)) {
Ok(data) => self.parser.process(&data),
Err(_) => {}
}
}
let screen = self.parser.screen();
let mut output = String::new();
for line in screen.rows(0, self.cols) {
output.push_str(&line);
output.push('\n');
}
output
}
pub fn wait_for(&mut self, needle: &str, timeout: Duration) -> bool {
self.wait_for_any(&[needle], timeout)
}
pub fn wait_for_any(&mut self, needles: &[&str], timeout: Duration) -> bool {
let deadline = std::time::Instant::now() + timeout;
while std::time::Instant::now() < deadline {
match self.rx.recv_timeout(Duration::from_millis(100)) {
Ok(data) => self.parser.process(&data),
Err(_) => {}
}
let screen = self.parser.screen();
let mut content = String::new();
for line in screen.rows(0, self.cols) {
content.push_str(&line);
content.push('\n');
}
for needle in needles {
if content.contains(needle) {
return true;
}
}
}
false
}
pub fn is_alive(&mut self) -> bool {
matches!(self.child.try_wait(), Ok(None))
}
pub fn send(&mut self, input: &[u8]) {
self.writer.as_mut().expect("writer gone").write_all(input).expect("failed to write to pty");
}
pub fn try_send(&mut self, input: &[u8]) -> std::io::Result<()> {
match self.writer.as_mut() {
Some(w) => w.write_all(input),
None => Err(std::io::Error::new(std::io::ErrorKind::BrokenPipe, "writer gone")),
}
}
pub fn try_send_line(&mut self, text: &str) -> std::io::Result<()> {
self.try_send(text.as_bytes())?;
self.try_send(b"\r")
}
pub fn send_enter(&mut self) {
self.send(b"\r");
}
pub fn send_line(&mut self, text: &str) {
self.send(text.as_bytes());
self.send_enter();
}
}
-107
View File
@@ -1,107 +0,0 @@
//! Tool call correlation — matches assistant ToolUse blocks with their
//! corresponding user ToolResult blocks by ID across a message sequence.
use std::collections::HashMap;
use crate::types::{
Content, ContentBlock, RawAssistantMessage, RawMessage, RawUserMessage, ToolCall,
ToolExchange, ToolName, ToolResultData,
};
/// Extract tool calls from an assistant message's content blocks.
fn extract_tool_calls(msg: &RawAssistantMessage) -> Vec<ToolCall> {
let source_uuid = msg.uuid.clone().unwrap_or_default();
msg.message
.content
.iter()
.filter_map(|block| {
if let ContentBlock::ToolUse { id, name, input, .. } = block {
Some(ToolCall {
id: id.clone(),
name: ToolName::from(name.clone()),
input: input.clone(),
source_message_uuid: source_uuid.clone(),
})
} else {
None
}
})
.collect()
}
/// Extract tool results from a user message's content blocks.
fn extract_tool_results(msg: &RawUserMessage) -> Vec<ToolResultData> {
let source_uuid = msg.uuid.clone().unwrap_or_default();
match &msg.message.content {
Content::Blocks(blocks) => blocks
.iter()
.filter_map(|block| {
if let ContentBlock::ToolResult { tool_use_id, content, is_error } = block {
// Extract text content from the tool result
let text_content = content.as_ref().and_then(|c| match c {
Content::Text(s) => Some(s.clone()),
Content::Blocks(bs) => {
// Concatenate text blocks
let texts: Vec<&str> = bs
.iter()
.filter_map(|b| {
if let ContentBlock::Text { text } = b {
Some(text.as_str())
} else {
None
}
})
.collect();
if texts.is_empty() { None } else { Some(texts.join("\n")) }
}
});
Some(ToolResultData {
tool_use_id: tool_use_id.clone(),
content: text_content,
is_error: *is_error,
source_message_uuid: source_uuid.clone(),
})
} else {
None
}
})
.collect(),
Content::Text(_) => Vec::new(),
}
}
/// Correlate tool calls with their results across a message sequence.
///
/// Iterates messages in order, collecting ToolUse blocks from assistant
/// messages and matching them by ID to ToolResult blocks in subsequent user
/// messages. Any tool calls that never received a result are emitted with
/// `result: None`.
pub fn correlate_tools(messages: &[RawMessage]) -> Vec<ToolExchange> {
let mut pending: HashMap<String, ToolCall> = HashMap::new();
let mut exchanges: Vec<ToolExchange> = Vec::new();
for msg in messages {
match msg {
RawMessage::Assistant(asst) => {
for call in extract_tool_calls(asst) {
pending.insert(call.id.clone(), call);
}
}
RawMessage::User(user) => {
for result in extract_tool_results(user) {
if let Some(call) = pending.remove(&result.tool_use_id) {
exchanges.push(ToolExchange { call, result: Some(result) });
}
}
}
_ => {}
}
}
// Emit unmatched calls (no result found)
for (_id, call) in pending {
exchanges.push(ToolExchange { call, result: None });
}
exchanges
}
-116
View File
@@ -1,116 +0,0 @@
//! Streaming deduplication for assistant messages.
use crate::types::{RawAssistantMessage, RawMessage};
/// Deduplicate streamed assistant messages.
///
/// Claude Code writes multiple JSONL lines for the same assistant message
/// as it streams. Each shares the same `uuid` with progressively more
/// content blocks. We keep only the last entry per uuid.
///
/// Non-assistant messages pass through unchanged.
pub fn dedup_messages(messages: Vec<RawMessage>) -> Vec<RawMessage> {
let mut result: Vec<RawMessage> = Vec::new();
let mut buffered_assistant: Option<RawAssistantMessage> = None;
for msg in messages {
match msg {
RawMessage::Assistant(ref asst) => {
let current_uuid = asst.uuid.as_deref();
if let Some(ref buffered) = buffered_assistant {
let buffered_uuid = buffered.uuid.as_deref();
if current_uuid == buffered_uuid {
// Same uuid — replace buffer with newer (more complete) version
buffered_assistant = Some(asst.clone());
} else {
// Different uuid — flush old buffer, start new
result.push(RawMessage::Assistant(buffered.clone()));
buffered_assistant = Some(asst.clone());
}
} else {
// No buffer yet — start buffering
buffered_assistant = Some(asst.clone());
}
}
_ => {
// Non-assistant: flush any buffered assistant first, then push this
if let Some(buffered) = buffered_assistant.take() {
result.push(RawMessage::Assistant(buffered));
}
result.push(msg);
}
}
}
// Flush remaining buffer
if let Some(buffered) = buffered_assistant {
result.push(RawMessage::Assistant(buffered));
}
result
}
#[cfg(test)]
mod tests {
use super::*;
use crate::types::{AssistantInner, ContentBlock};
fn make_assistant(uuid: &str, stop_reason: Option<&str>, text: &str) -> RawMessage {
RawMessage::Assistant(RawAssistantMessage {
uuid: Some(uuid.to_string()),
parent_uuid: None,
timestamp: None,
session_id: None,
cwd: None,
version: None,
git_branch: None,
is_sidechain: false,
request_id: None,
message: AssistantInner {
model: None,
id: None,
message_type: None,
role: None,
content: vec![ContentBlock::Text { text: text.to_string() }],
stop_reason: stop_reason.map(str::to_string),
stop_sequence: None,
usage: None,
},
})
}
#[test]
fn dedup_single_streamed_message() {
let msgs = vec![
make_assistant("a-1", None, "Part 1"),
make_assistant("a-1", None, "Part 1 more"),
make_assistant("a-1", Some("end_turn"), "Part 1 final"),
];
let deduped = dedup_messages(msgs);
assert_eq!(deduped.len(), 1);
if let RawMessage::Assistant(a) = &deduped[0] {
assert_eq!(a.message.stop_reason.as_deref(), Some("end_turn"));
match &a.message.content[0] {
ContentBlock::Text { text } => assert_eq!(text, "Part 1 final"),
_ => panic!("Expected text block"),
}
}
}
#[test]
fn dedup_two_distinct_assistants() {
let msgs = vec![
make_assistant("a-1", Some("end_turn"), "First"),
make_assistant("a-2", Some("end_turn"), "Second"),
];
let deduped = dedup_messages(msgs);
assert_eq!(deduped.len(), 2);
}
#[test]
fn dedup_empty_input() {
let deduped = dedup_messages(vec![]);
assert!(deduped.is_empty());
}
}
-342
View File
@@ -1,342 +0,0 @@
use std::collections::HashMap;
use camino::{Utf8Path, Utf8PathBuf};
use crate::types::*;
use crate::error::{AntError, Result};
/// Discover the Claude Code home directory (~/.claude/).
pub fn discover_claude_home() -> Result<Utf8PathBuf> {
let home = dirs::home_dir().ok_or(AntError::HomeNotFound)?;
let claude_dir = home.join(".claude");
if !claude_dir.exists() {
return Err(AntError::HomeNotFound);
}
Utf8PathBuf::try_from(claude_dir.to_path_buf())
.map_err(|e| AntError::InvalidPath(e.to_string()))
}
/// Normalise a native path to forward slashes for consistent storage.
fn normalize_to_forward_slashes(path: &str) -> String {
path.replace('\\', "/")
}
/// Resolve the original filesystem path for a Claude project directory.
///
/// Priority:
/// 1. `projectPath` from `sessions-index.json` (authoritative, cheap)
/// 2. `cwd` from the first user message in any session JSONL (authoritative, costs one file parse)
/// 3. `decode_project_path` (lossy fallback for empty project directories)
pub fn resolve_original_path(dir_name: &str, sessions: &[SessionRef]) -> String {
// 1. Try sessions-index.json projectPath
for session in sessions {
if let Some(ref idx) = session.index_entry {
if let Some(ref path) = idx.project_path {
if !path.is_empty() {
return normalize_to_forward_slashes(path);
}
}
}
}
// 2. Try cwd from first user message in any session
for session in sessions {
if let Ok(msgs) = crate::parser::parse_session(&session.jsonl_path) {
for msg in &msgs {
if let crate::types::RawMessage::User(user) = msg {
if let Some(ref cwd) = user.cwd {
if !cwd.is_empty() {
return normalize_to_forward_slashes(cwd);
}
}
}
}
}
}
// 3. Lossy fallback
decode_project_path(dir_name)
}
/// Discover all Claude Code project directories under the given home.
pub fn discover_projects(home: &Utf8Path) -> Result<Vec<ClaudeProject>> {
let projects_dir = home.join("projects");
if !projects_dir.as_std_path().exists() {
return Ok(Vec::new());
}
let mut projects = Vec::new();
for entry in std::fs::read_dir(projects_dir.as_std_path())? {
let entry = entry?;
let path = entry.path();
if !path.is_dir() {
continue;
}
let dir_name = match path.file_name().and_then(|n| n.to_str()) {
Some(name) => name.to_string(),
None => continue,
};
let utf8_path = match Utf8PathBuf::try_from(path.clone()) {
Ok(p) => p,
Err(_) => continue,
};
let sessions = discover_sessions(&utf8_path)?;
let original_path = resolve_original_path(&dir_name, &sessions);
projects.push(ClaudeProject {
path: utf8_path,
original_path,
sessions,
});
}
Ok(projects)
}
/// Decode an encoded project folder name back to the original path (lossy).
///
/// **Warning**: Claude Code's encoding replaces `\`, `/`, AND `_` all with
/// `-`, making this decoding ambiguous. For example, `G--dev-projects-adk-rust`
/// could be `G:/dev/projects/adk-rust` or `G:/dev/projects/adk/rust`. Prefer
/// [`resolve_original_path`] which reads ground truth from `sessions-index.json`
/// or session JSONL files. This function is a last-resort fallback for empty
/// project directories with no sessions or index.
pub fn decode_project_path(encoded: &str) -> String {
// Split on "--" to recover path segments separated by the original separators.
let parts: Vec<&str> = encoded.split("--").collect();
if parts.is_empty() {
return encoded.to_string();
}
let mut result = String::new();
let first = parts[0];
if first.len() == 1 && first.chars().next().map_or(false, |c| c.is_ascii_uppercase()) {
// Windows drive letter: "G" → "G:"
result.push_str(first);
result.push(':');
} else if first.starts_with('-') || first.is_empty() {
// Unix-style absolute path: the original path started with "/".
// The first segment has a leading "-" that encoded the root separator.
// Strip that leading "-" to recover the first directory component.
let component = first.trim_start_matches('-');
result.push('/');
if !component.is_empty() {
// Single dashes within the component are path separators.
result.push_str(&component.replace('-', "/"));
}
} else {
result.push_str(first);
}
// Remaining "--"-separated parts are additional path components.
// Within each part, single "-" represent path separators.
for part in &parts[1..] {
result.push('/');
result.push_str(&part.replace('-', "/"));
}
result
}
/// Discover all session JSONL files in a project directory.
pub fn discover_sessions(project_dir: &Utf8Path) -> Result<Vec<SessionRef>> {
let index = load_session_index(project_dir);
let mut sessions = Vec::new();
for entry in std::fs::read_dir(project_dir.as_std_path())? {
let entry = entry?;
let path = entry.path();
// Only .jsonl files
let extension = path.extension().and_then(|e| e.to_str());
if extension != Some("jsonl") {
continue;
}
let stem = match path.file_stem().and_then(|s| s.to_str()) {
Some(s) => s.to_string(),
None => continue,
};
let utf8_path = match Utf8PathBuf::try_from(path) {
Ok(p) => p,
Err(_) => continue,
};
// Check for artifacts directory (same name as the session stem).
let artifacts_dir = {
let dir = project_dir.join(&stem);
if dir.as_std_path().is_dir() {
Some(dir)
} else {
None
}
};
let index_entry = index.as_ref().and_then(|idx| idx.get(&stem).cloned());
sessions.push(SessionRef {
id: stem,
jsonl_path: utf8_path,
artifacts_dir,
index_entry,
});
}
Ok(sessions)
}
/// Load `sessions-index.json` if it exists in the given project directory.
fn load_session_index(project_dir: &Utf8Path) -> Option<HashMap<String, SessionIndexEntry>> {
let index_path = project_dir.join("sessions-index.json");
if !index_path.as_std_path().exists() {
return None;
}
let content = std::fs::read_to_string(index_path.as_std_path()).ok()?;
serde_json::from_str::<HashMap<String, SessionIndexEntry>>(&content).ok()
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
#[test]
fn decode_project_path_windows() {
assert_eq!(
decode_project_path("G--dev-projects-dirigent"),
"G:/dev/projects/dirigent"
);
}
#[test]
fn decode_project_path_windows_users() {
assert_eq!(
decode_project_path("C--Users-g4b-tmp"),
"C:/Users/g4b/tmp"
);
}
#[test]
fn decode_project_path_unix() {
assert_eq!(
decode_project_path("-home-user-projects-foo"),
"/home/user/projects/foo"
);
}
#[test]
fn discover_sessions_in_temp_dir() {
let tmp = TempDir::new().unwrap();
let project_dir = Utf8Path::from_path(tmp.path()).unwrap();
// Create fake session files.
std::fs::write(project_dir.join("abc-def-123.jsonl").as_std_path(), "{}\n").unwrap();
std::fs::write(project_dir.join("xyz-456-789.jsonl").as_std_path(), "{}\n").unwrap();
// Create an artifacts directory for one session.
std::fs::create_dir(project_dir.join("abc-def-123").as_std_path()).unwrap();
let sessions = discover_sessions(project_dir).unwrap();
assert_eq!(sessions.len(), 2);
let with_artifacts = sessions.iter().find(|s| s.id == "abc-def-123").unwrap();
assert!(with_artifacts.artifacts_dir.is_some());
let without_artifacts = sessions.iter().find(|s| s.id == "xyz-456-789").unwrap();
assert!(without_artifacts.artifacts_dir.is_none());
}
#[test]
fn discover_sessions_ignores_non_jsonl() {
let tmp = TempDir::new().unwrap();
let project_dir = Utf8Path::from_path(tmp.path()).unwrap();
std::fs::write(project_dir.join("session.jsonl").as_std_path(), "{}\n").unwrap();
std::fs::write(
project_dir.join("sessions-index.json").as_std_path(),
"{}",
)
.unwrap();
std::fs::create_dir(project_dir.join("some-dir").as_std_path()).unwrap();
let sessions = discover_sessions(project_dir).unwrap();
assert_eq!(sessions.len(), 1);
assert_eq!(sessions[0].id, "session");
}
#[test]
fn discover_sessions_loads_index_entry() {
let tmp = TempDir::new().unwrap();
let project_dir = Utf8Path::from_path(tmp.path()).unwrap();
std::fs::write(project_dir.join("abc-123.jsonl").as_std_path(), "{}\n").unwrap();
let index_json = r#"{
"abc-123": {
"sessionId": "abc-123",
"firstPrompt": "Hello",
"summary": "A test session",
"messageCount": 5
}
}"#;
std::fs::write(
project_dir.join("sessions-index.json").as_std_path(),
index_json,
)
.unwrap();
let sessions = discover_sessions(project_dir).unwrap();
assert_eq!(sessions.len(), 1);
let entry = sessions[0].index_entry.as_ref().unwrap();
assert_eq!(entry.session_id.as_deref(), Some("abc-123"));
assert_eq!(entry.first_prompt.as_deref(), Some("Hello"));
assert_eq!(entry.message_count, Some(5));
}
#[test]
fn resolve_original_path_prefers_index_project_path() {
let sessions = vec![SessionRef {
id: "test-session".to_string(),
jsonl_path: Utf8PathBuf::from("/tmp/fake.jsonl"),
artifacts_dir: None,
index_entry: Some(SessionIndexEntry {
session_id: Some("test-session".to_string()),
first_prompt: None,
summary: None,
message_count: None,
created: None,
modified: None,
git_branch: None,
project_path: Some(r"G:\dev\projects\bevy_sprite3d".to_string()),
}),
}];
let result = resolve_original_path("G--dev-projects-bevy-sprite3d", &sessions);
assert_eq!(result, "G:/dev/projects/bevy_sprite3d");
}
#[test]
fn resolve_original_path_falls_back_to_decode() {
let sessions: Vec<SessionRef> = vec![];
let result = resolve_original_path("G--dev-projects-dirigent", &sessions);
assert_eq!(result, "G:/dev/projects/dirigent");
}
#[test]
fn discover_projects_empty_when_no_projects_dir() {
let tmp = TempDir::new().unwrap();
let home_dir = Utf8Path::from_path(tmp.path()).unwrap();
// No "projects" subdirectory — should return empty vec, not an error.
let projects = discover_projects(home_dir).unwrap();
assert!(projects.is_empty());
}
}
-19
View File
@@ -1,19 +0,0 @@
#[derive(Debug, thiserror::Error)]
pub enum AntError {
#[error("I/O error: {0}")]
Io(#[from] std::io::Error),
#[error("JSON parse error at line {line}: {source}")]
JsonParse {
line: usize,
source: serde_json::Error,
},
#[error("Claude home directory not found")]
HomeNotFound,
#[error("Invalid path: {0}")]
InvalidPath(String),
}
pub type Result<T> = std::result::Result<T, AntError>;
-52
View File
@@ -1,52 +0,0 @@
//! dirigent_anth — Claude Code Session Parser & Toolkit
//!
//! Reads Claude Code's local JSONL session storage and produces typed,
//! deduplicated, correlated Rust data structures.
//!
//! # Design
//!
//! See `docs/superpowers/plans/2026-03-23-dirigent-ant-design.md`
pub mod claude_grab;
pub mod anth_usage;
pub mod correlation;
pub mod dedup;
pub mod discovery;
pub mod error;
pub mod noise;
pub mod parser;
pub mod subagent;
pub mod tree;
pub mod types;
pub mod util;
/// Load and fully parse a session: dedup, correlate, tree, subagents.
pub fn load_session(session_ref: &types::SessionRef) -> error::Result<types::ParsedSession> {
let messages = parser::parse_session_deduped(&session_ref.jsonl_path)?;
let tree = tree::ConversationTree::build(&messages);
let tool_exchanges = correlation::correlate_tools(&messages);
let mut subagents = if let Some(ref dir) = session_ref.artifacts_dir {
subagent::load_subagents(dir)?
} else {
Vec::new()
};
subagent::link_subagents_to_calls(&mut subagents, &tool_exchanges);
Ok(types::ParsedSession {
messages,
tree,
tool_exchanges,
subagents,
})
}
pub use correlation::correlate_tools;
pub use dedup::dedup_messages;
pub use discovery::{decode_project_path, discover_claude_home, discover_projects, discover_sessions, resolve_original_path};
pub use error::{AntError, Result};
pub use noise::{classify_noise, NoiseKind};
pub use parser::{parse_line, parse_session, parse_session_deduped};
pub use subagent::{link_subagents_to_calls, load_subagents};
pub use tree::{message_parent_uuid, message_uuid, ConversationNode, ConversationTree};
pub use types::*;
pub use util::parse_timestamp;
-72
View File
@@ -1,72 +0,0 @@
use crate::types::*;
/// Classification of noise patterns in Claude Code JSONL.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum NoiseKind {
Meta,
Warmup,
Interrupted,
Continuation,
ApiError,
SystemCaveat,
QueueOp,
}
/// Classify a message as noise, if applicable.
/// Returns None for normal messages.
pub fn classify_noise(message: &RawMessage) -> Option<NoiseKind> {
match message {
RawMessage::QueueOperation(_) => Some(NoiseKind::QueueOp),
RawMessage::User(user) => {
if user.is_meta.unwrap_or(false) {
return Some(NoiseKind::Meta);
}
if let Some(text) = extract_user_text(user) {
if text == "Warmup" {
return Some(NoiseKind::Warmup);
}
if text.starts_with("[Request interrupted") {
return Some(NoiseKind::Interrupted);
}
if text.starts_with("This session is being continued") {
return Some(NoiseKind::Continuation);
}
if text.starts_with("API Error") {
return Some(NoiseKind::ApiError);
}
if text.starts_with("Caveat: The messages below") {
return Some(NoiseKind::SystemCaveat);
}
}
None
}
_ => None,
}
}
/// Extract plain text from a user message's content.
fn extract_user_text(user: &RawUserMessage) -> Option<&str> {
match &user.message.content {
Content::Text(s) => Some(s.as_str()),
Content::Blocks(_) => None, // tool_result blocks, not plain text
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn normal_assistant_is_not_noise() {
let json = r#"{"type":"assistant","uuid":"x","timestamp":"2026-01-01T00:00:00Z","sessionId":"s","message":{"id":"m","role":"assistant","content":[{"type":"text","text":"Hello"}],"stop_reason":"end_turn"}}"#;
let msg: RawMessage = serde_json::from_str(json).unwrap();
assert_eq!(classify_noise(&msg), None);
}
#[test]
fn queue_op_is_noise() {
let json = r#"{"type":"queue-operation","operation":"enqueue","timestamp":"2026-01-01T00:00:00Z","sessionId":"s"}"#;
let msg: RawMessage = serde_json::from_str(json).unwrap();
assert_eq!(classify_noise(&msg), Some(NoiseKind::QueueOp));
}
}
-50
View File
@@ -1,50 +0,0 @@
//! JSONL line parser for Claude Code session files.
use std::io::BufRead;
use camino::Utf8Path;
use crate::error::Result;
use crate::types::RawMessage;
/// Parse a single JSONL line into a RawMessage.
/// Returns None for lines that cannot be parsed (logged via tracing).
pub fn parse_line(line: &str, line_number: usize) -> Option<RawMessage> {
match serde_json::from_str::<RawMessage>(line) {
Ok(msg) => Some(msg),
Err(e) => {
tracing::warn!(line = line_number, error = %e, "Skipping unparseable JSONL line");
None
}
}
}
/// Parse all messages from a JSONL file.
/// Skips unparseable lines (lenient). Returns I/O errors.
pub fn parse_session(path: &Utf8Path) -> Result<Vec<RawMessage>> {
let file = std::fs::File::open(path.as_std_path())?;
let reader = std::io::BufReader::new(file);
let mut messages = Vec::new();
for (i, line) in reader.lines().enumerate() {
let line = line?;
if line.trim().is_empty() {
continue;
}
if let Some(msg) = parse_line(&line, i + 1) {
messages.push(msg);
}
}
Ok(messages)
}
/// Parse a session JSONL file with streaming deduplication applied.
///
/// Claude Code writes multiple JSONL lines for the same assistant message as
/// it streams. This function collapses those into a single final version per
/// uuid. See [`crate::dedup::dedup_messages`] for details.
pub fn parse_session_deduped(path: &Utf8Path) -> Result<Vec<RawMessage>> {
let messages = parse_session(path)?;
Ok(crate::dedup::dedup_messages(messages))
}
-215
View File
@@ -1,215 +0,0 @@
//! Sub-agent session loading.
//!
//! Claude Code spawns sub-agents for Agent tool calls and stores their
//! conversations under `<session-artifacts-dir>/subagents/`. Each sub-agent
//! has a JSONL file and an optional `.meta.json` with metadata such as the
//! agent type.
use camino::Utf8Path;
use crate::error::Result;
use crate::parser::parse_session;
use crate::types::{SubAgentMeta, SubAgentSession, ToolExchange};
/// Load all sub-agent sessions from a session's artifacts directory.
///
/// Expects files at: `<session_artifacts_dir>/subagents/agent-<id>.jsonl`
/// with optional companion: `<session_artifacts_dir>/subagents/agent-<id>.meta.json`
///
/// Returns an empty `Vec` if the `subagents/` subdirectory does not exist.
pub fn load_subagents(session_artifacts_dir: &Utf8Path) -> Result<Vec<SubAgentSession>> {
let subagents_dir = session_artifacts_dir.join("subagents");
if !subagents_dir.as_std_path().exists() {
return Ok(Vec::new());
}
let mut subagents = Vec::new();
for entry in std::fs::read_dir(subagents_dir.as_std_path())? {
let entry = entry?;
let path = entry.path();
// Only process agent-*.jsonl files
let file_name = match path.file_name().and_then(|n| n.to_str()) {
Some(name) => name.to_string(),
None => continue,
};
if !file_name.starts_with("agent-") || !file_name.ends_with(".jsonl") {
continue;
}
// Extract agent ID: "agent-abc123.jsonl" → "abc123"
let agent_id = file_name
.strip_prefix("agent-")
.and_then(|s| s.strip_suffix(".jsonl"))
.unwrap_or(&file_name)
.to_string();
let jsonl_path = match camino::Utf8PathBuf::try_from(path.clone()) {
Ok(p) => p,
Err(_) => continue,
};
// Parse the JSONL session
let messages = parse_session(&jsonl_path)?;
// Try to load companion metadata file
let meta_path = path.with_file_name(format!("agent-{}.meta.json", agent_id));
let meta = if meta_path.exists() {
let content = std::fs::read_to_string(&meta_path)?;
serde_json::from_str::<SubAgentMeta>(&content)
.unwrap_or(SubAgentMeta { agent_type: None })
} else {
SubAgentMeta { agent_type: None }
};
subagents.push(SubAgentSession {
agent_id,
meta,
messages,
parent_tool_call_id: None,
});
}
Ok(subagents)
}
/// Try to link sub-agent sessions to their parent Agent tool calls.
///
/// For each Agent tool call in `tool_exchanges`, parses the tool result text
/// for `agentId: <id>` and matches it against sub-agent sessions. On match,
/// sets `SubAgentSession.parent_tool_call_id` to the tool call's ID.
///
/// This is best-effort: if the agentId text format changes or a result is
/// missing, the sub-agent is still usable but without tool_use linkage.
pub fn link_subagents_to_calls(
subagents: &mut [SubAgentSession],
tool_exchanges: &[ToolExchange],
) {
use regex::Regex;
if subagents.is_empty() || tool_exchanges.is_empty() {
return;
}
// Compile once, match many
let re = Regex::new(r"agentId:\s*(\S+)").expect("valid regex");
for exchange in tool_exchanges {
// Only look at Agent tool calls
if exchange.call.name != crate::types::ToolName::Agent {
continue;
}
// Extract agentId from the tool result text
let agent_id = exchange
.result
.as_ref()
.and_then(|r| r.content.as_deref())
.and_then(|text| re.captures(text))
.and_then(|caps| caps.get(1))
.map(|m| m.as_str());
let agent_id = match agent_id {
Some(id) => id,
None => continue,
};
// Find matching sub-agent and set the linkage
if let Some(subagent) = subagents.iter_mut().find(|s| s.agent_id == agent_id) {
subagent.parent_tool_call_id = Some(exchange.call.id.clone());
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::types::{ToolCall, ToolName, ToolResultData};
#[test]
fn test_link_subagents_to_calls_matches_agent_id() {
let mut subagents = vec![
SubAgentSession {
agent_id: "abc123def".to_string(),
meta: SubAgentMeta { agent_type: Some("Explore".to_string()) },
messages: vec![],
parent_tool_call_id: None,
},
SubAgentSession {
agent_id: "xyz789".to_string(),
meta: SubAgentMeta { agent_type: None },
messages: vec![],
parent_tool_call_id: None,
},
];
let exchanges = vec![
ToolExchange {
call: ToolCall {
id: "toolu_01ABC".to_string(),
name: ToolName::Agent,
input: serde_json::json!({"description": "test"}),
source_message_uuid: "msg-1".to_string(),
},
result: Some(ToolResultData {
tool_use_id: "toolu_01ABC".to_string(),
content: Some("agentId: abc123def (use SendMessage with to: 'abc123def' to continue)\n<usage>total_tokens: 1000</usage>".to_string()),
is_error: false,
source_message_uuid: "msg-2".to_string(),
}),
},
ToolExchange {
call: ToolCall {
id: "toolu_02DEF".to_string(),
name: ToolName::Read,
input: serde_json::json!({}),
source_message_uuid: "msg-3".to_string(),
},
result: None,
},
];
link_subagents_to_calls(&mut subagents, &exchanges);
assert_eq!(subagents[0].parent_tool_call_id, Some("toolu_01ABC".to_string()));
assert_eq!(subagents[1].parent_tool_call_id, None);
}
#[test]
fn test_link_subagents_empty_inputs() {
let mut empty_subagents: Vec<SubAgentSession> = vec![];
let empty_exchanges: Vec<ToolExchange> = vec![];
link_subagents_to_calls(&mut empty_subagents, &empty_exchanges);
// No panic
}
#[test]
fn test_link_subagents_no_match() {
let mut subagents = vec![SubAgentSession {
agent_id: "no_match".to_string(),
meta: SubAgentMeta { agent_type: None },
messages: vec![],
parent_tool_call_id: None,
}];
let exchanges = vec![ToolExchange {
call: ToolCall {
id: "toolu_99".to_string(),
name: ToolName::Agent,
input: serde_json::json!({}),
source_message_uuid: "msg-1".to_string(),
},
result: Some(ToolResultData {
tool_use_id: "toolu_99".to_string(),
content: Some("agentId: different_id\n<usage>tokens: 500</usage>".to_string()),
is_error: false,
source_message_uuid: "msg-2".to_string(),
}),
}];
link_subagents_to_calls(&mut subagents, &exchanges);
assert_eq!(subagents[0].parent_tool_call_id, None);
}
}
-171
View File
@@ -1,171 +0,0 @@
//! Conversation tree module — builds a parent/child tree from `RawMessage`s.
//!
//! Claude Code sessions are not purely linear: the user can edit earlier
//! messages, producing branches. Each message carries a `uuid` and a
//! `parentUuid` that describe the relationship. This module reconstructs
//! the tree so callers can walk threads, detect branches, and select the
//! main thread.
use std::collections::HashMap;
use crate::types::RawMessage;
// ---------------------------------------------------------------------------
// Node & tree types
// ---------------------------------------------------------------------------
/// A single node in the conversation tree.
#[derive(Debug)]
pub struct ConversationNode {
/// The UUID of this message.
pub uuid: String,
/// The raw message stored at this node.
pub message: RawMessage,
/// UUIDs of direct children, in insertion order.
pub children: Vec<String>,
}
/// The full conversation tree for a session.
///
/// A session may have multiple roots when the first message has no
/// `parentUuid`, or when a message refers to a parent that is not present
/// in the slice provided to [`ConversationTree::build`].
#[derive(Debug)]
pub struct ConversationTree {
/// Root node UUIDs (messages with no parent or with an unknown parent).
pub roots: Vec<String>,
/// All nodes indexed by UUID.
pub nodes: HashMap<String, ConversationNode>,
}
// ---------------------------------------------------------------------------
// UUID / parent-UUID helpers
// ---------------------------------------------------------------------------
/// Extract the `uuid` from any `RawMessage` variant.
///
/// Returns `None` for variants that carry no UUID (e.g. `QueueOperation`).
pub fn message_uuid(msg: &RawMessage) -> Option<&str> {
match msg {
RawMessage::User(m) => m.uuid.as_deref(),
RawMessage::Assistant(m) => m.uuid.as_deref(),
RawMessage::Progress(m) => m.uuid.as_deref(),
RawMessage::System(m) => m.uuid.as_deref(),
RawMessage::QueueOperation(_)
| RawMessage::FileHistorySnapshot(_)
| RawMessage::LastPrompt(_) => None,
}
}
/// Extract the `parent_uuid` from any `RawMessage` variant.
///
/// Returns `None` for variants that carry no parent UUID.
pub fn message_parent_uuid(msg: &RawMessage) -> Option<&str> {
match msg {
RawMessage::User(m) => m.parent_uuid.as_deref(),
RawMessage::Assistant(m) => m.parent_uuid.as_deref(),
RawMessage::Progress(m) => m.parent_uuid.as_deref(),
RawMessage::System(m) => m.parent_uuid.as_deref(),
RawMessage::QueueOperation(_)
| RawMessage::FileHistorySnapshot(_)
| RawMessage::LastPrompt(_) => None,
}
}
// ---------------------------------------------------------------------------
// ConversationTree impl
// ---------------------------------------------------------------------------
impl ConversationTree {
/// Build a conversation tree from a sequence of messages.
///
/// Messages without a UUID (e.g. `QueueOperation`) are silently skipped.
/// If a message's `parentUuid` is present but not found in the set,
/// that message is treated as a root.
pub fn build(messages: &[RawMessage]) -> Self {
let mut nodes: HashMap<String, ConversationNode> = HashMap::new();
let mut roots: Vec<String> = Vec::new();
// First pass: insert every addressable message as a node.
for msg in messages {
if let Some(uuid) = message_uuid(msg) {
nodes.insert(
uuid.to_string(),
ConversationNode {
uuid: uuid.to_string(),
message: msg.clone(),
children: Vec::new(),
},
);
}
}
// Second pass: collect (uuid, parent_uuid) pairs so we can wire up
// parent→child edges without a simultaneous mutable borrow.
let parent_links: Vec<(String, Option<String>)> = messages
.iter()
.filter_map(|msg| {
let uuid = message_uuid(msg)?.to_string();
let parent = message_parent_uuid(msg).map(|s| s.to_string());
Some((uuid, parent))
})
.collect();
for (uuid, parent_uuid) in parent_links {
match parent_uuid {
Some(parent_id) if nodes.contains_key(&parent_id) => {
// Safe: parent_id != uuid (a message cannot be its own parent).
nodes
.get_mut(&parent_id)
.expect("parent key confirmed above")
.children
.push(uuid);
}
_ => {
// No parent, or parent not in the provided slice — treat as root.
roots.push(uuid);
}
}
}
ConversationTree { roots, nodes }
}
/// Walk the *main thread*: start from the first root and always follow
/// the first child at each step.
///
/// In a linear session this is the complete conversation. In a branching
/// session this is the path taken before any edits.
pub fn main_thread(&self) -> Vec<&ConversationNode> {
let mut result = Vec::new();
if let Some(root_id) = self.roots.first() {
let mut current = root_id.as_str();
loop {
match self.nodes.get(current) {
Some(node) => {
result.push(node);
match node.children.first() {
Some(first_child) => current = first_child.as_str(),
None => break,
}
}
None => break,
}
}
}
result
}
/// Returns `true` when every node has at most one child (no branches).
pub fn is_linear(&self) -> bool {
self.nodes.values().all(|n| n.children.len() <= 1)
}
/// Returns all nodes that have more than one child (branch points).
pub fn branch_points(&self) -> Vec<&ConversationNode> {
self.nodes
.values()
.filter(|n| n.children.len() > 1)
.collect()
}
}
-847
View File
@@ -1,847 +0,0 @@
//! Core types for parsing Claude Code JSONL session data.
use camino::Utf8PathBuf;
use serde::{Deserialize, Serialize};
// ---------------------------------------------------------------------------
// Content types
// ---------------------------------------------------------------------------
/// Content is either a plain string or an array of content blocks.
///
/// Uses a custom deserializer so that `Blocks` variant applies lenient
/// deserialization — unknown content block types (e.g. `tool_reference`)
/// are silently skipped instead of failing the entire message.
#[derive(Debug, Clone, Serialize)]
#[serde(untagged)]
pub enum Content {
Text(String),
Blocks(Vec<ContentBlock>),
}
impl<'de> serde::Deserialize<'de> for Content {
fn deserialize<D>(deserializer: D) -> std::result::Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
let value = serde_json::Value::deserialize(deserializer)?;
match value {
serde_json::Value::String(s) => Ok(Content::Text(s)),
serde_json::Value::Array(arr) => {
let blocks = arr
.into_iter()
.filter_map(|v| {
serde_json::from_value::<ContentBlock>(v.clone())
.ok()
.or_else(|| {
tracing::debug!(
"Skipping unknown content block: {:?}",
v.get("type")
);
None
})
})
.collect();
Ok(Content::Blocks(blocks))
}
other => Err(serde::de::Error::custom(format!(
"expected string or array for Content, got {}",
match &other {
serde_json::Value::Null => "null",
serde_json::Value::Bool(_) => "bool",
serde_json::Value::Number(_) => "number",
serde_json::Value::Object(_) => "object",
_ => "unknown",
}
))),
}
}
}
/// Typed content block inside messages.
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum ContentBlock {
Text {
text: String,
},
ToolUse {
id: String,
name: String,
input: serde_json::Value,
#[serde(default)]
caller: Option<serde_json::Value>,
},
ToolResult {
tool_use_id: String,
#[serde(default)]
content: Option<Content>,
#[serde(default)]
is_error: bool,
},
Thinking {
thinking: String,
},
Image {
source: serde_json::Value,
},
}
// ---------------------------------------------------------------------------
// Lenient content block deserialization
// ---------------------------------------------------------------------------
/// Deserializes a `Vec<ContentBlock>` leniently — unknown block types are
/// silently skipped instead of failing the entire message.
fn deserialize_content_blocks<'de, D>(
deserializer: D,
) -> std::result::Result<Vec<ContentBlock>, D::Error>
where
D: serde::Deserializer<'de>,
{
use serde::Deserialize as _;
let raw: Vec<serde_json::Value> = Vec::deserialize(deserializer)?;
Ok(raw
.into_iter()
.filter_map(|v| {
serde_json::from_value::<ContentBlock>(v.clone()).ok().or_else(|| {
tracing::debug!("Skipping unknown content block: {:?}", v.get("type"));
None
})
})
.collect())
}
// ---------------------------------------------------------------------------
// Top-level JSONL line discriminator
// ---------------------------------------------------------------------------
/// Top-level JSONL line discriminator.
#[derive(Debug, Clone, Deserialize)]
#[serde(tag = "type", rename_all = "kebab-case")]
pub enum RawMessage {
User(RawUserMessage),
Assistant(RawAssistantMessage),
Progress(RawProgressMessage),
System(RawSystemMessage),
QueueOperation(RawQueueOperation),
FileHistorySnapshot(RawFileHistorySnapshot),
LastPrompt(RawLastPrompt),
}
// ---------------------------------------------------------------------------
// User message
// ---------------------------------------------------------------------------
#[derive(Debug, Clone, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct RawUserMessage {
#[serde(default)]
pub uuid: Option<String>,
#[serde(default)]
pub parent_uuid: Option<String>,
#[serde(default)]
pub timestamp: Option<String>,
#[serde(default)]
pub session_id: Option<String>,
#[serde(default)]
pub cwd: Option<String>,
#[serde(default)]
pub version: Option<String>,
#[serde(default)]
pub git_branch: Option<String>,
#[serde(default)]
pub is_sidechain: bool,
#[serde(default)]
pub is_meta: Option<bool>,
#[serde(default)]
pub user_type: Option<String>,
pub message: UserMessageInner,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct UserMessageInner {
pub role: String,
pub content: Content,
}
// ---------------------------------------------------------------------------
// Assistant message
// ---------------------------------------------------------------------------
#[derive(Debug, Clone, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct RawAssistantMessage {
#[serde(default)]
pub uuid: Option<String>,
#[serde(default)]
pub parent_uuid: Option<String>,
#[serde(default)]
pub timestamp: Option<String>,
#[serde(default)]
pub session_id: Option<String>,
#[serde(default)]
pub cwd: Option<String>,
#[serde(default)]
pub version: Option<String>,
#[serde(default)]
pub git_branch: Option<String>,
#[serde(default)]
pub is_sidechain: bool,
#[serde(default)]
pub request_id: Option<String>,
pub message: AssistantInner,
}
// NOTE: AssistantInner is the Anthropic API response object nested inside
// the Claude Code JSONL wrapper. The API uses snake_case (stop_reason, etc.)
// unlike the outer JSONL wrapper which uses camelCase.
#[derive(Debug, Clone, Deserialize)]
pub struct AssistantInner {
#[serde(default)]
pub model: Option<String>,
#[serde(default)]
pub id: Option<String>,
#[serde(default, rename = "type")]
pub message_type: Option<String>,
#[serde(default)]
pub role: Option<String>,
#[serde(default, deserialize_with = "deserialize_content_blocks")]
pub content: Vec<ContentBlock>,
#[serde(default)]
pub stop_reason: Option<String>,
#[serde(default)]
pub stop_sequence: Option<String>,
#[serde(default)]
pub usage: Option<serde_json::Value>,
}
// ---------------------------------------------------------------------------
// Progress message
// ---------------------------------------------------------------------------
#[derive(Debug, Clone, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct RawProgressMessage {
#[serde(default)]
pub uuid: Option<String>,
#[serde(default)]
pub parent_uuid: Option<String>,
#[serde(default)]
pub timestamp: Option<String>,
#[serde(default)]
pub session_id: Option<String>,
#[serde(default)]
pub cwd: Option<String>,
#[serde(default)]
pub version: Option<String>,
#[serde(default)]
pub git_branch: Option<String>,
#[serde(default)]
pub is_sidechain: bool,
#[serde(default)]
pub data: Option<serde_json::Value>,
}
// ---------------------------------------------------------------------------
// System message
// ---------------------------------------------------------------------------
#[derive(Debug, Clone, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct RawSystemMessage {
#[serde(default)]
pub uuid: Option<String>,
#[serde(default)]
pub parent_uuid: Option<String>,
#[serde(default)]
pub timestamp: Option<String>,
#[serde(default)]
pub session_id: Option<String>,
#[serde(default)]
pub cwd: Option<String>,
#[serde(default)]
pub version: Option<String>,
#[serde(default)]
pub git_branch: Option<String>,
#[serde(default)]
pub is_sidechain: bool,
#[serde(default)]
pub data: Option<serde_json::Value>,
}
// ---------------------------------------------------------------------------
// Queue operation
// ---------------------------------------------------------------------------
#[derive(Debug, Clone, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct RawQueueOperation {
pub operation: String,
#[serde(default)]
pub timestamp: Option<String>,
#[serde(default)]
pub session_id: Option<String>,
}
// ---------------------------------------------------------------------------
// File history snapshot
// ---------------------------------------------------------------------------
#[derive(Debug, Clone, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct RawFileHistorySnapshot {
#[serde(default)]
pub message_id: Option<String>,
#[serde(default)]
pub is_snapshot_update: bool,
#[serde(default)]
pub snapshot: Option<serde_json::Value>,
}
// ---------------------------------------------------------------------------
// Last prompt
// ---------------------------------------------------------------------------
#[derive(Debug, Clone, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct RawLastPrompt {
#[serde(default)]
pub last_prompt: Option<String>,
#[serde(default)]
pub session_id: Option<String>,
}
// ---------------------------------------------------------------------------
// Tool types (for correlation module later)
// ---------------------------------------------------------------------------
/// Known tool names used by Claude Code.
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum ToolName {
Bash,
Read,
Write,
Edit,
Grep,
Glob,
Agent,
Skill,
WebSearch,
WebFetch,
TodoWrite,
NotebookEdit,
Other(String),
}
impl From<String> for ToolName {
fn from(s: String) -> Self {
match s.as_str() {
"Bash" => ToolName::Bash,
"Read" => ToolName::Read,
"Write" => ToolName::Write,
"Edit" => ToolName::Edit,
"Grep" => ToolName::Grep,
"Glob" => ToolName::Glob,
"Agent" => ToolName::Agent,
"Skill" => ToolName::Skill,
"WebSearch" => ToolName::WebSearch,
"WebFetch" => ToolName::WebFetch,
"TodoWrite" => ToolName::TodoWrite,
"NotebookEdit" => ToolName::NotebookEdit,
other => ToolName::Other(other.to_string()),
}
}
}
/// A tool call extracted from an assistant message.
#[derive(Debug, Clone)]
pub struct ToolCall {
pub id: String,
pub name: ToolName,
pub input: serde_json::Value,
pub source_message_uuid: String,
}
/// A tool result extracted from a user message.
#[derive(Debug, Clone)]
pub struct ToolResultData {
pub tool_use_id: String,
pub content: Option<String>,
pub is_error: bool,
pub source_message_uuid: String,
}
/// A correlated tool call + result pair.
#[derive(Debug, Clone)]
pub struct ToolExchange {
pub call: ToolCall,
pub result: Option<ToolResultData>,
}
// ---------------------------------------------------------------------------
// Discovery types (for discovery module later)
// ---------------------------------------------------------------------------
/// A discovered Claude Code project directory.
#[derive(Debug, Clone)]
pub struct ClaudeProject {
pub path: Utf8PathBuf,
pub original_path: String,
pub sessions: Vec<SessionRef>,
}
/// Reference to a session (not yet parsed).
#[derive(Debug, Clone)]
pub struct SessionRef {
pub id: String,
pub jsonl_path: Utf8PathBuf,
pub artifacts_dir: Option<Utf8PathBuf>,
pub index_entry: Option<SessionIndexEntry>,
}
/// From sessions-index.json (when available).
#[derive(Debug, Clone, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SessionIndexEntry {
#[serde(default)]
pub session_id: Option<String>,
#[serde(default)]
pub first_prompt: Option<String>,
#[serde(default)]
pub summary: Option<String>,
#[serde(default)]
pub message_count: Option<u32>,
#[serde(default)]
pub created: Option<serde_json::Value>,
#[serde(default)]
pub modified: Option<serde_json::Value>,
#[serde(default)]
pub git_branch: Option<String>,
#[serde(default)]
pub project_path: Option<String>,
}
// ---------------------------------------------------------------------------
// Sub-agent types
// ---------------------------------------------------------------------------
/// Sub-agent metadata from .meta.json.
#[derive(Debug, Clone, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SubAgentMeta {
#[serde(default)]
pub agent_type: Option<String>,
}
/// A parsed sub-agent session.
#[derive(Debug, Clone)]
pub struct SubAgentSession {
pub agent_id: String,
pub meta: SubAgentMeta,
pub messages: Vec<RawMessage>,
pub parent_tool_call_id: Option<String>,
}
// ---------------------------------------------------------------------------
// MessageMeta (convenience, future use)
// ---------------------------------------------------------------------------
/// Common metadata extracted from any message. Defined for future consumers.
#[derive(Debug, Clone)]
pub struct MessageMeta {
pub uuid: String,
pub parent_uuid: Option<String>,
pub timestamp: Option<String>,
pub session_id: String,
pub cwd: Option<String>,
pub version: Option<String>,
pub git_branch: Option<String>,
pub is_sidechain: bool,
}
// ---------------------------------------------------------------------------
// ParsedSession
// ---------------------------------------------------------------------------
/// A fully parsed session with all correlations built.
#[derive(Debug)]
pub struct ParsedSession {
pub messages: Vec<RawMessage>,
pub tree: crate::tree::ConversationTree,
pub tool_exchanges: Vec<ToolExchange>,
pub subagents: Vec<SubAgentSession>,
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_content_text_string() {
let json = r#""Hello world""#;
let content: Content = serde_json::from_str(json).unwrap();
match content {
Content::Text(s) => assert_eq!(s, "Hello world"),
_ => panic!("Expected Content::Text"),
}
}
#[test]
fn parse_content_blocks() {
let json = r#"[{"type": "text", "text": "Hello"}]"#;
let content: Content = serde_json::from_str(json).unwrap();
match content {
Content::Blocks(blocks) => {
assert_eq!(blocks.len(), 1);
match &blocks[0] {
ContentBlock::Text { text } => assert_eq!(text, "Hello"),
_ => panic!("Expected ContentBlock::Text"),
}
}
_ => panic!("Expected Content::Blocks"),
}
}
#[test]
fn parse_tool_use_block() {
let json = r#"{"type": "tool_use", "id": "toolu_123", "name": "Bash", "input": {"command": "ls"}}"#;
let block: ContentBlock = serde_json::from_str(json).unwrap();
match block {
ContentBlock::ToolUse { id, name, .. } => {
assert_eq!(id, "toolu_123");
assert_eq!(name, "Bash");
}
_ => panic!("Expected ContentBlock::ToolUse"),
}
}
#[test]
fn parse_tool_result_block() {
let json = r#"{"type": "tool_result", "tool_use_id": "toolu_123", "content": "output text", "is_error": false}"#;
let block: ContentBlock = serde_json::from_str(json).unwrap();
match block {
ContentBlock::ToolResult {
tool_use_id,
is_error,
..
} => {
assert_eq!(tool_use_id, "toolu_123");
assert!(!is_error);
}
_ => panic!("Expected ContentBlock::ToolResult"),
}
}
#[test]
fn parse_thinking_block() {
let json = r#"{"type": "thinking", "thinking": "Let me consider..."}"#;
let block: ContentBlock = serde_json::from_str(json).unwrap();
match block {
ContentBlock::Thinking { thinking } => {
assert_eq!(thinking, "Let me consider...");
}
_ => panic!("Expected ContentBlock::Thinking"),
}
}
#[test]
fn parse_queue_operation() {
let json = r#"{"type": "queue-operation", "operation": "enqueue", "timestamp": "2026-03-14T21:15:17.531Z", "sessionId": "00f72d8d-fc54-485c-a082-310ffcabdb73"}"#;
let msg: RawMessage = serde_json::from_str(json).unwrap();
match msg {
RawMessage::QueueOperation(op) => {
assert_eq!(op.operation, "enqueue");
assert_eq!(
op.session_id.as_deref(),
Some("00f72d8d-fc54-485c-a082-310ffcabdb73")
);
}
_ => panic!("Expected RawMessage::QueueOperation"),
}
}
#[test]
fn parse_user_message_with_string_content() {
let json = r#"{
"parentUuid": "b1ab1ac7-fdb6-4e25-bc17-4c060b470b4a",
"isSidechain": false,
"userType": "external",
"cwd": "G:\\dev\\projects\\dirigent",
"sessionId": "00f72d8d-fc54-485c-a082-310ffcabdb73",
"version": "2.1.71",
"gitBranch": "main",
"type": "user",
"message": {
"role": "user",
"content": "Hello world"
},
"isMeta": false,
"uuid": "1d843a4a-b99d-4c02-91a3-7cfe3dcac9f0",
"timestamp": "2026-03-14T21:08:58.586Z"
}"#;
let msg: RawMessage = serde_json::from_str(json).unwrap();
match msg {
RawMessage::User(u) => {
assert_eq!(u.uuid.as_deref(), Some("1d843a4a-b99d-4c02-91a3-7cfe3dcac9f0"));
assert_eq!(u.session_id.as_deref(), Some("00f72d8d-fc54-485c-a082-310ffcabdb73"));
assert_eq!(u.is_meta, Some(false));
match &u.message.content {
Content::Text(s) => assert_eq!(s, "Hello world"),
_ => panic!("Expected Content::Text"),
}
}
_ => panic!("Expected RawMessage::User"),
}
}
#[test]
fn parse_assistant_message_with_tool_use() {
let json = r#"{
"parentUuid": "77793647-f957-4aec-8b04-a9c07e01e37b",
"isSidechain": false,
"userType": "external",
"cwd": "G:\\dev\\projects\\dirigent",
"sessionId": "00f72d8d-fc54-485c-a082-310ffcabdb73",
"version": "2.1.71",
"gitBranch": "main",
"message": {
"model": "claude-opus-4-6",
"id": "msg_01NcwYjEydGEyZCSCgwmcnYd",
"type": "message",
"role": "assistant",
"content": [
{
"type": "tool_use",
"id": "toolu_01DP5mkAQnAi2o54idq24cPn",
"name": "Agent",
"input": {
"description": "Investigate config sources of truth",
"subagent_type": "Explore",
"prompt": "test prompt"
},
"caller": { "type": "direct" }
}
],
"stop_reason": null,
"stop_sequence": null,
"usage": {
"input_tokens": 3,
"cache_creation_input_tokens": 20147,
"cache_read_input_tokens": 0,
"output_tokens": 9,
"service_tier": "standard"
}
},
"requestId": "req_011CZ3fYWGjcQCgh5d58d2k8",
"type": "assistant",
"uuid": "6cad0d13-d0ae-47fa-a6b1-b7b45a2b5e0b",
"timestamp": "2026-03-14T21:15:27.916Z"
}"#;
let msg: RawMessage = serde_json::from_str(json).unwrap();
match msg {
RawMessage::Assistant(a) => {
assert_eq!(a.uuid.as_deref(), Some("6cad0d13-d0ae-47fa-a6b1-b7b45a2b5e0b"));
assert_eq!(a.message.model.as_deref(), Some("claude-opus-4-6"));
assert_eq!(a.message.content.len(), 1);
match &a.message.content[0] {
ContentBlock::ToolUse { name, id, .. } => {
assert_eq!(name, "Agent");
assert_eq!(id, "toolu_01DP5mkAQnAi2o54idq24cPn");
}
_ => panic!("Expected ContentBlock::ToolUse"),
}
assert!(a.message.stop_reason.is_none());
assert!(a.message.usage.is_some());
}
_ => panic!("Expected RawMessage::Assistant"),
}
}
#[test]
fn unknown_content_block_type_skipped_in_assistant() {
let json = r#"{
"parentUuid": null,
"isSidechain": false,
"sessionId": "test",
"message": {
"role": "assistant",
"content": [
{"type": "text", "text": "known"},
{"type": "future_type", "data": "something"}
]
},
"type": "assistant",
"uuid": "test-uuid",
"timestamp": "2026-01-01T00:00:00Z"
}"#;
let msg: RawMessage = serde_json::from_str(json).unwrap();
match msg {
RawMessage::Assistant(a) => {
assert_eq!(a.message.content.len(), 1);
match &a.message.content[0] {
ContentBlock::Text { text } => assert_eq!(text, "known"),
_ => panic!("Expected ContentBlock::Text"),
}
}
_ => panic!("Expected RawMessage::Assistant"),
}
}
// -----------------------------------------------------------------------
// Regression tests for parse failure audit (2026-04-04)
// -----------------------------------------------------------------------
#[test]
fn tool_reference_in_tool_result_content_does_not_fail() {
// Suggestion 1 & 3: tool_reference blocks inside tool_result.content
// should be silently skipped, not fail the entire message.
let json = r#"{
"type": "user",
"uuid": "test-uuid",
"parentUuid": null,
"isSidechain": false,
"sessionId": "test-session",
"message": {
"role": "user",
"content": [
{
"type": "tool_result",
"tool_use_id": "toolu_abc123",
"content": [
{"type": "text", "text": "File contents here"},
{"type": "tool_reference", "tool_name": "TodoWrite"}
],
"is_error": false
}
]
}
}"#;
let msg: RawMessage = serde_json::from_str(json).unwrap();
match msg {
RawMessage::User(u) => {
match &u.message.content {
Content::Blocks(blocks) => {
assert_eq!(blocks.len(), 1);
match &blocks[0] {
ContentBlock::ToolResult { tool_use_id, content, .. } => {
assert_eq!(tool_use_id, "toolu_abc123");
// The inner content should have 1 block (text), tool_reference skipped
match content.as_ref().unwrap() {
Content::Blocks(inner) => {
assert_eq!(inner.len(), 1);
match &inner[0] {
ContentBlock::Text { text } => {
assert_eq!(text, "File contents here");
}
_ => panic!("Expected inner ContentBlock::Text"),
}
}
_ => panic!("Expected inner Content::Blocks"),
}
}
_ => panic!("Expected ContentBlock::ToolResult"),
}
}
_ => panic!("Expected Content::Blocks"),
}
}
_ => panic!("Expected RawMessage::User"),
}
}
#[test]
fn file_history_snapshot_parses() {
// Suggestion 2: file-history-snapshot lines should parse, not fail.
let json = r#"{
"type": "file-history-snapshot",
"messageId": "abc-123",
"isSnapshotUpdate": false,
"snapshot": {
"messageId": "abc-123",
"trackedFileBackups": {
"src/main.rs": {"backupFileName": "main.rs.bak", "backupTime": "2026-01-01T00:00:00Z", "version": "1"}
},
"timestamp": "2026-01-01T00:00:00Z"
}
}"#;
let msg: RawMessage = serde_json::from_str(json).unwrap();
match msg {
RawMessage::FileHistorySnapshot(s) => {
assert_eq!(s.message_id.as_deref(), Some("abc-123"));
assert!(!s.is_snapshot_update);
assert!(s.snapshot.is_some());
}
_ => panic!("Expected RawMessage::FileHistorySnapshot"),
}
}
#[test]
fn last_prompt_parses() {
// Suggestion 2: last-prompt lines should parse, not fail.
let json = r#"{
"type": "last-prompt",
"lastPrompt": "Fix the bug in auth middleware",
"sessionId": "session-uuid-123"
}"#;
let msg: RawMessage = serde_json::from_str(json).unwrap();
match msg {
RawMessage::LastPrompt(lp) => {
assert_eq!(lp.last_prompt.as_deref(), Some("Fix the bug in auth middleware"));
assert_eq!(lp.session_id.as_deref(), Some("session-uuid-123"));
}
_ => panic!("Expected RawMessage::LastPrompt"),
}
}
#[test]
fn unknown_content_block_in_user_message_skipped() {
// Suggestion 3: Unknown block types in user message content
// should be silently skipped (lenient everywhere).
let json = r#"{
"type": "user",
"uuid": "test-uuid",
"isSidechain": false,
"sessionId": "test",
"message": {
"role": "user",
"content": [
{"type": "text", "text": "known"},
{"type": "future_unknown_type", "data": "something"}
]
}
}"#;
let msg: RawMessage = serde_json::from_str(json).unwrap();
match msg {
RawMessage::User(u) => {
match &u.message.content {
Content::Blocks(blocks) => {
assert_eq!(blocks.len(), 1);
match &blocks[0] {
ContentBlock::Text { text } => assert_eq!(text, "known"),
_ => panic!("Expected ContentBlock::Text"),
}
}
_ => panic!("Expected Content::Blocks"),
}
}
_ => panic!("Expected RawMessage::User"),
}
}
#[test]
fn tool_name_from_string() {
assert_eq!(ToolName::from("Bash".to_string()), ToolName::Bash);
assert_eq!(ToolName::from("Read".to_string()), ToolName::Read);
assert_eq!(ToolName::from("Agent".to_string()), ToolName::Agent);
assert_eq!(ToolName::from("WebSearch".to_string()), ToolName::WebSearch);
assert_eq!(
ToolName::from("CustomTool".to_string()),
ToolName::Other("CustomTool".to_string())
);
}
}
-70
View File
@@ -1,70 +0,0 @@
use chrono::{DateTime, Utc};
/// Parse a timestamp from various formats found in Claude Code data.
///
/// Supports:
/// - ISO 8601 string: "2026-03-22T17:00:13.192Z"
/// - Unix milliseconds (number > 1e12): 1769461914249
/// - Unix seconds (number <= 1e12): 1769461914
pub fn parse_timestamp(value: &serde_json::Value) -> Option<DateTime<Utc>> {
match value {
serde_json::Value::String(s) => {
DateTime::parse_from_rfc3339(s)
.ok()
.map(|dt| dt.with_timezone(&Utc))
}
serde_json::Value::Number(n) => {
if let Some(ms) = n.as_i64() {
if ms > 1_000_000_000_000 {
DateTime::from_timestamp_millis(ms)
} else {
DateTime::from_timestamp(ms, 0)
}
} else {
None
}
}
_ => None,
}
}
#[cfg(test)]
mod tests {
use super::*;
use chrono::Datelike;
#[test]
fn parse_timestamp_iso8601() {
let v = serde_json::json!("2026-03-22T17:00:13.192Z");
let dt = parse_timestamp(&v).unwrap();
assert_eq!(dt.year(), 2026);
assert_eq!(dt.month(), 3);
assert_eq!(dt.day(), 22);
}
#[test]
fn parse_timestamp_unix_millis() {
let v = serde_json::json!(1769461914249_i64);
let dt = parse_timestamp(&v).unwrap();
assert!(dt.year() >= 2025);
}
#[test]
fn parse_timestamp_unix_seconds() {
let v = serde_json::json!(1769461914_i64);
let dt = parse_timestamp(&v).unwrap();
assert!(dt.year() >= 2025);
}
#[test]
fn parse_timestamp_null_returns_none() {
let v = serde_json::json!(null);
assert!(parse_timestamp(&v).is_none());
}
#[test]
fn parse_timestamp_invalid_string_returns_none() {
let v = serde_json::json!("not a date");
assert!(parse_timestamp(&v).is_none());
}
}
@@ -1,6 +0,0 @@
{"type":"user","uuid":"r-001","parentUuid":null,"timestamp":"2026-03-23T10:00:00.000Z","sessionId":"test-session-tree","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"isMeta":false,"userType":"external","message":{"role":"user","content":"Help me"}}
{"type":"assistant","uuid":"a-001","parentUuid":"r-001","timestamp":"2026-03-23T10:00:01.000Z","sessionId":"test-session-tree","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"requestId":"req-001","message":{"model":"claude-opus-4-6","id":"msg-001","type":"message","role":"assistant","content":[{"type":"text","text":"Sure"}],"stop_reason":"end_turn","usage":{"input_tokens":10,"output_tokens":5}}}
{"type":"user","uuid":"u-002","parentUuid":"a-001","timestamp":"2026-03-23T10:00:02.000Z","sessionId":"test-session-tree","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"isMeta":false,"userType":"external","message":{"role":"user","content":"Do option A"}}
{"type":"assistant","uuid":"a-003","parentUuid":"u-002","timestamp":"2026-03-23T10:00:03.000Z","sessionId":"test-session-tree","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"requestId":"req-002","message":{"model":"claude-opus-4-6","id":"msg-003","type":"message","role":"assistant","content":[{"type":"text","text":"Doing A"}],"stop_reason":"end_turn","usage":{"input_tokens":15,"output_tokens":5}}}
{"type":"user","uuid":"u-002b","parentUuid":"a-001","timestamp":"2026-03-23T10:00:04.000Z","sessionId":"test-session-tree","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"isMeta":false,"userType":"external","message":{"role":"user","content":"Actually, do option B"}}
{"type":"assistant","uuid":"a-003b","parentUuid":"u-002b","timestamp":"2026-03-23T10:00:05.000Z","sessionId":"test-session-tree","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"requestId":"req-003","message":{"model":"claude-opus-4-6","id":"msg-003b","type":"message","role":"assistant","content":[{"type":"text","text":"Doing B"}],"stop_reason":"end_turn","usage":{"input_tokens":15,"output_tokens":5}}}
@@ -1,6 +0,0 @@
{"type":"queue-operation","operation":"enqueue","timestamp":"2026-03-14T21:00:00.000Z","sessionId":"test-session-001"}
{"type":"queue-operation","operation":"dequeue","timestamp":"2026-03-14T21:00:00.001Z","sessionId":"test-session-001"}
{"type":"user","uuid":"u-001","parentUuid":null,"timestamp":"2026-03-14T21:00:01.000Z","sessionId":"test-session-001","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"isMeta":false,"userType":"external","message":{"role":"user","content":"Hello, help me with this project"}}
{"type":"assistant","uuid":"a-001","parentUuid":"u-001","timestamp":"2026-03-14T21:00:02.000Z","sessionId":"test-session-001","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"requestId":"req-001","message":{"model":"claude-opus-4-6","id":"msg-001","type":"message","role":"assistant","content":[{"type":"text","text":"I'll help you."},{"type":"tool_use","id":"toolu_01","name":"Bash","input":{"command":"ls","description":"List files"}}],"stop_reason":"tool_use","usage":{"input_tokens":100,"output_tokens":50}}}
{"type":"user","uuid":"u-002","parentUuid":"a-001","timestamp":"2026-03-14T21:00:03.000Z","sessionId":"test-session-001","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"isMeta":true,"userType":"external","message":{"role":"user","content":[{"type":"tool_result","tool_use_id":"toolu_01","content":"file1.rs\nfile2.rs","is_error":false}]}}
{"type":"assistant","uuid":"a-002","parentUuid":"u-002","timestamp":"2026-03-14T21:00:04.000Z","sessionId":"test-session-001","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"requestId":"req-002","message":{"model":"claude-opus-4-6","id":"msg-002","type":"message","role":"assistant","content":[{"type":"text","text":"I can see two Rust files in the directory."}],"stop_reason":"end_turn","usage":{"input_tokens":200,"output_tokens":30}}}
@@ -1,9 +0,0 @@
{"type":"queue-operation","operation":"enqueue","timestamp":"2026-03-14T21:00:00.000Z","sessionId":"test-session-noise"}
{"type":"user","uuid":"u-n-001","parentUuid":null,"timestamp":"2026-03-14T21:00:01.000Z","sessionId":"test-session-noise","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"isMeta":true,"message":{"role":"user","content":"system injected stuff"}}
{"type":"user","uuid":"u-n-002","parentUuid":"u-n-001","timestamp":"2026-03-14T21:00:02.000Z","sessionId":"test-session-noise","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"isMeta":false,"message":{"role":"user","content":"Warmup"}}
{"type":"user","uuid":"u-n-003","parentUuid":"u-n-002","timestamp":"2026-03-14T21:00:03.000Z","sessionId":"test-session-noise","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"isMeta":false,"message":{"role":"user","content":"[Request interrupted by user"}}
{"type":"user","uuid":"u-n-004","parentUuid":"u-n-003","timestamp":"2026-03-14T21:00:04.000Z","sessionId":"test-session-noise","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"isMeta":false,"message":{"role":"user","content":"This session is being continued from a previous conversation"}}
{"type":"user","uuid":"u-n-005","parentUuid":"u-n-004","timestamp":"2026-03-14T21:00:05.000Z","sessionId":"test-session-noise","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"isMeta":false,"message":{"role":"user","content":"API Error: rate limit exceeded"}}
{"type":"user","uuid":"u-n-006","parentUuid":"u-n-005","timestamp":"2026-03-14T21:00:06.000Z","sessionId":"test-session-noise","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"isMeta":false,"message":{"role":"user","content":"Caveat: The messages below were generated by the user"}}
{"type":"user","uuid":"u-n-007","parentUuid":"u-n-006","timestamp":"2026-03-14T21:00:07.000Z","sessionId":"test-session-noise","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"isMeta":false,"message":{"role":"user","content":"Please help me fix this bug"}}
{"type":"assistant","uuid":"a-n-001","parentUuid":"u-n-007","timestamp":"2026-03-14T21:00:08.000Z","sessionId":"test-session-noise","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"message":{"id":"msg-n-001","role":"assistant","content":[{"type":"text","text":"Sure, let me help."}],"stop_reason":"end_turn"}}
@@ -1,6 +0,0 @@
{"type":"user","uuid":"u-100","parentUuid":null,"timestamp":"2026-03-23T10:00:00.000Z","sessionId":"test-session-dedup","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"isMeta":false,"userType":"external","message":{"role":"user","content":"What files are here?"}}
{"type":"assistant","uuid":"a-100","parentUuid":"u-100","timestamp":"2026-03-23T10:00:01.000Z","sessionId":"test-session-dedup","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"requestId":"req-100","message":{"model":"claude-opus-4-6","id":"msg-100","type":"message","role":"assistant","content":[{"type":"text","text":"Let me"}],"stop_reason":null,"usage":{"input_tokens":50,"output_tokens":3}}}
{"type":"assistant","uuid":"a-100","parentUuid":"u-100","timestamp":"2026-03-23T10:00:01.100Z","sessionId":"test-session-dedup","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"requestId":"req-100","message":{"model":"claude-opus-4-6","id":"msg-100","type":"message","role":"assistant","content":[{"type":"text","text":"Let me look"},{"type":"tool_use","id":"toolu_100","name":"Bash","input":{"command":""}}],"stop_reason":null,"usage":{"input_tokens":50,"output_tokens":12}}}
{"type":"assistant","uuid":"a-100","parentUuid":"u-100","timestamp":"2026-03-23T10:00:01.200Z","sessionId":"test-session-dedup","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"requestId":"req-100","message":{"model":"claude-opus-4-6","id":"msg-100","type":"message","role":"assistant","content":[{"type":"text","text":"Let me look at this."},{"type":"tool_use","id":"toolu_100","name":"Bash","input":{"command":"ls"}}],"stop_reason":"tool_use","usage":{"input_tokens":50,"output_tokens":20}}}
{"type":"user","uuid":"u-101","parentUuid":"a-100","timestamp":"2026-03-23T10:00:02.000Z","sessionId":"test-session-dedup","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"isMeta":true,"userType":"external","message":{"role":"user","content":[{"type":"tool_result","tool_use_id":"toolu_100","content":"main.rs\nlib.rs","is_error":false}]}}
{"type":"assistant","uuid":"a-101","parentUuid":"u-101","timestamp":"2026-03-23T10:00:03.000Z","sessionId":"test-session-dedup","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"requestId":"req-101","message":{"model":"claude-opus-4-6","id":"msg-101","type":"message","role":"assistant","content":[{"type":"text","text":"Done."}],"stop_reason":"end_turn","usage":{"input_tokens":100,"output_tokens":5}}}
@@ -1,4 +0,0 @@
{"type":"user","uuid":"u-300","parentUuid":null,"timestamp":"2026-03-23T12:00:00.000Z","sessionId":"test-session-sub","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"isMeta":false,"message":{"role":"user","content":"Search the codebase"}}
{"type":"assistant","uuid":"a-300","parentUuid":"u-300","timestamp":"2026-03-23T12:00:01.000Z","sessionId":"test-session-sub","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"requestId":"req-300","message":{"model":"claude-opus-4-6","id":"msg-300","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_300","name":"Agent","input":{"description":"Search codebase","subagent_type":"Explore","prompt":"Find all config files"}}],"stop_reason":"tool_use","usage":{"input_tokens":100,"output_tokens":20}}}
{"type":"user","uuid":"u-301","parentUuid":"a-300","timestamp":"2026-03-23T12:00:30.000Z","sessionId":"test-session-sub","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"isMeta":true,"message":{"role":"user","content":[{"type":"tool_result","tool_use_id":"toolu_300","content":"Found 3 config files","is_error":false}]}}
{"type":"assistant","uuid":"a-301","parentUuid":"u-301","timestamp":"2026-03-23T12:00:31.000Z","sessionId":"test-session-sub","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"requestId":"req-301","message":{"model":"claude-opus-4-6","id":"msg-301","type":"message","role":"assistant","content":[{"type":"text","text":"I found the config files."}],"stop_reason":"end_turn","usage":{"input_tokens":200,"output_tokens":10}}}
@@ -1,2 +0,0 @@
{"type":"user","uuid":"sa-u1","parentUuid":null,"timestamp":"2026-03-23T12:00:02.000Z","sessionId":"agent-abc123","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":true,"isMeta":false,"message":{"role":"user","content":"Find all config files"}}
{"type":"assistant","uuid":"sa-a1","parentUuid":"sa-u1","timestamp":"2026-03-23T12:00:03.000Z","sessionId":"agent-abc123","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":true,"requestId":"req-sa1","message":{"model":"claude-opus-4-6","id":"msg-sa1","type":"message","role":"assistant","content":[{"type":"text","text":"Found config.toml, settings.json, .env"}],"stop_reason":"end_turn","usage":{"input_tokens":50,"output_tokens":15}}}
@@ -1 +0,0 @@
{"agentType": "Explore"}
@@ -1,6 +0,0 @@
{"type":"user","uuid":"u-200","parentUuid":null,"timestamp":"2026-03-23T10:00:00.000Z","sessionId":"test-session-corr","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"isMeta":false,"userType":"external","message":{"role":"user","content":"Fix the bug"}}
{"type":"assistant","uuid":"a-200","parentUuid":"u-200","timestamp":"2026-03-23T10:00:01.000Z","sessionId":"test-session-corr","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"requestId":"req-200","message":{"model":"claude-opus-4-6","id":"msg-200","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_200","name":"Bash","input":{"command":"cargo test"}},{"type":"tool_use","id":"toolu_201","name":"Read","input":{"file_path":"src/main.rs"}}],"stop_reason":"tool_use","usage":{"input_tokens":100,"output_tokens":50}}}
{"type":"user","uuid":"u-201","parentUuid":"a-200","timestamp":"2026-03-23T10:00:02.000Z","sessionId":"test-session-corr","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"isMeta":true,"userType":"external","message":{"role":"user","content":[{"type":"tool_result","tool_use_id":"toolu_200","content":"test result output","is_error":false},{"type":"tool_result","tool_use_id":"toolu_201","content":"fn main() {}","is_error":false}]}}
{"type":"assistant","uuid":"a-201","parentUuid":"u-201","timestamp":"2026-03-23T10:00:03.000Z","sessionId":"test-session-corr","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"requestId":"req-201","message":{"model":"claude-opus-4-6","id":"msg-201","type":"message","role":"assistant","content":[{"type":"tool_use","id":"toolu_202","name":"Write","input":{"file_path":"src/fix.rs","content":"fixed"}}],"stop_reason":"tool_use","usage":{"input_tokens":150,"output_tokens":30}}}
{"type":"user","uuid":"u-202","parentUuid":"a-201","timestamp":"2026-03-23T10:00:04.000Z","sessionId":"test-session-corr","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"isMeta":true,"userType":"external","message":{"role":"user","content":[{"type":"tool_result","tool_use_id":"toolu_202","content":"File written successfully","is_error":false}]}}
{"type":"assistant","uuid":"a-202","parentUuid":"u-202","timestamp":"2026-03-23T10:00:05.000Z","sessionId":"test-session-corr","cwd":"G:\\dev\\test","version":"2.1.71","gitBranch":"main","isSidechain":false,"requestId":"req-202","message":{"model":"claude-opus-4-6","id":"msg-202","type":"message","role":"assistant","content":[{"type":"text","text":"Bug is fixed."}],"stop_reason":"end_turn","usage":{"input_tokens":200,"output_tokens":20}}}
@@ -1,294 +0,0 @@
use camino::{Utf8Path, Utf8PathBuf};
use chrono::Datelike;
use dirigent_anth::{
correlation::correlate_tools,
dedup::dedup_messages,
noise::{classify_noise, NoiseKind},
parse_session,
tree::ConversationTree,
types::{ContentBlock, RawMessage},
util::parse_timestamp,
};
#[test]
fn parse_minimal_session() {
let path = Utf8Path::new("tests/fixtures/minimal_session.jsonl");
let messages = parse_session(path).unwrap();
assert_eq!(messages.len(), 6, "Expected 6 messages, got {}", messages.len());
let type_names: Vec<&str> = messages
.iter()
.map(|m| match m {
RawMessage::User(_) => "user",
RawMessage::Assistant(_) => "assistant",
RawMessage::Progress(_) => "progress",
RawMessage::System(_) => "system",
RawMessage::QueueOperation(_) => "queue-operation",
RawMessage::FileHistorySnapshot(_) => "file-history-snapshot",
RawMessage::LastPrompt(_) => "last-prompt",
})
.collect();
assert_eq!(
type_names.iter().filter(|&&t| t == "queue-operation").count(),
2
);
assert_eq!(type_names.iter().filter(|&&t| t == "user").count(), 2);
assert_eq!(
type_names.iter().filter(|&&t| t == "assistant").count(),
2
);
}
#[test]
fn parse_line_returns_none_for_invalid_json() {
assert!(dirigent_anth::parse_line("not valid json", 1).is_none());
assert!(dirigent_anth::parse_line("{}", 1).is_none());
}
#[test]
fn dedup_streaming_session() {
let path = Utf8Path::new("tests/fixtures/streaming_dedup.jsonl");
let messages = parse_session(path).unwrap();
// Raw should have 6 lines (including 3 versions of same assistant message)
assert_eq!(messages.len(), 6, "Raw messages: expected 6, got {}", messages.len());
let deduped = dedup_messages(messages);
// After dedup: U1, A1(final), U2, A2 = 4
assert_eq!(deduped.len(), 4, "Deduped messages: expected 4, got {}", deduped.len());
// The kept assistant message must be the final version
let first_assistant = deduped.iter().find(|m| matches!(m, RawMessage::Assistant(_))).unwrap();
if let RawMessage::Assistant(a) = first_assistant {
assert!(a.message.stop_reason.is_some(), "Deduped assistant should have stop_reason set");
assert_eq!(a.message.stop_reason.as_deref(), Some("tool_use"));
assert_eq!(a.message.content.len(), 2, "Final version should have 2 content blocks");
} else {
unreachable!();
}
}
#[test]
fn dedup_preserves_non_streamed_messages() {
let path = Utf8Path::new("tests/fixtures/minimal_session.jsonl");
let messages = parse_session(path).unwrap();
let count_before = messages.len();
let deduped = dedup_messages(messages);
// No streaming in minimal_session, so count should be same
assert_eq!(deduped.len(), count_before);
}
#[test]
fn correlate_parallel_tools() {
let path = Utf8Path::new("tests/fixtures/tool_correlation.jsonl");
let messages = dirigent_anth::parse_session_deduped(path).unwrap();
let exchanges = correlate_tools(&messages);
// 3 tool calls: 2 parallel (Bash+Read) + 1 sequential (Write)
assert_eq!(exchanges.len(), 3);
// All should have results
assert!(exchanges.iter().all(|e| e.result.is_some()));
// Verify correct pairing by ID
for ex in &exchanges {
assert_eq!(ex.call.id, ex.result.as_ref().unwrap().tool_use_id);
}
}
#[test]
fn correlate_no_tools_returns_empty() {
// Test with just a plain user message — no tool calls or results
let messages = vec![
serde_json::from_str::<RawMessage>(
r#"{"type":"user","uuid":"x","timestamp":"2026-01-01T00:00:00Z","sessionId":"s","message":{"role":"user","content":"hello"}}"#,
)
.unwrap(),
];
let exchanges = correlate_tools(&messages);
assert!(exchanges.is_empty());
}
#[test]
fn build_branching_tree() {
let path = Utf8Path::new("tests/fixtures/branching_tree.jsonl");
let messages = dirigent_anth::parse_session(path).unwrap();
let tree = ConversationTree::build(&messages);
assert_eq!(tree.roots.len(), 1);
assert!(!tree.is_linear());
assert_eq!(tree.branch_points().len(), 1); // A1 has 2 children
let main = tree.main_thread();
assert_eq!(main.len(), 4); // R → A1 → U2 → A3 (first branch)
}
#[test]
fn linear_conversation_is_linear() {
let path = Utf8Path::new("tests/fixtures/minimal_session.jsonl");
let messages = dirigent_anth::parse_session(path).unwrap();
let tree = ConversationTree::build(&messages);
assert!(tree.is_linear());
}
#[test]
fn classify_noise_from_fixture() {
let path = Utf8Path::new("tests/fixtures/noise_patterns.jsonl");
let messages = dirigent_anth::parse_session(path).unwrap();
assert_eq!(messages.len(), 9, "Expected 9 messages in noise fixture");
let classifications: Vec<Option<NoiseKind>> = messages.iter()
.map(classify_noise)
.collect();
assert_eq!(classifications[0], Some(NoiseKind::QueueOp));
assert_eq!(classifications[1], Some(NoiseKind::Meta));
assert_eq!(classifications[2], Some(NoiseKind::Warmup));
assert_eq!(classifications[3], Some(NoiseKind::Interrupted));
assert_eq!(classifications[4], Some(NoiseKind::Continuation));
assert_eq!(classifications[5], Some(NoiseKind::ApiError));
assert_eq!(classifications[6], Some(NoiseKind::SystemCaveat));
assert_eq!(classifications[7], None); // normal user
assert_eq!(classifications[8], None); // normal assistant
}
#[test]
fn load_subagent_from_fixture() {
let artifacts_dir = Utf8Path::new("tests/fixtures/subagent/parent");
let subagents = dirigent_anth::load_subagents(artifacts_dir).unwrap();
assert_eq!(subagents.len(), 1);
assert_eq!(subagents[0].agent_id, "abc123");
assert_eq!(subagents[0].meta.agent_type.as_deref(), Some("Explore"));
assert_eq!(subagents[0].messages.len(), 2);
}
#[test]
fn load_subagents_empty_dir() {
// Non-existent artifacts dir should return empty vec
let artifacts_dir = Utf8Path::new("tests/fixtures/nonexistent");
let subagents = dirigent_anth::load_subagents(artifacts_dir).unwrap();
assert!(subagents.is_empty());
}
#[test]
fn load_full_session_with_subagents() {
use dirigent_anth::types::SessionRef;
let session_ref = SessionRef {
id: "parent".to_string(),
jsonl_path: Utf8PathBuf::from("tests/fixtures/subagent/parent.jsonl"),
artifacts_dir: Some(Utf8PathBuf::from("tests/fixtures/subagent/parent")),
index_entry: None,
};
let session = dirigent_anth::load_session(&session_ref).unwrap();
assert!(!session.messages.is_empty());
assert!(!session.subagents.is_empty());
assert!(!session.tree.roots.is_empty());
assert!(!session.tool_exchanges.is_empty());
}
#[test]
fn load_session_without_artifacts() {
use dirigent_anth::types::SessionRef;
let session_ref = SessionRef {
id: "minimal".to_string(),
jsonl_path: Utf8PathBuf::from("tests/fixtures/minimal_session.jsonl"),
artifacts_dir: None,
index_entry: None,
};
let session = dirigent_anth::load_session(&session_ref).unwrap();
assert_eq!(session.messages.len(), 6); // 2 queue-ops + 2 users + 2 assistants
assert!(session.subagents.is_empty());
assert!(session.tree.is_linear());
}
#[test]
fn content_as_string_or_blocks() {
// String content
let s: dirigent_anth::types::Content = serde_json::from_str(r#""hello""#).unwrap();
assert!(matches!(s, dirigent_anth::types::Content::Text(_)));
// Block content
let b: dirigent_anth::types::Content =
serde_json::from_str(r#"[{"type":"text","text":"hi"}]"#).unwrap();
assert!(matches!(b, dirigent_anth::types::Content::Blocks(_)));
// Empty blocks
let empty: dirigent_anth::types::Content = serde_json::from_str(r#"[]"#).unwrap();
assert!(matches!(empty, dirigent_anth::types::Content::Blocks(ref v) if v.is_empty()));
}
#[test]
fn missing_optional_fields_dont_crash() {
// Minimal assistant message with many fields missing
let json = r#"{
"type": "assistant",
"message": {
"content": [{"type": "text", "text": "hi"}]
}
}"#;
let msg: RawMessage = serde_json::from_str(json).unwrap();
assert!(matches!(msg, RawMessage::Assistant(_)));
}
#[test]
fn tool_result_content_string_and_blocks() {
// tool_result with string content
let json = r#"{"type":"tool_result","tool_use_id":"t1","content":"output text","is_error":false}"#;
let block: ContentBlock = serde_json::from_str(json).unwrap();
if let ContentBlock::ToolResult { content, is_error, .. } = block {
assert!(!is_error);
assert!(content.is_some());
} else {
panic!("Expected ToolResult");
}
// tool_result with no content
let json2 = r#"{"type":"tool_result","tool_use_id":"t2"}"#;
let block2: ContentBlock = serde_json::from_str(json2).unwrap();
if let ContentBlock::ToolResult { content, is_error, .. } = block2 {
assert!(!is_error);
assert!(content.is_none());
} else {
panic!("Expected ToolResult");
}
}
#[test]
fn extra_unknown_fields_are_ignored() {
// Messages with extra fields not in our structs should parse fine
let json = r#"{
"type": "user",
"uuid": "x",
"timestamp": "2026-01-01T00:00:00Z",
"sessionId": "s",
"unknownField": "should be ignored",
"anotherExtra": 42,
"message": {"role": "user", "content": "hello"}
}"#;
let msg: RawMessage = serde_json::from_str(json).unwrap();
assert!(matches!(msg, RawMessage::User(_)));
}
#[test]
fn timestamp_parsing_all_formats() {
// ISO 8601
let iso = parse_timestamp(&serde_json::json!("2026-03-22T17:00:13.192Z")).unwrap();
assert_eq!(iso.year(), 2026);
// Unix millis
let ms = parse_timestamp(&serde_json::json!(1769461914249_i64)).unwrap();
assert!(ms.year() >= 2025);
// Unix seconds
let secs = parse_timestamp(&serde_json::json!(1769461914_i64)).unwrap();
assert!(secs.year() >= 2025);
}
-101
View File
@@ -1,101 +0,0 @@
use dirigent_anth::anth_usage::process_usage_screen;
const SAMPLE: &str = r#"
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Status Config Usage Stats
Session
Total cost: $0.0000
Total duration (API): 0s
Total duration (wall): 4s
Total code changes: 0 lines added, 0 lines removed
Usage: 0 input, 0 output, 0 cache read, 0 cache write
Current session
███████ 14% used
Resets 12:30pm (Europe/Vienna)
Current week (all models)
██████ 12% used
Resets May 12, 9am (Europe/Vienna)
Current week (Sonnet only)
0% used
Resets May 12, 9am (Europe/Vienna)
What's contributing to your limits usage?
Approximate, based on local sessions on this machine — does not include other devices or claude.ai
Last 24h · these are independent characteristics of your usage, not a breakdown
97% of your usage came from subagent-heavy sessions
Each subagent runs its own requests. Be deliberate about spawning them — and
consider configuring a cheaper model for simpler subagents.
16% of your usage was at >150k context
Longer sessions are more expensive even when cached. /compact mid-task, /clear
when switches to new tasks.
Subagents % of usage
Explore 3%
claude-code-guide 2%
d to day · w to week
Esc to cancel
"#;
#[test]
fn parses_gauges() {
let result = process_usage_screen(SAMPLE);
assert_eq!(result.data.gauges.len(), 3);
assert_eq!(result.data.gauges[0].name, "Current session");
assert_eq!(result.data.gauges[0].percent_used, 14);
assert_eq!(
result.data.gauges[0].resets.as_deref(),
Some("12:30pm (Europe/Vienna)")
);
assert_eq!(result.data.gauges[1].name, "Current week (all models)");
assert_eq!(result.data.gauges[1].percent_used, 12);
assert_eq!(
result.data.gauges[1].resets.as_deref(),
Some("May 12, 9am (Europe/Vienna)")
);
assert_eq!(result.data.gauges[2].name, "Current week (Sonnet only)");
assert_eq!(result.data.gauges[2].percent_used, 0);
// resets_iso should be present for all gauges with reset info
assert!(result.data.gauges[0].resets_iso.is_some());
assert!(result.data.gauges[1].resets_iso.is_some());
assert!(result.data.gauges[2].resets_iso.is_some());
// Week resets should contain the right date
let week_iso = result.data.gauges[1].resets_iso.as_ref().unwrap();
assert!(week_iso.starts_with("2026-05-12") || week_iso.contains("05-12"));
}
#[test]
fn parses_contributions() {
let result = process_usage_screen(SAMPLE);
let contrib = result.data.contributions.as_ref().unwrap();
assert_eq!(contrib.factors.len(), 2);
assert_eq!(contrib.factors[0].percent, 97);
assert!(contrib.factors[0].description.contains("subagent-heavy"));
assert_eq!(contrib.factors[1].percent, 16);
assert_eq!(contrib.subagents.len(), 2);
assert_eq!(contrib.subagents[0].name, "Explore");
assert_eq!(contrib.subagents[0].percent, 3);
assert_eq!(contrib.subagents[1].name, "claude-code-guide");
assert_eq!(contrib.subagents[1].percent, 2);
}
#[test]
fn raw_screen_starts_with_rule() {
let result = process_usage_screen(SAMPLE);
assert!(result.raw_screen.starts_with('─'));
}
-761
View File
@@ -1,761 +0,0 @@
# Package: dirigent_archivist
Persistent storage for all agentic interactions in Dirigent.
## Quick Facts
- **Type**: Library
- **Main Entry**: src/lib.rs
- **Dependencies**: dirigent_protocol, uuid, chrono, serde, tokio, tracing, thiserror, sha2, hex, async-trait
- **Status**: Complete - Production ready with comprehensive tests
## Purpose
The Archivist provides file-based archival storage for all session data, messages, and attachments in Dirigent. It implements an archive-first architecture with connector API fallback, using NDJSON, JSON, and TSV formats for durability and human-readability.
## Key Features
- **File-based Storage**: NDJSON for messages, JSON for metadata, TSV for indices
- **Content-Addressable Files**: SHA-256 based storage for attachments with automatic deduplication
- **Session Lineage**: Track splits, continuations, and mutations with parent references
- **Connector Registry**: Coordinate UID assignment across connectors with collision detection
- **Event Streaming**: Real-time updates via EventHandler subscribing to dirigent_protocol events
- **Archive-First Design**: Read from archive first, fall back to connector API when needed
- **Caching**: In-memory caching of connector and session mappings for performance
## Architecture
The Archivist is built on three core architectural principles:
### 1. Archive-First Read Strategy
The Archivist is the primary source of truth for historical data:
- UI and APIs query the archive first
- Only fall back to connector APIs if data is not in archive
- This enables offline access and consistent history across restarts
### 2. Write-Through Event Capture (Append-Only)
The EventHandler subscribes to the global event stream from dirigent_core:
- Captures session creation, message streaming, and tool calls in real-time
- Uses MessageAccumulator to assemble streaming chunks into complete messages
- Writes complete messages to archive immediately upon finalization
- No polling required - fully event-driven
- **Append-only writes**: Messages are appended as events arrive, NOT in chronological order
- File order reflects event timing, not message timestamps
### 3. File-Based Storage with Sort-on-Read
All data is stored in human-readable, grep-able formats:
- **NDJSON** (Newline-Delimited JSON): Incremental append-only logs for messages and mappings
- **JSON**: Structured metadata for sessions and connectors
- **TSV** (Tab-Separated Values): Fast indices for cross-references
- **Content-Addressed Files**: Binary attachments stored by SHA-256 hash for deduplication
- **Sort-on-Read**: `get_messages()` sorts by timestamp and message_id to ensure chronological order despite append-only writes
## Backend Trait Layer (Phase 2)
The archivist uses a trait-based backend abstraction. `ArchiveBackend`
defines the mandatory session and message primitives every backend must
provide, plus `as_xxx()` accessors returning optional sub-traits:
- `SearchBackend` — reserved for Phase 3+ indexed backends (not wired)
- `DagBackend` — session lineage DAG edges
- `MetaEventsBackend` — ACP connection lifecycle events
- `ConnectorRegistryBackend` — per-archive connector metadata
- `SessionMappingBackend` — native↔scroll session ID mapping
`JsonlBackend` is the Phase 2 concrete implementation (file-based
NDJSON/JSON/TSV) and opts into every sub-trait except `SearchBackend`
(content search continues to be served by ripgrep via
`crates/api/src/archivist/search_task.rs`).
The `Archivist` struct (in `src/coordinator/`) owns a registry of backends
keyed by archive name and performs orchestration (alias detection, session
lineage, move/copy, DAG walks, archive lifecycle). Consumers hold
`Arc<Archivist>` directly — the coordinator is concrete, not a trait.
See `docs/plans/2026-04-18-archivist-phase2-design.md` for design rationale.
## Multi-Backend Registry (Phase 3)
The coordinator (`Archivist`) holds `Vec<Arc<ArchiveRegistration>>` sorted
by `read_priority` instead of a flat `HashMap<name, Arc<dyn ArchiveBackend>>`.
Each registration carries:
- `backend: Arc<dyn ArchiveBackend>` + its declared capabilities
- `failure_mode`: `Required` (must succeed) | `BestEffort` (errors log + drift health)
- `read_priority`: lower = tried first for reads; also selects the default
write target when no archive is named
- `write_active`: participates in fanout writes
- `enabled`: kill-switch without removing config
- `write_policy`: `Inline` (default; `await` per call) or `Queued`
(mpsc + batch_window + overflow policy)
- Runtime state: `last_health`, `last_error`, `consecutive_failures`
(all `Arc<RwLock<_>>`, shared with the writer task when queued)
- Optional `writer: Option<WriterHandle>` (Some iff `write_policy = Queued`)
Backends are declared in `dirigent.toml` under `[[archives]]` and
constructed at boot via `Archivist::from_config(cfg, &BackendRegistry)`.
Add a new backend type by implementing `BackendFactory` and registering
it on the `BackendRegistry` before `from_config`.
### Reads
`get_session`, `get_messages_paged`, `count_messages`, `get_meta_events`,
`get_children`, etc. walk the registry in priority order via
`read_walk_per_session(scroll_id, predicate, op)`. The predicate
capability-filters; `Unavailable` backends are skipped. The first backend
that returns `Some(value)` wins and its name is cached against the
`scroll_id` in a positive LRU (capacity 10_000). Subsequent reads for the
same `scroll_id` short-circuit to the cached backend before falling back
to the full priority walk.
Collection-shape reads (`list_sessions_paged`, `list_connectors`,
`list_meta_sessions`, `find_meta_session_by_client`) use
`read_walk_collection` — first enabled backend that can answer wins, no
cache, no aggregation across backends. Phase 3 explicitly defers
cross-backend merge/dedup to a later phase.
### Writes
Mutating methods (`append_messages`, `register_session`, `update_session_*`,
`append_meta_events`, `append_dag_edge`, `clear_session_messages`,
`update_connector_fingerprint`) resolve a primary (per-call `archive:
Some(name)` override or the default-write target) and fan out to every
other `enabled && write_active` backend that has the required capability.
Capability-mismatched backends are skipped with a debug `capability_skip`
log (never an error). `Required` failures propagate to the caller;
`BestEffort` failures log + drift health.
`register_connector` currently does NOT fan out — alias detection + the
tri-state `Accepted`/`Aliased`/`Rejected` return shape make replication
non-trivial. Fanout for connectors is deferred; single-backend setups are
unaffected.
For `write_policy = Queued` backends, the primary/secondary write paths
enqueue a `WriteOp` into the backend's writer task instead of awaiting.
Errors drift the backend's health but do not propagate to the caller.
Coalescing merges consecutive `AppendMessages`/`AppendMetaEvents` for the
same `scroll_id` within `batch_window_ms`.
### Cross-backend operations
- `delete_session(scroll_id, _)` fans out to every enabled backend that has
the session. Copies in `write_active=false` backends produce
`ArchivistError::DeleteOnReadOnlyBackend` (write-active copies are still
deleted); cache invalidated regardless of outcome.
- `copy_session(scroll_id, from, to)` reads from `from`, writes to `to`,
including DAG and meta-events when both sides have the capability. The
source remains canonical (the cache is NOT rewritten).
- `move_session(scroll_id, from, to)` is `copy + delete-from-source`. If
the source-side delete fails after the copy succeeded,
`ArchivistError::PartialMove { copied_to, delete_error }` is returned so
the caller knows the session now lives in both places.
The Phase 2 connector-aware `move_session(scroll_id, target_connector_uid, _)`
and `copy_session(scroll_id, target_connector_uid, _)` survived the Phase
3 rename as `move_session_to_connector` / `copy_session_to_connector`.
Their bulk variant is `move_sessions_to_connector`.
### Health
`HealthStatus` drifts on every coordinator call that observes a backend:
- Successful write → `Healthy`; `consecutive_failures` reset to 0.
- Successful read → `Healthy` (only rescues `Degraded`; does not reset the counter).
- Write failure → `Degraded { reason }`; `consecutive_failures += 1`; after
K = 5 consecutive failures drifts to `Unavailable { reason }`. Reads skip
`Unavailable` backends; writes against an `Unavailable` `Required`
backend fail, while writes against an `Unavailable` `BestEffort` backend
are still attempted.
- Read failure alone never drifts past `Degraded`; writes are the
authoritative health signal.
`list_archives_with_health()` returns a `Vec<ArchiveStatus>` snapshot of
every registration: name, type, capabilities, health, last_error, and
queue_depth (for queued backends).
### Lifecycle
Phase 3 is **startup-only**. `add_archive` / `remove_archive` /
`set_default_archive` on the coordinator return
`ArchivistError::DynamicRegistryUnsupported`. To change the registry,
edit `dirigent.toml` and restart the server. `Archivist::shutdown()`
drains queued writer tasks (sends `WriteOp::Shutdown` on each writer's
mpsc and awaits ack); call it before process exit.
Test-only constructors `Archivist::from_registrations(regs)` and
`SessionMetadata::stub(scroll_id)` live under `#[cfg(any(test, feature =
"test-utils"))]` for integration tests that bypass the factory.
See `docs/plans/2026-04-19-archivist-phase3-design.md` for the full
design rationale, and `examples/multi_backend.rs` for a runnable
end-to-end example.
## Module Organization
### Core Modules
- **`lib.rs`**: Public API surface and re-exports
- **`types.rs`**: Core data structures (session metadata, message records, connector info, API types)
- **`error.rs`**: Error types and Result alias for archivist operations
### Backend Layer (`backend/`)
- **`traits.rs`**: `ArchiveBackend` trait + 5 optional sub-traits
- **`capability.rs`**: `ArchiveCapability` enum + `CapabilitySet` type
- **`health.rs`**: `HealthStatus` enum returned by `health_check`
- **`contract.rs`**: Reusable behavioral tests for any `&dyn ArchiveBackend` (cfg-gated)
- **`mock.rs`**: In-memory `MockBackend` for coordinator unit tests (cfg-gated)
### Concrete Backends (`backends/`)
- **`jsonl/`**: The file-based `JsonlBackend` — the only Phase 2 backend.
Reuses `storage/` primitives for NDJSON/JSON/TSV operations.
### Coordinator (`coordinator/`)
- **`mod.rs`**: The `Archivist` struct + constructors
- **`archives.rs`**: Archive lifecycle (add/remove/list/default)
- **`connectors.rs`**: Connector registration + alias detection
- **`sessions.rs`**: Session registration, metadata updates, move/copy
- **`meta.rs`**: Meta events, DAG walks, cleanup
### Storage Layer (`storage/`)
Low-level file I/O primitives used by `JsonlBackend`. All storage operations are async and use tokio.
- **`paths.rs`**: ArchivePaths utility for consistent directory structure and path resolution
- **`ndjson.rs`**: Newline-delimited JSON operations (read_ndjson, append_ndjson)
- **`json.rs`**: JSON operations (read_json, write_json)
- **`tsv.rs`**: Tab-separated value operations for connector index
- **`files.rs`**: Content-addressable file storage with SHA-256 hashing and deduplication
### Supporting Modules
- **`registry.rs`**: Archive registry persistence (multi-archive metadata)
- **`migration.rs`**: Single-archive → multi-archive migration path
- **`session.rs`**: Session lineage types shared across layers
- **`accumulator.rs`**: MessageAccumulator for assembling streaming message chunks
- **`backfill.rs`**: Backfill helpers for importing historical sessions
- **`import/`**: External conversation importers (e.g. Claude export)
### Events
- **`events.rs`**: EventHandler for subscribing to dirigent_protocol events and archiving them
## Configuration
The Archivist archive root is determined by `DirigentPaths` resolution:
- Set `DIRIGENT_DATA_DIR` to override the data directory; archives will be stored at `<data_dir>/archives/`
- Defaults to `~/.local/share/dirigent/archives/` (or platform equivalent)
```bash
DIRIGENT_DATA_DIR=/path/to/data dx serve
```
## Archive Structure
```
dirigent_archive/
├── .contexts/
│ └── {scroll_id:uuidv7}/ # One directory per session
│ ├── session.json # Session metadata
│ ├── messages.jsonl # Incremental message log (.ndjson also supported)
│ └── lineage.json # Session lineage info (optional)
├── .db/
│ └── connectors/
│ ├── index.tsv # Fast connector lookup (TSV)
│ └── {connector_uid}/
│ ├── connector.json # Connector metadata
│ └── sessions.jsonl # Session mappings (.ndjson also supported)
└── .files/
└── {sha256-hash} # Content-addressable file storage
```
### Why Hidden Directories?
The `.contexts`, `.db`, and `.files` directories are hidden (prefixed with `.`) to keep the archive root clean for future rendered outputs (like `chat.md` exports). This is similar to how `.git` hides implementation details in a codebase.
## File Formats
### Session Metadata (`session.json`)
```json
{
"version": 1,
"scroll_id": "01936e8f-e5a7-7000-8000-000000000001",
"created_at": "2025-01-01T12:00:00Z",
"updated_at": "2025-01-01T12:30:00Z",
"title": "Implement user authentication",
"connector_uid": "01936e8f-e5a7-7000-8000-000000000002",
"native_session_id": "abc123",
"agent_id": null,
"parent_scroll_id": null,
"continuation": null,
"tags": ["backend", "auth"],
"metadata": {
"source": "OpenCode",
"model": "claude-3-5-sonnet"
}
}
```
### Messages Log (`messages.jsonl`)
One JSON object per line, **append-only**:
```jsonl
{"version":1,"message_id":"01936e8f-e5a7-7000-8000-000000000003","session":"01936e8f-e5a7-7000-8000-000000000001","parent_id":null,"ts":"2025-01-01T12:01:00Z","role":"user","author":"alice","content_md":"How do I implement JWT auth?","attachments":[],"metadata":{}}
{"version":1,"message_id":"01936e8f-e5a7-7000-8000-000000000004","session":"01936e8f-e5a7-7000-8000-000000000001","parent_id":"01936e8f-e5a7-7000-8000-000000000003","ts":"2025-01-01T12:01:10Z","role":"assistant","author":"claude","content_md":"Here's how to implement JWT authentication...","attachments":[],"metadata":{"model":"claude-3-5-sonnet"}}
```
**IMPORTANT - Ordering**: The order of lines in the message log file (`messages.jsonl` or `messages.ndjson`) reflects **event arrival order**, NOT chronological order. Assistant replies often arrive after subsequent user messages due to streaming latency, resulting in non-chronological file order. Always use the `Archivist::get_messages()` API to retrieve messages, which sorts by `ts` (timestamp) and `message_id` (UUIDv7) to guarantee chronological order.
**File Format Compatibility**: The archivist supports both `.ndjson` and `.jsonl` file extensions for newline-delimited JSON files. When reading, `.jsonl` is preferred if present, with automatic fallback to `.ndjson` for backward compatibility. Write operations use `.jsonl` (canonical format). Both formats are identical in content - the difference is purely the file extension.
### Connector Index (`index.tsv`)
Tab-separated values with header row:
```tsv
connector_uid type title client_native_id alias_of created_at
01936e8f-e5a7-7000-8000-000000000002 OpenCode OpenCode Local opencode@http://localhost:12225 2025-01-01T12:00:00Z
```
### Session Mappings (`sessions.jsonl`)
Maps native session IDs from connectors to scroll IDs in the archive:
```jsonl
{"version":1,"connector_uid":"01936e8f-e5a7-7000-8000-000000000002","native_session_id":"abc123","scroll_id":"01936e8f-e5a7-7000-8000-000000000001","created_at":"2025-01-01T12:00:00Z","alias_of":null}
```
## Message Ordering Guarantees
### The Problem: Append Order ≠ Chronological Order
In the event-driven architecture, messages are written to the message log file (`messages.jsonl`) as completion events arrive. Due to streaming latency:
- User messages complete nearly instantly and are written immediately
- Assistant messages stream over time and complete later
- A second user message can be written before the first assistant reply completes
Example scenario:
```
T0: User sends "tell me a joke about snakes" (ts=18:23:36.947)
T1: Assistant starts streaming reply (ts=18:23:36.969)
T2: User sends "now one about tigers" (ts=18:23:49.429) <- completes and writes BEFORE assistant finishes
T3: Assistant finishes "snakes" reply <- writes AFTER "tigers" user message
```
File order in the message log file:
```
1. user "snakes" (18:23:36.947)
2. user "tigers" (18:23:49.429) <- written second
3. assistant "snakes" (18:23:36.969) <- written third, but timestamp is earlier!
```
### The Solution: Sort-on-Read
The `Archivist::get_messages()` implementation sorts messages before returning:
1. **Primary sort**: `ts` (timestamp) ascending
2. **Secondary sort**: `message_id` (UUIDv7) ascending for stable tie-breaking
This guarantees chronological order regardless of NDJSON append order:
```
1. user "snakes" (18:23:36.947)
2. assistant "snakes" (18:23:36.969)
3. user "tigers" (18:23:49.429)
```
### Why This Approach?
- **Maintains durability**: Append-only writes preserve crash safety
- **No migration needed**: Existing archives work without rewrites
- **Simple implementation**: No buffered writes or complex write-time ordering
- **Performance trade-off**: Small CPU cost on read (sorting) vs. complex write-time coordination
### Consumer Guidance
- **DO**: Use `Archivist::get_messages()` to retrieve messages
- **DON'T**: Read the message log file directly and assume file order = chronological order
- **UI/API**: Always sort by `ts` then `message_id` for defense in depth
- **Tie-breaking**: Use `message_id` (UUIDv7) as secondary sort for stable ordering when timestamps match
## Key Types
### SessionMetadata
Stores all metadata about a session including:
- **scroll_id**: UUIDv7 identifier for the session
- **connector_uid**: Which connector owns this session
- **native_session_id**: Original session ID from the connector (optional)
- **title**: Optional human-readable session title (see Title Management below)
- **parent_scroll_id**: For session lineage (splits, continuations)
- **continuation**: Type of continuation (SPLIT, COMPACT, REFERENCE, EDIT)
- **tags**: User-defined categorization
- **metadata**: Free-form JSON for connector-specific fields
#### Title Management
Session titles are fully supported and persist across restarts. Titles are stored in the `SessionMetadata` struct and saved to the `session.json` file.
**Setting Titles:**
```rust
// Update title for an existing session
archivist.update_session_metadata(
scroll_id,
Some("My Custom Session Title".to_string()),
None, // model
None // archive
).await?;
```
**Default Behavior:**
- New sessions can specify an initial title during registration
- If no title is provided, sessions default to `None`
- The UI typically displays "Untitled" for sessions without titles
**Title Loading:**
- Titles are automatically loaded when retrieving session metadata via `get_session_metadata()`
- Session lists include titles via `list_sessions()` and `list_sessions_all()`
- Titles are part of the `SessionMetadata` struct returned by all session queries
**UI Integration:**
- The web UI displays session titles in the session list and sidebar
- Users can rename sessions via the "Rename" button in the session list view
- Renaming calls `api::archivist::rename_session()` which uses `update_session_metadata()`
- Title changes are persisted immediately and survive application restarts
### MessageRecord
Represents a single message in the archive:
- **message_id**: UUIDv7 identifier
- **session**: scroll_id this message belongs to
- **role**: "user", "assistant", or "system"
- **content_md**: Message content in Markdown format
- **attachments**: References to attached files
- **metadata**: Free-form JSON for connector-specific fields
### ConnectorRecord
Metadata about a connector:
- **connector_uid**: UUIDv7 identifier
- **type**: "OpenCode", "ACP", or custom
- **client_native_id**: Unique identifier from client (e.g., "opencode@http://localhost:12225")
- **alias_of**: If this connector is an alias of another (for deduplication)
## Archivist Public API
The `Archivist` struct (in `coordinator/`) is the main public entry point
for archival operations. Consumers hold `Arc<Archivist>` and call inherent
methods — there is no `Archivist` trait anymore. The coordinator resolves
the target backend per call (via `archive: Option<String>`) and delegates
to `ArchiveBackend` methods.
Key method families (see `coordinator/*.rs` for full signatures):
- **Archive lifecycle** (`archives.rs`): `add_archive`, `remove_archive`,
`list_archives`, `set_default_archive`
- **Connectors** (`connectors.rs`): `register_connector` with tri-state
result (Accepted / Aliased / Rejected), `list_connectors`
- **Sessions** (`sessions.rs`): `register_session`, `get_session_metadata`,
`update_session_metadata`, `list_sessions_paged`, `move_session`,
`copy_session`, `resolve_session`
- **Messages**: `append_messages`, `get_messages` (sorts by `ts` then
`message_id` for stable chronological order)
- **Meta / DAG** (`meta.rs`): meta-event recording, session lineage DAG
walks, cleanup routines
## List Filter vs. Full-Text Search
Two distinct query paths exist — do not conflate them.
**List filter**`Archivist::list_sessions_paged(SessionListQuery)` returns a
cursor-paged list of sessions, AND-filtered by `title_query` (substring on
title), `tags`, `model_filter` (substring on `metadata.model`), `project_id`,
`connector_uid`, and `include_hidden`. This is the right tool for "narrow the
list of visible sessions."
**Full-text search**`api::search_sessions` (in the `api` package, backed by
`api::archivist::search_task::SearchTask`) spawns `rg --json` over the
archive's `.contexts/` tree to find messages containing text. It streams
`SearchExcerpt`s with parsed NDJSON content and supports cancellation via
`CancellationToken`. This is the right tool for "find messages containing
text."
**Do not extend `list_sessions_paged` to do content search.** Content search
belongs in the ripgrep pipeline. Future improvements to content search
(indexed backends, relevance scoring) are Phase 2d / Phase 3 concerns.
## JsonlBackend Implementation
The Phase 2 production backend — an implementation of `ArchiveBackend` plus
every sub-trait except `SearchBackend`:
- **Thread-safe**: Uses RwLock for in-memory caches
- **Async**: All operations use tokio for non-blocking I/O
- **Caching**: In-memory caches for connector and session mappings
- **Collision Detection**: Tri-state registration for connectors and sessions
Located under `src/backends/jsonl/` and split by concern (`backend.rs`,
`connectors.rs`, `dag.rs`, `mapping.rs`, `meta.rs`).
### Caching Strategy
`JsonlBackend` maintains two in-memory caches:
1. **connector_cache**: HashMap<Uuid, ConnectorRecord>
- Populated on registration
- Read from TSV index on startup (future enhancement)
2. **session_cache**: HashMap<(Uuid, String), Uuid>
- Maps (connector_uid, native_session_id) to scroll_id
- Populated on registration and session resolution
- Enables fast session lookups without disk I/O
## Event Handling
The EventHandler subscribes to dirigent_protocol events and archives them in real-time:
```rust
// Create archivist and event handler
let archivist = Archivist::new_with_single_archive(archive_path).await?;
let handler = EventHandler::new(Arc::new(archivist));
// Subscribe to event stream from dirigent_core
let events = event_stream.subscribe();
// Run event loop (blocking)
handler.run(events).await;
```
### Supported Events
- **SessionCreated**: Registers new sessions with the archivist
- **MessageCompleted**: Writes finalized messages to the archive
- **SessionUpdate**: Accumulates streaming message chunks
- AgentMessageChunk
- UserMessageChunk
- AgentThoughtChunk
- ToolCall
### MessageAccumulator
Assembles streaming message chunks into complete messages:
- Accumulates text chunks by message_id
- Tracks thinking blocks separately
- Stores tool calls with input/output
- Finalizes messages on MessageCompleted event
- Converts to MessageRecord for archival
## Integration with dirigent_core
The Archivist integrates with dirigent_core via the global event stream:
1. **CoreRuntime** emits events for all connector operations
2. **EventHandler** subscribes to event stream
3. **MessageAccumulator** assembles streaming chunks
4. **Archivist** writes complete messages to archive
This enables:
- Automatic archival of all sessions and messages
- No polling required - fully event-driven
- Consistent history across restarts
- Offline access to historical data
## Testing
The package has comprehensive test coverage across multiple dimensions:
### Unit Tests
Located in each module (`src/*.rs`, `src/storage/*.rs`):
- Type serialization/deserialization
- UUIDv7 generation and ordering
- Timestamp formatting (RFC 3339)
- Storage operations (NDJSON, JSON, TSV, files)
- Connector registration tri-state logic
- Session registration and alias detection
### Integration Tests
Located in `tests/`:
- `integration_tests.rs`: Full `Archivist` + `JsonlBackend` lifecycle, event
handler integration, multi-connector scenarios, session lineage, message
accumulation
- `list_sessions_paged_test.rs`, `pagination_test.rs`: List filter + cursor
pagination coverage
- `import_claude_idempotency_test.rs`: Claude export re-import idempotency
### Backend Contract Tests
`src/backend/contract.rs` holds reusable async assertions that any
`&dyn ArchiveBackend` must pass. `JsonlBackend` and `MockBackend` both
run the contract suite; new backends added in Phase 3+ should do the same.
### Examples
Located in `examples/`:
- `basic_usage.rs`: Core archivist operations
- `event_handling.rs`: EventHandler and MessageAccumulator
- `file_storage.rs`: Content-addressable file storage
Run tests:
```bash
cargo test --package dirigent_archivist
```
Run examples:
```bash
cargo run --package dirigent_archivist --example basic_usage
cargo run --package dirigent_archivist --example event_handling
cargo run --package dirigent_archivist --example file_storage
```
## Performance Characteristics
- **Append Operations**: O(1) with sequential file writes
- **Session Lookup**: O(1) with in-memory cache, O(n) cache miss
- **Message Retrieval**: O(n) where n = number of messages (NDJSON parsing)
- **File Storage**: O(1) content-addressable lookup with SHA-256 hashing
- **Connector Index**: O(n) TSV scan, suitable for hundreds of connectors
### Scalability Considerations
- **Large Sessions**: NDJSON is append-only, so reading large sessions requires parsing all lines
- **Many Sessions**: TSV indices are suitable for thousands of sessions per connector
- **File Deduplication**: SHA-256 hashing provides automatic deduplication across sessions
- **Concurrent Access**: RwLock allows multiple concurrent readers, single writer
## Error Handling
The Archivist uses thiserror for rich error types:
```rust
pub enum ArchivistError {
IoError(std::io::Error),
SerdeError(serde_json::Error),
SessionUnknown(Uuid),
CollisionInconsistent(Uuid),
// ... etc
}
```
All public APIs return `Result<T, ArchivistError>` for explicit error handling.
## Development Notes
- All storage operations are async (using tokio)
- Content-addressable storage uses SHA-256 hashes (hex-encoded)
- Archive directory structure mirrors session/message hierarchy
- UUIDv7 provides time-ordered, sortable identifiers
- RFC 3339 UTC timestamps for all time-based fields
- Schema versioning via `version` field in all records
## Related Packages
- **dirigent_protocol**: Shared types and protocol definitions (dependency)
- **dirigent_core**: Runtime integration for SSE event capture (integration point)
- **api**: Server functions for archive queries (future)
- **web**: UI for archive browsing and search (future)
## Phase 4: `ArchiveFilter` (2026-04-21)
Every `ArchiveRegistration` carries a `filter: ArchiveFilter`. The filter
describes which sessions/writes the backend wants to receive. Fields:
- `include_connectors: Option<HashSet<Uuid>>` — if Some, only these
connector UIDs pass. `None` means no connector gate.
- `exclude_connectors: HashSet<Uuid>` — always rejected.
- `include_tags: HashSet<String>` — if non-empty, the session must carry
at least one matching tag.
- `exclude_tags: HashSet<String>` — any matching tag rejects.
- `include_hidden: bool` — default `true`. If `false`, sessions whose
metadata has `"hidden": true` are skipped.
### Primary-always-writes invariant
The per-call primary (either the `archive: Some(name)` argument or the
default write-target) is **never** filtered. If a caller explicitly asks
to write to archive X, the filter on X is not consulted. Filters only
gate secondary fanout.
### Boot validator
At boot (`coordinator/boot.rs`), the validator rejects configurations
where:
- No write-active + enabled registration has an **unrestricted** filter
(`ArchiveFilter::default()` is unrestricted). Prevents configurations
that silently drop all writes.
- An archive's filter has `include_connectors = Some(empty set)`
equivalent to "reject everything", which is almost certainly a config
bug.
See `docs/plans/2026-04-21-archivist-phase4-design.md` §4 for the full
design rationale.
## Phase 5: Importers (2026-04-21)
The `import::` module centres on an `Importer` trait with per-source
implementations under `import::sources::*`. Each source produces a
`ParsedConversation` (ChatGPT) / `ParsedSession` (Codex) / session
directory walk (Claude) and feeds the results through the common
`import_sessions` orchestrator, which fires `ImportProgressEvent`s on a
bounded `ImportProgressSink`.
### `Importer` trait
Every importer declares a `config_shape()` so UIs can render a dynamic
form; a `discover()` that returns an `ImportDiscovery` preview; and an
`import()` that does the actual work. All three methods are async.
The trait lives in `import::trait_def`. Shape types (`ImportConfig`,
`ImportTarget`, `ConfigField`, `ConfigFieldKind`, `ImportError`) are
serialisable and safe to cross the WASM boundary.
### Registry
`ImporterRegistry::with_defaults()` registers every enabled
`importer-*` feature. Currently: `claude`, `chatgpt`, `codex`. The
registry is constructed at boot and stored on `AppState`.
### Progress sink
`ImportProgressSink::channel()` returns a bounded mpsc pair.
Non-terminal events use `try_send` (dropped on full); terminal events
use `send().await` so consumers always see the final state.
### Source crates
- `dirigent_chatgpt` — parses `conversations.json` from the OpenAI data
export.
- `dirigent_codex` — parses `*.jsonl` session files under
`~/.codex/sessions`.
Both crates hold pure parser types with zero dirigent-specific types.
See `docs/plans/2026-04-21-archivist-phase5-design.md`.
## Future Enhancements
- Indexed `SearchBackend` implementations (tantivy/sqlite) — currently
content search is ripgrep-based in the `api` package
- Session splitting and lineage management (mutations.ndjson)
- Knowledge overview generation (chat.md exports)
- Embedding storage and search (embeds/)
- Network RPC interface for remote archivist
- Compaction and pruning policies
- Additional concrete backends (e.g. SQLite, remote)
## Documentation
- **Package README**: `./README.md` - User-facing overview
- **Architecture Docs**: `../../docs/building/05_archivist/` - Design and planning
- **API Docs**: Run `cargo doc --package dirigent_archivist --open`
- **Examples**: See `examples/` directory for working code samples
-69
View File
@@ -1,69 +0,0 @@
[package]
name = "dirigent_archivist"
version = "0.1.0"
edition = "2021"
[lib]
path = "src/lib.rs"
[features]
# All built-in importers are on by default. Turn the corresponding
# `importer-*` flag off (and opt out of `default`) to ship a slimmer build.
default = ["importer-claude", "importer-chatgpt", "importer-codex"]
# Exposes the sub-trait contract test harness (`backend::contract`) to
# downstream crates so new backends can reuse the same behavioral checks.
test-utils = []
# Per-source importer feature gates. Each flag guards the corresponding
# `ImporterRegistry::with_defaults` registration and (where relevant) the
# source module itself.
importer-claude = []
importer-chatgpt = ["dep:dirigent_chatgpt"]
importer-codex = ["dep:dirigent_codex"]
[dependencies]
# Core dependencies
dirigent_protocol = { path = "../dirigent_protocol" }
dirigent_anth = { path = "../dirigent_anth" }
dirigent_chatgpt = { path = "../dirigent_chatgpt", optional = true }
dirigent_codex = { path = "../dirigent_codex", optional = true }
camino = "1.1"
# UUID support with v7 and serde
uuid = { version = "1.11", features = ["v5", "v7", "serde"] }
# Date/time handling
chrono = { version = "0.4", features = ["serde"] }
# Serialization
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
toml = "0.8"
# Async runtime and file operations
tokio = { version = "1.42", features = ["fs", "sync", "time", "io-util", "macros", "rt-multi-thread"] }
# Logging
tracing = "0.1"
# Error handling
thiserror = "2.0"
anyhow = "1"
# Hashing for content-addressable storage
sha2 = "0.10"
hex = "0.4"
# LRU read cache for registry backends
lru = "0.12"
# Async traits
async-trait = "0.1"
# Async futures
futures = "0.3"
[dev-dependencies]
tempfile = "3.0"
walkdir = "2"
-338
View File
@@ -1,338 +0,0 @@
# Dirigent Archivist
Persistent storage for all agentic interactions in Dirigent.
## Overview
The Archivist automatically archives every conversation, message, and file from your AI sessions into a local, grep-able, human-readable archive. No cloud required - your data stays on your machine in formats you can read and search manually.
## Why Archivist?
- **Offline Access**: All conversations are saved locally, accessible even when connectors are offline
- **Manual Curation**: Files are in plain JSON/NDJSON/TSV - grep, edit, or analyze them with any tool
- **Knowledge Base**: Build a searchable archive of all your AI interactions across projects
- **Session Lineage**: Track conversation branches, splits, and continuations
- **File Deduplication**: Attachments are stored once, referenced multiple times (content-addressable)
- **Archive-First**: UI reads from local archive first, only falls back to remote connectors when needed
## Quick Start
The Archivist runs automatically when you start Dirigent. The archive location is determined by the `DIRIGENT_DATA_DIR` environment variable (archives are stored at `<data_dir>/archives/`):
```bash
# Override data directory (archives at /path/to/data/archives/)
DIRIGENT_DATA_DIR=/path/to/data dx serve
```
That's it! Every session and message will be automatically archived.
## Archive Structure
Your archive is organized like this:
```
dirigent_archive/
├── .contexts/ # Session data
│ └── 01936e8f-e5a7-7000-8000.../
│ ├── session.json # Session metadata
│ └── messages.ndjson # All messages (one JSON per line)
├── .db/
│ └── connectors/ # Connector registry
│ ├── index.tsv # Fast lookup table
│ └── 01936e8f-e5a7.../
│ ├── connector.json # Connector info
│ └── sessions.ndjson # Session ID mappings
└── .files/ # Attachments (by SHA-256)
└── a1b2c3d4... # Content-addressable storage
```
### Why Hidden Directories?
The `.contexts`, `.db`, and `.files` directories start with `.` to keep them internal (like `.git`). In the future, you'll be able to export rendered markdown files into the archive root for easy reading.
## File Formats
### Session Metadata (`.contexts/{id}/session.json`)
```json
{
"version": 1,
"scroll_id": "01936e8f-e5a7-7000-8000-000000000001",
"created_at": "2025-01-01T12:00:00Z",
"updated_at": "2025-01-01T12:30:00Z",
"title": "Implement user authentication",
"connector_uid": "01936e8f-e5a7-7000-8000-000000000002",
"tags": ["backend", "auth"],
"metadata": {
"source": "OpenCode",
"model": "claude-3-5-sonnet"
}
}
```
### Messages (`.contexts/{id}/messages.ndjson`)
Newline-delimited JSON - one message per line, **append-only**:
```jsonl
{"version":1,"message_id":"...","session":"...","role":"user","ts":"2025-01-01T12:01:00Z","content_md":"How do I implement JWT auth?","attachments":[],"metadata":{}}
{"version":1,"message_id":"...","session":"...","role":"assistant","ts":"2025-01-01T12:01:10Z","content_md":"Here's how to implement JWT authentication...","attachments":[],"metadata":{"model":"claude-3-5-sonnet"}}
```
**IMPORTANT**: Messages are written as events arrive, NOT in chronological order. Assistant replies often appear after subsequent user messages due to streaming latency. When reading programmatically, use the Archivist API which sorts by timestamp (`ts`) to ensure correct order. For manual inspection, sort by the `ts` field.
### Connector Index (`.db/connectors/index.tsv`)
Tab-separated values for fast scanning:
```tsv
connector_uid type title client_native_id alias_of created_at
01936e8f... OpenCode OpenCode Local opencode@http://localhost:12225 2025-01-01T12:00:00Z
```
## Searching Your Archive
Since everything is plain text, you can use standard Unix tools:
```bash
# Find all sessions about "authentication"
grep -r "authentication" dirigent_archive/.contexts/*/session.json
# Find messages mentioning a specific error
grep "ECONNREFUSED" dirigent_archive/.contexts/*/messages.ndjson
# List all sessions for a connector
cat dirigent_archive/.db/connectors/*/sessions.ndjson | jq .
# Get all user messages from a session (sorted by timestamp)
cat dirigent_archive/.contexts/01936e8f.../messages.ndjson | jq -s 'sort_by(.ts) | .[] | select(.role=="user")'
# View messages in chronological order
cat dirigent_archive/.contexts/01936e8f.../messages.ndjson | jq -s 'sort_by(.ts)'
```
**Note on ordering**: Remember that the file order is append-only (event arrival order). Always sort by `ts` (timestamp) when reading manually to see messages in chronological order.
## Integration with Dirigent
The Archivist integrates seamlessly with Dirigent's core runtime:
1. **Automatic Archiving**: Every session and message is archived in real-time as events arrive
2. **Event-Driven**: No polling - listens to dirigent_core's event stream
3. **Append-Only Writes**: Messages written as completion events arrive (preserves durability)
4. **Sort-on-Read**: API returns messages in chronological order despite append-only file order
5. **UI Integration**: Web UI reads from archive first, shows data even when connectors are offline
6. **Connector Coordination**: Assigns stable UUIDs to connectors with collision detection
## Key Concepts
### Scroll IDs
Every session gets a unique `scroll_id` (UUIDv7) that's independent of the connector's native session ID. This allows:
- Sessions to move between connectors
- Stable references even if connector data is deleted
- Time-ordered sorting (UUIDv7 encodes timestamp)
### Session Lineage
Sessions can have parent sessions, creating a tree of related conversations:
- **Split**: Fork conversation at a specific message
- **Compact**: Summarized version of parent
- **Reference**: Points to parent without duplication
- **Edit**: Modified version of parent
### Content-Addressable Storage
Files are stored by their SHA-256 hash, so:
- Same file uploaded twice uses same storage
- Files can be shared across sessions without duplication
- You can verify file integrity by hash
## Configuration
### Environment Variables
- `DIRIGENT_DATA_DIR`: Override data directory; archives are stored at `<data_dir>/archives/`
### Example Configurations
```bash
# Use custom data directory (archives at /home/user/mydata/archives/)
DIRIGENT_DATA_DIR=/home/user/mydata dx serve
# Use global data directory
DIRIGENT_DATA_DIR=/home/user/.dirigent dx serve
# Use temporary data directory (testing)
DIRIGENT_DATA_DIR=/tmp/dirigent_test dx serve
```
## Programmatic Access
While the Archivist runs automatically, you can also use it programmatically:
```rust
use dirigent_archivist::Archivist;
use std::path::PathBuf;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Create an archivist over a single archive directory.
// Internally this wires up a `JsonlBackend` for the archive.
let archivist = Archivist::new_with_single_archive(
PathBuf::from("./dirigent_archive")
).await?;
// List sessions for a connector
let sessions = archivist.list_sessions(connector_uid).await?;
for session in sessions {
println!("{}: {}", session.scroll_id, session.title.unwrap_or_default());
}
Ok(())
}
```
`Archivist` is a concrete struct that owns a registry of `ArchiveBackend`
implementations keyed by archive name. In Phase 2 the only backend is
`JsonlBackend` (file-based NDJSON/JSON/TSV). See `examples/` for more
detailed usage.
## Performance
The Archivist is designed for human-scale workloads (thousands of sessions, millions of messages):
- **Fast Writes**: Append-only NDJSON is O(1)
- **Cached Reads**: Common lookups cached in memory
- **Grep-able**: TSV indices can be scanned in milliseconds
- **Incremental**: Only new messages are written, no full re-writes
### Scalability Notes
- Large sessions (1000+ messages) may take a few seconds to load
- TSV indices are suitable for 100-1000 connectors
- File deduplication saves space for repeated attachments
## Querying and Curation
### Future: Knowledge Overviews
The Archivist is designed to support knowledge curation workflows:
- Export sessions as clean markdown files
- Create summaries and overviews across sessions
- Tag and categorize conversations
- Build a personal knowledge base
These features are planned for future releases.
### Current: Manual Curation
For now, you can manually curate your archive:
- Edit `session.json` to add tags
- Grep through messages for specific topics
- Copy/organize sessions into project folders
- Use jq/awk/sed to extract insights
## Advanced Features
### Session Splitting
Create a new conversation branch from any point in history:
```rust
// Future API (not yet implemented)
let new_session = archivist.split_session(
session_id,
at_message_id,
Continuation::Split
).await?;
```
### Attachment Storage
Files are automatically deduplicated using SHA-256:
```rust
// Store file (content-addressable)
let file_id = archivist.store_file(
&file_data,
"spec.pdf",
Some("application/pdf")
).await?;
// Reference in message
let attachment = AttachmentRef {
file_id, // "sha256:abc123..."
name: "spec.pdf".to_string(),
mime_type: Some("application/pdf".to_string()),
};
```
### Multi-Archive Support
`Archivist` natively manages multiple named archives via an on-disk
registry. Each archive is backed by its own `ArchiveBackend` (currently
`JsonlBackend`) and selected per call via an optional `archive` argument.
This enables:
- Separate archives per project
- A default archive plus specialized side archives
- Moving or copying sessions between archives
Future backends (e.g. SQLite, indexed, remote) will plug into the same
trait layer without changing the coordinator API.
## Troubleshooting
### Archive Not Created
If the archive directory doesn't appear:
1. Check `DIRIGENT_DATA_DIR` is set correctly (or that the default data directory is writable)
2. Ensure write permissions on parent directory
3. Check logs for I/O errors
### Missing Sessions
If sessions don't appear in archive:
1. Verify EventHandler is running
2. Check for event subscription errors in logs
3. Ensure connector emits `SessionCreated` events
### Large Archive Size
If archive grows too large:
1. Check for duplicate files in `.files/`
2. Consider archiving old sessions separately
3. Future: Use compaction features (not yet implemented)
## Development Status
**Current** (Phase 2 complete):
- Automatic archival of sessions and messages
- Event-driven integration with dirigent_core
- File-based storage with NDJSON/JSON/TSV (`JsonlBackend`)
- Content-addressable file storage
- Multi-archive coordinator with per-archive backends
- Trait-based backend abstraction (`ArchiveBackend` + sub-traits)
**Future**:
- Indexed `SearchBackend` implementations (full-text search)
- Additional concrete backends (SQLite, remote)
- Session splitting and lineage management
- Knowledge overview generation
- Network RPC interface
## Documentation
- **Developer Guide**: `CLAUDE.md` - Package architecture and implementation details
- **Architecture**: `docs/building/05_archivist/vision.md` - Design rationale
- **API Docs**: `cargo doc --package dirigent_archivist --open`
- **Examples**: See `examples/` for working code
## Contributing
The Archivist is part of the Dirigent project. See the main repository for contribution guidelines.
## License
Part of the Dirigent project.
@@ -1,198 +0,0 @@
//! Basic usage example for dirigent_archivist
//!
//! This example demonstrates:
//! - Creating a Archivist
//! - Registering a connector
//! - Registering a session
//! - Appending messages to a session
//! - Listing sessions for a connector
//! - Retrieving messages for a session
use chrono::Utc;
use dirigent_archivist::{
Archivist, MessageRecord, RegisterConnectorRequest, RegisterSessionRequest,
Result,
};
use std::path::PathBuf;
use uuid::Uuid;
#[tokio::main]
async fn main() -> Result<()> {
// Create a temporary archive directory for this example
let temp_dir = std::env::temp_dir().join(format!("dirigent_example_{}", Uuid::now_v7()));
println!("Creating archive at: {}", temp_dir.display());
// Step 1: Create a Archivist
let archivist = Archivist::new_with_single_archive(temp_dir.clone()).await?;
println!("Archivist created successfully");
// Step 2: Register a connector
println!("\n--- Registering Connector ---");
let connector_req = RegisterConnectorRequest {
r#type: "OpenCode".to_string(),
title: "OpenCode Local".to_string(),
client_native_id: "opencode@http://localhost:12225".to_string(),
custom_uid: None, // Let archivist generate a UID
metadata: serde_json::json!({
"version": "0.1.0",
"protocol": "OpenCode HTTP API"
}),
fingerprint: None,
};
let connector_resp = archivist.register_connector(connector_req, None).await?;
println!("Connector registered: {:?}", connector_resp);
let connector_uid = connector_resp.connector_uid;
// Step 3: Register a session
println!("\n--- Registering Session ---");
let session_req = RegisterSessionRequest {
connector_uid,
native_session_id: "session-abc123".to_string(),
title: Some("Example chat session".to_string()),
custom_scroll_id: None, // Let archivist generate a scroll ID
metadata: serde_json::json!({
"project_path": "/home/user/projects/example",
"model": "claude-3-5-sonnet"
}),
completeness: Default::default(),
parent_scroll_id: None,
is_subagent: false,
continuation: None,
agent_id: None,
subagent_type: None,
spawning_tool_use_id: None,
};
let session_resp = archivist.register_session(session_req, None).await?;
println!("Session registered: {:?}", session_resp);
let scroll_id = session_resp.scroll_id;
// Step 4: Append messages to the session
println!("\n--- Appending Messages ---");
// User message
let user_msg = MessageRecord {
version: 1,
message_id: Uuid::now_v7(),
session: scroll_id,
parent_id: None,
ts: Utc::now(),
role: "user".to_string(),
author: Some("alice".to_string()),
content_md: "Hello! Can you help me write a function to calculate fibonacci numbers?"
.to_string(),
content_parts: None,
attachments: vec![],
metadata: serde_json::json!({}),
};
// Assistant message
let assistant_msg = MessageRecord {
version: 1,
message_id: Uuid::now_v7(),
session: scroll_id,
parent_id: Some(user_msg.message_id),
ts: Utc::now(),
role: "assistant".to_string(),
author: Some("claude".to_string()),
content_md: r#"Sure! Here's a recursive fibonacci function in Rust:
```rust
fn fibonacci(n: u32) -> u64 {
match n {
0 => 0,
1 => 1,
_ => fibonacci(n - 1) + fibonacci(n - 2),
}
}
```
This is the classic recursive implementation, though it's not the most efficient for large values of n."#
.to_string(),
content_parts: None,
attachments: vec![],
metadata: serde_json::json!({
"model": "claude-3-5-sonnet",
"latency_ms": 1245
}),
};
archivist
.append_messages(scroll_id, vec![user_msg.clone(), assistant_msg.clone()], None)
.await?;
println!("Appended 2 messages to session");
// Step 5: List all sessions for the connector
println!("\n--- Listing Sessions ---");
let page = archivist
.list_sessions_paged(
dirigent_archivist::SessionListQuery::default()
.with_connector(connector_uid)
.with_limit(100),
)
.await?;
let sessions = page.items;
println!("Found {} session(s) for connector:", sessions.len());
for session in &sessions {
println!(
" - {} ({}): {:?}",
session.scroll_id,
session.created_at.format("%Y-%m-%d %H:%M:%S"),
session.title
);
}
// Step 6: Retrieve all messages for the session
println!("\n--- Retrieving Messages ---");
let messages = archivist.get_messages(scroll_id, None).await?;
println!("Retrieved {} message(s):", messages.len());
for msg in &messages {
println!("\n[{}] {}", msg.role, msg.ts.format("%Y-%m-%d %H:%M:%S"));
println!("{}", msg.content_md);
}
// Step 7: Demonstrate session resolution
println!("\n--- Resolving Session ---");
let resolved_scroll_id = archivist
.resolve_session(connector_uid, "session-abc123", None)
.await?;
println!(
"Resolved native session 'session-abc123' to scroll_id: {}",
resolved_scroll_id
);
assert_eq!(resolved_scroll_id, scroll_id);
// Step 8: Show archive structure
println!("\n--- Archive Structure ---");
println!("Archive root: {}", temp_dir.display());
println!("\nDirectory structure:");
show_directory_tree(&temp_dir, 0)?;
// Cleanup
println!("\n--- Cleanup ---");
std::fs::remove_dir_all(&temp_dir)?;
println!("Removed temporary archive");
Ok(())
}
/// Helper function to display directory tree
fn show_directory_tree(path: &PathBuf, depth: usize) -> Result<()> {
let indent = " ".repeat(depth);
if path.is_dir() {
println!("{}{}/", indent, path.file_name().unwrap().to_string_lossy());
let mut entries: Vec<_> = std::fs::read_dir(path)?.filter_map(|e| e.ok()).collect();
entries.sort_by_key(|e| e.path());
for entry in entries {
show_directory_tree(&entry.path(), depth + 1)?;
}
} else {
println!("{}{}", indent, path.file_name().unwrap().to_string_lossy());
}
Ok(())
}
@@ -1,156 +0,0 @@
// Demonstration of archivist types serialization
// Run with: cargo run --package dirigent_archivist --example demo_types
use chrono::Utc;
use dirigent_archivist::*;
use uuid::Uuid;
fn main() {
println!("=== ARCHIVIST TYPES DEMONSTRATION ===\n");
// Demo 1: SessionMetadata (matches session.json format)
println!("1. SessionMetadata (session.json):");
let session_metadata = SessionMetadata {
version: 1,
scroll_id: Uuid::now_v7(),
created_at: Utc::now(),
updated_at: Utc::now(),
title: Some("Example Session".to_string()),
connector_uid: Uuid::now_v7(),
native_session_id: Some("abc123".to_string()),
agent_id: Some("claude-3-5".to_string()),
parent_scroll_id: None,
continuation: Some(Continuation::Split),
tags: vec!["example".to_string(), "test".to_string()],
metadata: serde_json::json!({
"source": "OpenCode",
"project": "dirigent"
}),
no_update: false,
kind: SessionKind::Chat,
acp_client_id: None,
is_connected: None,
current_session_id: None,
models: None,
modes: None,
config_options: None,
completeness: SessionCompleteness::default(),
matrix_room_id: None,
matrix_sharing_active: false,
matrix_shared_at: None,
is_subagent: false,
subagent_type: None,
spawning_tool_use_id: None,
};
println!(
"{}\n",
serde_json::to_string_pretty(&session_metadata).unwrap()
);
// Demo 2: MessageRecord (matches messages.ndjson format)
println!("2. MessageRecord (messages.ndjson line):");
let message = MessageRecord {
version: 1,
message_id: Uuid::now_v7(),
session: session_metadata.scroll_id,
parent_id: None,
ts: Utc::now(),
role: "user".to_string(),
author: Some("alice".to_string()),
content_md: "How do I implement archivist types?".to_string(),
content_parts: None,
attachments: vec![AttachmentRef {
file_id: "sha256:abc123".to_string(),
name: "spec.pdf".to_string(),
mime_type: Some("application/pdf".to_string()),
}],
metadata: serde_json::json!({
"connector_msg_id": "msg-456"
}),
};
// NDJSON format (one line)
println!("{}\n", serde_json::to_string(&message).unwrap());
// Demo 3: ConnectorRecord (matches connector.json format)
println!("3. ConnectorRecord (connector.json):");
let connector = ConnectorRecord {
version: 1,
connector_uid: session_metadata.connector_uid,
r#type: "OpenCode".to_string(),
title: "OpenCode Local".to_string(),
client_native_id: "opencode@http://localhost:12225".to_string(),
alias_of: None,
created_at: Utc::now(),
metadata: serde_json::json!({}),
fingerprint: None,
};
println!("{}\n", serde_json::to_string_pretty(&connector).unwrap());
// Demo 4: SessionMapping (matches sessions.ndjson format)
println!("4. SessionMapping (sessions.ndjson line):");
let mapping = SessionMapping {
version: 1,
connector_uid: connector.connector_uid,
native_session_id: "abc123".to_string(),
scroll_id: session_metadata.scroll_id,
created_at: Utc::now(),
alias_of: None,
};
println!("{}\n", serde_json::to_string(&mapping).unwrap());
// Demo 5: FileRecord (matches file_index.jsonl format)
println!("5. FileRecord (file_index.jsonl line):");
let file_record = FileRecord {
version: 1,
file_id: "sha256:abc123def456".to_string(),
path: ".files/ab/cd/abc123def456".to_string(),
size: 123456,
mime: Some("application/pdf".to_string()),
original_name: "spec.pdf".to_string(),
sessions: vec![session_metadata.scroll_id],
metadata: serde_json::json!({
"source": "upload"
}),
};
println!("{}\n", serde_json::to_string(&file_record).unwrap());
// Demo 6: Enum serialization
println!("6. Enum Serialization:");
println!(
" Continuation::Split: {}",
serde_json::to_string(&Continuation::Split).unwrap()
);
println!(
" Continuation::Compact: {}",
serde_json::to_string(&Continuation::Compact).unwrap()
);
println!(
" RegisterStatus::Accepted: {}",
serde_json::to_string(&RegisterStatus::Accepted).unwrap()
);
println!(
" RegisterStatus::Aliased: {}",
serde_json::to_string(&RegisterStatus::Aliased).unwrap()
);
println!();
// Demo 7: API types
println!("7. RegisterConnectorResponse:");
let response = RegisterConnectorResponse {
status: RegisterStatus::Accepted,
connector_uid: Uuid::now_v7(),
alias_of: None,
note: Some("Successfully registered".to_string()),
};
println!("{}\n", serde_json::to_string_pretty(&response).unwrap());
println!("8. RegisterSessionResponse:");
let response = RegisterSessionResponse {
status: RegisterStatus::Aliased,
scroll_id: Uuid::now_v7(),
alias_of: Some(Uuid::now_v7()),
};
println!("{}\n", serde_json::to_string_pretty(&response).unwrap());
println!("=== ALL TYPES MATCH VISION.MD SPECIFICATION ===");
}
@@ -1,277 +0,0 @@
//! Event handling example for dirigent_archivist
//!
//! This example demonstrates:
//! - Creating an EventHandler
//! - Subscribing to dirigent_protocol events
//! - Accumulating streaming message chunks
//! - Finalizing complete messages
//! - Automatic archival via event stream
use chrono::Utc;
use dirigent_archivist::{Archivist, EventHandler, Result};
use dirigent_protocol::streaming::{BusEvent, BusReceiver, EventOrigin, EventRouting};
use dirigent_protocol::{
ContentBlock, Event, Message, MessageMetadata, MessagePart, MessageRole, MessageStatus,
Session, SessionMetadata, SessionUpdate, ToolCall, ToolCallStatus,
};
use std::sync::Arc;
use std::sync::atomic::AtomicU64;
use tokio::sync::mpsc;
use uuid::Uuid;
/// Wrap a raw `Event` in a `BusEvent` with default routing.
fn wrap(event: Event) -> BusEvent {
BusEvent {
routing: EventRouting::default(),
origin: EventOrigin::Runtime,
event: Arc::new(event),
}
}
#[tokio::main]
async fn main() -> Result<()> {
// Create a temporary archive directory for this example
let temp_dir = std::env::temp_dir().join(format!("dirigent_event_example_{}", Uuid::now_v7()));
println!("Creating archive at: {}", temp_dir.display());
// Step 1: Create archivist and event handler
let archivist = Archivist::new_with_single_archive(temp_dir.clone()).await?;
let archivist = Arc::new(archivist);
let handler = EventHandler::new(archivist.clone());
println!("EventHandler created successfully");
// Step 2: Create a mock event stream. In production this is built
// by `SharingBus::subscribe_all()`; here we fabricate a `BusReceiver`
// directly so the example stays self-contained.
let (tx, rx) = mpsc::channel::<BusEvent>(100);
let bus_rx = BusReceiver {
id: 0,
rx,
lagged: Arc::new(AtomicU64::new(0)),
};
// Step 3: Spawn event handler task
let handler_task = tokio::spawn(async move {
handler.run(bus_rx).await;
});
// Step 4: Simulate event flow
println!("\n--- Simulating Event Stream ---");
// Generate connector and session IDs
let connector_id = Uuid::now_v7().to_string();
let session_id = Uuid::now_v7().to_string();
let message_id = Uuid::now_v7().to_string();
// Event 1: SessionCreated
println!("\n1. Sending SessionCreated event...");
let session_created = Event::SessionCreated {
connector_id: connector_id.clone(),
session: Session {
id: session_id.clone(),
title: "Example streaming session".to_string(),
created_at: Utc::now(),
updated_at: Utc::now(),
metadata: SessionMetadata {
project_path: "/home/user/project".to_string(),
model: Some("claude-3-5-sonnet".to_string()),
total_messages: 0,
system_message: None,
current_mode_id: None,
_meta: None,
project_id: None,
},
cwd: None,
models: None,
modes: None,
config_options: None,
acp_client_id: None,
},
};
tx.send(wrap(session_created)).await.unwrap();
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
// Event 2-5: Streaming message chunks (AgentMessageChunk)
println!("2. Sending streaming message chunks...");
let chunks = vec!["Hello! ", "I'm here to ", "help you with ", "your code."];
for (i, chunk) in chunks.iter().enumerate() {
let chunk_event = Event::SessionUpdate {
connector_id: connector_id.clone(),
session_id: session_id.clone(),
update: SessionUpdate::AgentMessageChunk {
message_id: message_id.clone(),
content: ContentBlock::Text {
text: chunk.to_string(),
},
_meta: None,
},
};
tx.send(wrap(chunk_event)).await.unwrap();
println!(" Chunk {}: {:?}", i + 1, chunk);
tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
}
// Event 6: Thinking chunk
println!("3. Sending thinking chunk...");
let thinking_event = Event::SessionUpdate {
connector_id: connector_id.clone(),
session_id: session_id.clone(),
update: SessionUpdate::AgentThoughtChunk {
message_id: message_id.clone(),
content: ContentBlock::Text {
text: "Let me consider the best approach...".to_string(),
},
_meta: None,
},
};
tx.send(wrap(thinking_event)).await.unwrap();
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
// Event 7: Tool call
println!("4. Sending tool call event...");
let tool_call_event = Event::SessionUpdate {
connector_id: connector_id.clone(),
session_id: session_id.clone(),
update: SessionUpdate::ToolCall {
message_id: message_id.clone(),
tool_call: ToolCall {
id: "tool_call_123".to_string(),
tool_name: "read_file".to_string(),
status: ToolCallStatus::Completed,
content: vec![],
raw_input: Some(serde_json::json!({
"path": "/home/user/project/main.rs"
})),
raw_output: Some(serde_json::json!({
"content": "fn main() { println!(\"Hello\"); }"
})),
title: None,
error: None,
metadata: None,
origin: None,
},
_meta: None,
},
};
tx.send(wrap(tool_call_event)).await.unwrap();
tokio::time::sleep(tokio::time::Duration::from_millis(100)).await;
// Event 8: MessageCompleted (triggers finalization)
println!("5. Sending MessageCompleted event...");
let message_completed = Event::MessageCompleted {
connector_id: connector_id.clone(),
message: Message {
id: message_id.clone(),
session_id: session_id.clone(),
role: MessageRole::Assistant,
created_at: Utc::now(),
content: vec![MessagePart::Text {
text: chunks.concat(),
}],
status: MessageStatus::Completed,
metadata: Some(MessageMetadata {
cost: None,
tokens_input: None,
tokens_output: None,
response_time_ms: None,
latency_ms: Some(1500),
model: Some("claude-3-5-sonnet".to_string()),
other: None,
}),
},
};
tx.send(wrap(message_completed)).await.unwrap();
tokio::time::sleep(tokio::time::Duration::from_millis(200)).await;
// Event 9: Second message (user response)
println!("6. Sending user message...");
let user_message_id = Uuid::now_v7().to_string();
let user_chunks = vec!["Thanks! ", "Can you explain ", "the code?"];
for (i, chunk) in user_chunks.iter().enumerate() {
let chunk_event = Event::SessionUpdate {
connector_id: connector_id.clone(),
session_id: session_id.clone(),
update: SessionUpdate::UserMessageChunk {
message_id: user_message_id.clone(),
content: ContentBlock::Text {
text: chunk.to_string(),
},
_meta: None,
},
};
tx.send(wrap(chunk_event)).await.unwrap();
println!(" User chunk {}: {:?}", i + 1, chunk);
tokio::time::sleep(tokio::time::Duration::from_millis(50)).await;
}
let user_message_completed = Event::MessageCompleted {
connector_id: connector_id.clone(),
message: Message {
id: user_message_id.clone(),
session_id: session_id.clone(),
role: MessageRole::User,
created_at: Utc::now(),
content: vec![MessagePart::Text {
text: user_chunks.concat(),
}],
status: MessageStatus::Completed,
metadata: None,
},
};
tx.send(wrap(user_message_completed)).await.unwrap();
tokio::time::sleep(tokio::time::Duration::from_millis(200)).await;
// Step 5: Verify archived data
println!("\n--- Verifying Archived Data ---");
// Parse connector_uid from connector_id string
let connector_uid =
Uuid::parse_str(&connector_id).expect("connector_id should be a valid UUID");
// List sessions
let page = archivist
.list_sessions_paged(
dirigent_archivist::SessionListQuery::default()
.with_connector(connector_uid)
.with_limit(100),
)
.await?;
let sessions = page.items;
println!("Found {} session(s) in archive", sessions.len());
for session in &sessions {
println!(" Session: {} - {:?}", session.scroll_id, session.title);
}
// Get messages
if let Some(session) = sessions.first() {
let messages = archivist.get_messages(session.scroll_id, None).await?;
println!("\nFound {} message(s):", messages.len());
for msg in &messages {
println!("\n[{}] {} chars", msg.role, msg.content_md.len());
println!(
"Content preview: {}",
&msg.content_md.chars().take(100).collect::<String>()
);
}
}
// Step 6: Cleanup
println!("\n--- Cleanup ---");
// Drop the event sender to close the channel
drop(tx);
// Wait for handler to finish
handler_task.await.expect("Handler task failed");
// Remove temporary archive
std::fs::remove_dir_all(&temp_dir)?;
println!("Removed temporary archive");
println!("\nExample completed successfully!");
Ok(())
}
@@ -1,214 +0,0 @@
//! File storage example for dirigent_archivist
//!
//! This example demonstrates:
//! - Storing files with content-addressing
//! - Retrieving files by file_id
//! - Automatic deduplication of identical content
//! - Session tracking for file references
use dirigent_archivist::storage::{files, ndjson, paths::ArchivePaths};
use dirigent_archivist::types::FileRecord;
use dirigent_archivist::Result;
use uuid::Uuid;
#[tokio::main]
async fn main() -> Result<()> {
// Create a temporary archive directory for this example
let temp_dir = std::env::temp_dir().join(format!("dirigent_files_example_{}", Uuid::now_v7()));
println!("Creating archive at: {}", temp_dir.display());
let paths = ArchivePaths::new(temp_dir.clone());
// Example 1: Store a file
println!("\n--- Example 1: Store a File ---");
let content1 = b"This is a sample document with some text content.";
let session1 = Uuid::now_v7();
let file_id1 = files::store_file(
&paths,
content1,
"document.txt".to_string(),
Some("text/plain".to_string()),
session1,
)
.await?;
println!("Stored file with ID: {}", file_id1);
println!("Session: {}", session1);
// Example 2: Retrieve the file
println!("\n--- Example 2: Retrieve the File ---");
let retrieved1 = files::get_file(&paths, &file_id1).await?;
println!("Retrieved {} bytes", retrieved1.len());
println!("Content: {}", String::from_utf8_lossy(&retrieved1));
// Example 3: Store the same content from a different session (deduplication)
println!("\n--- Example 3: Deduplication Demo ---");
let session2 = Uuid::now_v7();
let file_id2 = files::store_file(
&paths,
content1, // Same content as before
"duplicate.txt".to_string(), // Different name
Some("text/plain".to_string()),
session2,
)
.await?;
println!("Stored same content with different name");
println!("File ID 1: {}", file_id1);
println!("File ID 2: {}", file_id2);
println!("Same file_id? {}", file_id1 == file_id2);
println!("\nDeduplication: Same content produces same file_id, stored only once!");
// Example 4: Check the file index
println!("\n--- Example 4: File Index ---");
let index_path = paths.root().join(".files").join("file_index.jsonl");
let records: Vec<FileRecord> = ndjson::read_ndjson(&index_path).await?;
println!("File index contains {} record(s)", records.len());
for record in &records {
println!("\nFile: {}", record.file_id);
println!(" Original name: {}", record.original_name);
println!(" MIME type: {:?}", record.mime);
println!(" Size: {} bytes", record.size);
println!(" Referenced by {} session(s):", record.sessions.len());
for session_id in &record.sessions {
println!(" - {}", session_id);
}
}
// Example 5: Store different content
println!("\n--- Example 5: Store Different Content ---");
let content2 = b"This is completely different content with more data!";
let session3 = Uuid::now_v7();
let file_id3 = files::store_file(
&paths,
content2,
"different.txt".to_string(),
Some("text/plain".to_string()),
session3,
)
.await?;
println!("Stored different content");
println!("File ID 3: {}", file_id3);
println!("Different from file_id1? {}", file_id1 != file_id3);
// Example 6: Store binary content
println!("\n--- Example 6: Binary Content ---");
let binary_content: Vec<u8> = (0..256).map(|i| i as u8).collect();
let session4 = Uuid::now_v7();
let file_id4 = files::store_file(
&paths,
&binary_content,
"binary.dat".to_string(),
Some("application/octet-stream".to_string()),
session4,
)
.await?;
println!("Stored binary content (256 bytes)");
println!("File ID: {}", file_id4);
// Retrieve and verify
let retrieved_binary = files::get_file(&paths, &file_id4).await?;
println!("Retrieved {} bytes", retrieved_binary.len());
println!(
"Binary content verified: {}",
retrieved_binary == binary_content
);
// Example 7: Show final archive structure
println!("\n--- Example 7: Archive Structure ---");
println!("Archive root: {}", temp_dir.display());
show_files_directory(&paths)?;
// Example 8: Final statistics
println!("\n--- Final Statistics ---");
let final_records: Vec<FileRecord> = ndjson::read_ndjson(&index_path).await?;
println!("Total unique files stored: {}", final_records.len());
let total_sessions: usize = final_records.iter().map(|r| r.sessions.len()).sum();
println!("Total session references: {}", total_sessions);
let total_size: u64 = final_records.iter().map(|r| r.size).sum();
println!("Total storage used: {} bytes", total_size);
// Content-addressing means if we had stored content1 1000 times,
// we'd still only use storage for it once!
println!("\nContent-addressing benefit:");
println!(" File '{}' is referenced by {} sessions", file_id1, 2);
println!(" But stored only once on disk!");
// Cleanup
println!("\n--- Cleanup ---");
std::fs::remove_dir_all(&temp_dir)?;
println!("Removed temporary archive");
println!("\nExample completed successfully!");
Ok(())
}
/// Helper function to show .files directory structure
fn show_files_directory(paths: &ArchivePaths) -> Result<()> {
let files_dir = paths.root().join(".files");
if !files_dir.exists() {
println!("No files directory found");
return Ok(());
}
println!("\n.files/ directory:");
// Show index file
let index_path = files_dir.join("file_index.jsonl");
if index_path.exists() {
let metadata = std::fs::metadata(&index_path)?;
println!(" file_index.jsonl ({} bytes)", metadata.len());
}
// Show shard directories
for entry in std::fs::read_dir(&files_dir)? {
let entry = entry?;
let path = entry.path();
if path.is_dir() {
println!(" {}/", path.file_name().unwrap().to_string_lossy());
// Show files in shard
for sub_entry in std::fs::read_dir(&path)? {
let sub_entry = sub_entry?;
let sub_path = sub_entry.path();
if sub_path.is_dir() {
println!(" {}/", sub_path.file_name().unwrap().to_string_lossy());
// Show files in sub-shard
for file_entry in std::fs::read_dir(&sub_path)? {
let file_entry = file_entry?;
let file_path = file_entry.path();
let metadata = std::fs::metadata(&file_path)?;
println!(
" {} ({} bytes)",
file_path.file_name().unwrap().to_string_lossy(),
metadata.len()
);
}
} else {
let metadata = std::fs::metadata(&sub_path)?;
println!(
" {} ({} bytes)",
sub_path.file_name().unwrap().to_string_lossy(),
metadata.len()
);
}
}
}
}
Ok(())
}
@@ -1,199 +0,0 @@
//! Example: two `JsonlBackend`s side by side, demonstrating boot-from-config,
//! priority-ordered read routing, write fanout, and a health snapshot.
//!
//! Layout:
//! - `primary` → `read_priority = 0`, `failure_mode = required` (default)
//! - `mirror` → `read_priority = 10`, `failure_mode = best_effort`
//!
//! The primary is the default write target (lowest priority among
//! Required+write-active backends). `append_messages` fans out inline to the
//! mirror too. Reads walk the registrations in priority order, so the primary
//! answers first; if it is missing a session, the walk falls through to the
//! mirror.
//!
//! Run with:
//!
//! cargo run --package dirigent_archivist --example multi_backend
use std::sync::Arc;
use chrono::Utc;
use dirigent_archivist::coordinator::Archivist;
use dirigent_archivist::registry::{ArchivesConfig, BackendRegistry};
use dirigent_archivist::types::{
MessageRecord, RegisterConnectorRequest, RegisterSessionRequest,
};
use uuid::Uuid;
#[tokio::main]
async fn main() -> anyhow::Result<()> {
let dir_a = tempfile::tempdir()?;
let dir_b = tempfile::tempdir()?;
// Build a two-archive config entirely from TOML so the example doubles as
// a faithful demonstration of the config surface.
let cfg_src = format!(
r#"
[[archives]]
name = "primary"
type = "jsonl"
read_priority = 0
[archives.params]
path = "{a}"
[[archives]]
name = "mirror"
type = "jsonl"
failure_mode = "best_effort"
read_priority = 10
[archives.params]
path = "{b}"
"#,
a = dir_a.path().to_string_lossy().replace('\\', "/"),
b = dir_b.path().to_string_lossy().replace('\\', "/"),
);
let cfg: ArchivesConfig = toml::from_str(&cfg_src)?;
let registry = BackendRegistry::with_jsonl();
let archivist = Arc::new(Archivist::from_config(cfg, &registry, None).await?);
println!("\n=== Multi-backend Archivist example ===\n");
println!("Boot complete. Archives (ordered by read_priority):");
for s in archivist.list_archives_with_health().await {
println!(
" - name={:<8} type={:<6} priority={:<3} enabled={} write_active={} failure_mode={:?} health={:?}",
s.name,
s.type_name,
s.read_priority,
s.enabled,
s.write_active,
s.failure_mode,
s.health,
);
}
// ------------------------------------------------------------------
// Register a connector. The primary owns the canonical record; fanout
// mirrors it to the secondary.
// ------------------------------------------------------------------
let connector_resp = archivist
.register_connector(
RegisterConnectorRequest {
r#type: "Example".into(),
title: "multi-backend demo".into(),
client_native_id: "example://multi_backend".into(),
custom_uid: None,
metadata: serde_json::json!({ "demo": true }),
fingerprint: None,
},
None,
)
.await?;
let connector_uid = connector_resp.connector_uid;
println!(
"\nRegistered connector: uid={} status={:?}",
connector_uid, connector_resp.status
);
// ------------------------------------------------------------------
// Register a session under that connector. `register_session` writes
// the mapping and `session.json` on the primary first, then fans out
// to any enabled secondaries.
// ------------------------------------------------------------------
let session_resp = archivist
.register_session(
RegisterSessionRequest {
connector_uid,
native_session_id: "demo-session-1".into(),
title: Some("multi-backend demo session".into()),
custom_scroll_id: None,
metadata: serde_json::json!({ "model": "demo" }),
completeness: Default::default(),
parent_scroll_id: None,
is_subagent: false,
continuation: None,
agent_id: None,
subagent_type: None,
spawning_tool_use_id: None,
},
None,
)
.await?;
let scroll_id = session_resp.scroll_id;
println!(
"Registered session: scroll_id={} status={:?}",
scroll_id, session_resp.status
);
// ------------------------------------------------------------------
// Append a couple of messages. `append_messages` writes to the primary
// and then fans out inline to the mirror.
// ------------------------------------------------------------------
let user_msg = MessageRecord {
version: 1,
message_id: Uuid::now_v7(),
session: scroll_id,
parent_id: None,
ts: Utc::now(),
role: "user".into(),
author: Some("alice".into()),
content_md: "Hello from the multi-backend example!".into(),
content_parts: None,
attachments: vec![],
metadata: serde_json::json!({}),
};
let asst_msg = MessageRecord {
version: 1,
message_id: Uuid::now_v7(),
session: scroll_id,
parent_id: Some(user_msg.message_id),
ts: Utc::now(),
role: "assistant".into(),
author: Some("claude".into()),
content_md: "Greetings. I have been written to two archives.".into(),
content_parts: None,
attachments: vec![],
metadata: serde_json::json!({}),
};
archivist
.append_messages(scroll_id, vec![user_msg, asst_msg], None)
.await?;
println!("\nAppended 2 messages — fanned out to primary + mirror.");
// ------------------------------------------------------------------
// Read path: the priority walk tries the primary first (priority=0).
// It finds the session there and never consults the mirror.
// ------------------------------------------------------------------
let meta = archivist.get_session_metadata(scroll_id, None).await?;
println!(
"\nRead session via priority walk: title={:?} completeness={:?}",
meta.title, meta.completeness
);
println!(
"Read cache size after read: {}",
archivist.read_cache_size().await
);
let messages = archivist.get_messages(scroll_id, None).await?;
println!("Read {} message(s) from the archive:", messages.len());
for m in &messages {
println!(" - [{}] {}", m.role, m.content_md);
}
// ------------------------------------------------------------------
// Final health snapshot. Both backends should still be Available and
// have no queued writes (both run Inline write policies by default).
// ------------------------------------------------------------------
println!("\nFinal health snapshot:");
for s in archivist.list_archives_with_health().await {
println!(
" - {:<8} health={:?} queue_depth={:?} last_error={:?}",
s.name, s.health, s.queue_depth, s.last_error
);
}
// Clean shutdown drains any queued writer tasks. Both backends here run
// Inline, so this is effectively a no-op but remains the correct API.
archivist.shutdown().await?;
println!("\nShutdown complete.");
Ok(())
}
@@ -1,923 +0,0 @@
//! Message accumulator for incremental message assembly.
//!
//! This is a thin wrapper around [`dirigent_protocol::accumulator::MessageAccumulator`]
//! that delegates chunk/tool/thinking operations to the protocol accumulator and
//! converts [`AccumulatedMessage`] to [`MessageRecord`] on `finalize()`.
//!
//! The accumulator preserves the order of content parts (text, thinking, tool calls)
//! as they arrive in the event stream, enabling inline tool rendering in the UI.
use chrono::{DateTime, Utc};
use dirigent_protocol::accumulator::{
AccumulatedMessage, AccumulatedPart,
MessageAccumulator as ProtocolAccumulator,
};
#[cfg(test)]
use dirigent_protocol::MessagePart;
use dirigent_protocol::ContentBlock;
use serde_json::Value;
use std::collections::HashMap;
use uuid::Uuid;
use crate::error::Result;
use crate::types::MessageRecord;
// Re-export ToolCallData from the protocol for backward compatibility.
pub use dirigent_protocol::accumulator::ToolCallData;
/// Accumulator for assembling streaming message deltas into [`MessageRecord`]s.
///
/// Wraps the protocol-level [`ProtocolAccumulator`] and adds archivist-specific
/// concerns: per-message metadata, UUID parsing, and markdown generation.
#[derive(Debug, Default)]
pub struct MessageAccumulator {
inner: ProtocolAccumulator,
/// Per-message metadata not tracked by the protocol accumulator.
metadata: HashMap<String, Value>,
}
impl MessageAccumulator {
/// Create a new message accumulator
pub fn new() -> Result<Self> {
Ok(Self {
inner: ProtocolAccumulator::new(),
metadata: HashMap::new(),
})
}
/// Add a content chunk to the message buffer
pub fn add_chunk(
&mut self,
message_id: String,
session_id: String,
connector_id: String,
role: String,
content: ContentBlock,
) {
self.inner
.add_chunk(&message_id, &session_id, &connector_id, &role, content);
}
/// Add thinking content to the message buffer
pub fn add_thinking(
&mut self,
message_id: String,
session_id: String,
connector_id: String,
content: String,
) {
self.inner.add_thinking(&message_id, &session_id, &connector_id, &content);
}
/// Add or update a tool call in the message buffer
///
/// This method handles both initial ToolCall events and ToolCallUpdate events.
/// If a tool call with the given ID already exists, it updates the existing entry.
/// Otherwise, it adds a new entry.
///
/// This ensures that each tool_call_id appears exactly ONCE in the final message,
/// with the most recent input/output data.
pub fn add_or_update_tool_call(&mut self, message_id: String, tool_call: ToolCallData) {
self.inner.add_or_update_tool_call(&message_id, tool_call);
}
/// Add a tool call to the message buffer (DEPRECATED - use add_or_update_tool_call)
#[deprecated(note = "Use add_or_update_tool_call instead to avoid duplicates")]
pub fn add_tool_call(&mut self, message_id: String, tool_call: ToolCallData) {
self.add_or_update_tool_call(message_id, tool_call);
}
/// Update an existing tool call in the message buffer
///
/// Finds the tool call by ID and updates its input/output with non-empty values
/// from the update. If no matching tool call is found, this is a no-op (the
/// update arrived before the initial ToolCall).
pub fn update_tool_call(
&mut self,
message_id: String,
tool_call_id: &str,
input: Option<Value>,
output: Option<Value>,
) {
// Construct a ToolCallData and delegate to add_or_update_tool_call.
// We need the tool_name but don't have it here; use an empty string
// since add_or_update_tool_call only updates existing entries when the
// id matches. However, if there's no existing entry, this would create
// a new one with empty tool_name - so we need to check first.
//
// Instead, we use the protocol accumulator's update semantics directly:
// build a ToolCallData with the values we have.
let tool_call = ToolCallData {
id: tool_call_id.to_string(),
tool_name: String::new(), // Will be overwritten by existing entry's name
input: input.unwrap_or(Value::Null),
output,
};
// Only delegate if a buffer exists for this message (matching original behavior).
if self.inner.has_buffer(&message_id) {
self.inner.add_or_update_tool_call(&message_id, tool_call);
}
}
/// Get all message IDs for a given session that have active buffers
pub fn get_message_ids_for_session(&self, session_id: &str) -> Vec<String> {
self.inner.message_ids_for_session(session_id)
}
/// Get message IDs for buffers that have been inactive longer than the threshold
pub fn get_stale_message_ids(
&self,
_now: DateTime<Utc>,
threshold: std::time::Duration,
) -> Vec<String> {
self.inner.stale_message_ids(threshold)
}
/// Get all message IDs that have active buffers
pub fn get_all_message_ids(&self) -> Vec<String> {
self.inner.active_message_ids()
}
/// Finalize a message and produce a complete `(MessageRecord, connector_id, native_session_id)`.
///
/// Returns `None` if no buffer exists for the given `message_id`.
/// The `connector_id` and `native_session_id` in the tuple are the raw values
/// that were passed into `add_chunk`/`add_thinking` — callers in Task 5 will use
/// these to resolve the canonical scroll_id.
pub fn finalize(&mut self, message_id: &str) -> Option<(MessageRecord, String, String)> {
let accumulated = self.inner.finalize(message_id)?;
let connector_id = accumulated.connector_id.clone();
let native_session_id = accumulated.session_id.clone();
// Take stored metadata for this message (if any).
let metadata = self
.metadata
.remove(message_id)
.unwrap_or(Value::Null);
let record = accumulated_to_record(accumulated, metadata);
Some((record, connector_id, native_session_id))
}
}
// ---------------------------------------------------------------------------
// Conversion helpers
// ---------------------------------------------------------------------------
/// Convert an [`AccumulatedMessage`] into a [`MessageRecord`] for archival.
fn accumulated_to_record(accumulated: AccumulatedMessage, metadata: Value) -> MessageRecord {
// Build content_md by iterating parts in order
let mut content_md = String::new();
for part in &accumulated.parts {
match part {
AccumulatedPart::Text { text } => {
content_md.push_str(text);
}
AccumulatedPart::Thinking { text } => {
content_md.push_str("\n\n<thinking>\n");
content_md.push_str(text);
content_md.push_str("\n</thinking>");
}
AccumulatedPart::Tool { data } => {
content_md.push_str(&format!(
"\n\n**Tool**: {}\n```json\n{}\n```",
data.tool_name,
serde_json::to_string_pretty(&data.input)
.unwrap_or_else(|_| "{}".to_string())
));
}
}
}
// Convert accumulated parts to protocol MessageParts for rich rendering
let message_parts = accumulated.to_message_parts();
// Serialize content_parts for storage (None if empty to save space)
let content_parts = if message_parts.is_empty() {
None
} else {
serde_json::to_value(&message_parts).ok()
};
// Parse UUIDs from strings
// Strip "msg-" prefix if present (ACP connectors use this format)
let message_id_str = accumulated
.message_id
.strip_prefix("msg-")
.unwrap_or(&accumulated.message_id);
if message_id_str != accumulated.message_id.as_str() {
tracing::debug!(
"Stripped 'msg-' prefix from message_id: {} -> {}",
accumulated.message_id,
message_id_str
);
}
let message_uuid = match Uuid::parse_str(message_id_str) {
Ok(uuid) => uuid,
Err(_) => {
tracing::warn!(
"Failed to parse message_id as UUID: {}",
accumulated.message_id
);
Uuid::now_v7()
}
};
// Strip "msg-" prefix from session_id if present (for consistency)
let session_id_str = accumulated
.session_id
.strip_prefix("msg-")
.unwrap_or(&accumulated.session_id);
if session_id_str != accumulated.session_id.as_str() {
tracing::debug!(
"Stripped 'msg-' prefix from session_id: {} -> {}",
accumulated.session_id,
session_id_str
);
}
let session_uuid = match Uuid::parse_str(session_id_str) {
Ok(uuid) => uuid,
Err(_) => {
tracing::warn!(
"Failed to parse session_id as UUID: {}",
accumulated.session_id
);
Uuid::now_v7()
}
};
MessageRecord {
version: 1,
message_id: message_uuid,
session: session_uuid,
parent_id: None,
ts: accumulated.created_at.unwrap_or_else(Utc::now),
role: accumulated.role,
author: None,
content_md,
content_parts,
attachments: Vec::new(),
metadata,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_accumulator_creation() {
let acc = MessageAccumulator::new().unwrap();
assert_eq!(acc.get_all_message_ids().len(), 0);
}
#[test]
fn test_add_text_chunk() {
let mut acc = MessageAccumulator::new().unwrap();
acc.add_chunk(
"msg_1".to_string(),
"session_1".to_string(),
"connector_1".to_string(),
"user".to_string(),
ContentBlock::Text {
text: "Hello, ".to_string(),
},
);
acc.add_chunk(
"msg_1".to_string(),
"session_1".to_string(),
"connector_1".to_string(),
"user".to_string(),
ContentBlock::Text {
text: "world!".to_string(),
},
);
// Two consecutive text chunks should be coalesced.
// Finalize and check the result.
let (record, _, _) = acc.finalize("msg_1").unwrap();
assert_eq!(record.content_md, "Hello, world!");
}
#[test]
fn test_add_thinking_chunk() {
let mut acc = MessageAccumulator::new().unwrap();
acc.add_thinking(
"msg_2".to_string(),
"session_2".to_string(),
"connector_1".to_string(),
"Let me think... ".to_string(),
);
acc.add_thinking(
"msg_2".to_string(),
"session_2".to_string(),
"connector_1".to_string(),
"I need to analyze this.".to_string(),
);
// Finalize and verify thinking was coalesced
let (record, _, _) = acc.finalize("msg_2").unwrap();
assert!(record.content_md.contains("Let me think... I need to analyze this."));
assert_eq!(record.role, "assistant");
}
#[test]
fn test_finalize_text_only() {
let mut acc = MessageAccumulator::new().unwrap();
acc.add_chunk(
"01936e8f-e5a7-7000-8000-000000000001".to_string(),
"01936e8f-e5a7-7000-8000-000000000002".to_string(),
"connector_1".to_string(),
"user".to_string(),
ContentBlock::Text {
text: "Hello, ".to_string(),
},
);
acc.add_chunk(
"01936e8f-e5a7-7000-8000-000000000001".to_string(),
"01936e8f-e5a7-7000-8000-000000000002".to_string(),
"connector_1".to_string(),
"user".to_string(),
ContentBlock::Text {
text: "world!".to_string(),
},
);
let (record, _, _) = acc
.finalize("01936e8f-e5a7-7000-8000-000000000001")
.unwrap();
assert_eq!(record.content_md, "Hello, world!");
assert_eq!(record.role, "user");
assert!(record.ts <= Utc::now());
}
#[test]
fn test_finalize_with_thinking() {
let mut acc = MessageAccumulator::new().unwrap();
acc.add_chunk(
"01936e8f-e5a7-7000-8000-000000000003".to_string(),
"01936e8f-e5a7-7000-8000-000000000004".to_string(),
"connector_1".to_string(),
"assistant".to_string(),
ContentBlock::Text {
text: "Here's my response.".to_string(),
},
);
acc.add_thinking(
"01936e8f-e5a7-7000-8000-000000000003".to_string(),
"01936e8f-e5a7-7000-8000-000000000004".to_string(),
"connector_1".to_string(),
"Let me analyze this carefully.".to_string(),
);
let (record, _, _) = acc
.finalize("01936e8f-e5a7-7000-8000-000000000003")
.unwrap();
assert!(record.content_md.contains("Here's my response."));
assert!(record.content_md.contains("<thinking>"));
assert!(record.content_md.contains("Let me analyze this carefully."));
assert!(record.content_md.contains("</thinking>"));
}
#[test]
fn test_finalize_nonexistent_message() {
let mut acc = MessageAccumulator::new().unwrap();
let result = acc.finalize("nonexistent");
assert!(result.is_none());
}
#[test]
fn test_add_tool_call() {
let mut acc = MessageAccumulator::new().unwrap();
// First add a text chunk to create the buffer
acc.add_chunk(
"msg_tool".to_string(),
"session_tool".to_string(),
"connector_1".to_string(),
"assistant".to_string(),
ContentBlock::Text {
text: "I'll use a tool.".to_string(),
},
);
// Add a tool call
let tool_call = ToolCallData {
id: "call_123".to_string(),
tool_name: "search".to_string(),
input: serde_json::json!({"query": "test"}),
output: Some(serde_json::json!({"results": ["a", "b"]})),
};
#[allow(deprecated)]
acc.add_tool_call("msg_tool".to_string(), tool_call);
// Finalize and verify
let (record, _, _) = acc.finalize("msg_tool").unwrap();
let parts =
serde_json::from_value::<Vec<MessagePart>>(record.content_parts.unwrap()).unwrap();
assert_eq!(parts.len(), 2); // One Text, one Tool
assert!(matches!(parts[1], MessagePart::Tool { .. }));
if let MessagePart::Tool { tool, .. } = &parts[1] {
assert_eq!(tool, "search");
}
}
#[test]
fn test_finalize_with_tool_calls() {
let mut acc = MessageAccumulator::new().unwrap();
acc.add_chunk(
"01936e8f-e5a7-7000-8000-000000000005".to_string(),
"01936e8f-e5a7-7000-8000-000000000006".to_string(),
"connector_1".to_string(),
"assistant".to_string(),
ContentBlock::Text {
text: "Let me search for that.".to_string(),
},
);
let tool_call = ToolCallData {
id: "call_456".to_string(),
tool_name: "web_search".to_string(),
input: serde_json::json!({"query": "Rust async"}),
output: None,
};
#[allow(deprecated)]
acc.add_tool_call(
"01936e8f-e5a7-7000-8000-000000000005".to_string(),
tool_call,
);
let (record, _, _) = acc
.finalize("01936e8f-e5a7-7000-8000-000000000005")
.unwrap();
assert!(record.content_md.contains("Let me search for that."));
assert!(record.content_md.contains("**Tool**: web_search"));
assert!(record.content_md.contains("Rust async"));
}
#[test]
fn test_concurrent_messages() {
let mut acc = MessageAccumulator::new().unwrap();
// Add chunks for two different messages
acc.add_chunk(
"msg_a".to_string(),
"session_1".to_string(),
"connector_1".to_string(),
"user".to_string(),
ContentBlock::Text {
text: "Message A".to_string(),
},
);
acc.add_chunk(
"msg_b".to_string(),
"session_1".to_string(),
"connector_1".to_string(),
"assistant".to_string(),
ContentBlock::Text {
text: "Message B".to_string(),
},
);
acc.add_chunk(
"msg_a".to_string(),
"session_1".to_string(),
"connector_1".to_string(),
"user".to_string(),
ContentBlock::Text {
text: " continued".to_string(),
},
);
// Both messages should be buffered
assert_eq!(acc.get_all_message_ids().len(), 2);
// Finalize and check
let (record_a, _, _) = acc.finalize("msg_a").unwrap();
assert_eq!(record_a.content_md, "Message A continued");
let (record_b, _, _) = acc.finalize("msg_b").unwrap();
assert_eq!(record_b.content_md, "Message B");
}
#[test]
fn test_get_message_ids_for_session() {
let mut acc = MessageAccumulator::new().unwrap();
// Add messages to different sessions
acc.add_chunk(
"msg_1".to_string(),
"session_a".to_string(),
"connector_1".to_string(),
"user".to_string(),
ContentBlock::Text {
text: "Message 1".to_string(),
},
);
acc.add_chunk(
"msg_2".to_string(),
"session_a".to_string(),
"connector_1".to_string(),
"assistant".to_string(),
ContentBlock::Text {
text: "Message 2".to_string(),
},
);
acc.add_chunk(
"msg_3".to_string(),
"session_b".to_string(),
"connector_1".to_string(),
"user".to_string(),
ContentBlock::Text {
text: "Message 3".to_string(),
},
);
// Get message IDs for session_a
let mut session_a_ids = acc.get_message_ids_for_session("session_a");
session_a_ids.sort();
assert_eq!(session_a_ids, vec!["msg_1", "msg_2"]);
// Get message IDs for session_b
let session_b_ids = acc.get_message_ids_for_session("session_b");
assert_eq!(session_b_ids, vec!["msg_3"]);
// Get message IDs for non-existent session
let empty_ids = acc.get_message_ids_for_session("session_c");
assert!(empty_ids.is_empty());
}
#[test]
fn test_finalize_with_msg_prefix() {
let mut acc = MessageAccumulator::new().unwrap();
// Use message_id and session_id with "msg-" prefix (ACP format)
let uuid_str = "01936e8f-e5a7-7000-8000-000000000007";
let session_uuid_str = "01936e8f-e5a7-7000-8000-000000000008";
acc.add_chunk(
format!("msg-{}", uuid_str),
format!("msg-{}", session_uuid_str),
"connector_1".to_string(),
"assistant".to_string(),
ContentBlock::Text {
text: "Testing msg- prefix handling.".to_string(),
},
);
let (record, _, _) = acc.finalize(&format!("msg-{}", uuid_str)).unwrap();
// Verify that the UUID was correctly parsed (not regenerated)
assert_eq!(record.message_id.to_string(), uuid_str);
assert_eq!(record.session.to_string(), session_uuid_str);
assert_eq!(record.content_md, "Testing msg- prefix handling.");
}
#[test]
fn test_finalize_without_msg_prefix() {
let mut acc = MessageAccumulator::new().unwrap();
// Use message_id and session_id without "msg-" prefix
let uuid_str = "01936e8f-e5a7-7000-8000-000000000009";
let session_uuid_str = "01936e8f-e5a7-7000-8000-00000000000a";
acc.add_chunk(
uuid_str.to_string(),
session_uuid_str.to_string(),
"connector_1".to_string(),
"user".to_string(),
ContentBlock::Text {
text: "Testing without prefix.".to_string(),
},
);
let (record, _, _) = acc.finalize(uuid_str).unwrap();
// Verify that the UUID was correctly parsed
assert_eq!(record.message_id.to_string(), uuid_str);
assert_eq!(record.session.to_string(), session_uuid_str);
assert_eq!(record.content_md, "Testing without prefix.");
}
#[test]
fn test_interleaved_tool_calls() {
let mut acc = MessageAccumulator::new().unwrap();
let msg_id = "01936e8f-e5a7-7000-8000-000000000010";
let session_id = "01936e8f-e5a7-7000-8000-000000000011";
// Text chunk 1
acc.add_chunk(
msg_id.to_string(),
session_id.to_string(),
"connector_1".to_string(),
"assistant".to_string(),
ContentBlock::Text {
text: "Let me search for that. ".to_string(),
},
);
// Tool call 1
acc.add_or_update_tool_call(
msg_id.to_string(),
ToolCallData {
id: "call_1".to_string(),
tool_name: "search".to_string(),
input: serde_json::json!({"query": "rust"}),
output: None,
},
);
// Text chunk 2
acc.add_chunk(
msg_id.to_string(),
session_id.to_string(),
"connector_1".to_string(),
"assistant".to_string(),
ContentBlock::Text {
text: "Now let me check the documentation. ".to_string(),
},
);
// Tool call 2
acc.add_or_update_tool_call(
msg_id.to_string(),
ToolCallData {
id: "call_2".to_string(),
tool_name: "read_docs".to_string(),
input: serde_json::json!({"path": "README.md"}),
output: None,
},
);
// Text chunk 3
acc.add_chunk(
msg_id.to_string(),
session_id.to_string(),
"connector_1".to_string(),
"assistant".to_string(),
ContentBlock::Text {
text: "Based on my research...".to_string(),
},
);
let (record, _, _) = acc.finalize(msg_id).unwrap();
// Verify content_md has correct order (text1, tool1, text2, tool2, text3)
let content = &record.content_md;
let search_pos = content.find("**Tool**: search").expect("search tool not found");
let docs_pos = content
.find("**Tool**: read_docs")
.expect("read_docs tool not found");
let text1_pos = content.find("Let me search").expect("text1 not found");
let text2_pos = content.find("Now let me check").expect("text2 not found");
let text3_pos = content.find("Based on my research").expect("text3 not found");
// Verify order: text1 < search < text2 < read_docs < text3
assert!(
text1_pos < search_pos,
"text1 should come before search tool"
);
assert!(
search_pos < text2_pos,
"search tool should come before text2"
);
assert!(
text2_pos < docs_pos,
"text2 should come before read_docs tool"
);
assert!(
docs_pos < text3_pos,
"read_docs tool should come before text3"
);
// Verify content_parts structure
let parts =
serde_json::from_value::<Vec<MessagePart>>(record.content_parts.unwrap()).unwrap();
assert_eq!(
parts.len(),
5,
"Should have 5 parts: text, tool, text, tool, text"
);
// Verify each part type in order
assert!(matches!(parts[0], MessagePart::Text { .. }));
assert!(matches!(parts[1], MessagePart::Tool { .. }));
assert!(matches!(parts[2], MessagePart::Text { .. }));
assert!(matches!(parts[3], MessagePart::Tool { .. }));
assert!(matches!(parts[4], MessagePart::Text { .. }));
}
#[test]
fn test_text_coalescing_with_tool_separation() {
let mut acc = MessageAccumulator::new().unwrap();
let msg_id = "msg1";
let session_id = "session1";
// Two consecutive text chunks (should coalesce)
acc.add_chunk(
msg_id.to_string(),
session_id.to_string(),
"connector_1".to_string(),
"assistant".to_string(),
ContentBlock::Text {
text: "Hello ".to_string(),
},
);
acc.add_chunk(
msg_id.to_string(),
session_id.to_string(),
"connector_1".to_string(),
"assistant".to_string(),
ContentBlock::Text {
text: "world. ".to_string(),
},
);
// Tool call (separates text)
acc.add_or_update_tool_call(
msg_id.to_string(),
ToolCallData {
id: "call_1".to_string(),
tool_name: "search".to_string(),
input: serde_json::json!({}),
output: None,
},
);
// Two more consecutive text chunks (should coalesce separately)
acc.add_chunk(
msg_id.to_string(),
session_id.to_string(),
"connector_1".to_string(),
"assistant".to_string(),
ContentBlock::Text {
text: "More ".to_string(),
},
);
acc.add_chunk(
msg_id.to_string(),
session_id.to_string(),
"connector_1".to_string(),
"assistant".to_string(),
ContentBlock::Text {
text: "text.".to_string(),
},
);
let (record, _, _) = acc.finalize(msg_id).unwrap();
// Should have 3 parts: coalesced text1, tool, coalesced text2
let parts =
serde_json::from_value::<Vec<MessagePart>>(record.content_parts.unwrap()).unwrap();
assert_eq!(parts.len(), 3);
// Verify first text part is coalesced
if let MessagePart::Text { text } = &parts[0] {
assert_eq!(text, "Hello world. ");
} else {
panic!("Expected Text part");
}
// Verify tool part
assert!(matches!(parts[1], MessagePart::Tool { .. }));
// Verify second text part is coalesced
if let MessagePart::Text { text } = &parts[2] {
assert_eq!(text, "More text.");
} else {
panic!("Expected Text part");
}
}
#[test]
fn test_tool_call_progressive_updates() {
let mut acc = MessageAccumulator::new().unwrap();
let msg_id = "msg1";
let session_id = "session1";
// Create buffer with initial text chunk
acc.add_chunk(
msg_id.to_string(),
session_id.to_string(),
"connector_1".to_string(),
"assistant".to_string(),
ContentBlock::Text {
text: "Using grep... ".to_string(),
},
);
// Initial tool call (empty input, no output)
acc.add_or_update_tool_call(
msg_id.to_string(),
ToolCallData {
id: "call_1".to_string(),
tool_name: "grep".to_string(),
input: serde_json::json!({}),
output: None,
},
);
// Update with actual input
acc.add_or_update_tool_call(
msg_id.to_string(),
ToolCallData {
id: "call_1".to_string(),
tool_name: "grep".to_string(),
input: serde_json::json!({"pattern": "rust"}),
output: None,
},
);
// Update with output
acc.add_or_update_tool_call(
msg_id.to_string(),
ToolCallData {
id: "call_1".to_string(),
tool_name: "grep".to_string(),
input: serde_json::json!({}), // Empty, should not overwrite
output: Some(serde_json::json!({"results": ["match1", "match2"]})),
},
);
let (record, _, _) = acc.finalize(msg_id).unwrap();
// Should have 2 parts: text and tool (tool merged from 3 updates)
let parts =
serde_json::from_value::<Vec<MessagePart>>(record.content_parts.unwrap()).unwrap();
assert_eq!(parts.len(), 2);
// Verify first part is text
assert!(matches!(parts[0], MessagePart::Text { .. }));
// Verify second part is tool with merged data
if let MessagePart::Tool {
tool, input, output, ..
} = &parts[1]
{
assert_eq!(tool, "grep");
assert_eq!(input, &serde_json::json!({"pattern": "rust"})); // Input preserved
assert!(output.is_some()); // Output added
} else {
panic!("Expected Tool part");
}
}
#[test]
fn test_thinking_coalescing() {
let mut acc = MessageAccumulator::new().unwrap();
let msg_id = "msg1";
let session_id = "session1";
// Add multiple thinking chunks
acc.add_thinking(
msg_id.to_string(),
session_id.to_string(),
"connector_1".to_string(),
"First thought. ".to_string(),
);
acc.add_thinking(
msg_id.to_string(),
session_id.to_string(),
"connector_1".to_string(),
"Second thought.".to_string(),
);
let (record, _, _) = acc.finalize(msg_id).unwrap();
// Should have 1 thinking part (coalesced)
let parts =
serde_json::from_value::<Vec<MessagePart>>(record.content_parts.unwrap()).unwrap();
assert_eq!(parts.len(), 1);
// Verify it's coalesced thinking
if let MessagePart::Thinking { text } = &parts[0] {
assert_eq!(text, "First thought. Second thought.");
} else {
panic!("Expected Thinking part");
}
}
}
@@ -1,18 +0,0 @@
//! Archive backend capability enumeration.
//!
//! Mandatory session + message primitives are NOT listed here — every
//! backend has them. This enum represents the *optional* sub-traits a
//! backend opts into, surfaced through `ArchiveBackend::as_xxx()` accessors.
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)]
pub enum ArchiveCapability {
Search,
Dag,
MetaEvents,
ConnectorRegistry,
SessionMapping,
}
pub type CapabilitySet = std::collections::HashSet<ArchiveCapability>;
@@ -1,108 +0,0 @@
//! Reusable sub-trait contract tests.
//!
//! Pass any `&dyn ArchiveBackend` to verify it honors the behavioral
//! contract of each sub-trait it exposes. Phase 2 runs this against
//! `JsonlBackend`; Phase 3+ reuses it for every new backend.
#![cfg(any(test, feature = "test-utils"))]
use uuid::Uuid;
use crate::backend::ArchiveBackend;
/// Exercises `ConnectorRegistryBackend` through `as_connector_registry()`.
/// Skips silently if the backend does not expose the sub-trait.
pub async fn verify_connector_registry_contract(backend: &dyn ArchiveBackend) {
let Some(registry) = backend.as_connector_registry() else {
return;
};
// Empty state — listing returns Vec::new(), not an error.
let list = registry.list_connectors().await.expect("list_connectors");
assert!(list.is_empty(), "fresh backend should have no connectors");
// get_connector on missing UID returns Ok(None).
let missing = registry
.get_connector(Uuid::new_v4())
.await
.expect("get_connector");
assert!(missing.is_none());
// resolve_connector_uid on unknown id returns Ok(None).
let unresolved = registry
.resolve_connector_uid("nonexistent@host")
.await
.expect("resolve_connector_uid");
assert!(unresolved.is_none());
}
/// Exercises `SessionMappingBackend`.
pub async fn verify_session_mapping_contract(backend: &dyn ArchiveBackend) {
let Some(mapping) = backend.as_session_mapping() else {
return;
};
let missing = mapping
.get_mapping(Uuid::new_v4(), "absent")
.await
.expect("get_mapping");
assert!(missing.is_none());
let owner = mapping
.find_owner("absent")
.await
.expect("find_owner");
assert!(owner.is_none());
}
/// Exercises `DagBackend`.
pub async fn verify_dag_contract(backend: &dyn ArchiveBackend) {
let Some(dag) = backend.as_dag() else {
return;
};
let children = dag
.get_children(Uuid::new_v4())
.await
.expect("get_children");
assert!(children.is_empty());
let edges = dag
.get_dag_edges(Uuid::new_v4())
.await
.expect("get_dag_edges");
assert!(edges.is_empty());
}
/// Exercises `MetaEventsBackend`.
pub async fn verify_meta_events_contract(backend: &dyn ArchiveBackend) {
let Some(meta) = backend.as_meta_events() else {
return;
};
let events = meta
.get_meta_events(Uuid::new_v4())
.await
.expect("get_meta_events");
assert!(events.is_empty());
let by_client = meta
.find_meta_session_by_client("absent")
.await
.expect("find_meta_session_by_client");
assert!(by_client.is_none());
let all = meta
.list_meta_sessions()
.await
.expect("list_meta_sessions");
assert!(all.is_empty());
}
/// One-shot helper: runs every sub-trait contract whose capability is present.
pub async fn verify_all_contracts(backend: &dyn ArchiveBackend) {
verify_connector_registry_contract(backend).await;
verify_session_mapping_contract(backend).await;
verify_dag_contract(backend).await;
verify_meta_events_contract(backend).await;
}
@@ -1,10 +0,0 @@
//! Health status reported by `ArchiveBackend::health_check`.
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub enum HealthStatus {
Healthy,
Degraded { reason: String },
Unavailable { reason: String },
}
@@ -1,574 +0,0 @@
//! In-memory `ArchiveBackend` for coordinator unit tests.
//!
//! Fully supports every sub-trait. State lives in `Mutex<HashMap<…>>`.
#![cfg(any(test, feature = "test-utils"))]
use std::collections::HashMap;
use std::sync::Mutex;
use async_trait::async_trait;
use uuid::Uuid;
use crate::backend::{
ArchiveBackend, ArchiveCapability, CapabilitySet, ConnectorRegistryBackend,
DagBackend, HealthStatus, MetaEventsBackend, SessionMappingBackend,
};
use crate::error::{ArchivistError, Result};
use crate::types::{
ConnectorRecord, DagEdge, MessageCursor, MessagePage, MessageRecord,
MetaEventRecord, SessionListQuery, SessionMapping, SessionMetadata, SessionPage,
};
pub struct MockBackend {
capabilities: CapabilitySet,
sessions: Mutex<HashMap<Uuid, SessionMetadata>>,
messages: Mutex<HashMap<Uuid, Vec<MessageRecord>>>,
connectors: Mutex<HashMap<Uuid, ConnectorRecord>>,
mappings: Mutex<HashMap<(Uuid, String), Uuid>>,
meta_events: Mutex<HashMap<Uuid, Vec<MetaEventRecord>>>,
dag_edges: Mutex<Vec<DagEdge>>,
fail_next_writes: std::sync::atomic::AtomicUsize,
fail_next_reads: std::sync::atomic::AtomicUsize,
permanent_error: std::sync::Mutex<Option<String>>,
append_calls: std::sync::Mutex<std::collections::HashMap<Uuid, usize>>,
per_op_delay: std::sync::Mutex<std::time::Duration>,
}
impl MockBackend {
pub fn new() -> Self {
let mut capabilities = CapabilitySet::new();
capabilities.insert(ArchiveCapability::Dag);
capabilities.insert(ArchiveCapability::MetaEvents);
capabilities.insert(ArchiveCapability::ConnectorRegistry);
capabilities.insert(ArchiveCapability::SessionMapping);
Self {
capabilities,
sessions: Mutex::new(HashMap::new()),
messages: Mutex::new(HashMap::new()),
connectors: Mutex::new(HashMap::new()),
mappings: Mutex::new(HashMap::new()),
meta_events: Mutex::new(HashMap::new()),
dag_edges: Mutex::new(Vec::new()),
fail_next_writes: std::sync::atomic::AtomicUsize::new(0),
fail_next_reads: std::sync::atomic::AtomicUsize::new(0),
permanent_error: std::sync::Mutex::new(None),
append_calls: std::sync::Mutex::new(std::collections::HashMap::new()),
per_op_delay: std::sync::Mutex::new(std::time::Duration::ZERO),
}
}
}
impl MockBackend {
/// Build a mock with the exact capability set provided. All other state
/// starts empty (same as `new()`).
pub fn with_capabilities(capabilities: CapabilitySet) -> Self {
let mut m = Self::new();
m.capabilities = capabilities;
m
}
/// Test helper: does this mock have any meta events for the given session?
pub fn has_meta_events(&self, scroll_id: uuid::Uuid) -> bool {
self.meta_events
.lock()
.unwrap()
.get(&scroll_id)
.map(|v| !v.is_empty())
.unwrap_or(false)
}
/// Queue up `count` injected write failures. The next `count` calls to
/// any mutating API return `ArchivistError::Other("injected write failure")`
/// before touching state.
pub fn inject_write_failures(&self, count: usize) {
self.fail_next_writes
.store(count, std::sync::atomic::Ordering::SeqCst);
}
/// Queue up `count` injected read failures for per-scroll_id reads.
pub fn inject_read_failures(&self, count: usize) {
self.fail_next_reads
.store(count, std::sync::atomic::Ordering::SeqCst);
}
/// Simulate a permanently broken backend.
pub fn break_permanently(&self, reason: impl Into<String>) {
*self.permanent_error.lock().unwrap() = Some(reason.into());
}
pub fn clear_failures(&self) {
self.fail_next_writes
.store(0, std::sync::atomic::Ordering::SeqCst);
self.fail_next_reads
.store(0, std::sync::atomic::Ordering::SeqCst);
*self.permanent_error.lock().unwrap() = None;
}
/// Test helper: how many `MessageRecord`s this mock has for the given session.
pub fn appended_count(&self, scroll_id: uuid::Uuid) -> usize {
self.messages
.lock()
.unwrap()
.get(&scroll_id)
.map(|v| v.len())
.unwrap_or(0)
}
/// Test helper: how many times `append_messages` was invoked for the
/// given session (regardless of message count per invocation).
pub fn append_call_count(&self, scroll_id: uuid::Uuid) -> usize {
self.append_calls
.lock()
.unwrap()
.get(&scroll_id)
.copied()
.unwrap_or(0)
}
/// Test helper: artificially slow every mutating backend operation by
/// sleeping `d` before it touches state. Used to simulate a slow backend
/// for backpressure tests.
pub fn set_per_op_delay(&self, d: std::time::Duration) {
*self.per_op_delay.lock().unwrap() = d;
}
async fn maybe_delay(&self) {
let d = *self.per_op_delay.lock().unwrap();
if !d.is_zero() {
tokio::time::sleep(d).await;
}
}
pub(crate) fn check_write_failure(&self) -> Result<()> {
if let Some(reason) = self.permanent_error.lock().unwrap().clone() {
return Err(ArchivistError::Other(reason));
}
let prev = self
.fail_next_writes
.fetch_update(
std::sync::atomic::Ordering::SeqCst,
std::sync::atomic::Ordering::SeqCst,
|n| if n > 0 { Some(n - 1) } else { None },
)
.ok();
if prev.is_some() {
return Err(ArchivistError::Other("injected write failure".into()));
}
Ok(())
}
pub(crate) fn check_read_failure(&self) -> Result<()> {
if let Some(reason) = self.permanent_error.lock().unwrap().clone() {
return Err(ArchivistError::Other(reason));
}
let prev = self
.fail_next_reads
.fetch_update(
std::sync::atomic::Ordering::SeqCst,
std::sync::atomic::Ordering::SeqCst,
|n| if n > 0 { Some(n - 1) } else { None },
)
.ok();
if prev.is_some() {
return Err(ArchivistError::Other("injected read failure".into()));
}
Ok(())
}
}
impl Default for MockBackend {
fn default() -> Self {
Self::new()
}
}
#[async_trait]
impl ArchiveBackend for MockBackend {
fn capabilities(&self) -> &CapabilitySet {
&self.capabilities
}
async fn health_check(&self) -> HealthStatus {
HealthStatus::Healthy
}
async fn put_session(&self, meta: SessionMetadata) -> Result<()> {
self.check_write_failure()?;
self.maybe_delay().await;
self.sessions.lock().unwrap().insert(meta.scroll_id, meta);
Ok(())
}
async fn get_session(&self, scroll_id: Uuid) -> Result<Option<SessionMetadata>> {
self.check_read_failure()?;
Ok(self.sessions.lock().unwrap().get(&scroll_id).cloned())
}
async fn list_sessions_paged(&self, query: SessionListQuery) -> Result<SessionPage> {
let mut items: Vec<SessionMetadata> =
self.sessions.lock().unwrap().values().cloned().collect();
if !query.connector_uids.is_empty() {
items.retain(|s| query.connector_uids.contains(&s.connector_uid));
}
items.sort_by(|a, b| {
b.updated_at
.cmp(&a.updated_at)
.then(b.scroll_id.cmp(&a.scroll_id))
});
let limit = query.limit.min(crate::types::MAX_PAGE_LIMIT).max(1);
let total_count = items.len();
let items: Vec<_> = items.into_iter().take(limit).collect();
Ok(SessionPage {
items,
next_cursor: None,
total_count: Some(total_count),
})
}
async fn delete_session(&self, scroll_id: Uuid) -> Result<()> {
self.check_write_failure()?;
self.maybe_delay().await;
if self.sessions.lock().unwrap().remove(&scroll_id).is_none() {
return Err(ArchivistError::SessionUnknown(scroll_id));
}
self.messages.lock().unwrap().remove(&scroll_id);
Ok(())
}
async fn append_messages(
&self,
scroll_id: Uuid,
msgs: Vec<MessageRecord>,
) -> Result<()> {
self.check_write_failure()?;
self.maybe_delay().await;
*self
.append_calls
.lock()
.unwrap()
.entry(scroll_id)
.or_insert(0) += 1;
self.messages
.lock()
.unwrap()
.entry(scroll_id)
.or_default()
.extend(msgs);
Ok(())
}
async fn get_messages_paged(
&self,
scroll_id: Uuid,
cursor: Option<MessageCursor>,
limit: usize,
) -> Result<MessagePage> {
self.check_read_failure()?;
let mut all = self
.messages
.lock()
.unwrap()
.get(&scroll_id)
.cloned()
.unwrap_or_default();
all.sort_by(|a, b| a.ts.cmp(&b.ts).then(a.message_id.cmp(&b.message_id)));
if let Some(c) = cursor.as_ref() {
all.retain(|m| (m.ts, m.message_id) > (c.ts, c.message_id));
}
let total = all.len();
let taken: Vec<_> = all.into_iter().take(limit.max(1)).collect();
let next_cursor = if total > taken.len() {
taken.last().map(|m| MessageCursor {
ts: m.ts,
message_id: m.message_id,
})
} else {
None
};
Ok(MessagePage {
items: taken,
next_cursor,
})
}
async fn count_messages(&self, scroll_id: Uuid) -> Result<usize> {
self.check_read_failure()?;
Ok(self
.messages
.lock()
.unwrap()
.get(&scroll_id)
.map(|v| v.len())
.unwrap_or(0))
}
async fn clear_session_messages(&self, scroll_id: Uuid) -> Result<()> {
self.check_write_failure()?;
self.maybe_delay().await;
self.messages.lock().unwrap().remove(&scroll_id);
Ok(())
}
fn as_dag(&self) -> Option<&dyn DagBackend> {
if self.capabilities.contains(&ArchiveCapability::Dag) {
Some(self)
} else {
None
}
}
fn as_meta_events(&self) -> Option<&dyn MetaEventsBackend> {
if self.capabilities.contains(&ArchiveCapability::MetaEvents) {
Some(self)
} else {
None
}
}
fn as_connector_registry(&self) -> Option<&dyn ConnectorRegistryBackend> {
if self
.capabilities
.contains(&ArchiveCapability::ConnectorRegistry)
{
Some(self)
} else {
None
}
}
fn as_session_mapping(&self) -> Option<&dyn SessionMappingBackend> {
if self
.capabilities
.contains(&ArchiveCapability::SessionMapping)
{
Some(self)
} else {
None
}
}
}
#[async_trait]
impl ConnectorRegistryBackend for MockBackend {
async fn put_connector(&self, record: ConnectorRecord) -> Result<()> {
self.check_write_failure()?;
self.maybe_delay().await;
self.connectors
.lock()
.unwrap()
.insert(record.connector_uid, record);
Ok(())
}
async fn get_connector(&self, connector_uid: Uuid) -> Result<Option<ConnectorRecord>> {
Ok(self
.connectors
.lock()
.unwrap()
.get(&connector_uid)
.cloned())
}
async fn list_connectors(&self) -> Result<Vec<ConnectorRecord>> {
Ok(self.connectors.lock().unwrap().values().cloned().collect())
}
async fn resolve_connector_uid(&self, client_native_id: &str) -> Result<Option<Uuid>> {
Ok(self
.connectors
.lock()
.unwrap()
.values()
.find(|c| c.client_native_id == client_native_id)
.map(|c| c.connector_uid))
}
async fn update_connector_fingerprint(
&self,
connector_uid: Uuid,
fingerprint: String,
) -> Result<()> {
self.check_write_failure()?;
self.maybe_delay().await;
if let Some(r) = self.connectors.lock().unwrap().get_mut(&connector_uid) {
r.fingerprint = Some(fingerprint);
Ok(())
} else {
Err(ArchivistError::ConnectorUnknown(connector_uid))
}
}
}
#[async_trait]
impl SessionMappingBackend for MockBackend {
async fn put_mapping(
&self,
connector_uid: Uuid,
native_session_id: &str,
scroll_id: Uuid,
) -> Result<()> {
self.check_write_failure()?;
self.maybe_delay().await;
self.mappings
.lock()
.unwrap()
.insert((connector_uid, native_session_id.to_string()), scroll_id);
Ok(())
}
async fn get_mapping(
&self,
connector_uid: Uuid,
native_session_id: &str,
) -> Result<Option<Uuid>> {
Ok(self
.mappings
.lock()
.unwrap()
.get(&(connector_uid, native_session_id.to_string()))
.copied())
}
async fn list_mappings_for_connector(
&self,
connector_uid: Uuid,
) -> Result<Vec<SessionMapping>> {
Ok(self
.mappings
.lock()
.unwrap()
.iter()
.filter(|((c, _), _)| *c == connector_uid)
.map(|((c, n), s)| SessionMapping {
version: 1,
connector_uid: *c,
native_session_id: n.clone(),
scroll_id: *s,
created_at: chrono::Utc::now(),
alias_of: None,
})
.collect())
}
async fn find_owner(&self, native_session_id: &str) -> Result<Option<(Uuid, Uuid)>> {
Ok(self
.mappings
.lock()
.unwrap()
.iter()
.find(|((_, n), _)| n == native_session_id)
.map(|((c, _), s)| (*c, *s)))
}
async fn rewrite_connector_mappings(
&self,
connector_uid: Uuid,
mappings: Vec<SessionMapping>,
) -> Result<()> {
self.check_write_failure()?;
self.maybe_delay().await;
let mut map = self.mappings.lock().unwrap();
map.retain(|(c, _), _| *c != connector_uid);
for m in mappings {
map.insert((connector_uid, m.native_session_id), m.scroll_id);
}
Ok(())
}
}
#[async_trait]
impl DagBackend for MockBackend {
async fn append_dag_edge(&self, edge: DagEdge) -> Result<()> {
self.check_write_failure()?;
self.maybe_delay().await;
self.dag_edges.lock().unwrap().push(edge);
Ok(())
}
async fn get_children(&self, parent: Uuid) -> Result<Vec<SessionMetadata>> {
self.check_read_failure()?;
let edges = self.dag_edges.lock().unwrap();
let sessions = self.sessions.lock().unwrap();
Ok(edges
.iter()
.filter(|e| e.parent == parent)
.filter_map(|e| sessions.get(&e.child).cloned())
.collect())
}
async fn get_dag_edges(&self, root: Uuid) -> Result<Vec<DagEdge>> {
self.check_read_failure()?;
Ok(self
.dag_edges
.lock()
.unwrap()
.iter()
.filter(|e| e.parent == root)
.cloned()
.collect())
}
}
#[async_trait]
impl MetaEventsBackend for MockBackend {
async fn append_meta_events(
&self,
scroll_id: Uuid,
events: Vec<MetaEventRecord>,
) -> Result<()> {
self.check_write_failure()?;
self.maybe_delay().await;
self.meta_events
.lock()
.unwrap()
.entry(scroll_id)
.or_default()
.extend(events);
Ok(())
}
async fn get_meta_events(&self, scroll_id: Uuid) -> Result<Vec<MetaEventRecord>> {
self.check_read_failure()?;
Ok(self
.meta_events
.lock()
.unwrap()
.get(&scroll_id)
.cloned()
.unwrap_or_default())
}
async fn update_meta_session_status(
&self,
scroll_id: Uuid,
is_connected: bool,
current_session_id: Option<Uuid>,
) -> Result<()> {
self.check_write_failure()?;
self.maybe_delay().await;
if let Some(s) = self.sessions.lock().unwrap().get_mut(&scroll_id) {
s.is_connected = Some(is_connected);
s.current_session_id = current_session_id;
Ok(())
} else {
Err(ArchivistError::SessionUnknown(scroll_id))
}
}
async fn list_meta_sessions(&self) -> Result<Vec<SessionMetadata>> {
Ok(self
.sessions
.lock()
.unwrap()
.values()
.filter(|s| matches!(s.kind, crate::types::SessionKind::AcpConnection))
.cloned()
.collect())
}
async fn find_meta_session_by_client(
&self,
client_id: &str,
) -> Result<Option<SessionMetadata>> {
Ok(self
.sessions
.lock()
.unwrap()
.values()
.find(|s| s.acp_client_id.as_deref() == Some(client_id))
.cloned())
}
}
#[cfg(test)]
mod failure_injection_tests {
use super::*;
#[tokio::test]
async fn injected_write_failure_returns_error_then_recovers() {
let m = MockBackend::new();
m.inject_write_failures(2);
let scroll = uuid::Uuid::nil();
assert!(m.append_messages(scroll, vec![]).await.is_err());
assert!(m.append_messages(scroll, vec![]).await.is_err());
assert!(m.append_messages(scroll, vec![]).await.is_ok()); // back to normal
}
}
@@ -1,20 +0,0 @@
//! Archive backend trait layer.
//!
//! See `docs/plans/2026-04-18-archivist-phase2-design.md` for the design.
pub mod capability;
pub mod health;
pub mod traits;
#[cfg(any(test, feature = "test-utils"))]
pub mod contract;
#[cfg(any(test, feature = "test-utils"))]
pub mod mock;
pub use capability::{ArchiveCapability, CapabilitySet};
pub use health::HealthStatus;
pub use traits::{
ArchiveBackend, ConnectorRegistryBackend, DagBackend, MetaEventsBackend,
SearchBackend, SessionMappingBackend,
};
@@ -1,167 +0,0 @@
//! Archive backend trait definitions.
//!
//! `ArchiveBackend` is mandatory for every backend: session + message
//! primitives plus self-description (capabilities, health). Optional
//! sub-traits (`SearchBackend`, `DagBackend`, `MetaEventsBackend`,
//! `ConnectorRegistryBackend`, `SessionMappingBackend`) are surfaced
//! via `as_xxx() -> Option<&dyn SubTrait>` accessors returning a
//! borrow from `self`.
//!
//! See `docs/plans/2026-04-18-archivist-phase2-design.md` §Trait Definitions.
use async_trait::async_trait;
use uuid::Uuid;
use crate::backend::capability::CapabilitySet;
use crate::backend::health::HealthStatus;
use crate::error::Result;
use crate::types::{
ConnectorRecord, DagEdge, MessageCursor, MessagePage, MessageRecord,
MetaEventRecord, SessionListQuery, SessionMapping, SessionMetadata,
SessionPage,
};
// ---------------------------------------------------------------------------
// Mandatory backend surface
// ---------------------------------------------------------------------------
/// An archive storage backend.
///
/// All backends must implement session metadata and message primitives;
/// optional capabilities are exposed through `as_xxx()` accessors that
/// return `None` when unsupported. `JsonlBackend` implements every
/// sub-trait except `SearchBackend`.
#[async_trait]
pub trait ArchiveBackend: Send + Sync {
// --- Self-description ---
fn capabilities(&self) -> &CapabilitySet;
async fn health_check(&self) -> HealthStatus;
// --- Session metadata ---
async fn put_session(&self, meta: SessionMetadata) -> Result<()>;
async fn get_session(&self, scroll_id: Uuid) -> Result<Option<SessionMetadata>>;
async fn list_sessions_paged(&self, query: SessionListQuery) -> Result<SessionPage>;
async fn delete_session(&self, scroll_id: Uuid) -> Result<()>;
// --- Messages ---
async fn append_messages(
&self,
scroll_id: Uuid,
messages: Vec<MessageRecord>,
) -> Result<()>;
async fn get_messages_paged(
&self,
scroll_id: Uuid,
cursor: Option<MessageCursor>,
limit: usize,
) -> Result<MessagePage>;
async fn count_messages(&self, scroll_id: Uuid) -> Result<usize>;
async fn clear_session_messages(&self, scroll_id: Uuid) -> Result<()>;
// --- Optional capability accessors ---
fn as_search(&self) -> Option<&dyn SearchBackend> {
None
}
fn as_dag(&self) -> Option<&dyn DagBackend> {
None
}
fn as_meta_events(&self) -> Option<&dyn MetaEventsBackend> {
None
}
fn as_connector_registry(&self) -> Option<&dyn ConnectorRegistryBackend> {
None
}
fn as_session_mapping(&self) -> Option<&dyn SessionMappingBackend> {
None
}
}
// ---------------------------------------------------------------------------
// Optional sub-traits
// ---------------------------------------------------------------------------
/// Content search. Reserved in Phase 2; not wired to `JsonlBackend`.
///
/// `packages/api/src/archivist/search_task.rs` continues to serve content
/// search via ripgrep — this trait exists as a forward-compatible hook for
/// indexed backends (ChromaDB, tantivy, …) arriving in Phase 3+.
#[async_trait]
pub trait SearchBackend: Send + Sync {
// Deliberately left without methods; Phase 3 adds the concrete
// query/result shapes when a real indexed backend lands.
}
#[async_trait]
pub trait DagBackend: Send + Sync {
async fn append_dag_edge(&self, edge: DagEdge) -> Result<()>;
async fn get_children(&self, parent: Uuid) -> Result<Vec<SessionMetadata>>;
async fn get_dag_edges(&self, root: Uuid) -> Result<Vec<DagEdge>>;
}
#[async_trait]
pub trait MetaEventsBackend: Send + Sync {
async fn append_meta_events(
&self,
scroll_id: Uuid,
events: Vec<MetaEventRecord>,
) -> Result<()>;
async fn get_meta_events(&self, scroll_id: Uuid) -> Result<Vec<MetaEventRecord>>;
async fn update_meta_session_status(
&self,
scroll_id: Uuid,
is_connected: bool,
current_session_id: Option<Uuid>,
) -> Result<()>;
async fn list_meta_sessions(&self) -> Result<Vec<SessionMetadata>>;
async fn find_meta_session_by_client(
&self,
client_id: &str,
) -> Result<Option<SessionMetadata>>;
}
#[async_trait]
pub trait ConnectorRegistryBackend: Send + Sync {
async fn put_connector(&self, record: ConnectorRecord) -> Result<()>;
async fn get_connector(&self, connector_uid: Uuid) -> Result<Option<ConnectorRecord>>;
async fn list_connectors(&self) -> Result<Vec<ConnectorRecord>>;
async fn resolve_connector_uid(&self, client_native_id: &str) -> Result<Option<Uuid>>;
async fn update_connector_fingerprint(
&self,
connector_uid: Uuid,
fingerprint: String,
) -> Result<()>;
}
#[async_trait]
pub trait SessionMappingBackend: Send + Sync {
async fn put_mapping(
&self,
connector_uid: Uuid,
native_session_id: &str,
scroll_id: Uuid,
) -> Result<()>;
async fn get_mapping(
&self,
connector_uid: Uuid,
native_session_id: &str,
) -> Result<Option<Uuid>>;
async fn list_mappings_for_connector(
&self,
connector_uid: Uuid,
) -> Result<Vec<SessionMapping>>;
async fn find_owner(&self, native_session_id: &str) -> Result<Option<(Uuid, Uuid)>>;
/// Replace the entire mapping table for `connector_uid` with `mappings`.
///
/// Phase 2 uses this to remove an individual mapping — callers read the
/// current table via `list_mappings_for_connector`, filter out the
/// unwanted row, and call this method with the remainder. Implementations
/// must also invalidate any in-memory cache entries that reference the
/// removed rows so subsequent `get_mapping` / `find_owner` calls don't
/// return stale hits.
async fn rewrite_connector_mappings(
&self,
connector_uid: Uuid,
mappings: Vec<SessionMapping>,
) -> Result<()>;
}
@@ -1,624 +0,0 @@
//! `JsonlBackend` — the Phase 2 concrete backend.
use std::collections::{HashMap, HashSet};
use std::path::PathBuf;
use async_trait::async_trait;
use chrono::Utc;
use tokio::sync::RwLock;
use uuid::Uuid;
use crate::backend::{
ArchiveBackend, ArchiveCapability, CapabilitySet, ConnectorRegistryBackend,
DagBackend, HealthStatus, MetaEventsBackend, SessionMappingBackend,
};
use crate::error::{ArchivistError, Result};
use crate::storage::{
append_ndjson, read_connector_index, read_json, read_ndjson, write_json, ArchivePaths,
};
use crate::types::{
ConnectorRecord, MessageCursor, MessagePage, MessageRecord, SessionCompleteness,
SessionKind, SessionListQuery, SessionMapping, SessionMetadata, SessionPage,
};
/// NDJSON/JSON/TSV file-based `ArchiveBackend`.
pub struct JsonlBackend {
pub(crate) paths: ArchivePaths,
pub(crate) connector_cache: RwLock<HashMap<Uuid, ConnectorRecord>>,
pub(crate) session_cache: RwLock<HashMap<(Uuid, String), Uuid>>,
pub(crate) capabilities: CapabilitySet,
}
impl JsonlBackend {
/// Create a new backend rooted at `archive_root`.
///
/// Creates the required directories (`.contexts`, `.db/connectors`, `.files`)
/// and initializes empty caches. Matches `FileBasedArchivist::new`.
pub async fn new(archive_root: PathBuf) -> Result<Self> {
let paths = ArchivePaths::new(archive_root);
tokio::fs::create_dir_all(paths.root().join(".contexts")).await?;
tokio::fs::create_dir_all(paths.root().join(".db").join("connectors")).await?;
tokio::fs::create_dir_all(paths.root().join(".files")).await?;
let mut capabilities = HashSet::new();
capabilities.insert(ArchiveCapability::Dag);
capabilities.insert(ArchiveCapability::MetaEvents);
capabilities.insert(ArchiveCapability::ConnectorRegistry);
capabilities.insert(ArchiveCapability::SessionMapping);
Ok(Self {
paths,
connector_cache: RwLock::new(HashMap::new()),
session_cache: RwLock::new(HashMap::new()),
capabilities,
})
}
/// Filesystem path utilities for this backend.
pub fn paths(&self) -> &ArchivePaths {
&self.paths
}
/// Read and chronologically sort all messages for a session.
///
/// See module docs for the append-order vs. chronological-order rationale.
pub(crate) async fn read_messages_sorted(
&self,
scroll_id: Uuid,
) -> Result<Vec<MessageRecord>> {
let path = self.paths.messages_path_for_read(scroll_id);
let mut msgs: Vec<MessageRecord> =
read_ndjson(&path).await.unwrap_or_default();
msgs.sort_by(|a, b| {
a.ts.cmp(&b.ts).then(a.message_id.cmp(&b.message_id))
});
Ok(msgs)
}
/// Locate the (connector_uid, native_session_id) owning `scroll_id` by
/// scanning the session cache first, then each connector's session
/// mapping files on disk.
async fn find_mapping_for_scroll_id(&self, scroll_id: Uuid) -> Option<(Uuid, String)> {
// Check cache first
{
let cache = self.session_cache.read().await;
for ((connector_uid, native_id), cached_scroll_id) in cache.iter() {
if *cached_scroll_id == scroll_id {
return Some((*connector_uid, native_id.clone()));
}
}
}
// Cache miss: scan connector index and each connector's sessions file
let index_path = self.paths.connector_index_tsv();
let rows = match read_connector_index(&index_path).await {
Ok(rows) => rows,
Err(_) => return None,
};
for row in &rows {
let sessions_path = self.paths.sessions_path_for_read(row.connector_uid);
let mappings: Vec<SessionMapping> = match read_ndjson(&sessions_path).await {
Ok(m) => m,
Err(_) => continue,
};
for mapping in mappings {
if mapping.scroll_id == scroll_id {
return Some((row.connector_uid, mapping.native_session_id));
}
}
}
None
}
/// Load every session for a connector, including hidden ones. Used by
/// `list_sessions_paged` — it applies visibility filters itself.
async fn load_sessions_for_connector(
&self,
connector_uid: Uuid,
) -> Result<Vec<SessionMetadata>> {
let sessions_path = self.paths.sessions_path_for_read(connector_uid);
let mappings: Vec<SessionMapping> = read_ndjson(&sessions_path).await?;
let mut sessions = Vec::new();
for mapping in mappings {
let session_json_path = self.paths.session_json(mapping.scroll_id);
match read_json::<SessionMetadata>(&session_json_path).await {
Ok(metadata) => sessions.push(metadata),
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
tracing::debug!(
scroll_id = %mapping.scroll_id,
"session.json missing, surfacing as Discovered stub"
);
sessions.push(SessionMetadata {
version: 1,
scroll_id: mapping.scroll_id,
created_at: mapping.created_at,
updated_at: mapping.created_at,
title: None,
connector_uid,
native_session_id: Some(mapping.native_session_id.clone()),
agent_id: None,
parent_scroll_id: None,
continuation: None,
tags: Vec::new(),
metadata: serde_json::json!({}),
no_update: false,
kind: SessionKind::Chat,
acp_client_id: None,
is_connected: None,
current_session_id: None,
models: None,
modes: None,
config_options: None,
completeness: SessionCompleteness::Discovered,
matrix_room_id: None,
matrix_sharing_active: false,
matrix_shared_at: None,
is_subagent: false,
subagent_type: None,
spawning_tool_use_id: None,
});
}
Err(e) => return Err(e.into()),
}
}
Ok(sessions)
}
}
/// Returns true if `session` satisfies every filter in `query`.
///
/// `connector_uid` is already honored by the caller (it picks which connector
/// directories to scan), so we do not re-check it here.
fn matches_query(
session: &SessionMetadata,
query: &crate::types::SessionListQuery,
) -> bool {
// Visibility
if !query.include_hidden && (session.no_update || session.is_subagent) {
return false;
}
// Project scope — project_ids lives in metadata.project_id
if !query.project_ids.is_empty() {
let session_project_id = session
.metadata
.get("project_id")
.and_then(|v| v.as_str());
match session_project_id {
Some(pid) => {
if !query.project_ids.iter().any(|q| q.as_str() == pid) {
return false;
}
}
None => return false,
}
}
// Project path filter — exact match on metadata.project_path
if let Some(ref path) = query.project_path {
let session_path = session
.metadata
.get("project_path")
.and_then(|v| v.as_str());
if session_path != Some(path.as_str()) {
return false;
}
}
// Title filter — case-insensitive substring.
if let Some(q) = query.title_query.as_ref() {
let needle = q.to_lowercase();
let haystack = match session.title.as_ref() {
Some(t) => t.to_lowercase(),
None => return false,
};
if !haystack.contains(&needle) {
return false;
}
}
// Tag filter — all requested tags must be present on the session.
if !query.tags.is_empty() {
for required in &query.tags {
if !session.tags.iter().any(|t| t == required) {
return false;
}
}
}
// Model filter — case-insensitive substring on metadata.model.
if let Some(q) = query.model_filter.as_ref() {
let needle = q.to_lowercase();
let haystack = session
.metadata
.get("model")
.and_then(|v| v.as_str())
.map(|s| s.to_lowercase());
match haystack {
Some(h) if h.contains(&needle) => {}
_ => return false,
}
}
true
}
#[async_trait]
impl ArchiveBackend for JsonlBackend {
fn capabilities(&self) -> &CapabilitySet {
&self.capabilities
}
async fn health_check(&self) -> HealthStatus {
match tokio::fs::metadata(self.paths.root()).await {
Ok(m) if m.is_dir() => HealthStatus::Healthy,
Ok(_) => HealthStatus::Unavailable {
reason: "archive root is not a directory".into(),
},
Err(e) => HealthStatus::Unavailable {
reason: format!("stat archive root failed: {e}"),
},
}
}
async fn put_session(&self, meta: SessionMetadata) -> Result<()> {
tokio::fs::create_dir_all(&self.paths.session_dir(meta.scroll_id)).await?;
write_json(&self.paths.session_json(meta.scroll_id), &meta).await?;
Ok(())
}
async fn get_session(&self, scroll_id: Uuid) -> Result<Option<SessionMetadata>> {
// FileBasedArchivist ignores archive parameter (single-archive only)
let session_json_path = self.paths.session_json(scroll_id);
match read_json(&session_json_path).await {
Ok(metadata) => Ok(Some(metadata)),
Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(None),
Err(e) => Err(e.into()),
}
}
async fn list_sessions_paged(&self, query: SessionListQuery) -> Result<SessionPage> {
use crate::types::{SessionCursor, SessionPage, MAX_PAGE_LIMIT};
// Determine which connectors to scan.
let connector_uids: Vec<Uuid> = if !query.connector_uids.is_empty() {
query.connector_uids.clone()
} else {
// Iterate every primary (non-alias) connector.
let index_path = self.paths.connector_index_tsv();
let connectors = read_connector_index(&index_path).await?;
connectors
.into_iter()
.filter(|c| c.alias_of.is_none())
.map(|c| c.connector_uid)
.collect()
};
// Stream matching sessions from every selected connector.
let mut matched: Vec<SessionMetadata> = Vec::new();
for connector_uid in connector_uids {
let sessions = match self.load_sessions_for_connector(connector_uid).await {
Ok(s) => s,
Err(e) => {
tracing::warn!(
connector_uid = %connector_uid,
error = %e,
"Failed to list sessions for connector during paged scan, skipping"
);
continue;
}
};
for session in sessions {
if !matches_query(&session, &query) {
continue;
}
matched.push(session);
}
}
// Sort by (updated_at DESC, scroll_id DESC).
matched.sort_by(|a, b| {
b.updated_at
.cmp(&a.updated_at)
.then_with(|| b.scroll_id.cmp(&a.scroll_id))
});
// Skip entries at-or-before the cursor.
if let Some(cursor) = query.cursor.as_ref() {
matched.retain(|s| {
(s.updated_at, s.scroll_id) < (cursor.updated_at, cursor.scroll_id)
});
}
// Capture total count before slicing.
let total_count = matched.len();
// Clamp limit and paginate.
let effective_limit = query.limit.min(MAX_PAGE_LIMIT).max(1);
let has_more = matched.len() > effective_limit;
matched.truncate(effective_limit);
let next_cursor = if has_more {
matched.last().map(|s| SessionCursor {
updated_at: s.updated_at,
scroll_id: s.scroll_id,
})
} else {
None
};
Ok(SessionPage {
items: matched,
next_cursor,
total_count: Some(total_count),
})
}
async fn delete_session(&self, scroll_id: Uuid) -> Result<()> {
// FileBasedArchivist ignores archive parameter (single-archive only)
// First, read session metadata to get connector_uid and native_session_id
let session_dir = self.paths.session_dir(scroll_id);
let session_json_path = self.paths.session_json(scroll_id);
if !session_dir.exists() {
return Err(ArchivistError::SessionUnknown(scroll_id));
}
// Read session metadata to get connector info
let metadata: SessionMetadata = read_json(&session_json_path).await?;
let connector_uid = metadata.connector_uid;
let native_session_id = metadata.native_session_id.clone();
// Delete the session directory and all its contents
tokio::fs::remove_dir_all(&session_dir).await.map_err(|e| {
tracing::error!("Failed to delete session directory {:?}: {}", session_dir, e);
ArchivistError::Io(e)
})?;
tracing::info!(
"Deleted session directory for scroll_id: {}",
scroll_id
);
// Remove from session cache
if let Some(native_id) = &native_session_id {
let mut cache = self.session_cache.write().await;
cache.remove(&(connector_uid, native_id.clone()));
}
// Note: We're not removing from sessions.ndjson because it's append-only.
// The session simply won't have a directory anymore, so list_sessions will skip it.
// A future enhancement could add a "deleted" flag or periodic compaction.
tracing::info!(
"Successfully deleted session {} (connector: {})",
scroll_id,
connector_uid
);
Ok(())
}
async fn append_messages(
&self,
scroll_id: Uuid,
messages: Vec<MessageRecord>,
) -> Result<()> {
// Ensure session directory exists (handles resync case where directory was deleted)
self.paths.ensure_dirs(scroll_id).await?;
// Append each message to messages.jsonl
let messages_path = self.paths.messages_path_for_write(scroll_id);
for message in &messages {
append_ndjson(&messages_path, message).await?;
}
// Update session.json timestamp (or create if missing)
let session_json_path = self.paths.session_json(scroll_id);
let now = Utc::now();
let session_metadata = match read_json::<SessionMetadata>(&session_json_path).await {
Ok(mut metadata) => {
metadata.updated_at = now;
metadata
}
Err(_) => {
// session.json doesn't exist, create minimal metadata
// This handles resync case where directory was deleted but mapping still exists
tracing::info!(
scroll_id = %scroll_id,
"Creating minimal session.json during append (was missing)"
);
// Look up the correct connector_uid and native_session_id via session mappings
let (connector_uid, native_session_id) = match self.find_mapping_for_scroll_id(scroll_id).await {
Some(mapping) => mapping,
None => {
tracing::error!(
scroll_id = %scroll_id,
"Cannot reconstruct session.json: no connector mapping found. \
Messages written but session metadata will remain missing."
);
return Ok(());
}
};
SessionMetadata {
version: 1,
scroll_id,
created_at: now,
updated_at: now,
title: None,
connector_uid,
native_session_id: Some(native_session_id),
agent_id: None,
parent_scroll_id: None,
continuation: None,
tags: Vec::new(),
metadata: serde_json::json!({}),
no_update: false,
kind: SessionKind::Chat,
acp_client_id: None,
is_connected: None,
current_session_id: None,
models: None,
modes: None,
config_options: None,
completeness: SessionCompleteness::default(),
matrix_room_id: None,
matrix_sharing_active: false,
matrix_shared_at: None,
is_subagent: false,
subagent_type: None,
spawning_tool_use_id: None,
}
}
};
write_json(&session_json_path, &session_metadata).await?;
Ok(())
}
async fn get_messages_paged(
&self,
scroll_id: Uuid,
cursor: Option<MessageCursor>,
limit: usize,
) -> Result<MessagePage> {
use crate::types::MAX_PAGE_LIMIT;
// Hard-clamp limit — same policy as sessions.
let effective_limit = limit.min(MAX_PAGE_LIMIT).max(1);
// Read NDJSON, sort, apply cursor.
let mut all = self.read_messages_sorted(scroll_id).await?;
if let Some(c) = cursor.as_ref() {
// Keep strictly-after the cursor point in (ts, message_id) order.
all.retain(|m| (m.ts, m.message_id) > (c.ts, c.message_id));
}
let total = all.len();
let taken: Vec<_> = all.into_iter().take(effective_limit).collect();
let next_cursor = if total > taken.len() {
taken.last().map(|m| MessageCursor {
ts: m.ts,
message_id: m.message_id,
})
} else {
None
};
Ok(MessagePage {
items: taken,
next_cursor,
})
}
async fn count_messages(&self, scroll_id: Uuid) -> Result<usize> {
let messages_path = self.paths.messages_path_for_read(scroll_id);
// Read file and count lines (each line = one message)
// If file doesn't exist, return 0 (empty session)
match tokio::fs::read_to_string(&messages_path).await {
Ok(content) => {
// Count non-empty lines
let count = content.lines().filter(|line| !line.trim().is_empty()).count();
Ok(count)
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
// File doesn't exist yet - empty session
Ok(0)
}
Err(e) => Err(e.into()),
}
}
async fn clear_session_messages(&self, scroll_id: Uuid) -> Result<()> {
// First, verify the session exists by reading its metadata
let session_json_path = self.paths.session_json(scroll_id);
let mut session_metadata: SessionMetadata = match read_json(&session_json_path).await {
Ok(metadata) => metadata,
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
return Err(ArchivistError::SessionUnknown(scroll_id));
}
Err(e) => return Err(e.into()),
};
// Truncate the messages file (clear all messages)
// First try to clear .jsonl (new format), then fall back to .ndjson (legacy)
let jsonl_path = self.paths.messages_path_for_write(scroll_id);
#[allow(deprecated)]
let ndjson_path = self.paths.messages_ndjson(scroll_id);
let mut cleared = false;
// Clear .jsonl if it exists
if jsonl_path.exists() {
tokio::fs::write(&jsonl_path, "").await?;
cleared = true;
}
// Also clear .ndjson if it exists (in case both are present)
if ndjson_path.exists() {
tokio::fs::write(&ndjson_path, "").await?;
cleared = true;
}
if cleared {
tracing::info!(
scroll_id = %scroll_id,
"Cleared all messages from session"
);
}
// Update the session's updated_at timestamp
session_metadata.updated_at = Utc::now();
write_json(&session_json_path, &session_metadata).await?;
tracing::info!(
scroll_id = %scroll_id,
"Updated session metadata after clearing messages"
);
Ok(())
}
fn as_dag(&self) -> Option<&dyn DagBackend> {
Some(self)
}
fn as_meta_events(&self) -> Option<&dyn MetaEventsBackend> {
Some(self)
}
fn as_connector_registry(&self) -> Option<&dyn ConnectorRegistryBackend> {
Some(self)
}
fn as_session_mapping(&self) -> Option<&dyn SessionMappingBackend> {
Some(self)
}
}
#[cfg(test)]
mod contract_tests {
use super::*;
use tempfile::tempdir;
#[tokio::test]
async fn jsonl_backend_honors_all_contracts() {
let dir = tempdir().expect("tempdir");
let backend = JsonlBackend::new(dir.path().to_path_buf())
.await
.expect("new");
crate::backend::contract::verify_all_contracts(&backend).await;
}
}
@@ -1,161 +0,0 @@
//! `ConnectorRegistryBackend` impl for `JsonlBackend`.
use async_trait::async_trait;
use uuid::Uuid;
use crate::backend::ConnectorRegistryBackend;
use crate::backends::jsonl::backend::JsonlBackend;
use crate::error::{ArchivistError, Result};
use crate::storage::{
read_connector_index, read_json, write_connector_index, write_json,
};
use crate::types::{ConnectorIndexRow, ConnectorRecord};
#[async_trait]
impl ConnectorRegistryBackend for JsonlBackend {
async fn put_connector(&self, record: ConnectorRecord) -> Result<()> {
// Write connector.json
let connector_dir = self.paths.connector_dir(record.connector_uid);
tokio::fs::create_dir_all(&connector_dir).await?;
write_json(&connector_dir.join("connector.json"), &record).await?;
// Append row to index.tsv (read-modify-write).
let index_path = self.paths.connector_index_tsv();
let mut rows = read_connector_index(&index_path).await?;
rows.push(ConnectorIndexRow {
connector_uid: record.connector_uid,
r#type: record.r#type.clone(),
title: record.title.clone(),
client_native_id: record.client_native_id.clone(),
alias_of: record.alias_of,
created_at: record.created_at,
fingerprint: record.fingerprint.clone(),
});
write_connector_index(&index_path, &rows).await?;
// Update cache
self.connector_cache
.write()
.await
.insert(record.connector_uid, record);
Ok(())
}
async fn get_connector(&self, connector_uid: Uuid) -> Result<Option<ConnectorRecord>> {
// Fast path: consult the in-memory cache.
{
let cache = self.connector_cache.read().await;
if let Some(record) = cache.get(&connector_uid) {
return Ok(Some(record.clone()));
}
}
// Disk fallback.
let connector_json = self
.paths
.connector_dir(connector_uid)
.join("connector.json");
match read_json::<ConnectorRecord>(&connector_json).await {
Ok(record) => Ok(Some(record)),
Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(None),
Err(e) => Err(e.into()),
}
}
async fn list_connectors(&self) -> Result<Vec<ConnectorRecord>> {
let index_path = self.paths.connector_index_tsv();
let rows = read_connector_index(&index_path).await?;
let mut connectors = Vec::new();
for row in rows {
if row.alias_of.is_some() {
continue;
}
let connector_json = self
.paths
.connector_dir(row.connector_uid)
.join("connector.json");
match read_json::<ConnectorRecord>(&connector_json).await {
Ok(record) => connectors.push(record),
Err(e) if e.kind() == std::io::ErrorKind::NotFound => continue,
Err(e) => return Err(e.into()),
}
}
Ok(connectors)
}
async fn resolve_connector_uid(
&self,
client_native_id: &str,
) -> Result<Option<Uuid>> {
// First, try parsing client_native_id as a UUID directly
// This handles the common case where the connector_id IS the UUID
if let Ok(uuid) = Uuid::parse_str(client_native_id) {
// Check if this UUID is a registered connector_uid in cache
let cache = self.connector_cache.read().await;
if cache.contains_key(&uuid) {
return Ok(Some(uuid));
}
drop(cache);
// Check on disk if not in cache
let connector_json = self.paths.connector_dir(uuid).join("connector.json");
if connector_json.exists() {
return Ok(Some(uuid));
}
}
// Not a UUID or not registered as a connector_uid - search by client_native_id
// Load connector index and find by client_native_id
let index_path = self.paths.connector_index_tsv();
let connectors = read_connector_index(&index_path).await?;
if let Some(connector) = connectors
.iter()
.find(|c| c.client_native_id == client_native_id)
{
return Ok(Some(connector.connector_uid));
}
// Not found - return Ok(None). Error wrapping is a coordinator concern.
tracing::warn!(
"Failed to resolve connector_uid for client_native_id '{}'. \
This connector may not be registered with the archivist.",
client_native_id
);
Ok(None)
}
async fn update_connector_fingerprint(
&self,
connector_uid: Uuid,
fingerprint: String,
) -> Result<()> {
// 1. Read and update connector.json
let connector_dir = self.paths.connector_dir(connector_uid);
let connector_json = connector_dir.join("connector.json");
let mut record: ConnectorRecord = read_json(&connector_json)
.await
.map_err(|_| ArchivistError::ConnectorUnknown(connector_uid))?;
record.fingerprint = Some(fingerprint.clone());
write_json(&connector_json, &record).await?;
// 2. Update in-memory cache
self.connector_cache
.write()
.await
.insert(connector_uid, record);
// 3. Update index.tsv
let index_path = self.paths.connector_index_tsv();
let mut rows = read_connector_index(&index_path).await?;
if let Some(row) = rows.iter_mut().find(|r| r.connector_uid == connector_uid) {
row.fingerprint = Some(fingerprint);
}
write_connector_index(&index_path, &rows).await?;
Ok(())
}
}
@@ -1,69 +0,0 @@
//! `DagBackend` impl for `JsonlBackend`.
use async_trait::async_trait;
use uuid::Uuid;
use crate::backend::DagBackend;
use crate::backends::jsonl::backend::JsonlBackend;
use crate::error::Result;
use crate::storage::{append_ndjson, read_ndjson};
use crate::types::{DagEdge, SessionMetadata};
#[async_trait]
impl DagBackend for JsonlBackend {
async fn append_dag_edge(&self, edge: DagEdge) -> Result<()> {
let dag_path = self.paths.dag_path();
if let Some(parent) = dag_path.parent() {
tokio::fs::create_dir_all(parent).await?;
}
append_ndjson(&dag_path, &edge).await?;
Ok(())
}
async fn get_children(&self, parent: Uuid) -> Result<Vec<SessionMetadata>> {
let dag_path = self.paths.dag_path();
let edges: Vec<DagEdge> = read_ndjson(&dag_path).await.unwrap_or_default();
let child_ids: Vec<Uuid> = edges
.iter()
.filter(|e| e.parent == parent)
.map(|e| e.child)
.collect();
let mut children = Vec::new();
for child_id in child_ids {
match crate::backend::ArchiveBackend::get_session(self, child_id).await {
Ok(Some(meta)) => children.push(meta),
Ok(None) => {
tracing::warn!(
child_scroll_id = %child_id,
"DAG child session not found"
);
}
Err(e) => {
tracing::warn!(
child_scroll_id = %child_id,
error = %e,
"DAG child session not found"
);
}
}
}
Ok(children)
}
async fn get_dag_edges(&self, root: Uuid) -> Result<Vec<DagEdge>> {
// Single-level read: return edges whose parent == root.
// The recursive DAG walk is coordinator-level orchestration.
let dag_path = self.paths.dag_path();
let all_edges: Vec<DagEdge> = read_ndjson(&dag_path).await.unwrap_or_default();
let edges = all_edges
.into_iter()
.filter(|e| e.parent == root)
.collect();
Ok(edges)
}
}
@@ -1,179 +0,0 @@
//! `SessionMappingBackend` impl for `JsonlBackend`.
use async_trait::async_trait;
use chrono::Utc;
use uuid::Uuid;
use crate::backend::SessionMappingBackend;
use crate::backends::jsonl::backend::JsonlBackend;
use crate::error::Result;
use crate::storage::{append_ndjson, read_connector_index, read_ndjson, write_ndjson};
use crate::types::SessionMapping;
#[async_trait]
impl SessionMappingBackend for JsonlBackend {
async fn put_mapping(
&self,
connector_uid: Uuid,
native_session_id: &str,
scroll_id: Uuid,
) -> Result<()> {
// Ported from the mapping-persistence tail of
// `FileBasedArchivist::register_session`: ensure the connector
// directory exists, append a `SessionMapping` row to
// `.db/connectors/{uid}/sessions.jsonl`, and prime `session_cache`.
//
// No alias detection — the caller has already chosen `scroll_id`.
let now = Utc::now();
// Ensure connector directory exists before appending.
self.paths.ensure_connector_dir(connector_uid).await?;
let session_mapping = SessionMapping {
version: 1,
connector_uid,
native_session_id: native_session_id.to_string(),
scroll_id,
created_at: now,
alias_of: None,
};
let sessions_write_path = self.paths.sessions_path_for_write(connector_uid);
append_ndjson(&sessions_write_path, &session_mapping).await?;
// Prime the in-memory cache for fast resolution.
self.session_cache
.write()
.await
.insert((connector_uid, native_session_id.to_string()), scroll_id);
Ok(())
}
async fn get_mapping(
&self,
connector_uid: Uuid,
native_session_id: &str,
) -> Result<Option<Uuid>> {
// Ported from `FileBasedArchivist::resolve_session`. Cache-first
// lookup; on miss, scan the connector's sessions file and populate
// the cache on hit. Unlike the archivist trait, a miss returns
// `Ok(None)` instead of `Err(SessionUnknown)`.
// Check cache first
let cache_key = (connector_uid, native_session_id.to_string());
{
let cache = self.session_cache.read().await;
if let Some(&scroll_id) = cache.get(&cache_key) {
return Ok(Some(scroll_id));
}
}
// Cache miss - load from disk
let sessions_path = self.paths.sessions_path_for_read(connector_uid);
let mappings: Vec<SessionMapping> = read_ndjson(&sessions_path).await?;
// Find mapping by native_session_id
if let Some(mapping) = mappings
.iter()
.find(|m| m.native_session_id == native_session_id)
{
// Update cache
self.session_cache
.write()
.await
.insert(cache_key, mapping.scroll_id);
Ok(Some(mapping.scroll_id))
} else {
Ok(None)
}
}
async fn list_mappings_for_connector(
&self,
connector_uid: Uuid,
) -> Result<Vec<SessionMapping>> {
// Read `.db/connectors/{uid}/sessions.jsonl` (with `.ndjson`
// fallback handled by `sessions_path_for_read` + `read_ndjson`).
let sessions_path = self.paths.sessions_path_for_read(connector_uid);
let mappings: Vec<SessionMapping> =
read_ndjson(&sessions_path).await.unwrap_or_default();
Ok(mappings)
}
async fn find_owner(
&self,
native_session_id: &str,
) -> Result<Option<(Uuid, Uuid)>> {
// Ported verbatim from `FileBasedArchivist::find_session_owner`.
// Fast path: scan in-memory session_cache
{
let cache = self.session_cache.read().await;
for ((connector_uid, cached_native_id), scroll_id) in cache.iter() {
if cached_native_id == native_session_id {
return Ok(Some((*connector_uid, *scroll_id)));
}
}
}
// Slow path: read connector index and scan each connector's sessions file
let index_path = self.paths.connector_index_tsv();
let rows = read_connector_index(&index_path).await?;
for row in &rows {
// Skip alias connectors - only search primary connectors
if row.alias_of.is_some() {
continue;
}
let sessions_path = self.paths.sessions_path_for_read(row.connector_uid);
let mappings: Vec<SessionMapping> = read_ndjson(&sessions_path).await?;
if let Some(mapping) = mappings
.iter()
.find(|m| m.native_session_id == native_session_id)
{
// Cache the found mapping for future lookups
let cache_key = (row.connector_uid, native_session_id.to_string());
self.session_cache
.write()
.await
.insert(cache_key, mapping.scroll_id);
return Ok(Some((row.connector_uid, mapping.scroll_id)));
}
}
Ok(None)
}
async fn rewrite_connector_mappings(
&self,
connector_uid: Uuid,
mappings: Vec<SessionMapping>,
) -> Result<()> {
// Ensure the connector directory exists before we write.
self.paths.ensure_connector_dir(connector_uid).await?;
// Invalidate cache entries for this connector first, then re-prime
// from the new mapping set. Any (connector_uid, native_id) entry not
// present in `mappings` is dropped.
{
let mut cache = self.session_cache.write().await;
cache.retain(|(cu, _), _| *cu != connector_uid);
for m in &mappings {
cache.insert(
(connector_uid, m.native_session_id.clone()),
m.scroll_id,
);
}
}
// Truncate + re-write the canonical `.jsonl` table.
let write_path = self.paths.sessions_path_for_write(connector_uid);
write_ndjson(&write_path, &mappings).await?;
Ok(())
}
}
@@ -1,200 +0,0 @@
//! `MetaEventsBackend` impl for `JsonlBackend`.
use async_trait::async_trait;
use chrono::Utc;
use uuid::Uuid;
use crate::backend::MetaEventsBackend;
use crate::backends::jsonl::backend::JsonlBackend;
use crate::error::{ArchivistError, Result};
use crate::storage::{append_ndjson, read_json, read_ndjson, write_json};
use crate::types::{
MetaEventRecord, SessionCompleteness, SessionKind, SessionMetadata,
};
#[async_trait]
impl MetaEventsBackend for JsonlBackend {
async fn append_meta_events(
&self,
scroll_id: Uuid,
events: Vec<MetaEventRecord>,
) -> Result<()> {
// Ensure session directory exists
self.paths.ensure_dirs(scroll_id).await?;
// Append each event to events.jsonl
let events_path = self.paths.events_path(scroll_id);
for event in &events {
append_ndjson(&events_path, event).await?;
}
// Update session.json timestamp
let session_json_path = self.paths.session_json(scroll_id);
let now = Utc::now();
let session_metadata = match read_json::<SessionMetadata>(&session_json_path).await {
Ok(mut metadata) => {
metadata.updated_at = now;
metadata
}
Err(_) => {
// session.json doesn't exist, this shouldn't happen for meta sessions
// but we'll handle it gracefully
tracing::warn!(
scroll_id = %scroll_id,
"session.json missing when appending meta events, creating minimal metadata"
);
SessionMetadata {
version: 1,
scroll_id,
created_at: now,
updated_at: now,
title: None,
connector_uid: scroll_id, // Use scroll_id as placeholder
native_session_id: None,
agent_id: None,
parent_scroll_id: None,
continuation: None,
tags: Vec::new(),
metadata: serde_json::json!({}),
no_update: false,
kind: SessionKind::AcpConnection,
acp_client_id: None,
is_connected: None,
current_session_id: None,
models: None,
modes: None,
config_options: None,
completeness: SessionCompleteness::default(),
matrix_room_id: None,
matrix_sharing_active: false,
matrix_shared_at: None,
is_subagent: false,
subagent_type: None,
spawning_tool_use_id: None,
}
}
};
write_json(&session_json_path, &session_metadata).await?;
Ok(())
}
async fn get_meta_events(&self, scroll_id: Uuid) -> Result<Vec<MetaEventRecord>> {
let events_path = self.paths.events_path(scroll_id);
// Read events from events.jsonl
let mut events: Vec<MetaEventRecord> = read_ndjson(&events_path)
.await
.unwrap_or_else(|_| Vec::new());
// Sort by timestamp then event_id for stable ordering
events.sort_by(|a, b| {
a.ts.cmp(&b.ts).then_with(|| a.event_id.cmp(&b.event_id))
});
Ok(events)
}
async fn update_meta_session_status(
&self,
scroll_id: Uuid,
is_connected: bool,
current_session_id: Option<Uuid>,
) -> Result<()> {
// Load existing session metadata
let session_json_path = self.paths.session_json(scroll_id);
let mut session_metadata: SessionMetadata = match read_json(&session_json_path).await {
Ok(metadata) => metadata,
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
return Err(ArchivistError::SessionUnknown(scroll_id));
}
Err(e) => return Err(e.into()),
};
// Update connection status fields
session_metadata.is_connected = Some(is_connected);
session_metadata.current_session_id = current_session_id;
session_metadata.updated_at = Utc::now();
// Write updated metadata back to disk
write_json(&session_json_path, &session_metadata).await?;
tracing::info!(
scroll_id = %scroll_id,
is_connected = %is_connected,
current_session_id = ?current_session_id,
"Updated meta session status"
);
Ok(())
}
async fn list_meta_sessions(&self) -> Result<Vec<SessionMetadata>> {
// Scan .contexts/ directory for all session.json files
let contexts_dir = self.paths.root().join(".contexts");
if !contexts_dir.exists() {
return Ok(Vec::new());
}
let mut meta_sessions = Vec::new();
// Read all session directories
let mut entries = tokio::fs::read_dir(&contexts_dir).await?;
while let Some(entry) = entries.next_entry().await? {
if !entry.file_type().await?.is_dir() {
continue;
}
let session_json_path = entry.path().join("session.json");
// Try to read session.json
match read_json::<SessionMetadata>(&session_json_path).await {
Ok(metadata) => {
// Filter to only AcpConnection sessions
if metadata.kind == SessionKind::AcpConnection {
meta_sessions.push(metadata);
}
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
// Skip missing session files
continue;
}
Err(e) => {
tracing::warn!(
path = ?session_json_path,
error = %e,
"Failed to read session.json while listing meta sessions"
);
continue;
}
}
}
// Sort by updated_at descending (newest first)
meta_sessions.sort_by(|a, b| b.updated_at.cmp(&a.updated_at));
Ok(meta_sessions)
}
async fn find_meta_session_by_client(
&self,
client_id: &str,
) -> Result<Option<SessionMetadata>> {
// Use list_meta_sessions and filter by acp_client_id
let meta_sessions = self.list_meta_sessions().await?;
let result = meta_sessions
.into_iter()
.find(|session| {
session.acp_client_id.as_deref() == Some(client_id)
});
Ok(result)
}
}
@@ -1,12 +0,0 @@
//! NDJSON/JSON/TSV file-based backend.
//!
//! Ports the body of the former `FileBasedArchivist`. Uses the existing
//! `crate::storage` free-function primitives unchanged.
mod backend;
mod connectors;
mod dag;
mod mapping;
mod meta;
pub use backend::JsonlBackend;
@@ -1,5 +0,0 @@
//! Concrete backend implementations for `ArchiveBackend`.
pub mod jsonl;
pub use jsonl::JsonlBackend;
-558
View File
@@ -1,558 +0,0 @@
//! Backfill functionality for importing existing sessions from connectors.
//!
//! This module provides utilities to import sessions and messages from connectors
//! that support listing operations (like OpenCode connectors) into the Archivist.
use futures::future::BoxFuture;
use serde::{Deserialize, Serialize};
use uuid::Uuid;
use crate::coordinator::Archivist;
use crate::error::{ArchivistError, Result};
use crate::types::{MessageRecord, RegisterSessionRequest, RegisterStatus};
use dirigent_protocol::{Message, Session};
/// Statistics collected during a backfill operation.
///
/// This provides a summary of what was imported and any errors encountered.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct BackfillStats {
/// Total number of sessions found in the connector
pub sessions_found: usize,
/// Number of sessions successfully imported (new registrations)
pub sessions_imported: usize,
/// Number of sessions skipped (already archived)
pub sessions_skipped: usize,
/// Total number of messages imported across all sessions
pub messages_imported: usize,
/// Error messages for sessions that failed to import
pub errors: Vec<String>,
}
impl BackfillStats {
/// Create a new BackfillStats with all counts at zero
pub fn new() -> Self {
Self {
sessions_found: 0,
sessions_imported: 0,
sessions_skipped: 0,
messages_imported: 0,
errors: Vec::new(),
}
}
}
impl Default for BackfillStats {
fn default() -> Self {
Self::new()
}
}
/// Backfill sessions from a connector into the archive.
///
/// This function imports existing sessions from a connector by:
/// 1. Attempting to register each session with the archivist
/// 2. For newly registered sessions, fetching messages via the provided closure
/// 3. Appending fetched messages to the archive
/// 4. Collecting statistics on successes, failures, and skips
///
/// # Arguments
///
/// * `archivist` - The archivist to backfill into
/// * `connector_uid` - The UID of the connector being backfilled
/// * `sessions` - List of sessions to import (from connector's list_sessions())
/// * `fetch_messages` - Async closure to fetch messages for a given native session ID
///
/// # Returns
///
/// Statistics about the backfill operation including counts and errors
///
/// # Error Handling
///
/// This function continues processing all sessions even if individual sessions fail.
/// Errors are collected in `BackfillStats.errors` rather than aborting the operation.
///
/// # Example
///
/// ```no_run
/// use dirigent_archivist::{Archivist, backfill_from_sessions};
/// use dirigent_protocol::{Session, Message};
/// use uuid::Uuid;
///
/// # async fn example(archivist: &Archivist, sessions: Vec<Session>) {
/// let connector_uid = Uuid::now_v7();
///
/// let stats = backfill_from_sessions(
/// archivist,
/// connector_uid,
/// sessions,
/// |session_id| {
/// Box::pin(async move {
/// // Fetch messages from connector
/// // Return Vec<Message>
/// Ok(vec![])
/// })
/// }
/// ).await.unwrap();
///
/// println!("Imported {} sessions, {} messages",
/// stats.sessions_imported,
/// stats.messages_imported);
/// # }
/// ```
pub async fn backfill_from_sessions<F>(
archivist: &Archivist,
connector_uid: Uuid,
sessions: Vec<Session>,
fetch_messages: F,
) -> Result<BackfillStats>
where
F: Fn(&str) -> BoxFuture<'static, Result<Vec<Message>>> + Send + Sync,
{
let mut stats = BackfillStats::new();
stats.sessions_found = sessions.len();
for session in sessions {
let native_session_id = session.id.clone();
// Try to resolve the session - if it exists, skip it
match archivist
.resolve_session(connector_uid, &native_session_id, None)
.await
{
Ok(_scroll_id) => {
// Session already archived, skip
stats.sessions_skipped += 1;
continue;
}
Err(ArchivistError::SessionUnknown(_)) => {
// Session not found, proceed with import
}
Err(e) => {
// Unexpected error during resolution
stats.errors.push(format!(
"Failed to resolve session {}: {}",
native_session_id, e
));
continue;
}
}
// Register the session
let register_req = RegisterSessionRequest {
connector_uid,
native_session_id: native_session_id.clone(),
title: Some(session.title.clone()),
custom_scroll_id: None, // Let archivist generate
metadata: serde_json::to_value(&session.metadata)
.unwrap_or_else(|_| serde_json::json!({})),
completeness: Default::default(),
parent_scroll_id: None,
is_subagent: false,
continuation: None,
agent_id: None,
subagent_type: None,
spawning_tool_use_id: None,
};
let scroll_id = match archivist.register_session(register_req, None).await {
Ok(response) => {
match response.status {
RegisterStatus::Accepted => {
stats.sessions_imported += 1;
response.scroll_id
}
RegisterStatus::Aliased => {
// Already exists (shouldn't happen since we checked, but handle gracefully)
stats.sessions_skipped += 1;
continue;
}
RegisterStatus::Rejected => {
// Registration rejected (collision inconsistency)
stats.errors.push(format!(
"Session registration rejected for {}: UID collision",
native_session_id
));
continue;
}
}
}
Err(e) => {
stats.errors.push(format!(
"Failed to register session {}: {}",
native_session_id, e
));
continue;
}
};
// Fetch messages for this session
let messages = match fetch_messages(&native_session_id).await {
Ok(msgs) => msgs,
Err(e) => {
stats.errors.push(format!(
"Failed to fetch messages for session {}: {}",
native_session_id, e
));
continue;
}
};
// Convert protocol messages to message records
let message_records: Vec<MessageRecord> = messages
.into_iter()
.map(|msg| convert_message_to_record(msg, scroll_id))
.collect();
let message_count = message_records.len();
// Append messages to the archive
if let Err(e) = archivist
.append_messages(scroll_id, message_records, None)
.await
{
stats.errors.push(format!(
"Failed to append messages for session {}: {}",
native_session_id, e
));
continue;
}
stats.messages_imported += message_count;
}
Ok(stats)
}
/// Convert a dirigent_protocol::Message to a MessageRecord for archival.
///
/// This function translates the protocol message format into the archivist's
/// internal storage format, extracting markdown content and metadata.
pub fn convert_message_to_record(msg: Message, scroll_id: Uuid) -> MessageRecord {
// Extract text content from message parts and convert to markdown
let mut md_parts = Vec::new();
for part in &msg.content {
match part {
dirigent_protocol::MessagePart::Text { text } => {
md_parts.push(text.clone());
}
dirigent_protocol::MessagePart::Thinking { text } => {
md_parts.push(format!("<thinking>\n{}\n</thinking>", text));
}
dirigent_protocol::MessagePart::Code { language, code } => {
md_parts.push(format!("```{}\n{}\n```", language, code));
}
dirigent_protocol::MessagePart::Tool {
tool,
tool_call_id: _,
input,
output,
} => {
let mut tool_text =
format!("**Tool: {}**\n\nInput:\n```json\n{}\n```", tool, input);
if let Some(out) = output {
tool_text.push_str(&format!("\n\nOutput:\n```json\n{}\n```", out));
}
md_parts.push(tool_text);
}
dirigent_protocol::MessagePart::File { path, content } => {
md_parts.push(format!("**File: {}**\n\n```\n{}\n```", path, content));
}
}
}
let content_md = md_parts.join("\n\n");
// Serialize original content parts for rich UI rendering
let content_parts = serde_json::to_value(&msg.content).ok();
// Convert role
let role = match msg.role {
dirigent_protocol::MessageRole::User => "user",
dirigent_protocol::MessageRole::Assistant => "assistant",
}
.to_string();
// Generate message ID from the protocol message ID or create new one
let message_id = Uuid::now_v7();
MessageRecord {
version: 1,
message_id,
session: scroll_id,
parent_id: None,
ts: msg.created_at,
role,
author: None, // Protocol messages don't have author field
content_md,
content_parts,
attachments: Vec::new(), // Would need to extract from message parts if supported
metadata: msg
.metadata
.and_then(|m| serde_json::to_value(m).ok())
.unwrap_or_else(|| serde_json::json!({})),
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::coordinator::Archivist;
use chrono::Utc;
use dirigent_protocol::{MessageRole, MessageStatus, SessionMetadata};
use tempfile::TempDir;
async fn setup_test_archivist() -> (Archivist, TempDir) {
let temp_dir = TempDir::new().unwrap();
// Use `from_single_backend` so each test is isolated (no shared
// registry file in the tempdir's parent racing against siblings).
let backend = std::sync::Arc::new(
crate::backends::JsonlBackend::new(temp_dir.path().to_path_buf())
.await
.unwrap(),
);
let archivist = Archivist::from_single_backend("main".into(), backend)
.await
.unwrap();
(archivist, temp_dir)
}
fn create_test_session(id: &str, title: &str) -> Session {
Session {
id: id.to_string(),
title: title.to_string(),
created_at: Utc::now(),
updated_at: Utc::now(),
metadata: SessionMetadata {
project_path: "/test".to_string(),
model: Some("test-model".to_string()),
total_messages: 0,
system_message: None,
current_mode_id: None,
_meta: None,
project_id: None,
},
cwd: None,
config_options: None,
acp_client_id: None,
models: None,
modes: None,
}
}
fn create_test_message(id: &str, session_id: &str, role: MessageRole, text: &str) -> Message {
Message {
id: id.to_string(),
session_id: session_id.to_string(),
role,
created_at: Utc::now(),
content: vec![dirigent_protocol::MessagePart::Text {
text: text.to_string(),
}],
status: MessageStatus::Completed,
metadata: None,
}
}
#[tokio::test]
async fn test_backfill_new_sessions() {
let (archivist, _temp) = setup_test_archivist().await;
// Register connector first
let connector_uid = Uuid::now_v7();
let connector_req = crate::types::RegisterConnectorRequest {
custom_uid: Some(connector_uid),
r#type: "OpenCode".to_string(),
title: "Test Connector".to_string(),
client_native_id: "test-connector".to_string(),
metadata: serde_json::json!({}),
fingerprint: None,
};
archivist
.register_connector(connector_req, None)
.await
.unwrap();
// Create test sessions
let sessions = vec![
create_test_session("session-1", "Session 1"),
create_test_session("session-2", "Session 2"),
];
// Mock message fetcher
let fetch_messages = |session_id: &str| {
let sid = session_id.to_string();
Box::pin(async move {
Ok(vec![
create_test_message("msg-1", &sid, MessageRole::User, "Hello"),
create_test_message("msg-2", &sid, MessageRole::Assistant, "Hi there"),
])
}) as BoxFuture<'static, Result<Vec<Message>>>
};
// Backfill
let stats = backfill_from_sessions(&archivist, connector_uid, sessions, fetch_messages)
.await
.unwrap();
// Verify stats
assert_eq!(stats.sessions_found, 2);
assert_eq!(stats.sessions_imported, 2);
assert_eq!(stats.sessions_skipped, 0);
assert_eq!(stats.messages_imported, 4); // 2 messages per session
assert_eq!(stats.errors.len(), 0);
}
#[tokio::test]
async fn test_backfill_skips_existing_sessions() {
let (archivist, _temp) = setup_test_archivist().await;
// Register connector first
let connector_uid = Uuid::now_v7();
let connector_req = crate::types::RegisterConnectorRequest {
custom_uid: Some(connector_uid),
r#type: "OpenCode".to_string(),
title: "Test Connector".to_string(),
client_native_id: "test-connector".to_string(),
metadata: serde_json::json!({}),
fingerprint: None,
};
archivist
.register_connector(connector_req, None)
.await
.unwrap();
// Pre-register one session
let session1 = create_test_session("session-1", "Session 1");
let req = RegisterSessionRequest {
connector_uid,
native_session_id: session1.id.clone(),
title: Some(session1.title.clone()),
custom_scroll_id: None,
metadata: serde_json::json!({}),
completeness: Default::default(),
parent_scroll_id: None,
is_subagent: false,
continuation: None,
agent_id: None,
subagent_type: None,
spawning_tool_use_id: None,
};
archivist.register_session(req, None).await.unwrap();
// Create sessions including the pre-registered one
let sessions = vec![session1, create_test_session("session-2", "Session 2")];
// Mock message fetcher
let fetch_messages = |session_id: &str| {
let sid = session_id.to_string();
Box::pin(async move {
Ok(vec![create_test_message(
"msg-1",
&sid,
MessageRole::User,
"Test",
)])
}) as BoxFuture<'static, Result<Vec<Message>>>
};
// Backfill
let stats = backfill_from_sessions(&archivist, connector_uid, sessions, fetch_messages)
.await
.unwrap();
// Verify stats - session-1 should be skipped
assert_eq!(stats.sessions_found, 2);
assert_eq!(stats.sessions_imported, 1); // Only session-2
assert_eq!(stats.sessions_skipped, 1); // session-1 already exists
assert_eq!(stats.messages_imported, 1); // Only messages from session-2
assert_eq!(stats.errors.len(), 0);
}
#[tokio::test]
async fn test_backfill_handles_fetch_errors() {
let (archivist, _temp) = setup_test_archivist().await;
// Register connector first
let connector_uid = Uuid::now_v7();
let connector_req = crate::types::RegisterConnectorRequest {
custom_uid: Some(connector_uid),
r#type: "OpenCode".to_string(),
title: "Test Connector".to_string(),
client_native_id: "test-connector".to_string(),
metadata: serde_json::json!({}),
fingerprint: None,
};
archivist
.register_connector(connector_req, None)
.await
.unwrap();
let sessions = vec![create_test_session("session-1", "Session 1")];
// Mock message fetcher that fails
let fetch_messages = |_session_id: &str| {
Box::pin(async move {
Err(ArchivistError::InvalidRequest(
"Failed to fetch messages".to_string(),
))
}) as BoxFuture<'static, Result<Vec<Message>>>
};
// Backfill
let stats = backfill_from_sessions(&archivist, connector_uid, sessions, fetch_messages)
.await
.unwrap();
// Verify stats - session registered but messages failed
assert_eq!(stats.sessions_found, 1);
assert_eq!(stats.sessions_imported, 1); // Session was registered
assert_eq!(stats.messages_imported, 0); // But no messages imported
assert_eq!(stats.errors.len(), 1); // Error recorded
assert!(stats.errors[0].contains("Failed to fetch messages"));
}
#[test]
fn test_backfill_stats_default() {
let stats = BackfillStats::default();
assert_eq!(stats.sessions_found, 0);
assert_eq!(stats.sessions_imported, 0);
assert_eq!(stats.sessions_skipped, 0);
assert_eq!(stats.messages_imported, 0);
assert_eq!(stats.errors.len(), 0);
}
#[test]
fn test_convert_message_to_record() {
let scroll_id = Uuid::now_v7();
let msg = create_test_message("msg-1", "session-1", MessageRole::User, "Hello world");
let record = convert_message_to_record(msg, scroll_id);
assert_eq!(record.session, scroll_id);
assert_eq!(record.role, "user");
assert_eq!(record.content_md, "Hello world");
assert_eq!(record.version, 1);
}
#[test]
fn test_convert_message_with_thinking() {
let scroll_id = Uuid::now_v7();
let msg = Message {
id: "msg-1".to_string(),
session_id: "session-1".to_string(),
role: MessageRole::Assistant,
created_at: Utc::now(),
content: vec![dirigent_protocol::MessagePart::Thinking {
text: "Let me think...".to_string(),
}],
status: MessageStatus::Completed,
metadata: None,
};
let record = convert_message_to_record(msg, scroll_id);
assert!(record.content_md.contains("<thinking>"));
assert!(record.content_md.contains("Let me think..."));
assert!(record.content_md.contains("</thinking>"));
}
}
@@ -1,70 +0,0 @@
//! Admin / inspection methods on `Archivist`.
//!
//! Split out because they aren't part of the hot-path coordinator API:
//! `shutdown` drains queued writer tasks, `list_archives_with_health`
//! snapshots every registration's health + queue depth, and the cache
//! admin methods delegate to `ReadCache`.
use std::sync::Arc;
use tokio::sync::oneshot;
use crate::error::Result;
use crate::registry::writer::WriteOp;
use crate::registry::{ArchiveRegistration, ArchiveStatus};
use super::Archivist;
impl Archivist {
/// Drain every queued writer task. Inline backends are no-ops.
/// Call before process exit to ensure in-flight batches land.
pub async fn shutdown(&self) -> Result<()> {
let regs: Vec<Arc<ArchiveRegistration>> = self.registrations.read().await.clone();
for reg in regs.iter() {
if let Some(writer) = reg.writer.as_ref() {
let (tx, rx) = oneshot::channel();
// If the send fails, the writer task has already exited — skip the wait.
if writer.sender.send(WriteOp::Shutdown(tx)).await.is_ok() {
let _ = rx.await;
}
// Join the task, if it's still attached.
if let Some(handle) = writer.join.lock().await.take() {
let _ = handle.await;
}
}
}
Ok(())
}
/// Snapshot every registered archive's current status.
pub async fn list_archives_with_health(&self) -> Vec<ArchiveStatus> {
let regs: Vec<Arc<ArchiveRegistration>> = self.registrations.read().await.clone();
let mut out = Vec::with_capacity(regs.len());
for reg in regs.iter() {
let health = reg.last_health.read().await.clone();
let last_error = reg.last_error.read().await.clone();
let queue_depth = reg.writer.as_ref().map(|w| w.queue_depth_now());
out.push(ArchiveStatus {
name: reg.name.clone(),
type_name: reg.type_name.to_string(),
enabled: reg.enabled,
write_active: reg.write_active,
failure_mode: reg.failure_mode,
read_priority: reg.read_priority,
capabilities: reg.capabilities().clone(),
health,
last_error,
queue_depth,
});
}
out
}
pub async fn clear_read_cache(&self) {
self.read_cache.clear().await;
}
pub async fn read_cache_size(&self) -> usize {
self.read_cache.len().await
}
}
@@ -1,77 +0,0 @@
//! Archive lifecycle methods for `Archivist`.
//!
//! Phase 3 is **startup-only**: the archive registry is constructed from
//! `dirigent.toml` at boot and not mutated at runtime. Accordingly,
//! `add_archive`, `remove_archive`, and `set_default_archive` all return
//! [`ArchivistError::DynamicRegistryUnsupported`]. The `list_archives`
//! and `get_default_archive` read-paths continue to operate against the
//! new `Vec<Arc<ArchiveRegistration>>` storage.
use std::path::PathBuf;
use crate::coordinator::Archivist;
use crate::error::{ArchivistError, Result};
use crate::registry::FailureMode;
impl Archivist {
/// **Deprecated in Phase 3.** Archive registry is configured at boot
/// via `dirigent.toml`; runtime mutation is not supported.
pub async fn add_archive(&self, _name: String, _path: PathBuf) -> Result<()> {
Err(ArchivistError::DynamicRegistryUnsupported)
}
/// **Deprecated in Phase 3.** Archive registry is configured at boot
/// via `dirigent.toml`; runtime mutation is not supported.
pub async fn remove_archive(&self, _name: String, _force: bool) -> Result<()> {
Err(ArchivistError::DynamicRegistryUnsupported)
}
/// List all configured archives. Session counts are reported as `0`
/// because the Phase 3 multi-backend coordinator does not persist a
/// per-archive connector index; counts will be reintroduced by the
/// admin-status query in Task 23.
pub async fn list_archives(&self) -> Result<Vec<super::types::ArchiveInfo>> {
let regs = self.registrations.read().await;
let primary_name = regs
.iter()
.filter(|r| {
r.enabled && r.write_active && r.failure_mode == FailureMode::Required
})
.min_by_key(|r| r.read_priority)
.map(|r| r.name.clone());
Ok(regs
.iter()
.map(|r| super::types::ArchiveInfo {
name: r.name.clone(),
path: PathBuf::new(),
created_at: chrono::Utc::now(),
session_count: 0,
is_default: primary_name.as_deref() == Some(r.name.as_str()),
})
.collect())
}
/// Get the name of the "default" archive — interpreted in Phase 3 as
/// the enabled, write-active, `Required` backend with the lowest
/// `read_priority`.
pub async fn get_default_archive(&self) -> Result<String> {
let regs = self.registrations.read().await;
regs.iter()
.filter(|r| {
r.enabled && r.write_active && r.failure_mode == FailureMode::Required
})
.min_by_key(|r| r.read_priority)
.map(|r| r.name.clone())
.ok_or_else(|| ArchivistError::PrimaryUnavailable {
name: "<default>".into(),
reason: "no required write-active backend".into(),
})
}
/// **Deprecated in Phase 3.** Archive registry is configured at boot
/// via `dirigent.toml`; runtime mutation is not supported.
pub async fn set_default_archive(&self, _name: String) -> Result<()> {
Err(ArchivistError::DynamicRegistryUnsupported)
}
}
@@ -1,281 +0,0 @@
//! Boot-time construction of the `Archivist` coordinator from a parsed
//! `ArchivesConfig` and a `BackendRegistry` of factories.
use std::sync::Arc;
use tokio::sync::RwLock;
use crate::backend::HealthStatus;
use crate::error::ArchivistBootError;
use crate::registry::{
cache::ReadCache, ArchiveRegistration, ArchivesConfig, BackendRegistry, FailureMode,
WritePolicy,
};
use super::Archivist;
impl Archivist {
/// Construct the coordinator from a parsed `[[archives]]` config block
/// and a registry of backend factories.
///
/// - Validates the config (duplicate-name / no-primary rules).
/// - Instantiates every enabled backend via the factory.
/// - Runs a startup `health_check` per backend.
/// - Sorts registrations by `read_priority` (ties by declaration order).
/// - Writer tasks for `WritePolicy::Queued` backends are wired in Task 17;
/// for now every backend boots with `writer = None`.
pub async fn from_config(
mut config: ArchivesConfig,
registry: &BackendRegistry,
base_dir: Option<&std::path::Path>,
) -> Result<Self, ArchivistBootError> {
config.validate()?;
// Filter-level validation (Phase 4, Task 19).
//
// 1. At least one enabled write-active archive must have an
// unrestricted filter. Otherwise there is no default home for
// a session that does not match any filter, and the primary
// target would silently exclude sessions despite being the
// "write-always" backend.
// 2. No archive may declare a filter whose `include_connectors`
// set is `Some(empty)` — that form rejects every session
// unconditionally and is almost always a config typo.
let mut has_unrestricted_write_active = false;
for entry in &config.entries {
if let Some(inc) = &entry.filter.include_connectors {
if inc.is_empty() {
return Err(ArchivistBootError::FilterRejectsEverything {
archive: entry.name.clone(),
});
}
}
if entry.enabled && entry.write_active && entry.filter.is_unrestricted() {
has_unrestricted_write_active = true;
}
}
if !config.entries.is_empty() && !has_unrestricted_write_active {
return Err(ArchivistBootError::NoUnrestrictedPrimary);
}
// Resolve relative `params.path` values against `base_dir` so that
// archives declared with relative paths land under the data directory
// rather than the binary's CWD.
if let Some(base) = base_dir {
for entry in &mut config.entries {
if let toml::Value::Table(ref mut table) = entry.params {
if let Some(toml::Value::String(ref mut path_str)) = table.get_mut("path") {
let p = std::path::Path::new(path_str.as_str());
if p.is_relative() {
*path_str = base.join(&*path_str).to_string_lossy().into_owned();
}
}
}
}
}
let mut registrations: Vec<Arc<ArchiveRegistration>> = Vec::new();
for entry in config.entries.into_iter() {
let backend = registry
.build(&entry.name, &entry.type_name, entry.params)
.await
.map_err(|e| match e {
crate::registry::BackendBuildError::UnknownType(t) => {
ArchivistBootError::UnknownType {
name: entry.name.clone(),
type_name: t,
}
}
other => ArchivistBootError::BackendBuild {
name: entry.name.clone(),
source: other,
},
})?;
let initial_health = backend.health_check().await;
if entry.failure_mode == FailureMode::Required {
if let HealthStatus::Unavailable { reason } = &initial_health {
return Err(ArchivistBootError::UnavailableRequiredBackend {
name: entry.name.clone(),
reason: reason.clone(),
});
}
}
let runtime_policy: WritePolicy = entry.write_policy.into_runtime();
// Build shared drift state up-front so the writer task (if any)
// and the registration's health-drift helpers mutate the SAME
// `Arc<RwLock<_>>` cells. This keeps Task 22's drift semantics
// coherent across the inline and queued paths.
let health_state: Arc<RwLock<HealthStatus>> =
Arc::new(RwLock::new(initial_health.clone()));
let error_state: Arc<
RwLock<Option<(chrono::DateTime<chrono::Utc>, String)>>,
> = Arc::new(RwLock::new(None));
let failure_counter: Arc<RwLock<u32>> = Arc::new(RwLock::new(0u32));
let writer = match &runtime_policy {
WritePolicy::Inline => None,
WritePolicy::Queued {
batch_window_ms,
capacity,
overflow,
} => Some(crate::registry::writer::spawn_writer(
backend.clone(),
entry.name.clone(),
*capacity,
std::time::Duration::from_millis(*batch_window_ms),
*overflow,
health_state.clone(),
error_state.clone(),
failure_counter.clone(),
)),
};
// Leak `type_name` to satisfy &'static str on the registration; safe at boot,
// and a constant number of entries (O(archives in config)).
let type_name_static: &'static str = Box::leak(entry.type_name.into_boxed_str());
let registration = ArchiveRegistration::new_with_shared_state(
entry.name,
type_name_static,
backend,
entry.write_active,
entry.failure_mode,
entry.read_priority,
entry.enabled,
runtime_policy,
writer,
health_state,
error_state,
failure_counter,
)
.with_filter(entry.filter);
registrations.push(Arc::new(registration));
}
// Sort by `read_priority`. Rust's sort is stable, so ties keep declaration order.
registrations.sort_by_key(|r| r.read_priority);
Ok(Self {
registrations: RwLock::new(registrations),
read_cache: Arc::new(ReadCache::new()),
registry_path: std::path::PathBuf::new(),
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::registry::{ArchivesConfig, BackendRegistry};
fn parse(toml_src: &str) -> ArchivesConfig {
toml::from_str(toml_src).unwrap()
}
#[tokio::test]
async fn relative_archive_path_resolved_against_base_dir() {
let base = tempfile::tempdir().unwrap();
let cfg = parse(
r#"
[[archives]]
name = "main"
type = "jsonl"
[archives.params]
path = "my_archive"
"#,
);
let registry = BackendRegistry::with_jsonl();
let archivist = Archivist::from_config(cfg, &registry, Some(base.path()))
.await
.unwrap();
// The archive should have been created under base_dir/my_archive.
// Verify by checking the .contexts directory exists.
let expected = base.path().join("my_archive").join(".contexts");
assert!(
expected.exists(),
"expected {expected:?} to exist after boot with relative path"
);
// Also verify the archivist is functional (has one registration).
let archives = archivist.list_archives().await.unwrap();
assert_eq!(archives.len(), 1);
}
#[tokio::test]
async fn absolute_archive_path_not_affected_by_base_dir() {
let base = tempfile::tempdir().unwrap();
let archive_dir = tempfile::tempdir().unwrap();
let abs_path = archive_dir.path().to_string_lossy().replace('\\', "/");
let cfg = parse(&format!(
r#"
[[archives]]
name = "main"
type = "jsonl"
[archives.params]
path = "{abs_path}"
"#,
));
let registry = BackendRegistry::with_jsonl();
let archivist = Archivist::from_config(cfg, &registry, Some(base.path()))
.await
.unwrap();
// The archive should be at the absolute path, NOT under base_dir.
let expected = archive_dir.path().join(".contexts");
assert!(
expected.exists(),
"expected {expected:?} to exist (absolute path should be used as-is)"
);
// Verify nothing was created under base_dir with the archive name.
// If base_dir resolution incorrectly touched the absolute path, we'd
// see stray directories under base_dir.
let base_entries: Vec<_> = std::fs::read_dir(base.path())
.unwrap()
.collect();
assert!(
base_entries.is_empty(),
"base_dir should be untouched when archive path is absolute, found: {base_entries:?}"
);
let archives = archivist.list_archives().await.unwrap();
assert_eq!(archives.len(), 1);
}
#[tokio::test]
async fn none_base_dir_preserves_existing_behavior() {
let archive_dir = tempfile::tempdir().unwrap();
let abs_path = archive_dir.path().to_string_lossy().replace('\\', "/");
let cfg = parse(&format!(
r#"
[[archives]]
name = "main"
type = "jsonl"
[archives.params]
path = "{abs_path}"
"#,
));
let registry = BackendRegistry::with_jsonl();
let archivist = Archivist::from_config(cfg, &registry, None)
.await
.unwrap();
let expected = archive_dir.path().join(".contexts");
assert!(
expected.exists(),
"expected {expected:?} to exist with None base_dir and absolute path"
);
let archives = archivist.list_archives().await.unwrap();
assert_eq!(archives.len(), 1);
}
}
@@ -1,285 +0,0 @@
//! Connector orchestration for `Archivist`.
//!
//! Alias detection and tri-state registration logic live here; persistence is
//! delegated to each backend's `ConnectorRegistryBackend` sub-trait. Ported
//! from `FileBasedArchivist::register_connector` and
//! `MultiArchiveArchivist::resolve_connector_uid`.
use chrono::Utc;
use uuid::Uuid;
use crate::backend::ArchiveCapability;
use crate::coordinator::Archivist;
use crate::error::{ArchivistError, Result};
use crate::types::{
ConnectorRecord, RegisterConnectorRequest, RegisterConnectorResponse, RegisterStatus,
};
impl Archivist {
/// Register a connector with alias detection.
///
/// Ported from `FileBasedArchivist::register_connector`. Decision order:
///
/// 1. If `custom_uid` collides with an existing connector:
/// - same `client_native_id` → `Aliased` (idempotent re-registration).
/// - different `client_native_id` → `CollisionInconsistent` error.
/// 2. If the `client_native_id` is already registered under a different
/// UID → `Aliased` to that pre-existing UID.
/// 3. If a `fingerprint` matches a pre-existing connector → `Aliased` to
/// that UID. (Identity persistence across connector re-adds.)
/// 4. Otherwise → `Accepted`; a new `ConnectorRecord` is persisted via
/// `ConnectorRegistryBackend::put_connector`.
// TODO(phase3 task 16): register_connector fanout requires replicating the
// ConnectorRecord to secondaries. Since connectors are identity-shaped (UIDs
// must match across backends), the tri-state alias detection must stay
// canonical on the primary, but the accepted record should be mirrored to
// secondaries. Deferred to a follow-up within Phase 3 — the core Task 16
// plan covers append_messages and the session mutators which are the hot
// paths. Current behaviour: single-primary via `resolve_backend`.
pub async fn register_connector(
&self,
req: RegisterConnectorRequest,
archive: Option<String>,
) -> Result<RegisterConnectorResponse> {
let backend = self.resolve_backend(archive).await?;
let registry = backend.as_connector_registry().ok_or_else(|| {
ArchivistError::CapabilityNotSupported {
capability: ArchiveCapability::ConnectorRegistry,
backend: "selected".into(),
}
})?;
// Generate connector UID (use custom_uid or generate new)
let connector_uid = req.custom_uid.unwrap_or_else(Uuid::now_v7);
// Load existing (non-alias) connectors for collision detection.
let existing_connectors = registry.list_connectors().await?;
// 1. Check for UID collision.
if let Some(existing) = existing_connectors
.iter()
.find(|c| c.connector_uid == connector_uid)
{
if existing.client_native_id == req.client_native_id {
// Same UID with same client_native_id -> ALIASED (idempotent).
return Ok(RegisterConnectorResponse {
status: RegisterStatus::Aliased,
connector_uid,
alias_of: Some(connector_uid),
note: Some("Connector already registered with this UID".to_string()),
});
} else {
// Same UID with different client_native_id -> REJECTED.
return Err(ArchivistError::CollisionInconsistent(connector_uid));
}
}
// 2. Check for existing client_native_id (different UID collision).
if let Some(existing) = existing_connectors
.iter()
.find(|c| c.client_native_id == req.client_native_id)
{
return Ok(RegisterConnectorResponse {
status: RegisterStatus::Aliased,
connector_uid: existing.connector_uid,
alias_of: Some(existing.connector_uid),
note: Some("Connector already registered with different UID".to_string()),
});
}
// 3. Check for fingerprint match (identity persistence across re-adds).
//
// Note: the original `FileBasedArchivist` additionally refreshed the
// matched connector's `title`/`metadata` on disk and in cache here.
// That refresh bypassed both the TSV index and any backend abstraction
// (direct `read_json`/`write_json` against `connector.json`). The
// `ConnectorRegistryBackend` trait does not yet expose an
// "update metadata" method, and `put_connector` would append a
// duplicate row to the index rather than mutate in place. The refresh
// was best-effort (`let _ = write_json(...)`) and is not exercised by
// existing tests; deliberately skipped here. Re-introduce via a
// dedicated backend method if a consumer relies on it.
if let Some(ref fp) = req.fingerprint {
if let Some(existing) = existing_connectors
.iter()
.find(|c| c.fingerprint.as_deref() == Some(fp.as_str()))
{
let matched_uid = existing.connector_uid;
return Ok(RegisterConnectorResponse {
status: RegisterStatus::Aliased,
connector_uid: matched_uid,
alias_of: Some(matched_uid),
note: Some(format!("Matched by fingerprint: {}", fp)),
});
}
}
// 4. No collision -> ACCEPTED, create and persist new connector.
let now = Utc::now();
let connector_record = ConnectorRecord {
version: 1,
connector_uid,
r#type: req.r#type,
title: req.title,
client_native_id: req.client_native_id,
alias_of: None,
created_at: now,
metadata: req.metadata,
fingerprint: req.fingerprint,
};
registry.put_connector(connector_record).await?;
Ok(RegisterConnectorResponse {
status: RegisterStatus::Accepted,
connector_uid,
alias_of: None,
note: None,
})
}
/// Resolve a connector UID by scanning every registered backend.
///
/// Ported from `MultiArchiveArchivist::resolve_connector_uid`: each
/// backend is tried in turn; the first backend that recognises the
/// `client_native_id` wins. As a secondary path, if `client_native_id`
/// parses as a UUID, checks whether a backend already has a connector
/// record at that UID. Returns `ConnectorUnknown(Uuid::nil())` if no
/// backend can resolve it.
pub async fn resolve_connector_uid(&self, client_native_id: &str) -> Result<Uuid> {
// Hand-rolled walk rather than `read_walk_collection`: we want
// "try every backend" semantics — a backend that returns `Ok(None)`
// should NOT win the walk. `read_walk_collection` treats any `Ok(_)`
// as a hit, so it would stop at the first backend that answered at
// all. Health drift is still wired through `record_read_*`.
let regs: Vec<_> = self.registrations.read().await.clone();
for reg in regs.iter() {
if !reg.enabled {
continue;
}
let Some(registry) = reg.backend.as_connector_registry() else {
continue;
};
match registry.resolve_connector_uid(client_native_id).await {
Ok(Some(uid)) => {
self.record_read_success(reg).await;
return Ok(uid);
}
Ok(None) => {
self.record_read_success(reg).await;
if let Ok(parsed) = Uuid::parse_str(client_native_id) {
match registry.get_connector(parsed).await {
Ok(Some(_)) => return Ok(parsed),
Ok(None) => {}
Err(_) => {
self.record_read_failure(reg).await;
}
}
}
}
Err(_) => {
self.record_read_failure(reg).await;
}
}
}
Err(ArchivistError::ConnectorUnknown(Uuid::nil()))
}
/// List connectors in the selected archive (non-aliases only).
///
/// When `archive` is `Some`, the explicit override still resolves directly
/// against that named backend (returning `ArchiveNameUnknown` /
/// `CapabilityNotSupported` as appropriate). When `None`, routing walks
/// enabled backends in `read_priority` order and returns the first
/// `ConnectorRegistry`-capable answer.
pub async fn list_connectors(
&self,
archive: Option<String>,
) -> Result<Vec<ConnectorRecord>> {
if let Some(name) = archive {
let reg = self
.find_registration(&name)
.await
.ok_or(ArchivistError::ArchiveNameUnknown(name))?;
let registry = reg.backend.as_connector_registry().ok_or_else(|| {
ArchivistError::CapabilityNotSupported {
capability: ArchiveCapability::ConnectorRegistry,
backend: reg.name.clone(),
}
})?;
return registry.list_connectors().await;
}
Ok(self
.read_walk_collection(
|reg| reg.backend.as_connector_registry().is_some(),
|backend| async move {
let cr = backend
.as_connector_registry()
.expect("predicate ensured");
cr.list_connectors().await
},
)
.await?
.unwrap_or_default())
}
/// Update the stable fingerprint of an existing connector.
///
/// NOTE: read-mutate-write on the backend side; falls through to inline
/// under `WritePolicy::Queued` (no `WriteOp` variant).
pub async fn update_connector_fingerprint(
&self,
connector_uid: Uuid,
fingerprint: String,
archive: Option<String>,
) -> Result<()> {
let primary = self.resolve_primary(archive.clone()).await?;
let regs: Vec<std::sync::Arc<crate::registry::ArchiveRegistration>> =
self.registrations.read().await.clone();
let primary_reg = primary.backend.as_connector_registry().ok_or_else(|| {
ArchivistError::PrimaryUnavailable {
name: primary.name.clone(),
reason: "backend lacks ConnectorRegistry capability".into(),
}
})?;
if let Err(e) = primary_reg
.update_connector_fingerprint(connector_uid, fingerprint.clone())
.await
{
self.record_write_failure(&primary, &format!("{e}")).await;
return Err(e);
}
self.record_write_success(&primary).await;
for reg in regs.iter() {
if reg.name == primary.name {
continue;
}
if !reg.enabled || !reg.write_active {
continue;
}
let Some(sec_reg) = reg.backend.as_connector_registry() else {
tracing::debug!(
backend = reg.name.as_str(),
type_name = reg.type_name,
op = "update_connector_fingerprint",
"capability_skip"
);
continue;
};
if let Err(e) = sec_reg
.update_connector_fingerprint(connector_uid, fingerprint.clone())
.await
{
self.record_write_failure(reg, &format!("{e}")).await;
if reg.failure_mode == crate::registry::FailureMode::Required {
return Err(e);
}
} else {
self.record_write_success(reg).await;
}
}
Ok(())
}
}
@@ -1,526 +0,0 @@
//! Meta events, DAG, and cleanup orchestration for `Archivist`.
//!
//! Ported from `FileBasedArchivist` in `archivist.rs`. Meta events and DAG
//! methods are thin delegates over `as_meta_events()` / `as_dag()`;
//! `get_session_tree` performs a recursive DAG walk; `cleanup_empty_sessions`
//! pages through all sessions and deletes those with zero messages (skipping
//! `SessionKind::AcpConnection` meta sessions, which track events rather than
//! messages).
use uuid::Uuid;
use crate::backend::ArchiveCapability;
use crate::coordinator::Archivist;
use crate::error::{ArchivistError, Result};
use crate::types::{
DagEdge, MetaEventRecord, SessionKind, SessionListQuery, SessionMetadata, MAX_PAGE_LIMIT,
};
impl Archivist {
// ------------------------------------------------------------------
// Meta events
// ------------------------------------------------------------------
pub async fn append_meta_events(
&self,
scroll_id: Uuid,
events: Vec<MetaEventRecord>,
archive: Option<String>,
) -> Result<()> {
let primary = self.resolve_primary(archive.clone()).await?;
let regs: Vec<std::sync::Arc<crate::registry::ArchiveRegistration>> =
self.registrations.read().await.clone();
// Primary must have MetaEvents capability even in the queued path —
// the writer task dispatches to `as_meta_events()`, so we'd silently
// drop events on an incapable primary. Fail fast here.
let _ = primary.backend.as_meta_events().ok_or_else(|| {
ArchivistError::PrimaryUnavailable {
name: primary.name.clone(),
reason: "backend lacks MetaEvents capability".into(),
}
})?;
match &primary.write_policy {
crate::registry::WritePolicy::Inline => {
let primary_meta = primary
.backend
.as_meta_events()
.expect("capability checked above");
if let Err(e) = primary_meta
.append_meta_events(scroll_id, events.clone())
.await
{
self.record_write_failure(&primary, &format!("{e}")).await;
return Err(e);
}
self.record_write_success(&primary).await;
}
crate::registry::WritePolicy::Queued { .. } => {
let writer = primary
.writer
.as_ref()
.expect("queued policy implies writer handle present");
writer
.enqueue(crate::registry::writer::WriteOp::AppendMetaEvents {
scroll_id,
events: events.clone(),
})
.await?;
}
}
let session_metadata_for_filter = self
.load_metadata_for_filter(scroll_id, &regs, &primary.name)
.await;
for reg in regs.iter() {
if reg.name == primary.name {
continue;
}
if !reg.enabled || !reg.write_active {
continue;
}
if !Self::filter_allows(reg, session_metadata_for_filter.as_ref()) {
tracing::debug!(
archive = %reg.name,
scroll_id = %scroll_id,
op = "append_meta_events",
"filter_skip"
);
continue;
}
if reg.backend.as_meta_events().is_none() {
tracing::debug!(
backend = reg.name.as_str(),
type_name = reg.type_name,
op = "append_meta_events",
"capability_skip"
);
continue;
}
match &reg.write_policy {
crate::registry::WritePolicy::Inline => {
let me = reg
.backend
.as_meta_events()
.expect("capability checked above");
if let Err(e) = me.append_meta_events(scroll_id, events.clone()).await {
self.record_write_failure(reg, &format!("{e}")).await;
if reg.failure_mode == crate::registry::FailureMode::Required {
return Err(e);
}
} else {
self.record_write_success(reg).await;
}
}
crate::registry::WritePolicy::Queued { .. } => {
let writer = reg
.writer
.as_ref()
.expect("queued policy implies writer handle present");
if let Err(e) = writer
.enqueue(crate::registry::writer::WriteOp::AppendMetaEvents {
scroll_id,
events: events.clone(),
})
.await
{
if reg.failure_mode == crate::registry::FailureMode::Required {
return Err(e);
}
}
}
}
}
Ok(())
}
pub async fn get_meta_events(
&self,
scroll_id: Uuid,
_archive: Option<String>,
) -> Result<Vec<MetaEventRecord>> {
// `archive` is now ignored for reads; routing picks the highest-priority
// backend that has the session and supports `MetaEvents`.
Ok(self
.read_walk_per_session(
scroll_id,
|reg| reg.backend.as_meta_events().is_some(),
|backend| async move {
let me = backend.as_meta_events().expect("predicate ensured");
me.get_meta_events(scroll_id).await.map(Some)
},
)
.await?
.unwrap_or_default())
}
/// Update the connection status of an ACP meta-session.
///
/// NOTE: read-mutate-write on the backend side (the impl rewrites fields
/// on the stored session); falls through to inline under
/// `WritePolicy::Queued` (no `WriteOp` variant).
pub async fn update_meta_session_status(
&self,
scroll_id: Uuid,
is_connected: bool,
current_session_id: Option<Uuid>,
archive: Option<String>,
) -> Result<()> {
let primary = self.resolve_primary(archive.clone()).await?;
let regs: Vec<std::sync::Arc<crate::registry::ArchiveRegistration>> =
self.registrations.read().await.clone();
let primary_meta = primary.backend.as_meta_events().ok_or_else(|| {
ArchivistError::PrimaryUnavailable {
name: primary.name.clone(),
reason: "backend lacks MetaEvents capability".into(),
}
})?;
if let Err(e) = primary_meta
.update_meta_session_status(scroll_id, is_connected, current_session_id)
.await
{
self.record_write_failure(&primary, &format!("{e}")).await;
return Err(e);
}
self.record_write_success(&primary).await;
for reg in regs.iter() {
if reg.name == primary.name {
continue;
}
if !reg.enabled || !reg.write_active {
continue;
}
let Some(me) = reg.backend.as_meta_events() else {
tracing::debug!(
backend = reg.name.as_str(),
type_name = reg.type_name,
op = "update_meta_session_status",
"capability_skip"
);
continue;
};
if let Err(e) = me
.update_meta_session_status(scroll_id, is_connected, current_session_id)
.await
{
self.record_write_failure(reg, &format!("{e}")).await;
if reg.failure_mode == crate::registry::FailureMode::Required {
return Err(e);
}
} else {
self.record_write_success(reg).await;
}
}
Ok(())
}
pub async fn list_meta_sessions(
&self,
_archive: Option<String>,
) -> Result<Vec<SessionMetadata>> {
// Collection-shape read: first enabled/healthy backend that supports
// `MetaEvents` wins. `archive` override is no longer honoured here —
// routing decides.
Ok(self
.read_walk_collection(
|reg| reg.backend.as_meta_events().is_some(),
|backend| async move {
let me = backend.as_meta_events().expect("predicate ensured");
me.list_meta_sessions().await
},
)
.await?
.unwrap_or_default())
}
pub async fn find_meta_session_by_client(
&self,
client_id: &str,
_archive: Option<String>,
) -> Result<Option<SessionMetadata>> {
// Collection-shape read: first enabled/healthy backend that supports
// `MetaEvents` wins. The inner op returns `Result<Option<_>>`, so the
// walker's outer `Option` flattens to the inner one — "no backend
// answered" and "backend answered None" collapse the same way.
let client_id = client_id.to_string();
let result = self
.read_walk_collection(
|reg| reg.backend.as_meta_events().is_some(),
|backend| {
let client_id = client_id.clone();
async move {
let me = backend.as_meta_events().expect("predicate ensured");
me.find_meta_session_by_client(&client_id).await
}
},
)
.await?;
Ok(result.flatten())
}
// ------------------------------------------------------------------
// DAG
// ------------------------------------------------------------------
pub async fn append_dag_edge(
&self,
edge: DagEdge,
archive: Option<String>,
) -> Result<()> {
let primary = self.resolve_primary(archive.clone()).await?;
let regs: Vec<std::sync::Arc<crate::registry::ArchiveRegistration>> =
self.registrations.read().await.clone();
// Primary must have DAG capability — even in the queued path the
// writer task dispatches to `as_dag()`, so silently accepting a
// non-DAG primary would lose the edge.
let _ = primary.backend.as_dag().ok_or_else(|| {
ArchivistError::PrimaryUnavailable {
name: primary.name.clone(),
reason: "backend lacks Dag capability".into(),
}
})?;
match &primary.write_policy {
crate::registry::WritePolicy::Inline => {
let primary_dag = primary
.backend
.as_dag()
.expect("capability checked above");
if let Err(e) = primary_dag.append_dag_edge(edge.clone()).await {
self.record_write_failure(&primary, &format!("{e}")).await;
return Err(e);
}
self.record_write_success(&primary).await;
}
crate::registry::WritePolicy::Queued { .. } => {
let writer = primary
.writer
.as_ref()
.expect("queued policy implies writer handle present");
writer
.enqueue(crate::registry::writer::WriteOp::AppendDagEdge(edge.clone()))
.await?;
}
}
// DAG edges are indexed under the parent scroll_id, so use that for
// filter evaluation (the session whose DAG is being extended).
let parent_scroll_id = edge.parent;
let session_metadata_for_filter = self
.load_metadata_for_filter(parent_scroll_id, &regs, &primary.name)
.await;
for reg in regs.iter() {
if reg.name == primary.name {
continue;
}
if !reg.enabled || !reg.write_active {
continue;
}
if !Self::filter_allows(reg, session_metadata_for_filter.as_ref()) {
tracing::debug!(
archive = %reg.name,
scroll_id = %parent_scroll_id,
op = "append_dag_edge",
"filter_skip"
);
continue;
}
if reg.backend.as_dag().is_none() {
tracing::debug!(
backend = reg.name.as_str(),
type_name = reg.type_name,
op = "append_dag_edge",
"capability_skip"
);
continue;
}
match &reg.write_policy {
crate::registry::WritePolicy::Inline => {
let d = reg.backend.as_dag().expect("capability checked above");
if let Err(e) = d.append_dag_edge(edge.clone()).await {
self.record_write_failure(reg, &format!("{e}")).await;
if reg.failure_mode == crate::registry::FailureMode::Required {
return Err(e);
}
} else {
self.record_write_success(reg).await;
}
}
crate::registry::WritePolicy::Queued { .. } => {
let writer = reg
.writer
.as_ref()
.expect("queued policy implies writer handle present");
if let Err(e) = writer
.enqueue(crate::registry::writer::WriteOp::AppendDagEdge(edge.clone()))
.await
{
if reg.failure_mode == crate::registry::FailureMode::Required {
return Err(e);
}
}
}
}
}
Ok(())
}
pub async fn get_children(
&self,
scroll_id: Uuid,
_archive: Option<String>,
) -> Result<Vec<SessionMetadata>> {
// `archive` is now ignored for reads; routing picks the highest-priority
// backend that has the session and supports `Dag`.
Ok(self
.read_walk_per_session(
scroll_id,
|reg| reg.backend.as_dag().is_some(),
|backend| async move {
let d = backend.as_dag().expect("predicate ensured");
d.get_children(scroll_id).await.map(Some)
},
)
.await?
.unwrap_or_default())
}
/// Recursive DAG walk rooted at `root_scroll_id`.
///
/// Matches the shape of `FileBasedArchivist::get_session_tree`: returns
/// every edge reachable from `root_scroll_id` (children, grandchildren,
/// …). Uses `DagBackend::get_dag_edges` per-parent plus a `seen` set to
/// guard against cycles.
pub async fn get_session_tree(
&self,
root_scroll_id: Uuid,
archive: Option<String>,
) -> Result<Vec<DagEdge>> {
// TODO(phase3): consider multi-backend DAG walk in a future phase —
// current impl uses the default backend only. Consistent BFS across
// a tree requires all `get_dag_edges` calls to target the SAME
// backend as the root, which the walker API does not yet expose.
let backend = self.resolve_backend(archive).await?;
let dag = backend
.as_dag()
.ok_or_else(|| ArchivistError::CapabilityNotSupported {
capability: ArchiveCapability::Dag,
backend: "selected".into(),
})?;
let mut out = Vec::new();
let mut stack = vec![root_scroll_id];
let mut seen = std::collections::HashSet::new();
while let Some(parent) = stack.pop() {
if !seen.insert(parent) {
continue;
}
let edges = dag.get_dag_edges(parent).await?;
for e in &edges {
stack.push(e.child);
}
out.extend(edges);
}
Ok(out)
}
// ------------------------------------------------------------------
// Cleanup
// ------------------------------------------------------------------
/// Delete sessions that have zero messages.
///
/// Ported from `FileBasedArchivist::cleanup_empty_sessions`. Pages through
/// every session (including hidden ones) via `list_sessions_paged`, counts
/// messages per session, and deletes those with zero. Meta sessions
/// (`SessionKind::AcpConnection`) are skipped — they track connection
/// events in `events.jsonl`, not messages, so an empty message log is
/// expected.
///
/// Returns `(deleted, total_scanned)`.
pub async fn cleanup_empty_sessions(
&self,
archive: Option<String>,
) -> Result<(usize, usize)> {
let backend = self.resolve_backend(archive).await?;
let mut total: usize = 0;
let mut deleted: usize = 0;
let mut q = SessionListQuery {
include_hidden: true,
limit: MAX_PAGE_LIMIT,
..SessionListQuery::default()
};
loop {
let page = backend.list_sessions_paged(q.clone()).await?;
for session in page.items.iter() {
total += 1;
// Skip meta sessions - they track events, not messages, so a
// zero message count is expected and not a signal of emptiness.
if session.kind == SessionKind::AcpConnection {
tracing::debug!(
scroll_id = %session.scroll_id,
"Skipping meta session (AcpConnection) during cleanup"
);
continue;
}
let count = match backend.count_messages(session.scroll_id).await {
Ok(c) => c,
Err(e) => {
// Match legacy semantics: if we can't count messages,
// skip this session rather than risk deleting a
// non-empty one.
tracing::warn!(
scroll_id = %session.scroll_id,
error = %e,
"Failed to count messages for session, skipping cleanup"
);
continue;
}
};
if count == 0 {
match backend.delete_session(session.scroll_id).await {
Ok(()) => {
tracing::info!(
scroll_id = %session.scroll_id,
"Deleted empty session during cleanup"
);
deleted += 1;
}
Err(e) => {
tracing::warn!(
scroll_id = %session.scroll_id,
error = %e,
"Failed to delete empty session during cleanup"
);
}
}
}
}
match page.next_cursor {
Some(cursor) => q.cursor = Some(cursor),
None => break,
}
}
tracing::info!(
deleted = deleted,
total = total,
"Completed empty session cleanup"
);
Ok((deleted, total))
}
}
@@ -1,231 +0,0 @@
//! Concrete archivist coordinator.
//!
//! Owns a `Vec<Arc<ArchiveRegistration>>` sorted by `read_priority`, plus a
//! positive `scroll_id → backend` cache. The registry is constructed from
//! `dirigent.toml` at boot (Task 12). `Archivist::new` remains a legacy
//! convenience for the dev-instance migration path; later tasks migrate
//! consumers to `Archivist::from_config`.
mod admin;
mod archives;
mod boot;
mod connectors;
mod meta;
mod routing;
mod sessions;
pub mod types;
pub use types::{ArchiveInfo, ArchiveMetadata};
use std::path::PathBuf;
use std::sync::Arc;
use tokio::sync::RwLock;
use crate::backend::ArchiveBackend;
use crate::error::{ArchivistError, Result};
use crate::registry::{
cache::ReadCache, ArchiveRegistration, FailureMode, WritePolicy,
};
pub struct Archivist {
pub(crate) registrations: RwLock<Vec<Arc<ArchiveRegistration>>>,
#[allow(dead_code)] // wired up in later tasks (cache-backed reads)
pub(crate) read_cache: Arc<ReadCache>,
#[allow(dead_code)] // retained for future admin endpoints / diagnostics
pub(crate) registry_path: PathBuf,
}
impl Archivist {
/// Legacy constructor: builds a single JsonlBackend rooted at
/// `registry_path.parent()`. Kept so dev-instance migration still
/// succeeds before Task 28 migrates consumers to `from_config`.
pub async fn new(registry_path: PathBuf) -> Result<Self> {
use crate::backends::JsonlBackend;
let mut registrations: Vec<Arc<ArchiveRegistration>> = Vec::new();
if !registry_path.as_os_str().is_empty() {
let archive_root = registry_path
.parent()
.map(|p| p.to_path_buf())
.unwrap_or_else(|| registry_path.clone());
let backend = Arc::new(JsonlBackend::new(archive_root).await?)
as Arc<dyn ArchiveBackend>;
let initial_health = backend.health_check().await;
registrations.push(Arc::new(ArchiveRegistration::new(
"main".into(),
"jsonl",
backend,
/* write_active */ true,
FailureMode::Required,
/* read_priority */ 0,
/* enabled */ true,
WritePolicy::Inline,
/* writer */ None,
initial_health,
)));
}
Ok(Self {
registrations: RwLock::new(registrations),
read_cache: Arc::new(ReadCache::new()),
registry_path,
})
}
/// Construct a coordinator with a single `JsonlBackend` archive named
/// "main" rooted at `archive_root`.
pub async fn new_with_single_archive(archive_root: PathBuf) -> Result<Self> {
use crate::backends::JsonlBackend;
let backend = Arc::new(JsonlBackend::new(archive_root).await?)
as Arc<dyn ArchiveBackend>;
let initial_health = backend.health_check().await;
let reg = Arc::new(ArchiveRegistration::new(
"main".into(),
"jsonl",
backend,
true,
FailureMode::Required,
0,
true,
WritePolicy::Inline,
None,
initial_health,
));
Ok(Self {
registrations: RwLock::new(vec![reg]),
read_cache: Arc::new(ReadCache::new()),
registry_path: PathBuf::new(),
})
}
/// Construct a coordinator with a pre-built single backend (for tests
/// that need to hold the backend directly alongside the coordinator).
pub async fn from_single_backend(
name: String,
backend: Arc<dyn ArchiveBackend>,
) -> Result<Self> {
let initial_health = backend.health_check().await;
let reg = Arc::new(ArchiveRegistration::new(
name,
"external",
backend,
true,
FailureMode::Required,
0,
true,
WritePolicy::Inline,
None,
initial_health,
));
Ok(Self {
registrations: RwLock::new(vec![reg]),
read_cache: Arc::new(ReadCache::new()),
registry_path: PathBuf::new(),
})
}
/// Resolve a single backend by optional name.
///
/// `None` → lowest-`read_priority` enabled write-active `Required`
/// backend. `Some(name)` → the backend with that name (must exist).
#[allow(dead_code)] // wired up in later tasks
pub(crate) async fn resolve_backend(
&self,
archive: Option<String>,
) -> Result<Arc<dyn ArchiveBackend>> {
let regs = self.registrations.read().await;
if regs.is_empty() {
return Err(ArchivistError::NoArchiveConfigured);
}
let chosen = match archive {
Some(name) => match regs.iter().find(|r| r.name == name) {
Some(r) => r,
None => return Err(ArchivistError::ArchiveNameUnknown(name)),
},
None => regs
.iter()
.filter(|r| {
r.enabled && r.write_active && r.failure_mode == FailureMode::Required
})
.min_by_key(|r| r.read_priority)
.ok_or_else(|| ArchivistError::PrimaryUnavailable {
name: "<default>".into(),
reason: "no required write-active backend".into(),
})?,
};
Ok(chosen.backend.clone())
}
/// Resolve the primary `ArchiveRegistration` for a write.
///
/// `None` → default-write target (lowest `read_priority` among enabled
/// write-active `Required` backends). `Some(name)` → the backend with that
/// name; errors if disabled or not write-active.
#[allow(dead_code)] // wired up in Task 16
pub(crate) async fn resolve_primary(
&self,
archive: Option<String>,
) -> Result<Arc<crate::registry::ArchiveRegistration>> {
let regs = self.registrations.read().await;
if regs.is_empty() {
return Err(ArchivistError::NoArchiveConfigured);
}
let chosen = match archive {
Some(name) => {
let r = regs
.iter()
.find(|r| r.name == name)
.ok_or_else(|| ArchivistError::ArchiveNameUnknown(name.clone()))?;
if !r.enabled {
return Err(ArchivistError::PrimaryUnavailable {
name: r.name.clone(),
reason: "backend is disabled".into(),
});
}
if !r.write_active {
return Err(ArchivistError::PrimaryUnavailable {
name: r.name.clone(),
reason: "backend is not write-active".into(),
});
}
r.clone()
}
None => regs
.iter()
.filter(|r| {
r.enabled
&& r.write_active
&& r.failure_mode == crate::registry::FailureMode::Required
})
.min_by_key(|r| r.read_priority)
.cloned()
.ok_or_else(|| ArchivistError::PrimaryUnavailable {
name: "<default>".into(),
reason: "no required write-active backend".into(),
})?,
};
Ok(chosen)
}
}
#[cfg(any(test, feature = "test-utils"))]
impl Archivist {
/// Test-only: construct directly from pre-built registrations.
pub fn from_registrations(
regs: Vec<std::sync::Arc<crate::registry::ArchiveRegistration>>,
) -> Self {
Self {
registrations: tokio::sync::RwLock::new(regs),
read_cache: std::sync::Arc::new(crate::registry::cache::ReadCache::new()),
registry_path: std::path::PathBuf::new(),
}
}
}
#[cfg(test)]
mod tests;
@@ -1,136 +0,0 @@
//! Read priority walk shared by every per-scroll_id and collection-shape
//! coordinator method.
//!
//! The walk honours per-backend `enabled`, caller-supplied capability
//! predicates, and current health. Per-scroll_id reads populate a positive
//! LRU cache keyed on `scroll_id`, so the second read for the same session
//! can short-circuit the priority walk.
use std::sync::Arc;
use uuid::Uuid;
use crate::backend::ArchiveBackend;
use crate::error::Result;
use crate::registry::ArchiveRegistration;
use super::Archivist;
impl Archivist {
/// Walk enabled + healthy registrations in `read_priority` order.
///
/// `predicate` decides whether a backend can serve the read (typically a
/// capability check). `op` is invoked on the first matching backend:
/// - `Ok(Some(value))` — wins the walk; per-scroll_id cache is updated; returned.
/// - `Ok(None)` — backend doesn't have it; continue.
/// - `Err(_)` — drift the backend's health and continue.
pub(crate) async fn read_walk_per_session<T, F, Fut, P>(
&self,
scroll_id: Uuid,
predicate: P,
op: F,
) -> Result<Option<T>>
where
T: Send,
P: Fn(&ArchiveRegistration) -> bool + Send + Sync,
F: Fn(Arc<dyn ArchiveBackend>) -> Fut + Send + Sync,
Fut: std::future::Future<Output = Result<Option<T>>> + Send,
{
// Cache hit: try the cached backend first.
if let Some(cached_name) = self.read_cache.get(scroll_id).await {
if let Some(reg) = self.find_registration(&cached_name).await {
if predicate(&reg) && reg.enabled && !self.is_unavailable(&reg).await {
match op(reg.backend.clone()).await {
Ok(Some(value)) => return Ok(Some(value)),
Ok(None) => {
// Cached entry no longer holds — invalidate and fall through.
self.read_cache.invalidate(scroll_id).await;
}
Err(_) => {
self.record_read_failure(&reg).await;
self.read_cache.invalidate(scroll_id).await;
}
}
}
}
}
// Priority walk.
let regs: Vec<Arc<ArchiveRegistration>> = self.registrations.read().await.clone();
for reg in regs.iter() {
if !reg.enabled || !predicate(reg) || self.is_unavailable(reg).await {
continue;
}
match op(reg.backend.clone()).await {
Ok(Some(value)) => {
self.record_read_success(reg).await;
self.read_cache.put(scroll_id, reg.name.clone()).await;
return Ok(Some(value));
}
Ok(None) => {
self.record_read_success(reg).await;
continue;
}
Err(_) => {
self.record_read_failure(reg).await;
continue;
}
}
}
Ok(None)
}
/// Collection-shape read variant: returns the first enabled/healthy backend's
/// result, no cache. `op`'s return type is `Result<T>` (no `Option<T>`):
/// an error is treated as "backend couldn't serve this" and drifted; `Ok(T)`
/// is the answer.
pub(crate) async fn read_walk_collection<T, F, Fut, P>(
&self,
predicate: P,
op: F,
) -> Result<Option<T>>
where
T: Send,
P: Fn(&ArchiveRegistration) -> bool + Send + Sync,
F: Fn(Arc<dyn ArchiveBackend>) -> Fut + Send + Sync,
Fut: std::future::Future<Output = Result<T>> + Send,
{
let regs: Vec<Arc<ArchiveRegistration>> = self.registrations.read().await.clone();
for reg in regs.iter() {
if !reg.enabled || !predicate(reg) || self.is_unavailable(reg).await {
continue;
}
match op(reg.backend.clone()).await {
Ok(value) => {
self.record_read_success(reg).await;
return Ok(Some(value));
}
Err(_) => {
self.record_read_failure(reg).await;
continue;
}
}
}
Ok(None)
}
pub(crate) async fn find_registration(
&self,
name: &str,
) -> Option<Arc<ArchiveRegistration>> {
self.registrations
.read()
.await
.iter()
.find(|r| r.name == name)
.cloned()
}
async fn is_unavailable(&self, reg: &ArchiveRegistration) -> bool {
matches!(
*reg.last_health.read().await,
crate::backend::HealthStatus::Unavailable { .. }
)
}
}
File diff suppressed because it is too large Load Diff
@@ -1,195 +0,0 @@
//! Coordinator orchestration unit tests using `MockBackend`.
//!
//! These tests exercise alias detection, move/copy semantics, DAG walks,
//! and cleanup policies without any disk I/O.
#![cfg(test)]
use std::sync::Arc;
use tokio::sync::RwLock;
use uuid::Uuid;
use crate::backend::mock::MockBackend;
use crate::backend::ArchiveBackend;
use crate::coordinator::Archivist;
use crate::registry::{
cache::ReadCache, ArchiveRegistration, FailureMode, WritePolicy,
};
use crate::types::{
DagEdge, MessageRecord, RegisterConnectorRequest, RegisterStatus, SessionCompleteness,
SessionKind, SessionMetadata,
};
/// Construct a blank `SessionMetadata` with the given `scroll_id` and
/// `connector_uid`. Sensible defaults for every other field.
fn blank_session(scroll_id: Uuid, connector_uid: Uuid) -> SessionMetadata {
let now = chrono::Utc::now();
SessionMetadata {
version: 1,
scroll_id,
created_at: now,
updated_at: now,
title: None,
connector_uid,
native_session_id: None,
agent_id: None,
parent_scroll_id: None,
continuation: None,
tags: Vec::new(),
metadata: serde_json::Value::Null,
no_update: false,
kind: SessionKind::Chat,
acp_client_id: None,
is_connected: None,
current_session_id: None,
models: None,
modes: None,
config_options: None,
completeness: SessionCompleteness::Complete,
matrix_room_id: None,
matrix_sharing_active: false,
matrix_shared_at: None,
is_subagent: false,
subagent_type: None,
spawning_tool_use_id: None,
}
}
/// Construct a blank `MessageRecord` scoped to the given session with a
/// freshly generated `message_id` and current timestamp.
fn blank_message(session: Uuid) -> MessageRecord {
MessageRecord {
version: 1,
message_id: Uuid::now_v7(),
session,
parent_id: None,
ts: chrono::Utc::now(),
role: "user".into(),
author: None,
content_md: String::new(),
content_parts: None,
attachments: Vec::new(),
metadata: serde_json::Value::Null,
}
}
async fn make_coordinator_with_single_mock() -> Archivist {
let backend: Arc<dyn ArchiveBackend> = Arc::new(MockBackend::new());
let initial_health = backend.health_check().await;
let reg = Arc::new(ArchiveRegistration::new(
"main".into(),
"mock",
backend,
/* write_active */ true,
FailureMode::Required,
/* read_priority */ 0,
/* enabled */ true,
WritePolicy::Inline,
/* writer */ None,
initial_health,
));
Archivist {
registrations: RwLock::new(vec![reg]),
read_cache: Arc::new(ReadCache::new()),
registry_path: std::path::PathBuf::from("/mock/.archives.json"),
}
}
#[tokio::test]
async fn register_connector_assigns_uid_and_returns_accepted() {
let coord = make_coordinator_with_single_mock().await;
let req = RegisterConnectorRequest {
r#type: "OpenCode".into(),
title: "test".into(),
client_native_id: "opencode@localhost".into(),
custom_uid: None,
metadata: serde_json::Value::Null,
fingerprint: None,
};
let resp = coord.register_connector(req, None).await.expect("register");
assert!(matches!(resp.status, RegisterStatus::Accepted));
assert_ne!(resp.connector_uid, Uuid::nil());
}
#[tokio::test]
async fn register_connector_aliases_on_duplicate_native_id() {
let coord = make_coordinator_with_single_mock().await;
let mk_req = || RegisterConnectorRequest {
r#type: "OpenCode".into(),
title: "test".into(),
client_native_id: "opencode@localhost".into(),
custom_uid: None,
metadata: serde_json::Value::Null,
fingerprint: None,
};
let first = coord.register_connector(mk_req(), None).await.unwrap();
let second = coord.register_connector(mk_req(), None).await.unwrap();
assert_eq!(second.connector_uid, first.connector_uid);
assert!(matches!(second.status, RegisterStatus::Aliased));
}
#[tokio::test]
async fn get_session_tree_walks_full_dag() {
let coord = make_coordinator_with_single_mock().await;
let connector_uid = Uuid::now_v7();
let root = Uuid::now_v7();
let child_a = Uuid::now_v7();
let child_b = Uuid::now_v7();
let grand = Uuid::now_v7();
let backend = coord.registrations.read().await[0].backend.clone();
for id in [root, child_a, child_b, grand] {
backend
.put_session(blank_session(id, connector_uid))
.await
.unwrap();
}
for (p, c) in [(root, child_a), (root, child_b), (child_a, grand)] {
coord
.append_dag_edge(
DagEdge {
parent: p,
child: c,
agent_id: String::new(),
subagent_type: None,
tool_use_id: None,
ts: Some(chrono::Utc::now()),
},
None,
)
.await
.unwrap();
}
let edges = coord.get_session_tree(root, None).await.unwrap();
assert_eq!(edges.len(), 3, "expected 3 edges, got {}", edges.len());
}
#[tokio::test]
async fn cleanup_empty_sessions_deletes_only_message_less_sessions() {
let coord = make_coordinator_with_single_mock().await;
let connector_uid = Uuid::now_v7();
let empty = Uuid::now_v7();
let populated = Uuid::now_v7();
let backend = coord.registrations.read().await[0].backend.clone();
for scroll_id in [empty, populated] {
backend
.put_session(blank_session(scroll_id, connector_uid))
.await
.unwrap();
}
backend
.append_messages(populated, vec![blank_message(populated)])
.await
.unwrap();
let (deleted, total) = coord.cleanup_empty_sessions(None).await.unwrap();
assert_eq!(deleted, 1);
assert_eq!(total, 2);
assert!(backend.get_session(empty).await.unwrap().is_none());
assert!(backend.get_session(populated).await.unwrap().is_some());
}
@@ -1,60 +0,0 @@
//! Shared data types used by the archivist coordinator.
//!
//! `ArchiveMetadata` is persisted per-archive in the registry file and
//! tracks creation time, path, and the set of connectors registered in
//! the archive. `ArchiveInfo` is the display-friendly projection returned
//! from listing APIs; it extends the metadata with computed fields like
//! session count and default-archive status.
use std::path::PathBuf;
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use uuid::Uuid;
/// Metadata about a single archive.
///
/// This structure contains all the information needed to track and display
/// an archive without loading its full backend instance.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ArchiveMetadata {
/// Unique name for this archive (e.g., "personal", "work", "experiments")
pub name: String,
/// Filesystem path to the archive root directory
pub path: PathBuf,
/// When this archive was first registered with the coordinator
pub created_at: DateTime<Utc>,
/// List of connector UIDs registered in this archive
///
/// This is updated as connectors are registered/unregistered and provides
/// a quick way to see which connectors belong to which archive.
pub connector_uids: Vec<Uuid>,
}
/// Display-friendly information about an archive.
///
/// This struct is returned by listing operations and includes computed
/// fields like session count and default status.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ArchiveInfo {
/// Unique name for this archive
pub name: String,
/// Filesystem path to the archive root directory
pub path: PathBuf,
/// When this archive was first registered
pub created_at: DateTime<Utc>,
/// Total number of sessions across all connectors in this archive
///
/// This is computed by counting sessions across all connectors and
/// may be expensive for large archives.
pub session_count: usize,
/// Whether this is the current default archive
pub is_default: bool,
}
-314
View File
@@ -1,314 +0,0 @@
//! Error types for the Archivist.
//!
//! This module defines all error types that can occur during archival operations,
//! including I/O errors, JSON errors, and domain-specific errors for connectors
//! and sessions.
use std::path::PathBuf;
use thiserror::Error;
use uuid::Uuid;
/// Result type alias for Archivist operations
pub type Result<T> = std::result::Result<T, ArchivistError>;
/// Errors that can occur during archival operations
#[derive(Debug, Error)]
pub enum ArchivistError {
/// Connector with the given UID was not found
#[error("Connector not found: {0}")]
ConnectorUnknown(Uuid),
/// Session with the given scroll ID was not found
#[error("Session not found: {0}")]
SessionUnknown(Uuid),
/// UUID collision detected with inconsistent data
///
/// This occurs when a custom UUID is provided that matches an existing
/// entity but with different attributes (e.g., different connector type).
#[error("UUID collision: {0}")]
CollisionInconsistent(Uuid),
/// Invalid request (e.g., missing required fields, invalid format)
#[error("Invalid request: {0}")]
InvalidRequest(String),
/// I/O error during file operations
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
/// JSON serialization/deserialization error
#[error("JSON error: {0}")]
Json(#[from] serde_json::Error),
// Multi-archive errors
/// Invalid archive name (empty or contains invalid characters)
#[error("Invalid archive name: {0}")]
InvalidArchiveName(String),
/// Archive already exists with the given name
#[error("Archive already exists: {0}")]
ArchiveAlreadyExists(String),
/// Archive not found with the given name
#[error("Archive not found: {0}")]
ArchiveNotFound(String),
/// Archive path conflict (path is already used by another archive)
#[error("Archive path conflict: {0}")]
ArchivePathConflict(PathBuf),
/// Cannot remove default archive without force flag
#[error("Cannot remove default archive without force flag")]
CannotRemoveDefaultArchive,
/// Archive is not empty (has sessions)
#[error("Archive '{name}' is not empty ({session_count} sessions)")]
ArchiveNotEmpty {
name: String,
session_count: usize,
},
/// No archives configured
#[error("No archives configured")]
NoArchivesConfigured,
/// Failed to load registry file
#[error("Failed to load registry: {0}")]
RegistryLoadError(String),
/// Failed to parse registry JSON
#[error("Failed to parse registry: {0}")]
RegistryParseError(String),
/// Failed to serialize registry to JSON
#[error("Failed to serialize registry: {0}")]
RegistrySerializeError(String),
/// Failed to write registry file
#[error("Failed to write registry: {0}")]
RegistryWriteError(String),
/// Backend is unavailable (e.g., disk full, connection lost, degraded state)
#[error("Backend {name} is unavailable")]
BackendUnavailable { name: String },
/// Backend does not support the requested capability
#[error("Backend {backend} does not support capability {capability:?}")]
CapabilityNotSupported {
capability: crate::backend::ArchiveCapability,
backend: String,
},
/// Health check for a backend failed
#[error("Health check for backend {name} failed: {reason}")]
BackendHealthCheckFailed { name: String, reason: String },
/// Primary write backend is unavailable or misconfigured.
#[error("primary write backend `{name}` is unavailable: {reason}")]
PrimaryUnavailable { name: String, reason: String },
/// Session exists on a read-only (not write_active) backend; deletion impossible.
#[error("session {scroll_id} exists in read-only backend `{backend}`; cannot delete")]
DeleteOnReadOnlyBackend { backend: String, scroll_id: uuid::Uuid },
/// Move succeeded at the destination but source-side delete failed.
#[error("partial move: copy to `{copied_to}` succeeded but source-side delete failed: {delete_error}")]
PartialMove {
copied_to: String,
delete_error: Box<ArchivistError>,
},
/// Queued-write backend's queue is full.
#[error("write queue full for backend `{backend}` (op `{op}`)")]
WriteQueueFull {
backend: String,
op: &'static str,
},
/// The coordinator has no archive configured (ephemeral mode).
#[error("no archive is configured (ephemeral mode)")]
NoArchiveConfigured,
/// A requested archive name does not exist in the registry.
#[error("archive name `{0}` is unknown")]
ArchiveNameUnknown(String),
/// Runtime mutation of the archive registry is not supported in Phase 3.
#[error("dynamic registry mutation is not supported (Phase 3 is startup-only)")]
DynamicRegistryUnsupported,
/// Catch-all for injected failures / legacy call sites. Prefer a typed variant when possible.
#[error("{0}")]
Other(String),
}
/// Errors raised exclusively at boot, by `Archivist::from_config`.
#[derive(Debug, thiserror::Error)]
pub enum ArchivistBootError {
#[error("duplicate archive name `{0}` in config")]
DuplicateName(String),
#[error("archive `{name}` declares unknown type `{type_name}`")]
UnknownType { name: String, type_name: String },
#[error("no `required` write-active backend configured (need at least one primary)")]
NoPrimary,
#[error("backend `{name}` failed to build: {source}")]
BackendBuild {
name: String,
#[source]
source: crate::registry::BackendBuildError,
},
#[error("required backend `{name}` is unavailable at boot: {reason}")]
UnavailableRequiredBackend { name: String, reason: String },
#[error("no unrestricted write-active archive — at least one enabled, write_active backend must have an empty filter")]
NoUnrestrictedPrimary,
#[error("filter for archive `{archive}` rejects all sessions (include_connectors is empty)")]
FilterRejectsEverything { archive: String },
#[error("config validation failed: {0}")]
Validation(#[from] crate::registry::ConfigValidationError),
}
#[cfg(test)]
mod tests {
use super::*;
use std::io;
#[test]
fn test_error_display() {
let uuid = Uuid::now_v7();
// Test ConnectorUnknown
let err = ArchivistError::ConnectorUnknown(uuid);
assert_eq!(err.to_string(), format!("Connector not found: {}", uuid));
// Test SessionUnknown
let err = ArchivistError::SessionUnknown(uuid);
assert_eq!(err.to_string(), format!("Session not found: {}", uuid));
// Test CollisionInconsistent
let err = ArchivistError::CollisionInconsistent(uuid);
assert_eq!(err.to_string(), format!("UUID collision: {}", uuid));
// Test InvalidRequest
let err = ArchivistError::InvalidRequest("missing field".to_string());
assert_eq!(err.to_string(), "Invalid request: missing field");
}
#[test]
fn test_io_error_conversion() {
// Create an I/O error
let io_err = io::Error::new(io::ErrorKind::NotFound, "file not found");
// Convert to ArchivistError using From trait
let archivist_err: ArchivistError = io_err.into();
// Verify it's the right variant
match archivist_err {
ArchivistError::Io(e) => {
assert_eq!(e.kind(), io::ErrorKind::NotFound);
assert_eq!(e.to_string(), "file not found");
}
_ => panic!("Expected Io variant"),
}
}
#[test]
fn test_json_error_conversion() {
// Create a JSON error by trying to parse invalid JSON
let json_err = serde_json::from_str::<serde_json::Value>("invalid json").unwrap_err();
// Convert to ArchivistError using From trait
let archivist_err: ArchivistError = json_err.into();
// Verify it's the right variant
match archivist_err {
ArchivistError::Json(_) => {
// Success - it's a JSON error
}
_ => panic!("Expected Json variant"),
}
}
#[test]
fn test_result_type_with_question_mark() {
// Test that Result<T> works with the ? operator
fn test_function() -> Result<String> {
// This should compile and work with ?
let _data: serde_json::Value = serde_json::from_str(r#"{"key": "value"}"#)?;
Ok("success".to_string())
}
let result = test_function();
assert!(result.is_ok());
assert_eq!(result.unwrap(), "success");
}
#[test]
fn test_error_chain() {
// Test that errors can be chained properly
fn inner_function() -> std::io::Result<String> {
Err(std::io::Error::new(
std::io::ErrorKind::NotFound,
"inner error",
))
}
fn outer_function() -> Result<String> {
// The ? operator should automatically convert io::Error to ArchivistError
let _result = inner_function()?;
Ok("success".to_string())
}
let result = outer_function();
assert!(result.is_err());
match result {
Err(ArchivistError::Io(e)) => {
assert_eq!(e.kind(), std::io::ErrorKind::NotFound);
}
_ => panic!("Expected Io error"),
}
}
#[test]
fn test_error_debug() {
let uuid = Uuid::now_v7();
let err = ArchivistError::ConnectorUnknown(uuid);
// Verify Debug implementation works
let debug_str = format!("{:?}", err);
assert!(debug_str.contains("ConnectorUnknown"));
assert!(debug_str.contains(&uuid.to_string()));
}
#[test]
fn test_all_error_variants() {
let uuid = Uuid::now_v7();
// Test all variants can be created
let errors = vec![
ArchivistError::ConnectorUnknown(uuid),
ArchivistError::SessionUnknown(uuid),
ArchivistError::CollisionInconsistent(uuid),
ArchivistError::InvalidRequest("test".to_string()),
ArchivistError::Io(io::Error::new(io::ErrorKind::Other, "test")),
ArchivistError::Json(serde_json::from_str::<serde_json::Value>("bad").unwrap_err()),
];
// Verify each error has a non-empty display string
for err in errors {
let display = err.to_string();
assert!(!display.is_empty(), "Error display should not be empty");
let debug = format!("{:?}", err);
assert!(!debug.is_empty(), "Error debug should not be empty");
}
}
}
File diff suppressed because it is too large Load Diff
-933
View File
@@ -1,933 +0,0 @@
//! Generic import infrastructure for bringing external sessions into the archive.
//!
//! This module provides the shared types and orchestration logic that all importers
//! (Claude, ChatGPT, etc.) reuse. Each importer implements discovery and message
//! conversion, then delegates to [`import_sessions`] for the actual import.
pub mod progress;
pub mod registry;
pub mod sources;
pub mod trait_def;
/// Backwards-compatible re-export — external callers (e.g. `api`) import
/// `dirigent_archivist::import::claude::{discover_claude_import,
/// import_claude_sessions}`. Keep the path stable until those callsites
/// migrate to the `Importer` trait.
pub use sources::claude;
#[cfg(feature = "importer-claude")]
pub use sources::claude::ClaudeImporter;
pub use progress::{ImportProgressEvent, ImportProgressSink, SessionOutcome, StatsDelta};
pub use registry::ImporterRegistry;
pub use trait_def::{ConfigField, ConfigFieldKind, ImportConfig, ImportConfigShape, ImportError, ImportTarget, Importer, ImporterInfo};
use std::collections::HashMap;
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use uuid::Uuid;
use crate::coordinator::Archivist;
use crate::error::{ArchivistError, Result};
use crate::types::{
MessageRecord, RegisterConnectorRequest, RegisterSessionRequest, RegisterStatus,
SessionCompleteness,
};
/// Statistics collected during an import operation.
#[derive(Debug, Clone, Serialize, Deserialize, Default, PartialEq)]
pub struct ImportStats {
/// Number of sessions found by the importer's discovery phase.
pub sessions_discovered: usize,
/// Number of sessions successfully imported as new.
pub sessions_imported: usize,
/// Number of sessions skipped (already present with same or more messages).
pub sessions_skipped: usize,
/// Number of sessions that were updated with new messages.
pub sessions_updated: usize,
/// Total number of message records written to the archive.
pub messages_written: usize,
/// Number of messages that were already present (from existing sessions).
pub messages_already_present: usize,
/// Number of sessions skipped because the fingerprint matched (no source changes).
#[serde(default)]
pub sessions_fingerprint_skipped: usize,
/// Errors encountered during import (non-fatal; import continues).
pub errors: Vec<String>,
}
impl ImportStats {
/// Total sessions processed (imported + skipped + updated + errored).
pub fn total_sessions_processed(&self) -> usize {
self.sessions_imported + self.sessions_skipped + self.sessions_updated + self.errors.len()
}
/// Whether any errors were encountered during import.
pub fn has_errors(&self) -> bool {
!self.errors.is_empty()
}
}
/// Intermediate representation for a session discovered by any importer.
///
/// This is source-agnostic: each importer converts its native session format
/// into `DiscoveredSession` before handing it to [`import_sessions`].
#[derive(Debug, Clone)]
pub struct DiscoveredSession {
/// The session ID from the original source (e.g., Claude's JSONL filename).
pub native_session_id: String,
/// Human-readable session title, if available.
pub title: Option<String>,
/// When the session was created in the source system.
pub created_at: Option<DateTime<Utc>>,
/// When the session was last updated in the source system.
pub updated_at: Option<DateTime<Utc>>,
/// Number of messages in the source session (used for skip/update decisions).
pub message_count: usize,
/// Arbitrary source-specific metadata preserved for provenance.
pub metadata: serde_json::Value,
/// Project path associated with the session, if known.
pub project_path: Option<String>,
/// Size of the source file in bytes, if available. Used for fingerprint-based
/// change detection to skip unchanged sessions on re-import.
pub file_size: Option<u64>,
}
/// Snapshot of source-side signals captured after a successful import.
///
/// Stored in the session's `metadata` JSON under the `"_import_snapshot"` key.
/// On re-import, comparing the current `DiscoveredSession` against the stored
/// snapshot lets us skip expensive full-parse when nothing has changed (O(1) gate).
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct ImportSnapshot {
/// Number of messages in the source at the time of import.
pub source_message_count: usize,
/// Source-side `updated_at` timestamp at the time of import.
pub source_updated_at: Option<DateTime<Utc>>,
/// Source file size in bytes at the time of import.
pub source_file_size: Option<u64>,
/// When this snapshot was recorded.
pub imported_at: DateTime<Utc>,
}
/// Key used to store [`ImportSnapshot`] in session metadata JSON.
const IMPORT_SNAPSHOT_KEY: &str = "_import_snapshot";
impl ImportSnapshot {
/// Check whether the source signals in `discovered` match this snapshot.
///
/// Returns `true` if all present signals match, meaning the session has not
/// changed since this snapshot was taken and a full re-parse can be skipped.
pub fn matches(&self, discovered: &DiscoveredSession) -> bool {
if self.source_message_count != discovered.message_count {
return false;
}
if self.source_updated_at != discovered.updated_at {
return false;
}
// file_size: only compare when both sides have a value.
if let (Some(snap_size), Some(disc_size)) = (self.source_file_size, discovered.file_size) {
if snap_size != disc_size {
return false;
}
}
true
}
/// Build a snapshot from a discovered session (captures current source signals).
pub fn from_discovered(discovered: &DiscoveredSession) -> Self {
Self {
source_message_count: discovered.message_count,
source_updated_at: discovered.updated_at,
source_file_size: discovered.file_size,
imported_at: Utc::now(),
}
}
/// Try to deserialize a snapshot from a session's metadata JSON.
pub fn from_metadata(metadata: &serde_json::Value) -> Option<Self> {
metadata
.get(IMPORT_SNAPSHOT_KEY)
.and_then(|v| serde_json::from_value(v.clone()).ok())
}
/// Serialize this snapshot into the session's metadata JSON under the
/// `_import_snapshot` key.
pub fn write_to_metadata(&self, metadata: &mut serde_json::Value) {
if let Some(obj) = metadata.as_object_mut() {
if let Ok(val) = serde_json::to_value(self) {
obj.insert(IMPORT_SNAPSHOT_KEY.to_string(), val);
}
} else {
tracing::warn!("cannot write import snapshot: metadata is not a JSON object");
}
}
}
/// Summary returned by the discovery phase before actual import begins.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ImportDiscovery {
/// Human-readable name of the import source (e.g., "Claude Code").
pub source_name: String,
/// Filesystem path or URI that was scanned.
pub source_path: String,
/// Projects discovered, grouped by name.
pub projects: Vec<ImportProject>,
/// Total number of sessions found across all projects.
pub total_sessions: usize,
/// Estimated total messages across all discovered sessions.
pub total_estimated_messages: usize,
}
/// A project grouping within an import discovery result.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ImportProject {
/// Project name (typically derived from the directory path).
pub name: String,
/// Number of sessions belonging to this project.
pub session_count: usize,
}
/// Resolves the `updated_at` timestamp for an imported session.
///
/// Prefers the source-provided timestamp from `discovered.updated_at`; falls
/// back to `Utc::now()` only when the source does not supply one.
fn resolve_updated_at(discovered: &DiscoveredSession) -> DateTime<Utc> {
discovered.updated_at.unwrap_or_else(chrono::Utc::now)
}
/// Generic async orchestrator that imports discovered sessions into the archive.
///
/// This function handles the full import lifecycle:
/// 1. Registers the connector (idempotent via fingerprint).
/// 2. For each discovered session, checks whether it already exists in the archive.
/// 3. New sessions are registered and their messages are converted and appended.
/// 4. Existing sessions with fewer archived messages are logged and skipped (v1).
/// 5. Existing sessions with the same or more archived messages are skipped.
///
/// The `convert_messages` closure receives a `native_session_id` and returns
/// `MessageRecord`s with `Uuid::nil()` in the `session` field. This function
/// patches each record's `session` to the real `scroll_id` before appending.
///
/// # Arguments
///
/// * `archivist` - The archivist to import into.
/// * `connector_req` - Registration request for the import connector.
/// * `sessions` - Sessions discovered by the importer.
/// * `convert_messages` - Closure that converts a native session into `MessageRecord`s.
/// * `archive` - Optional archive name (`None` for default archive).
/// * `progress` - Sink for per-session progress events (use
/// [`ImportProgressSink::noop`] when progress reporting is not needed).
pub async fn import_sessions<F>(
archivist: &Archivist,
connector_req: RegisterConnectorRequest,
sessions: Vec<DiscoveredSession>,
convert_messages: F,
archive: Option<String>,
progress: &ImportProgressSink,
force_deep_scan: bool,
project_map: &HashMap<String, String>,
) -> Result<ImportStats>
where
F: Fn(&str) -> Result<Vec<MessageRecord>> + Send + Sync,
{
let mut stats = ImportStats::default();
stats.sessions_discovered = sessions.len();
// Step 1: Register the connector (idempotent).
let connector_resp = archivist
.register_connector(connector_req, archive.clone())
.await?;
let connector_uid = connector_resp.connector_uid;
tracing::info!(
connector_uid = %connector_uid,
status = ?connector_resp.status,
"Import connector registered"
);
// Step 2: Process each discovered session.
let total = sessions.len();
for (index, session) in sessions.iter().enumerate() {
let native_id = &session.native_session_id;
progress
.send(ImportProgressEvent::SessionStarted {
native_id: native_id.clone(),
index,
total,
})
.await;
// Per-session outcome + stats delta. Updated as we go; on the early
// `continue` paths we emit Failed/Skipped before moving on.
let mut messages_written_delta: u64 = 0;
let mut messages_already_present_delta: u64 = 0;
let mut session_changed = false;
// Helper: emit SessionFinished and fall out of the iteration.
macro_rules! emit_finished {
($outcome:expr) => {{
progress
.send(ImportProgressEvent::SessionFinished {
native_id: native_id.clone(),
outcome: $outcome,
stats_delta: StatsDelta {
messages_written: messages_written_delta,
messages_already_present: messages_already_present_delta,
},
})
.await;
}};
}
// --- Step 1: Resolve or create scroll_id BEFORE convert_messages ---
let (scroll_id, session_is_new) = match archivist
.resolve_session(connector_uid, native_id, archive.clone())
.await
{
Ok(id) => (id, false),
Err(ArchivistError::SessionUnknown(_)) => {
// Inject project_id from project_map if the session has a
// project_path that maps to a known project.
let mut metadata = session.metadata.clone();
if let Some(project_path) = session.project_path.as_deref() {
if let Some(pid) = project_map.get(project_path) {
if let Some(obj) = metadata.as_object_mut() {
obj.insert(
"project_id".to_string(),
serde_json::Value::String(pid.clone()),
);
}
}
}
let register_req = RegisterSessionRequest {
connector_uid,
native_session_id: native_id.clone(),
title: session.title.clone(),
custom_scroll_id: None,
metadata,
completeness: SessionCompleteness::Complete,
parent_scroll_id: None,
is_subagent: false,
continuation: None,
agent_id: None,
subagent_type: None,
spawning_tool_use_id: None,
};
match archivist
.register_session(register_req, archive.clone())
.await
{
Ok(resp) => match resp.status {
RegisterStatus::Accepted => (resp.scroll_id, true),
RegisterStatus::Aliased => {
stats.sessions_skipped += 1;
emit_finished!(SessionOutcome::Skipped);
continue;
}
RegisterStatus::Rejected => {
stats.errors.push(format!(
"Session registration rejected for {native_id}"
));
emit_finished!(SessionOutcome::Failed);
continue;
}
},
Err(e) => {
stats.errors.push(format!(
"Failed to register session {native_id}: {e}"
));
emit_finished!(SessionOutcome::Failed);
continue;
}
}
}
Err(e) => {
stats.errors.push(format!(
"Failed to resolve session {native_id}: {e}"
));
emit_finished!(SessionOutcome::Failed);
continue;
}
};
// --- Step 2: Hoist metadata read for existing sessions ---
// Load metadata once; reused for fingerprint check AND title/model diff.
let existing_meta = if !session_is_new {
match archivist
.get_session_metadata(scroll_id, archive.clone())
.await
{
Ok(m) => Some(m),
Err(e) => {
stats.errors.push(format!(
"Failed to read session metadata for {native_id}: {e}"
));
emit_finished!(SessionOutcome::Failed);
continue;
}
}
} else {
None
};
// --- Step 2b: Retroactive project_id linking for existing sessions ---
// Sessions imported before project detection (or before the project was
// created) have project_path but no project_id. Patch it now if the
// project_map has a match — this runs even for fingerprint-skipped
// sessions so re-import can link them without any source-side changes.
if !session_is_new {
if let Some(ref meta) = existing_meta {
let has_project_path = meta
.metadata
.get("project_path")
.and_then(|v| v.as_str())
.is_some();
let has_project_id = meta
.metadata
.get("project_id")
.and_then(|v| v.as_str())
.filter(|s| !s.is_empty())
.is_some();
if has_project_path && !has_project_id {
let stored_path = meta
.metadata
.get("project_path")
.and_then(|v| v.as_str())
.unwrap();
if let Some(pid) = project_map.get(stored_path) {
if let Ok(primary) =
archivist.resolve_primary(archive.clone()).await
{
let mut patched = meta.clone();
if let Some(obj) = patched.metadata.as_object_mut() {
obj.insert(
"project_id".to_string(),
serde_json::Value::String(pid.clone()),
);
}
patched.updated_at = resolve_updated_at(session);
match primary.backend.put_session(patched).await {
Ok(_) => {
tracing::info!(
scroll_id = %scroll_id,
project_id = %pid,
"Retroactively linked session to project"
);
session_changed = true;
}
Err(e) => {
tracing::warn!(
scroll_id = %scroll_id,
error = %e,
"Failed to retroactively link session to project"
);
}
}
}
}
}
}
}
// --- Step 3: Fingerprint gate — skip unchanged sessions ---
if !session_is_new && !force_deep_scan {
if let Some(ref meta) = existing_meta {
if let Some(snapshot) = ImportSnapshot::from_metadata(&meta.metadata) {
if snapshot.matches(session) {
stats.sessions_fingerprint_skipped += 1;
if session_changed {
tracing::debug!(
native_id = %native_id,
"Fingerprint match — skipping message scan (metadata was updated)"
);
stats.sessions_updated += 1;
emit_finished!(SessionOutcome::Updated);
} else {
tracing::debug!(
native_id = %native_id,
"Fingerprint match — skipping unchanged session"
);
stats.sessions_skipped += 1;
emit_finished!(SessionOutcome::Skipped);
}
continue;
}
}
}
}
// --- Step 4: Convert messages (EXPENSIVE — after fingerprint gate) ---
let source_records = match convert_messages(native_id) {
Ok(r) => r,
Err(e) => {
stats.errors.push(format!(
"Failed to convert messages for session {native_id}: {e}"
));
emit_finished!(SessionOutcome::Failed);
continue;
}
};
// Build existing_ids set — empty for brand-new sessions.
let existing_ids: std::collections::HashSet<Uuid> = if session_is_new {
std::collections::HashSet::new()
} else {
match archivist.get_messages(scroll_id, archive.clone()).await {
Ok(msgs) => msgs.into_iter().map(|m| m.message_id).collect(),
Err(e) => {
stats.errors.push(format!(
"Failed to read existing messages for session {native_id}: {e}"
));
emit_finished!(SessionOutcome::Failed);
continue;
}
}
};
// Patch placeholder session field and partition.
let mut new_messages: Vec<MessageRecord> = Vec::new();
let mut already_present_count: usize = 0;
for mut record in source_records {
if record.session == Uuid::nil() {
record.session = scroll_id;
}
if existing_ids.contains(&record.message_id) {
already_present_count += 1;
} else {
new_messages.push(record);
}
}
let new_count = new_messages.len();
if new_count > 0 {
if let Err(e) = archivist
.append_messages(scroll_id, new_messages, archive.clone())
.await
{
stats.errors.push(format!(
"Failed to append messages for session {native_id}: {e}"
));
emit_finished!(SessionOutcome::Failed);
continue;
}
stats.messages_written += new_count;
messages_written_delta = new_count as u64;
session_changed = true;
}
stats.messages_already_present += already_present_count;
messages_already_present_delta = already_present_count as u64;
// --- Step 5: Metadata diff (reuse hoisted metadata) ---
if !session_is_new {
// SAFETY: existing_meta is Some when !session_is_new (guarded above).
let current_meta = existing_meta.unwrap();
let new_title = session.title.as_ref();
let title_differs = new_title.is_some() && new_title != current_meta.title.as_ref();
let new_model = session
.metadata
.get("model")
.and_then(|v| v.as_str())
.map(String::from);
let current_model = current_meta
.metadata
.get("model")
.and_then(|v| v.as_str())
.map(String::from);
let model_differs = new_model.is_some() && new_model != current_model;
if title_differs || model_differs {
if let Err(e) = archivist
.update_session_metadata(
scroll_id,
if title_differs { new_title.cloned() } else { None },
if model_differs { new_model } else { None },
archive.clone(),
)
.await
{
stats.errors.push(format!(
"Failed to update session metadata for {native_id}: {e}"
));
emit_finished!(SessionOutcome::Failed);
continue;
}
session_changed = true;
}
let new_project_path = session
.metadata
.get("project_path")
.and_then(|v| v.as_str())
.map(String::from);
let current_project_path = current_meta
.metadata
.get("project_path")
.and_then(|v| v.as_str())
.map(String::from);
let project_path_differs =
new_project_path.is_some() && new_project_path != current_project_path;
if project_path_differs {
// project_path lives in the free-form metadata JSON.
// Re-read to pick up any title/model changes applied above.
let mut patched_meta = archivist
.get_session_metadata(scroll_id, archive.clone())
.await
.unwrap_or(current_meta);
if let Some(obj) = patched_meta.metadata.as_object_mut() {
let path_val = new_project_path.clone().unwrap_or_default();
obj.insert(
"project_path".to_string(),
serde_json::Value::String(path_val.clone()),
);
if let Some(pid) = project_map.get(&path_val) {
obj.insert(
"project_id".to_string(),
serde_json::Value::String(pid.clone()),
);
}
}
patched_meta.updated_at = resolve_updated_at(session);
if let Ok(primary) = archivist.resolve_primary(archive.clone()).await {
if let Err(e) = primary.backend.put_session(patched_meta).await {
tracing::warn!(
scroll_id = %scroll_id,
error = %e,
"Failed to update project_path in session metadata"
);
}
}
session_changed = true;
}
}
// --- Step 6: Write import snapshot after successful import/update ---
{
let snapshot = ImportSnapshot::from_discovered(session);
// Re-read metadata to get the latest state (may have been updated above).
let write_result = async {
let mut meta = archivist
.get_session_metadata(scroll_id, archive.clone())
.await?;
snapshot.write_to_metadata(&mut meta.metadata);
meta.updated_at = resolve_updated_at(session);
let primary = archivist.resolve_primary(archive.clone()).await?;
primary.backend.put_session(meta).await.map_err(|e| {
ArchivistError::InvalidRequest(format!(
"Failed to write import snapshot: {e}"
))
})
}
.await;
if let Err(e) = write_result {
tracing::warn!(
scroll_id = %scroll_id,
error = %e,
"Failed to write import snapshot (session still imported)"
);
}
}
// Accounting: exactly one of {imported, updated, skipped} per session.
let outcome = if session_is_new {
stats.sessions_imported += 1;
SessionOutcome::Imported
} else if session_changed {
stats.sessions_updated += 1;
SessionOutcome::Updated
} else {
stats.sessions_skipped += 1;
SessionOutcome::Skipped
};
emit_finished!(outcome);
}
Ok(stats)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_import_stats_default() {
let stats = ImportStats::default();
assert_eq!(stats.sessions_discovered, 0);
assert_eq!(stats.sessions_imported, 0);
assert_eq!(stats.sessions_skipped, 0);
assert_eq!(stats.sessions_updated, 0);
assert_eq!(stats.messages_written, 0);
assert_eq!(stats.messages_already_present, 0);
assert!(stats.errors.is_empty());
}
#[test]
fn test_import_stats_total_sessions_processed() {
let mut stats = ImportStats::default();
stats.sessions_imported = 3;
stats.sessions_skipped = 2;
stats.sessions_updated = 1;
stats.errors.push("oops".to_string());
assert_eq!(stats.total_sessions_processed(), 7);
}
#[test]
fn test_import_stats_has_errors() {
let mut stats = ImportStats::default();
assert!(!stats.has_errors());
stats.errors.push("something went wrong".to_string());
assert!(stats.has_errors());
}
}
#[cfg(test)]
mod idempotency_tests {
use super::*;
use crate::Archivist;
use chrono::Utc;
use uuid::Uuid;
async fn mk() -> (Archivist, std::path::PathBuf) {
let tmp = std::env::temp_dir().join(format!("import_idem_{}", Uuid::now_v7()));
// Use `from_single_backend` rather than `new_with_single_archive` so
// each test's archive is fully self-contained (no shared `.archives.json`
// in the parent tempdir racing against sibling tests).
let backend = std::sync::Arc::new(
crate::backends::JsonlBackend::new(tmp.clone()).await.unwrap(),
);
let a = Archivist::from_single_backend("main".into(), backend)
.await
.unwrap();
(a, tmp)
}
fn connector() -> RegisterConnectorRequest {
// Stable client_native_id so that re-registering within the same test
// (which uses an isolated temp dir per test) aliases onto the same
// connector_uid — otherwise each call would produce a fresh connector
// and defeat idempotency.
RegisterConnectorRequest {
r#type: "Fake".into(),
title: "fake".into(),
client_native_id: "fake@local:stable".into(),
custom_uid: None,
metadata: serde_json::json!({}),
fingerprint: None,
}
}
fn record(session: Uuid, id: Uuid, role: &str, content: &str) -> MessageRecord {
MessageRecord {
version: 1,
message_id: id,
session,
parent_id: None,
ts: Utc::now(),
role: role.to_string(),
author: None,
content_md: content.to_string(),
content_parts: None,
attachments: Vec::new(),
metadata: serde_json::json!({}),
}
}
#[tokio::test]
async fn import_skips_already_present_messages() {
let (archivist, tmp) = mk().await;
let a = Uuid::now_v7();
let b = Uuid::now_v7();
let c = Uuid::now_v7();
let discovered = vec![DiscoveredSession {
native_session_id: "s1".into(),
title: Some("t".into()),
created_at: None,
updated_at: None,
message_count: 3,
metadata: serde_json::json!({}),
project_path: None,
file_size: None,
}];
let convert = |_: &str| -> Result<Vec<MessageRecord>> {
Ok(vec![
record(Uuid::nil(), a, "user", "hi-a"),
record(Uuid::nil(), b, "user", "hi-b"),
record(Uuid::nil(), c, "user", "hi-c"),
])
};
let stats = import_sessions(&archivist, connector(), discovered.clone(), convert, None, &ImportProgressSink::noop(), true, &HashMap::new())
.await
.unwrap();
assert_eq!(stats.sessions_imported, 1);
assert_eq!(stats.messages_written, 3);
// Re-import with IDENTICAL records — nothing should be written.
let convert2 = |_: &str| -> Result<Vec<MessageRecord>> {
Ok(vec![
record(Uuid::nil(), a, "user", "hi-a"),
record(Uuid::nil(), b, "user", "hi-b"),
record(Uuid::nil(), c, "user", "hi-c"),
])
};
let stats2 = import_sessions(&archivist, connector(), discovered, convert2, None, &ImportProgressSink::noop(), true, &HashMap::new())
.await
.unwrap();
assert_eq!(stats2.messages_written, 0);
assert_eq!(stats2.messages_already_present, 3);
assert_eq!(stats2.sessions_skipped, 1);
assert_eq!(stats2.sessions_imported, 0);
assert_eq!(stats2.sessions_updated, 0);
let _ = tokio::fs::remove_dir_all(tmp).await;
}
#[tokio::test]
async fn import_appends_new_messages_only() {
let (archivist, tmp) = mk().await;
let a = Uuid::now_v7();
let b = Uuid::now_v7();
let c = Uuid::now_v7();
let d = Uuid::now_v7();
let discovered = vec![DiscoveredSession {
native_session_id: "s1".into(),
title: Some("t".into()),
created_at: None,
updated_at: None,
message_count: 2,
metadata: serde_json::json!({}),
project_path: None,
file_size: None,
}];
let convert1 = |_: &str| -> Result<Vec<MessageRecord>> {
Ok(vec![
record(Uuid::nil(), a, "user", "hi-a"),
record(Uuid::nil(), b, "user", "hi-b"),
])
};
let _ = import_sessions(&archivist, connector(), discovered.clone(), convert1, None, &ImportProgressSink::noop(), true, &HashMap::new())
.await
.unwrap();
// Second run: source has grown to 4 messages.
let convert2 = |_: &str| -> Result<Vec<MessageRecord>> {
Ok(vec![
record(Uuid::nil(), a, "user", "hi-a"),
record(Uuid::nil(), b, "user", "hi-b"),
record(Uuid::nil(), c, "user", "hi-c"),
record(Uuid::nil(), d, "user", "hi-d"),
])
};
let stats = import_sessions(&archivist, connector(), discovered, convert2, None, &ImportProgressSink::noop(), true, &HashMap::new())
.await
.unwrap();
assert_eq!(stats.messages_written, 2);
assert_eq!(stats.messages_already_present, 2);
assert_eq!(stats.sessions_updated, 1);
assert_eq!(stats.sessions_skipped, 0);
assert_eq!(stats.sessions_imported, 0);
let _ = tokio::fs::remove_dir_all(tmp).await;
}
#[tokio::test]
async fn import_updates_metadata_only() {
let (archivist, tmp) = mk().await;
let a = Uuid::now_v7();
let convert = |_: &str| -> Result<Vec<MessageRecord>> {
Ok(vec![record(Uuid::nil(), a, "user", "hi")])
};
let first = vec![DiscoveredSession {
native_session_id: "s1".into(),
title: Some("old title".into()),
created_at: None,
updated_at: None,
message_count: 1,
metadata: serde_json::json!({}),
project_path: None,
file_size: None,
}];
let _ = import_sessions(&archivist, connector(), first, convert, None, &ImportProgressSink::noop(), true, &HashMap::new())
.await
.unwrap();
// Re-import with same messages but new title.
let second = vec![DiscoveredSession {
native_session_id: "s1".into(),
title: Some("new title".into()),
created_at: None,
updated_at: None,
message_count: 1,
metadata: serde_json::json!({}),
project_path: None,
file_size: None,
}];
let convert2 = |_: &str| -> Result<Vec<MessageRecord>> {
Ok(vec![record(Uuid::nil(), a, "user", "hi")])
};
let stats = import_sessions(&archivist, connector(), second, convert2, None, &ImportProgressSink::noop(), true, &HashMap::new())
.await
.unwrap();
assert_eq!(stats.messages_written, 0);
assert_eq!(stats.sessions_updated, 1);
assert_eq!(stats.sessions_skipped, 0);
// Verify title landed on disk.
let meta_list = archivist
.list_sessions_paged(
crate::types::SessionListQuery::default().with_limit(50),
)
.await
.unwrap();
assert!(meta_list.items.iter().any(|m| m.title.as_deref() == Some("new title")));
let _ = tokio::fs::remove_dir_all(tmp).await;
}
#[tokio::test]
async fn import_handles_metadata_unchanged() {
let (archivist, tmp) = mk().await;
let a = Uuid::now_v7();
let discovered = vec![DiscoveredSession {
native_session_id: "s1".into(),
title: Some("t".into()),
created_at: None,
updated_at: None,
message_count: 1,
metadata: serde_json::json!({"model": "claude"}),
project_path: None,
file_size: None,
}];
let convert = |_: &str| -> Result<Vec<MessageRecord>> {
Ok(vec![record(Uuid::nil(), a, "user", "hi")])
};
let _ = import_sessions(&archivist, connector(), discovered.clone(), convert, None, &ImportProgressSink::noop(), true, &HashMap::new())
.await
.unwrap();
let convert2 = |_: &str| -> Result<Vec<MessageRecord>> {
Ok(vec![record(Uuid::nil(), a, "user", "hi")])
};
let stats = import_sessions(&archivist, connector(), discovered, convert2, None, &ImportProgressSink::noop(), true, &HashMap::new())
.await
.unwrap();
assert_eq!(stats.sessions_skipped, 1);
assert_eq!(stats.sessions_updated, 0);
assert_eq!(stats.messages_written, 0);
let _ = tokio::fs::remove_dir_all(tmp).await;
}
}
@@ -1,117 +0,0 @@
//! ImportProgressSink: bounded mpsc with drop-oldest-non-terminal overflow.
//! Terminal events (ImportDone / ImportFailed) are never dropped — on full
//! channel they evict oldest non-terminal events until they fit. The import
//! thread never backpressures on a slow consumer.
use serde::{Deserialize, Serialize};
use tokio::sync::mpsc;
use super::ImportDiscovery;
use super::ImportStats;
const DEFAULT_CAPACITY: usize = 64;
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "snake_case", tag = "kind")]
pub enum ImportProgressEvent {
DiscoveryStarted { source: String },
DiscoveryProgress { scanned: usize, estimated_total: Option<usize> },
DiscoveryDone { discovered: ImportDiscovery },
SessionStarted { native_id: String, index: usize, total: usize },
SessionFinished { native_id: String, outcome: SessionOutcome, stats_delta: StatsDelta },
ImportDone { stats: ImportStats },
ImportFailed { error: String },
}
impl ImportProgressEvent {
pub fn is_terminal(&self) -> bool {
matches!(self, ImportProgressEvent::ImportDone { .. } | ImportProgressEvent::ImportFailed { .. })
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum SessionOutcome { Imported, Skipped, Updated, Failed }
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct StatsDelta {
pub messages_written: u64,
pub messages_already_present: u64,
}
pub struct ImportProgressSink {
inner: SinkInner,
}
enum SinkInner {
Live { tx: mpsc::Sender<ImportProgressEvent> },
Noop,
}
impl ImportProgressSink {
pub fn channel() -> (Self, mpsc::Receiver<ImportProgressEvent>) {
let (tx, rx) = mpsc::channel(DEFAULT_CAPACITY);
(Self { inner: SinkInner::Live { tx } }, rx)
}
pub fn noop() -> Self { Self { inner: SinkInner::Noop } }
pub async fn send(&self, evt: ImportProgressEvent) {
match &self.inner {
SinkInner::Noop => {}
SinkInner::Live { tx } => {
if evt.is_terminal() {
// Force-send: guaranteed delivery of terminal events.
let _ = tx.send(evt).await;
} else {
// Best-effort: drop non-terminal events when the channel is full.
match tx.try_send(evt) {
Ok(()) => {}
Err(mpsc::error::TrySendError::Full(_)) => {
tracing::debug!("import progress: dropped non-terminal event (queue full)");
}
Err(mpsc::error::TrySendError::Closed(_)) => {
tracing::warn!("import progress: consumer gone");
}
}
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn terminal_events_always_delivered() {
let (sink, mut rx) = ImportProgressSink::channel();
// Fill the channel with non-terminal events (mostly drop).
for i in 0..1000 {
sink.send(ImportProgressEvent::SessionStarted {
native_id: format!("s{i}"), index: i, total: 1000,
}).await;
}
// Consumer drains in background.
let handle = tokio::spawn(async move {
let mut saw_done = false;
while let Some(e) = rx.recv().await {
if matches!(e, ImportProgressEvent::ImportDone { .. }) {
saw_done = true;
break;
}
}
saw_done
});
sink.send(ImportProgressEvent::ImportDone { stats: ImportStats::default() }).await;
let saw_done = tokio::time::timeout(std::time::Duration::from_secs(2), handle).await.unwrap().unwrap();
assert!(saw_done);
}
#[tokio::test]
async fn noop_sink_never_fails() {
let sink = ImportProgressSink::noop();
sink.send(ImportProgressEvent::ImportDone { stats: ImportStats::default() }).await;
}
}
@@ -1,93 +0,0 @@
//! Dynamic registry of Importer implementations. Populated at boot.
use std::collections::HashMap;
use std::sync::Arc;
use super::trait_def::{Importer, ImporterInfo};
pub struct ImporterRegistry {
importers: HashMap<&'static str, Arc<dyn Importer>>,
}
impl ImporterRegistry {
pub fn new() -> Self {
Self {
importers: HashMap::new(),
}
}
/// Populate with all built-in importers. Feature flags select which ship.
pub fn with_defaults() -> Self {
let mut r = Self::new();
#[cfg(feature = "importer-claude")]
r.register(Arc::new(super::sources::claude::ClaudeImporter));
#[cfg(feature = "importer-chatgpt")]
r.register(Arc::new(super::sources::chatgpt::ChatGptImporter));
#[cfg(feature = "importer-codex")]
r.register(Arc::new(super::sources::codex::CodexImporter));
r
}
pub fn register(&mut self, importer: Arc<dyn Importer>) {
self.importers.insert(importer.source_name(), importer);
}
pub fn get(&self, name: &str) -> Option<Arc<dyn Importer>> {
self.importers.get(name).cloned()
}
pub fn list(&self) -> Vec<ImporterInfo> {
self.importers
.values()
.map(|i| ImporterInfo {
source_name: i.source_name().to_string(),
display_name: pretty_name(i.source_name()),
config_shape: i.config_shape(),
})
.collect()
}
}
fn pretty_name(source: &str) -> String {
match source {
"claude" => "Claude Code".into(),
"chatgpt" => "ChatGPT (OpenAI)".into(),
"codex" => "OpenAI Codex".into(),
other => other.to_string(),
}
}
impl Default for ImporterRegistry {
fn default() -> Self {
Self::with_defaults()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn defaults_include_claude_when_feature_enabled() {
let reg = ImporterRegistry::with_defaults();
let list = reg.list();
#[cfg(feature = "importer-claude")]
{
assert!(list.iter().any(|i| i.source_name == "claude"));
assert!(reg.get("claude").is_some());
}
#[cfg(not(feature = "importer-claude"))]
{
let _ = list;
assert!(reg.get("claude").is_none());
}
}
#[test]
fn pretty_name_known_sources() {
assert_eq!(pretty_name("claude"), "Claude Code");
assert_eq!(pretty_name("chatgpt"), "ChatGPT (OpenAI)");
assert_eq!(pretty_name("codex"), "OpenAI Codex");
assert_eq!(pretty_name("custom"), "custom");
}
}
@@ -1,361 +0,0 @@
//! ChatGPT importer: takes a path to a conversations.json file.
use std::path::PathBuf;
use async_trait::async_trait;
use chrono::Utc;
use uuid::Uuid;
use dirigent_chatgpt::{ContentPart, ParsedConversation, ParsedMessage};
use super::super::progress::ImportProgressSink;
use super::super::trait_def::{
ConfigField, ConfigFieldKind, ImportConfig, ImportConfigShape, ImportError, ImportTarget,
Importer,
};
use super::super::{
import_sessions, DiscoveredSession, ImportDiscovery, ImportProject, ImportStats,
};
use crate::coordinator::Archivist;
use crate::error::{ArchivistError, Result};
use crate::types::{MessageRecord, RegisterConnectorRequest};
/// Connector type string used for imported ChatGPT sessions.
pub const CHATGPT_CONNECTOR_TYPE: &str = "ChatGPT";
/// Fingerprint prefix for locally-imported ChatGPT exports.
pub const CHATGPT_FINGERPRINT_PREFIX: &str = "import/local:chatgpt";
/// Namespace UUID for deterministic UUIDv5 derivations on ChatGPT message ids
/// that are not already valid UUIDs.
const CHATGPT_MESSAGE_NS: Uuid = Uuid::from_u128(0x4e58_a7cb_bf1c_4de2_b7c9_8c31_11b3_1112);
pub struct ChatGptImporter;
#[async_trait]
impl Importer for ChatGptImporter {
fn source_name(&self) -> &'static str {
"chatgpt"
}
fn config_shape(&self) -> ImportConfigShape {
ImportConfigShape {
fields: vec![ConfigField {
key: "path".into(),
label: "conversations.json path".into(),
kind: ConfigFieldKind::File {
extension: Some("json".into()),
},
required: true,
help: Some(
"Unzipped OpenAI data export \u{2192} conversations.json".into(),
),
}],
example: ImportConfig {
source: "chatgpt".into(),
params: {
let mut m = std::collections::BTreeMap::new();
m.insert(
"path".into(),
serde_json::json!("~/Downloads/chatgpt-export/conversations.json"),
);
m
},
},
}
}
async fn discover(
&self,
cfg: &ImportConfig,
) -> std::result::Result<ImportDiscovery, ImportError> {
let path = require_path(cfg)?;
let convs = dirigent_chatgpt::parse_export(&path)
.map_err(|e| ImportError::Discovery(e.to_string()))?;
let total_sessions = convs.len();
let total_estimated_messages: usize = convs.iter().map(|c| c.messages.len()).sum();
// ChatGPT exports don't carry per-project information, so we bucket
// everything into a single synthetic project named after the file.
let project_name = path
.file_name()
.and_then(|s| s.to_str())
.unwrap_or("ChatGPT export")
.to_string();
Ok(ImportDiscovery {
source_name: "ChatGPT".to_string(),
source_path: path.display().to_string(),
projects: vec![ImportProject {
name: project_name,
session_count: total_sessions,
}],
total_sessions,
total_estimated_messages,
})
}
async fn import(
&self,
cfg: &ImportConfig,
archivist: &Archivist,
target: ImportTarget,
progress: ImportProgressSink,
) -> std::result::Result<ImportStats, ImportError> {
let path = require_path(cfg)?;
let convs = dirigent_chatgpt::parse_export(&path)
.map_err(|e| ImportError::Parser(e.to_string()))?;
// Build discovered-session list + keep the parsed convs handy for
// message conversion inside the closure.
let mut discovered: Vec<DiscoveredSession> = Vec::with_capacity(convs.len());
for c in &convs {
let metadata = serde_json::json!({
"source": "chatgpt",
"conversation_id": c.id,
"parser_metadata": c.metadata.clone(),
});
discovered.push(DiscoveredSession {
native_session_id: c.id.clone(),
title: c.title.clone(),
created_at: c.created_at,
updated_at: c.updated_at,
message_count: c.messages.len(),
metadata,
project_path: None,
file_size: None,
});
}
// Map native_id -> parsed conversation for O(1) lookup in `convert`.
let conv_lookup: std::collections::HashMap<String, ParsedConversation> = convs
.into_iter()
.map(|c| (c.id.clone(), c))
.collect();
// Fingerprint the import by the canonical path. Re-running against the
// same file aliases onto the same connector.
let canonical_path = path.canonicalize().unwrap_or_else(|_| path.clone());
let fingerprint = format!("{}:{}", CHATGPT_FINGERPRINT_PREFIX, canonical_path.display());
let connector_req = RegisterConnectorRequest {
r#type: CHATGPT_CONNECTOR_TYPE.to_string(),
title: format!("ChatGPT ({})", canonical_path.display()),
client_native_id: fingerprint.clone(),
custom_uid: None,
metadata: serde_json::json!({}),
fingerprint: Some(fingerprint),
};
let convert = |native_id: &str| -> Result<Vec<MessageRecord>> {
let conv = conv_lookup.get(native_id).ok_or_else(|| {
ArchivistError::InvalidRequest(format!(
"Parsed conversation not found for native_id: {}",
native_id
))
})?;
Ok(convert_conversation_to_records(conv))
};
import_sessions(
archivist,
connector_req,
discovered,
convert,
target.archive,
&progress,
false,
&target.project_map,
)
.await
.map_err(|e| ImportError::Archivist(e.to_string()))
}
}
// ---------------------------------------------------------------------------
// Conversion helpers
// ---------------------------------------------------------------------------
fn require_path(cfg: &ImportConfig) -> std::result::Result<PathBuf, ImportError> {
cfg.params
.get("path")
.and_then(|v| v.as_str())
.map(PathBuf::from)
.ok_or_else(|| ImportError::Config("missing `path`".into()))
}
/// Prefer to parse the native id as a UUID if possible; otherwise derive a
/// stable UUIDv5 under [`CHATGPT_MESSAGE_NS`].
fn parse_or_derive_uuid(native_id: &str) -> Uuid {
Uuid::parse_str(native_id)
.unwrap_or_else(|_| Uuid::new_v5(&CHATGPT_MESSAGE_NS, native_id.as_bytes()))
}
/// Convert parsed `ContentPart`s into `dirigent_protocol::MessagePart`s.
fn parts_to_message_parts(parts: &[ContentPart]) -> Vec<dirigent_protocol::MessagePart> {
parts
.iter()
.map(|p| match p {
ContentPart::Text { text } => dirigent_protocol::MessagePart::Text {
text: text.clone(),
},
ContentPart::Code { language, text } => dirigent_protocol::MessagePart::Code {
language: language.clone().unwrap_or_default(),
code: text.clone(),
},
ContentPart::Tool { name, input, output } => dirigent_protocol::MessagePart::Tool {
tool: name.clone(),
tool_call_id: None,
input: input.clone(),
output: output.clone(),
},
})
.collect()
}
/// Flatten a list of parsed content parts into a markdown-y string for the
/// `content_md` fallback surface.
fn parts_to_markdown(parts: &[ContentPart]) -> String {
parts
.iter()
.map(|p| match p {
ContentPart::Text { text } => text.clone(),
ContentPart::Code { language, text } => {
let lang = language.clone().unwrap_or_default();
format!("```{}\n{}\n```", lang, text)
}
ContentPart::Tool { name, .. } => format!("[Tool: {}]", name),
})
.collect::<Vec<_>>()
.join("\n\n")
}
/// Convert a parsed ChatGPT conversation into a vector of `MessageRecord`s.
///
/// Each message's `session` field is left as `Uuid::nil()`; the generic
/// `import_sessions` orchestrator patches it to the real scroll id.
fn convert_conversation_to_records(conv: &ParsedConversation) -> Vec<MessageRecord> {
conv.messages
.iter()
.filter_map(convert_parsed_message)
.collect()
}
fn convert_parsed_message(msg: &ParsedMessage) -> Option<MessageRecord> {
// Skip messages with entirely empty text payloads (nothing to archive).
let content_md = parts_to_markdown(&msg.content);
if content_md.trim().is_empty() && msg.content.iter().all(is_part_empty) {
return None;
}
let parts = parts_to_message_parts(&msg.content);
let content_parts = serde_json::to_value(&parts).ok();
let ts = msg.ts.unwrap_or_else(Utc::now);
let message_id = if msg.id.is_empty() {
// Fallback: derive from role + timestamp + a hash of content.
let key = format!("{}:{}:{}", msg.role, ts.to_rfc3339(), content_md);
Uuid::new_v5(&CHATGPT_MESSAGE_NS, key.as_bytes())
} else {
parse_or_derive_uuid(&msg.id)
};
Some(MessageRecord {
version: 1,
message_id,
session: Uuid::nil(),
parent_id: None,
ts,
role: msg.role.clone(),
author: None,
content_md,
content_parts,
attachments: Vec::new(),
metadata: msg.metadata.clone(),
})
}
fn is_part_empty(p: &ContentPart) -> bool {
match p {
ContentPart::Text { text } => text.trim().is_empty(),
ContentPart::Code { text, .. } => text.trim().is_empty(),
ContentPart::Tool { .. } => false,
}
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_or_derive_uuid_parses_real_uuid() {
let real = "12345678-1234-5678-1234-567812345678";
let u = parse_or_derive_uuid(real);
assert_eq!(u.to_string(), real);
}
#[test]
fn parse_or_derive_uuid_falls_back_to_v5() {
let a = parse_or_derive_uuid("not-a-uuid");
let b = parse_or_derive_uuid("not-a-uuid");
assert_eq!(a, b, "deterministic UUIDv5 derivation");
let c = parse_or_derive_uuid("different");
assert_ne!(a, c);
}
#[test]
fn parts_to_message_parts_covers_all_variants() {
let parts = vec![
ContentPart::Text { text: "hi".into() },
ContentPart::Code {
language: Some("rust".into()),
text: "fn main() {}".into(),
},
ContentPart::Tool {
name: "browser".into(),
input: serde_json::json!({"url": "https://example.com"}),
output: Some(serde_json::json!({"status": 200})),
},
];
let mp = parts_to_message_parts(&parts);
assert_eq!(mp.len(), 3);
assert!(matches!(&mp[0], dirigent_protocol::MessagePart::Text { .. }));
assert!(matches!(&mp[1], dirigent_protocol::MessagePart::Code { .. }));
assert!(matches!(&mp[2], dirigent_protocol::MessagePart::Tool { .. }));
}
#[test]
fn empty_parsed_message_is_skipped() {
let msg = ParsedMessage {
id: "m1".into(),
role: "system".into(),
ts: None,
content: vec![ContentPart::Text { text: " ".into() }],
metadata: serde_json::Value::Null,
};
assert!(convert_parsed_message(&msg).is_none());
}
#[test]
fn non_empty_parsed_message_round_trips() {
let msg = ParsedMessage {
id: "m1".into(),
role: "user".into(),
ts: None,
content: vec![ContentPart::Text {
text: "hello".into(),
}],
metadata: serde_json::Value::Null,
};
let record = convert_parsed_message(&msg).expect("should convert");
assert_eq!(record.role, "user");
assert_eq!(record.content_md, "hello");
assert_eq!(record.session, Uuid::nil());
assert!(record.content_parts.is_some());
}
}
File diff suppressed because it is too large Load Diff
@@ -1,331 +0,0 @@
//! OpenAI Codex CLI importer: takes a path to a directory of JSONL session files.
use std::path::PathBuf;
use async_trait::async_trait;
use chrono::Utc;
use uuid::Uuid;
use dirigent_codex::{ParsedMessage, ParsedSession};
use super::super::progress::ImportProgressSink;
use super::super::trait_def::{
ConfigField, ConfigFieldKind, ImportConfig, ImportConfigShape, ImportError, ImportTarget,
Importer,
};
use super::super::{
import_sessions, DiscoveredSession, ImportDiscovery, ImportProject, ImportStats,
};
use crate::coordinator::Archivist;
use crate::error::{ArchivistError, Result};
use crate::types::{MessageRecord, RegisterConnectorRequest};
/// Connector type string used for imported Codex sessions.
pub const CODEX_CONNECTOR_TYPE: &str = "Codex";
/// Fingerprint prefix for locally-imported Codex sessions.
pub const CODEX_FINGERPRINT_PREFIX: &str = "import/local:codex";
/// Namespace UUID for deterministic UUIDv5 derivations of message ids that
/// Codex does not expose natively.
const CODEX_MESSAGE_NS: Uuid = Uuid::from_u128(0x9e28_b7d4_af9c_4fe2_a8d1_8c41_21b3_2222);
pub struct CodexImporter;
#[async_trait]
impl Importer for CodexImporter {
fn source_name(&self) -> &'static str {
"codex"
}
fn config_shape(&self) -> ImportConfigShape {
ImportConfigShape {
fields: vec![ConfigField {
key: "path".into(),
label: "Codex sessions directory".into(),
kind: ConfigFieldKind::Path { directory: true },
required: true,
help: Some("Usually ~/.codex/sessions".into()),
}],
example: ImportConfig {
source: "codex".into(),
params: {
let mut m = std::collections::BTreeMap::new();
m.insert("path".into(), serde_json::json!("~/.codex/sessions"));
m
},
},
}
}
async fn discover(
&self,
cfg: &ImportConfig,
) -> std::result::Result<ImportDiscovery, ImportError> {
let path = require_path(cfg)?;
let files = dirigent_codex::discover_sessions(&path)
.map_err(|e| ImportError::Discovery(e.to_string()))?;
// Parse each file to count messages. This is a best-effort estimate —
// malformed lines are skipped by the parser, so counts reflect what
// the importer would actually write.
let mut total_estimated_messages: usize = 0;
for file in &files {
if let Ok(session) = dirigent_codex::parse_file(file) {
total_estimated_messages += session.messages.len();
}
}
let total_sessions = files.len();
// Codex sessions live flat in one directory; bucket them into a
// single synthetic project named after the directory.
let project_name = path
.file_name()
.and_then(|s| s.to_str())
.unwrap_or("Codex sessions")
.to_string();
Ok(ImportDiscovery {
source_name: "Codex".to_string(),
source_path: path.display().to_string(),
projects: vec![ImportProject {
name: project_name,
session_count: total_sessions,
}],
total_sessions,
total_estimated_messages,
})
}
async fn import(
&self,
cfg: &ImportConfig,
archivist: &Archivist,
target: ImportTarget,
progress: ImportProgressSink,
) -> std::result::Result<ImportStats, ImportError> {
let path = require_path(cfg)?;
let files = dirigent_codex::discover_sessions(&path)
.map_err(|e| ImportError::Discovery(e.to_string()))?;
// Parse every session file up front so that `convert_messages`
// (called by `import_sessions`) can do O(1) lookups.
let mut parsed: Vec<ParsedSession> = Vec::with_capacity(files.len());
for file in &files {
match dirigent_codex::parse_file(file) {
Ok(session) => parsed.push(session),
Err(e) => {
tracing::warn!(
path = %file.display(),
error = %e,
"Skipping unreadable Codex session file"
);
}
}
}
let mut discovered: Vec<DiscoveredSession> = Vec::with_capacity(parsed.len());
for s in &parsed {
let metadata = serde_json::json!({
"source": "codex",
"source_path": s.source_path.display().to_string(),
"native_id": s.native_id,
});
let file_size = std::fs::metadata(&s.source_path).ok().map(|m| m.len());
discovered.push(DiscoveredSession {
native_session_id: s.native_id.clone(),
title: None,
created_at: s.created_at,
updated_at: s.updated_at,
message_count: s.messages.len(),
metadata,
project_path: None,
file_size,
});
}
// Map native_id -> parsed session for O(1) lookup in `convert`.
let session_lookup: std::collections::HashMap<String, ParsedSession> = parsed
.into_iter()
.map(|s| (s.native_id.clone(), s))
.collect();
// Fingerprint the import by the canonical directory path. Re-running
// against the same directory aliases onto the same connector.
let canonical_path = path.canonicalize().unwrap_or_else(|_| path.clone());
let fingerprint = format!("{}:{}", CODEX_FINGERPRINT_PREFIX, canonical_path.display());
let connector_req = RegisterConnectorRequest {
r#type: CODEX_CONNECTOR_TYPE.to_string(),
title: format!("Codex ({})", canonical_path.display()),
client_native_id: fingerprint.clone(),
custom_uid: None,
metadata: serde_json::json!({}),
fingerprint: Some(fingerprint),
};
let convert = |native_id: &str| -> Result<Vec<MessageRecord>> {
let session = session_lookup.get(native_id).ok_or_else(|| {
ArchivistError::InvalidRequest(format!(
"Parsed session not found for native_id: {}",
native_id
))
})?;
Ok(convert_session_to_records(session))
};
import_sessions(
archivist,
connector_req,
discovered,
convert,
target.archive,
&progress,
false,
&target.project_map,
)
.await
.map_err(|e| ImportError::Archivist(e.to_string()))
}
}
// ---------------------------------------------------------------------------
// Conversion helpers
// ---------------------------------------------------------------------------
fn require_path(cfg: &ImportConfig) -> std::result::Result<PathBuf, ImportError> {
cfg.params
.get("path")
.and_then(|v| v.as_str())
.map(PathBuf::from)
.ok_or_else(|| ImportError::Config("missing `path`".into()))
}
/// Convert every [`ParsedMessage`] in a session into a [`MessageRecord`],
/// leaving `session = Uuid::nil()` for the generic orchestrator to patch.
fn convert_session_to_records(session: &ParsedSession) -> Vec<MessageRecord> {
session
.messages
.iter()
.enumerate()
.filter_map(|(idx, m)| convert_parsed_message(&session.native_id, idx, m))
.collect()
}
fn convert_parsed_message(
native_session_id: &str,
index: usize,
msg: &ParsedMessage,
) -> Option<MessageRecord> {
// Skip purely empty messages — nothing to archive.
if msg.content.trim().is_empty() {
return None;
}
let ts = msg.ts.unwrap_or_else(Utc::now);
// Codex events don't carry per-message UUIDs, so always derive a stable
// UUIDv5 from (native_session, index, role, ts). Index disambiguates
// otherwise-identical back-to-back messages.
let key = format!(
"{}:{}:{}:{}",
native_session_id,
index,
msg.role,
ts.to_rfc3339(),
);
let message_id = Uuid::new_v5(&CODEX_MESSAGE_NS, key.as_bytes());
let parts = vec![dirigent_protocol::MessagePart::Text {
text: msg.content.clone(),
}];
let content_parts = serde_json::to_value(&parts).ok();
Some(MessageRecord {
version: 1,
message_id,
session: Uuid::nil(),
parent_id: None,
ts,
role: msg.role.clone(),
author: None,
content_md: msg.content.clone(),
content_parts,
attachments: Vec::new(),
metadata: msg.metadata.clone(),
})
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
fn sample_message(role: &str, content: &str) -> ParsedMessage {
ParsedMessage {
ts: None,
role: role.into(),
content: content.into(),
metadata: serde_json::Value::Null,
}
}
fn sample_message_at(role: &str, content: &str, ts: chrono::DateTime<Utc>) -> ParsedMessage {
ParsedMessage {
ts: Some(ts),
role: role.into(),
content: content.into(),
metadata: serde_json::Value::Null,
}
}
#[test]
fn empty_content_is_skipped() {
let m = sample_message("user", " ");
assert!(convert_parsed_message("s", 0, &m).is_none());
}
#[test]
fn non_empty_message_converts() {
let m = sample_message("user", "hello");
let r = convert_parsed_message("s", 0, &m).expect("converts");
assert_eq!(r.role, "user");
assert_eq!(r.content_md, "hello");
assert_eq!(r.session, Uuid::nil());
assert!(r.content_parts.is_some());
}
#[test]
fn message_id_is_deterministic_per_session_index() {
// Fix ts so we don't accidentally hash Utc::now() into the id key.
let ts = chrono::TimeZone::timestamp_opt(&Utc, 1_735_732_800, 0)
.single()
.unwrap();
let m = sample_message_at("user", "hello", ts);
let a = convert_parsed_message("session-a", 0, &m).unwrap();
let b = convert_parsed_message("session-a", 0, &m).unwrap();
assert_eq!(a.message_id, b.message_id);
// Different index → different id.
let c = convert_parsed_message("session-a", 1, &m).unwrap();
assert_ne!(a.message_id, c.message_id);
// Different session → different id.
let d = convert_parsed_message("session-b", 0, &m).unwrap();
assert_ne!(a.message_id, d.message_id);
}
#[test]
fn require_path_reports_missing_config() {
let cfg = ImportConfig {
source: "codex".into(),
params: Default::default(),
};
let err = require_path(&cfg).expect_err("should fail");
assert!(matches!(err, ImportError::Config(_)));
}
}
@@ -1,7 +0,0 @@
//! Per-source importer implementations.
pub mod claude;
#[cfg(feature = "importer-chatgpt")]
pub mod chatgpt;
#[cfg(feature = "importer-codex")]
pub mod codex;
@@ -1,113 +0,0 @@
//! Importer trait and config-shape types consumed by the UI (dynamic form
//! rendering) and the CLI (future). Scripts can serialise ImportConfig as JSON.
use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use std::collections::{BTreeMap, HashMap};
use thiserror::Error;
use uuid::Uuid;
use crate::coordinator::Archivist;
use super::progress::ImportProgressSink;
#[async_trait]
pub trait Importer: Send + Sync {
fn source_name(&self) -> &'static str;
fn config_shape(&self) -> ImportConfigShape;
async fn discover(
&self,
cfg: &ImportConfig,
) -> Result<super::ImportDiscovery, ImportError>;
async fn import(
&self,
cfg: &ImportConfig,
archivist: &Archivist,
target: ImportTarget,
progress: ImportProgressSink,
) -> Result<super::ImportStats, ImportError>;
/// Attempt to auto-detect default configuration values.
///
/// Importers that can discover their source location automatically
/// (e.g., Claude Code's `~/.claude` directory) should override this.
/// Returns `None` when auto-detection is not supported or fails.
fn detect_defaults(&self) -> Option<ImportConfig> {
None
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ImporterInfo {
pub source_name: String,
pub display_name: String,
pub config_shape: ImportConfigShape,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ImportConfigShape {
pub fields: Vec<ConfigField>,
pub example: ImportConfig,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ConfigField {
pub key: String,
pub label: String,
pub kind: ConfigFieldKind,
pub required: bool,
pub help: Option<String>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "snake_case", tag = "type")]
pub enum ConfigFieldKind {
Path { directory: bool },
File { extension: Option<String> },
String,
Bool,
Enum { variants: Vec<String> },
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct ImportConfig {
pub source: String,
#[serde(default)]
pub params: BTreeMap<String, serde_json::Value>,
}
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
pub struct ImportTarget {
pub archive: Option<String>,
pub connector_alias: Option<String>,
pub project_id: Option<Uuid>,
/// Maps normalized project_path -> project_id (as string UUID).
/// When a session's project_path is found in this map, the corresponding
/// project_id is injected into the session metadata during import.
#[serde(default)]
pub project_map: HashMap<String, String>,
}
#[derive(Debug, Error)]
pub enum ImportError {
#[error("source not found: {0}")] SourceNotFound(String),
#[error("config: {0}")] Config(String),
#[error("discovery: {0}")] Discovery(String),
#[error("I/O: {0}")] Io(#[from] std::io::Error),
#[error("archivist: {0}")] Archivist(String),
#[error("parser: {0}")] Parser(String),
#[error("cancelled")] Cancelled,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn config_round_trips() {
let cfg = ImportConfig { source: "claude".into(), params: BTreeMap::new() };
let json = serde_json::to_string(&cfg).unwrap();
let back: ImportConfig = serde_json::from_str(&json).unwrap();
assert_eq!(back.source, "claude");
}
}
-45
View File
@@ -1,45 +0,0 @@
//! Dirigent Archivist
//!
//! Persistent storage for all agentic interactions in Dirigent.
//!
//! The Archivist provides file-based archival storage using NDJSON, JSON, and TSV
//! formats for durability and human-readability. It implements an archive-first
//! architecture with connector API fallback for session data.
//!
//! # Key Features
//!
//! - File-based storage for easy curation and grep-ability
//! - Content-addressable file storage for attachments
//! - Session lineage tracking (splits, continuations, mutations)
//! - Connector registry with UID coordination
//! - Real-time event streaming for archive updates
//!
//! # Architecture
//!
//! See `docs/building/05_archivist/vision.md` for detailed design.
pub mod accumulator;
pub mod backend;
pub mod backends;
pub mod backfill;
pub mod coordinator;
pub mod error;
pub mod events;
pub mod import;
pub mod registry;
pub mod session;
pub mod storage;
pub mod types;
// Re-export commonly used types
pub use accumulator::{MessageAccumulator, ToolCallData};
pub use backend::{
ArchiveBackend, ArchiveCapability, CapabilitySet, ConnectorRegistryBackend,
DagBackend, HealthStatus, MetaEventsBackend, SearchBackend, SessionMappingBackend,
};
pub use backends::JsonlBackend;
pub use backfill::{backfill_from_sessions, convert_message_to_record, BackfillStats};
pub use coordinator::{ArchiveInfo, ArchiveMetadata, Archivist};
pub use error::{ArchivistError, Result};
pub use events::EventHandler;
pub use types::*;
@@ -1,116 +0,0 @@
//! Positive LRU cache mapping `scroll_id` to the backend that holds the
//! authoritative session metadata, populated on the first successful read.
use std::num::NonZeroUsize;
use lru::LruCache;
use tokio::sync::Mutex;
use uuid::Uuid;
const DEFAULT_CAPACITY: usize = 10_000;
pub struct ReadCache {
inner: Mutex<LruCache<Uuid, String>>,
}
impl ReadCache {
pub fn new() -> Self {
Self::with_capacity(DEFAULT_CAPACITY)
}
pub fn with_capacity(capacity: usize) -> Self {
let cap = NonZeroUsize::new(capacity.max(1)).unwrap();
Self {
inner: Mutex::new(LruCache::new(cap)),
}
}
pub async fn get(&self, scroll_id: Uuid) -> Option<String> {
let mut guard = self.inner.lock().await;
guard.get(&scroll_id).cloned()
}
pub async fn put(&self, scroll_id: Uuid, backend_name: String) {
let mut guard = self.inner.lock().await;
guard.put(scroll_id, backend_name);
}
pub async fn invalidate(&self, scroll_id: Uuid) {
let mut guard = self.inner.lock().await;
guard.pop(&scroll_id);
}
pub async fn rewrite(&self, scroll_id: Uuid, new_backend: String) {
let mut guard = self.inner.lock().await;
guard.put(scroll_id, new_backend);
}
pub async fn clear(&self) {
let mut guard = self.inner.lock().await;
guard.clear();
}
pub async fn len(&self) -> usize {
let guard = self.inner.lock().await;
guard.len()
}
}
impl Default for ReadCache {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
fn id(b: u8) -> Uuid {
Uuid::from_bytes([b; 16])
}
#[tokio::test]
async fn put_then_get() {
let c = ReadCache::new();
c.put(id(1), "main".into()).await;
assert_eq!(c.get(id(1)).await.as_deref(), Some("main"));
assert!(c.get(id(2)).await.is_none());
}
#[tokio::test]
async fn invalidate_removes_entry() {
let c = ReadCache::new();
c.put(id(1), "main".into()).await;
c.invalidate(id(1)).await;
assert!(c.get(id(1)).await.is_none());
}
#[tokio::test]
async fn rewrite_changes_backend() {
let c = ReadCache::new();
c.put(id(1), "a".into()).await;
c.rewrite(id(1), "b".into()).await;
assert_eq!(c.get(id(1)).await.as_deref(), Some("b"));
}
#[tokio::test]
async fn lru_evicts_oldest() {
let c = ReadCache::with_capacity(2);
c.put(id(1), "a".into()).await;
c.put(id(2), "b".into()).await;
c.put(id(3), "c".into()).await; // evicts id(1)
assert!(c.get(id(1)).await.is_none());
assert_eq!(c.get(id(2)).await.as_deref(), Some("b"));
assert_eq!(c.get(id(3)).await.as_deref(), Some("c"));
}
#[tokio::test]
async fn clear_empties() {
let c = ReadCache::new();
c.put(id(1), "a".into()).await;
c.put(id(2), "b".into()).await;
c.clear().await;
assert_eq!(c.len().await, 0);
}
}
@@ -1,253 +0,0 @@
//! Declarative `[[archives]]` config block parsed from `dirigent.toml`.
//!
//! The TOML schema is documented in `docs/plans/2026-04-19-archivist-phase3-design.md`.
use serde::{Deserialize, Serialize};
use super::filter::ArchiveFilter;
use super::registration::{FailureMode, OverflowPolicy};
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct ArchivesConfig {
#[serde(default, rename = "archives")]
pub entries: Vec<ArchiveConfig>,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
pub struct ArchiveConfig {
pub name: String,
#[serde(rename = "type")]
pub type_name: String,
#[serde(default = "default_write_active")]
pub write_active: bool,
#[serde(default)]
pub failure_mode: FailureMode,
#[serde(default)]
pub read_priority: u32,
#[serde(default = "default_enabled")]
pub enabled: bool,
#[serde(default)]
pub write_policy: WritePolicyConfig,
/// Per-archive include/exclude filter applied during non-primary write
/// fanout. Absent or `{}` means unrestricted.
#[serde(default)]
pub filter: ArchiveFilter,
#[serde(default = "default_params")]
pub params: toml::Value,
}
fn default_params() -> toml::Value {
toml::Value::Table(toml::value::Table::new())
}
fn default_write_active() -> bool {
true
}
fn default_enabled() -> bool {
true
}
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(untagged)]
pub enum WritePolicyConfig {
Tag(WritePolicyTag),
Detailed(WritePolicyDetailed),
}
#[derive(Debug, Clone, Copy, Deserialize, Serialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum WritePolicyTag {
Inline,
}
#[derive(Debug, Clone, Deserialize, Serialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum WritePolicyDetailed {
Inline,
Queued {
#[serde(default = "default_batch_window_ms")]
batch_window_ms: u64,
#[serde(default = "default_capacity")]
capacity: usize,
#[serde(default)]
overflow: OverflowPolicy,
},
}
fn default_batch_window_ms() -> u64 {
50
}
fn default_capacity() -> usize {
1024
}
impl Default for WritePolicyConfig {
fn default() -> Self {
WritePolicyConfig::Tag(WritePolicyTag::Inline)
}
}
impl WritePolicyConfig {
pub fn into_runtime(self) -> super::registration::WritePolicy {
use super::registration::WritePolicy;
match self {
WritePolicyConfig::Tag(WritePolicyTag::Inline) => WritePolicy::Inline,
WritePolicyConfig::Detailed(WritePolicyDetailed::Inline) => WritePolicy::Inline,
WritePolicyConfig::Detailed(WritePolicyDetailed::Queued {
batch_window_ms,
capacity,
overflow,
}) => WritePolicy::Queued {
batch_window_ms,
capacity,
overflow,
},
}
}
}
use std::collections::BTreeSet;
#[derive(Debug, thiserror::Error, PartialEq)]
pub enum ConfigValidationError {
#[error("duplicate archive name `{0}`")]
DuplicateName(String),
#[error("no `required` write-active backend configured (need at least one)")]
NoPrimary,
}
impl ArchivesConfig {
pub fn validate(&self) -> Result<(), ConfigValidationError> {
let mut seen: BTreeSet<&str> = BTreeSet::new();
for entry in &self.entries {
if !seen.insert(entry.name.as_str()) {
return Err(ConfigValidationError::DuplicateName(entry.name.clone()));
}
}
// Empty config is allowed (ephemeral mode).
if self.entries.is_empty() {
return Ok(());
}
let has_primary = self
.entries
.iter()
.any(|e| e.enabled && e.write_active && e.failure_mode == FailureMode::Required);
if !has_primary {
return Err(ConfigValidationError::NoPrimary);
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
fn parse(toml_src: &str) -> ArchivesConfig {
toml::from_str(toml_src).expect("parse")
}
#[test]
fn empty_config_is_ephemeral() {
let cfg: ArchivesConfig = toml::from_str("").unwrap();
assert!(cfg.entries.is_empty());
assert!(cfg.validate().is_ok());
}
#[test]
fn minimal_single_archive() {
let cfg = parse(
r#"
[[archives]]
name = "main"
type = "jsonl"
[archives.params]
path = "dirigent_archive"
"#,
);
assert_eq!(cfg.entries.len(), 1);
let e = &cfg.entries[0];
assert_eq!(e.name, "main");
assert_eq!(e.type_name, "jsonl");
assert!(e.write_active);
assert_eq!(e.failure_mode, FailureMode::Required);
assert_eq!(e.read_priority, 0);
assert!(e.enabled);
assert!(matches!(e.write_policy, WritePolicyConfig::Tag(WritePolicyTag::Inline)));
cfg.validate().unwrap();
}
#[test]
fn duplicate_name_rejected() {
let cfg = parse(
r#"
[[archives]]
name = "main"
type = "jsonl"
[archives.params]
path = "a"
[[archives]]
name = "main"
type = "jsonl"
[archives.params]
path = "b"
"#,
);
assert_eq!(
cfg.validate(),
Err(ConfigValidationError::DuplicateName("main".into()))
);
}
#[test]
fn no_primary_rejected() {
let cfg = parse(
r#"
[[archives]]
name = "mirror"
type = "jsonl"
failure_mode = "best_effort"
[archives.params]
path = "a"
"#,
);
assert_eq!(cfg.validate(), Err(ConfigValidationError::NoPrimary));
}
#[test]
fn queued_write_policy_parses() {
let cfg = parse(
r#"
[[archives]]
name = "main"
type = "jsonl"
[archives.params]
path = "a"
[archives.write_policy]
type = "queued"
batch_window_ms = 100
capacity = 4096
overflow = "drop_oldest"
"#,
);
let entry = &cfg.entries[0];
match &entry.write_policy {
WritePolicyConfig::Detailed(WritePolicyDetailed::Queued {
batch_window_ms,
capacity,
overflow,
}) => {
assert_eq!(*batch_window_ms, 100);
assert_eq!(*capacity, 4096);
assert_eq!(*overflow, OverflowPolicy::DropOldest);
}
other => panic!("unexpected write_policy: {:?}", other),
}
}
}
@@ -1,192 +0,0 @@
//! Pluggable backend instantiation: type-string → factory → backend.
use std::collections::HashMap;
use std::sync::Arc;
use async_trait::async_trait;
use crate::backend::ArchiveBackend;
#[derive(Debug, thiserror::Error)]
pub enum BackendBuildError {
#[error("unknown backend type `{0}`")]
UnknownType(String),
#[error("invalid params for backend `{name}` (type `{type_name}`): {source}")]
InvalidParams {
name: String,
type_name: String,
#[source]
source: anyhow::Error,
},
#[error("backend `{name}` (type `{type_name}`) failed to initialise: {source}")]
BackendInit {
name: String,
type_name: String,
#[source]
source: anyhow::Error,
},
}
#[async_trait]
pub trait BackendFactory: Send + Sync {
fn type_name(&self) -> &'static str;
async fn build(
&self,
archive_name: &str,
params: toml::Value,
) -> Result<Arc<dyn ArchiveBackend>, BackendBuildError>;
}
pub struct BackendRegistry {
factories: HashMap<&'static str, Arc<dyn BackendFactory>>,
}
impl BackendRegistry {
pub fn new() -> Self {
Self {
factories: HashMap::new(),
}
}
pub fn register(&mut self, factory: Arc<dyn BackendFactory>) {
self.factories.insert(factory.type_name(), factory);
}
pub fn get(&self, type_name: &str) -> Option<&Arc<dyn BackendFactory>> {
self.factories.get(type_name)
}
pub async fn build(
&self,
archive_name: &str,
type_name: &str,
params: toml::Value,
) -> Result<Arc<dyn ArchiveBackend>, BackendBuildError> {
let factory = self
.get(type_name)
.ok_or_else(|| BackendBuildError::UnknownType(type_name.into()))?;
factory.build(archive_name, params).await
}
}
impl Default for BackendRegistry {
fn default() -> Self {
Self::new()
}
}
use std::path::PathBuf;
use crate::backends::JsonlBackend;
#[derive(Debug, serde::Deserialize)]
struct JsonlParams {
path: PathBuf,
}
pub struct JsonlFactory;
#[async_trait]
impl BackendFactory for JsonlFactory {
fn type_name(&self) -> &'static str {
"jsonl"
}
async fn build(
&self,
archive_name: &str,
params: toml::Value,
) -> Result<Arc<dyn ArchiveBackend>, BackendBuildError> {
let parsed: JsonlParams =
params
.try_into()
.map_err(|e: toml::de::Error| BackendBuildError::InvalidParams {
name: archive_name.into(),
type_name: "jsonl".into(),
source: anyhow::Error::new(e),
})?;
let backend = JsonlBackend::new(parsed.path).await.map_err(|e| {
BackendBuildError::BackendInit {
name: archive_name.into(),
type_name: "jsonl".into(),
source: anyhow::Error::new(e),
}
})?;
Ok(Arc::new(backend) as Arc<dyn ArchiveBackend>)
}
}
impl BackendRegistry {
/// Convenience: a registry with `jsonl` pre-registered.
pub fn with_jsonl() -> Self {
let mut r = Self::new();
r.register(Arc::new(JsonlFactory));
r
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::backend::mock::MockBackend;
struct MockFactory;
#[async_trait]
impl BackendFactory for MockFactory {
fn type_name(&self) -> &'static str {
"mock"
}
async fn build(
&self,
_archive_name: &str,
_params: toml::Value,
) -> Result<Arc<dyn ArchiveBackend>, BackendBuildError> {
Ok(Arc::new(MockBackend::new()) as Arc<dyn ArchiveBackend>)
}
}
#[tokio::test]
async fn unknown_type_rejected() {
let r = BackendRegistry::new();
let err = r
.build("a", "nope", toml::Value::Table(Default::default()))
.await
.map(|_| ())
.unwrap_err();
assert!(matches!(err, BackendBuildError::UnknownType(s) if s == "nope"));
}
#[tokio::test]
async fn registered_factory_builds() {
let mut r = BackendRegistry::new();
r.register(Arc::new(MockFactory));
let backend = r
.build("a", "mock", toml::Value::Table(Default::default()))
.await
.unwrap();
let _: &dyn ArchiveBackend = &*backend;
}
#[tokio::test]
async fn jsonl_factory_builds_under_tempdir() {
let dir = tempfile::tempdir().unwrap();
let r = BackendRegistry::with_jsonl();
let mut params = toml::value::Table::new();
params.insert(
"path".into(),
toml::Value::String(dir.path().to_string_lossy().into_owned()),
);
let backend = r
.build("main", "jsonl", toml::Value::Table(params))
.await
.unwrap();
let health = backend.health_check().await;
assert!(matches!(
health,
crate::backend::HealthStatus::Healthy | crate::backend::HealthStatus::Degraded { .. }
));
}
}
@@ -1,187 +0,0 @@
//! Per-archive include/exclude filter. Consulted during non-primary write
//! fanout (Task 20). Primary always writes regardless of filter.
use std::collections::HashSet;
use serde::{Deserialize, Serialize};
use uuid::Uuid;
use crate::types::SessionMetadata;
/// Declarative filter applied to non-primary fanout writes.
///
/// A registration's filter decides whether a given session should be
/// replicated to that archive. The primary write target ignores the filter
/// and always writes. A default filter (`ArchiveFilter::default()`) is
/// unrestricted and allows every session.
#[derive(Debug, Clone, Deserialize, Serialize, PartialEq, Eq)]
pub struct ArchiveFilter {
/// If `Some`, only sessions whose `connector_uid` is in the set are
/// accepted. If `None`, any connector is allowed (subject to other rules).
#[serde(default)]
pub include_connectors: Option<HashSet<Uuid>>,
/// Connector UIDs that are explicitly rejected. Takes precedence over
/// `include_connectors`.
#[serde(default)]
pub exclude_connectors: HashSet<Uuid>,
/// If non-empty, the session must carry at least one of these tags.
#[serde(default)]
pub include_tags: HashSet<String>,
/// Tags that cause the session to be rejected.
#[serde(default)]
pub exclude_tags: HashSet<String>,
/// When `false`, sessions whose `metadata.hidden == true` are rejected.
#[serde(default = "default_include_hidden")]
pub include_hidden: bool,
}
fn default_include_hidden() -> bool {
true
}
impl Default for ArchiveFilter {
fn default() -> Self {
Self {
include_connectors: None,
exclude_connectors: HashSet::new(),
include_tags: HashSet::new(),
exclude_tags: HashSet::new(),
include_hidden: true,
}
}
}
impl ArchiveFilter {
/// Returns true when this session should be written to the archive.
pub fn allows(&self, session: &SessionMetadata, connector_uid: &Uuid) -> bool {
// Exclude rules win.
if self.exclude_connectors.contains(connector_uid) {
return false;
}
if let Some(inc) = &self.include_connectors {
if !inc.contains(connector_uid) {
return false;
}
}
if session.tags.iter().any(|t| self.exclude_tags.contains(t)) {
return false;
}
if !self.include_tags.is_empty()
&& !session.tags.iter().any(|t| self.include_tags.contains(t))
{
return false;
}
if !self.include_hidden
&& session.metadata.get("hidden") == Some(&serde_json::Value::Bool(true))
{
return false;
}
true
}
/// A filter that allows everything is equivalent to no filter.
pub fn is_unrestricted(&self) -> bool {
self.include_connectors.is_none()
&& self.exclude_connectors.is_empty()
&& self.include_tags.is_empty()
&& self.exclude_tags.is_empty()
&& self.include_hidden
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::types::SessionMetadata;
fn make_session(tags: Vec<String>, hidden: bool) -> SessionMetadata {
let mut s = SessionMetadata::stub(Uuid::now_v7());
s.tags = tags;
s.metadata = if hidden {
serde_json::json!({ "hidden": true })
} else {
serde_json::Value::Null
};
s
}
#[test]
fn default_allows_all() {
let f = ArchiveFilter::default();
let s = make_session(vec![], false);
let uid = Uuid::new_v4();
assert!(f.allows(&s, &uid));
assert!(f.is_unrestricted());
}
#[test]
fn exclude_connector_rejects() {
let excluded = Uuid::new_v4();
let mut f = ArchiveFilter::default();
f.exclude_connectors.insert(excluded);
let s = make_session(vec![], false);
assert!(!f.allows(&s, &excluded));
assert!(f.allows(&s, &Uuid::new_v4()));
assert!(!f.is_unrestricted());
}
#[test]
fn include_connector_only_allows_listed() {
let allowed = Uuid::new_v4();
let mut f = ArchiveFilter::default();
f.include_connectors = Some(HashSet::from_iter([allowed]));
let s = make_session(vec![], false);
assert!(f.allows(&s, &allowed));
assert!(!f.allows(&s, &Uuid::new_v4()));
}
#[test]
fn tag_intersection_semantics() {
let mut f = ArchiveFilter::default();
f.include_tags = HashSet::from_iter(["prod".into()]);
let s_prod = make_session(vec!["prod".into()], false);
let s_dev = make_session(vec!["dev".into()], false);
let uid = Uuid::new_v4();
assert!(f.allows(&s_prod, &uid));
assert!(!f.allows(&s_dev, &uid));
}
#[test]
fn exclude_tag_wins_over_include() {
let mut f = ArchiveFilter::default();
f.include_tags = HashSet::from_iter(["prod".into()]);
f.exclude_tags = HashSet::from_iter(["sensitive".into()]);
let s = make_session(vec!["prod".into(), "sensitive".into()], false);
let uid = Uuid::new_v4();
assert!(!f.allows(&s, &uid));
}
#[test]
fn include_hidden_false_rejects_hidden_sessions() {
let mut f = ArchiveFilter::default();
f.include_hidden = false;
let s_hidden = make_session(vec![], true);
let s_visible = make_session(vec![], false);
let uid = Uuid::new_v4();
assert!(!f.allows(&s_hidden, &uid));
assert!(f.allows(&s_visible, &uid));
}
#[test]
fn default_include_hidden_accepts_hidden_sessions() {
let f = ArchiveFilter::default();
let s = make_session(vec![], true);
let uid = Uuid::new_v4();
assert!(f.allows(&s, &uid));
}
#[test]
fn toml_roundtrip_default() {
// Serializing via TOML is exercised through ArchiveConfig, but a
// plain JSON roundtrip on the struct catches serde attribute typos.
let f = ArchiveFilter::default();
let json = serde_json::to_string(&f).unwrap();
let back: ArchiveFilter = serde_json::from_str(&json).unwrap();
assert_eq!(f, back);
}
}
@@ -1,72 +0,0 @@
//! Health drift helpers used by both read and write paths.
//!
//! The drift model: successful writes reset `consecutive_failures` to 0 and
//! promote `Degraded` → `Healthy`. Write failures bump the counter and drift
//! to `Degraded { reason }`; after K consecutive failures, the registration
//! drifts to `Unavailable { reason }`, which causes read walks to skip it.
//!
//! Read successes rescue `Degraded` → `Healthy` but don't touch the failure
//! counter (writes are the authoritative health signal). Read failures drift
//! `Healthy` → `Degraded` but never to `Unavailable` by themselves (a truly
//! broken backend will be caught on the next write attempt).
use chrono::Utc;
use crate::backend::HealthStatus;
use crate::registry::ArchiveRegistration;
const FAILURE_THRESHOLD: u32 = 5;
impl crate::coordinator::Archivist {
pub(crate) async fn record_write_success(&self, reg: &ArchiveRegistration) {
*reg.consecutive_failures.write().await = 0;
let mut h = reg.last_health.write().await;
if !matches!(*h, HealthStatus::Healthy) {
*h = HealthStatus::Healthy;
}
}
pub(crate) async fn record_read_success(&self, reg: &ArchiveRegistration) {
// Reads don't reset the failure counter — writes are the authoritative
// health signal. But reads DO recover from `Degraded` to `Healthy`.
let mut h = reg.last_health.write().await;
if matches!(*h, HealthStatus::Degraded { .. }) {
*h = HealthStatus::Healthy;
}
}
pub(crate) async fn record_write_failure(
&self,
reg: &ArchiveRegistration,
reason: &str,
) {
let mut n = reg.consecutive_failures.write().await;
*n = n.saturating_add(1);
*reg.last_error.write().await = Some((Utc::now(), reason.to_string()));
let mut h = reg.last_health.write().await;
if *n >= FAILURE_THRESHOLD {
*h = HealthStatus::Unavailable {
reason: format!("{} consecutive failures: {reason}", *n),
};
} else {
*h = HealthStatus::Degraded {
reason: reason.to_string(),
};
}
}
pub(crate) async fn record_read_failure(&self, reg: &ArchiveRegistration) {
// Reads alone do not drift to Unavailable. Only drift to Degraded.
let mut h = reg.last_health.write().await;
if matches!(*h, HealthStatus::Healthy) {
*h = HealthStatus::Degraded {
reason: "read failure".into(),
};
}
}
#[allow(dead_code)]
pub(crate) async fn current_health(&self, reg: &ArchiveRegistration) -> HealthStatus {
reg.last_health.read().await.clone()
}
}
@@ -1,22 +0,0 @@
//! Multi-backend registry: configuration, factory, registration entries,
//! read cache, queued writer tasks, and health drift helpers.
//!
//! The single `registry.rs` file from Phase 2 (on-disk archive metadata
//! persistence) has been replaced; archive declaration moves to
//! `dirigent.toml` and is consumed at boot via
//! `coordinator::boot::Archivist::from_config` in later Phase 3 tasks.
pub mod cache;
pub mod config;
pub mod factory;
pub mod filter;
pub mod health;
pub mod registration;
pub mod writer;
pub use config::{ArchiveConfig, ArchivesConfig, ConfigValidationError};
pub use factory::{BackendBuildError, BackendFactory, BackendRegistry, JsonlFactory};
pub use filter::ArchiveFilter;
pub use registration::{
ArchiveRegistration, ArchiveStatus, FailureMode, OverflowPolicy, WritePolicy,
};
@@ -1,181 +0,0 @@
//! Per-backend configuration value types used by `ArchiveRegistration`.
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum FailureMode {
Required,
BestEffort,
}
impl Default for FailureMode {
fn default() -> Self {
FailureMode::Required
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum WritePolicy {
Inline,
Queued {
batch_window_ms: u64,
capacity: usize,
overflow: OverflowPolicy,
},
}
impl Default for WritePolicy {
fn default() -> Self {
WritePolicy::Inline
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum OverflowPolicy {
Block,
DropOldest,
Error,
}
impl Default for OverflowPolicy {
fn default() -> Self {
OverflowPolicy::Block
}
}
use std::sync::Arc;
use chrono::{DateTime, Utc};
use tokio::sync::RwLock;
use crate::backend::{ArchiveBackend, CapabilitySet, HealthStatus};
use crate::registry::filter::ArchiveFilter;
use super::writer::WriterHandle;
pub struct ArchiveRegistration {
pub name: String,
pub type_name: &'static str,
pub backend: Arc<dyn ArchiveBackend>,
pub write_active: bool,
pub failure_mode: FailureMode,
pub read_priority: u32,
pub enabled: bool,
pub write_policy: WritePolicy,
/// Per-archive include/exclude filter consulted during non-primary
/// write fanout. Default (unrestricted) accepts every session; the
/// primary target always writes regardless of its filter.
pub filter: ArchiveFilter,
pub last_health: Arc<RwLock<HealthStatus>>,
pub last_error: Arc<RwLock<Option<(DateTime<Utc>, String)>>>,
pub consecutive_failures: Arc<RwLock<u32>>,
pub writer: Option<WriterHandle>,
}
impl ArchiveRegistration {
/// Convenience constructor: builds new `Arc<RwLock<_>>` instances for the
/// drift trio. Use this for single-process, single-owner registrations
/// (tests and the simple single-archive constructors).
#[allow(clippy::too_many_arguments)]
pub fn new(
name: String,
type_name: &'static str,
backend: Arc<dyn ArchiveBackend>,
write_active: bool,
failure_mode: FailureMode,
read_priority: u32,
enabled: bool,
write_policy: WritePolicy,
writer: Option<WriterHandle>,
initial_health: HealthStatus,
) -> Self {
Self::new_with_shared_state(
name,
type_name,
backend,
write_active,
failure_mode,
read_priority,
enabled,
write_policy,
writer,
Arc::new(RwLock::new(initial_health)),
Arc::new(RwLock::new(None)),
Arc::new(RwLock::new(0)),
)
}
/// Constructor used by `from_config` so the writer task and the
/// registration share the same drift state (both mutate it).
#[allow(clippy::too_many_arguments)]
pub fn new_with_shared_state(
name: String,
type_name: &'static str,
backend: Arc<dyn ArchiveBackend>,
write_active: bool,
failure_mode: FailureMode,
read_priority: u32,
enabled: bool,
write_policy: WritePolicy,
writer: Option<WriterHandle>,
last_health: Arc<RwLock<HealthStatus>>,
last_error: Arc<RwLock<Option<(DateTime<Utc>, String)>>>,
consecutive_failures: Arc<RwLock<u32>>,
) -> Self {
Self {
name,
type_name,
backend,
write_active,
failure_mode,
read_priority,
enabled,
write_policy,
filter: ArchiveFilter::default(),
last_health,
last_error,
consecutive_failures,
writer,
}
}
/// Override the registration's filter. Intended for boot-time wiring
/// (`from_config`) and tests; the field itself is public for other
/// direct consumers.
pub fn with_filter(mut self, filter: ArchiveFilter) -> Self {
self.filter = filter;
self
}
pub fn capabilities(&self) -> &CapabilitySet {
self.backend.capabilities()
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ArchiveStatus {
pub name: String,
pub type_name: String,
pub enabled: bool,
pub write_active: bool,
pub failure_mode: FailureMode,
pub read_priority: u32,
pub capabilities: CapabilitySet,
pub health: HealthStatus,
pub last_error: Option<(DateTime<Utc>, String)>,
pub queue_depth: Option<usize>,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn defaults_are_safe() {
assert_eq!(FailureMode::default(), FailureMode::Required);
assert_eq!(WritePolicy::default(), WritePolicy::Inline);
assert_eq!(OverflowPolicy::default(), OverflowPolicy::Block);
}
}
@@ -1,256 +0,0 @@
//! Per-backend writer task for `WritePolicy::Queued` backends.
//!
//! The task drains a per-backend mpsc, optionally batching/coalescing within
//! a configured window, and invokes `ArchiveBackend` methods directly. Errors
//! drift health on the parent registration; they do not propagate to the
//! caller.
use std::sync::Arc;
use std::time::Duration;
use chrono::Utc;
use tokio::sync::{mpsc, oneshot, watch, RwLock};
use tokio::task::JoinHandle;
use tracing::{debug, warn};
use uuid::Uuid;
use crate::backend::{ArchiveBackend, HealthStatus};
use super::OverflowPolicy;
#[derive(Debug)]
pub enum WriteOp {
PutSession(crate::types::SessionMetadata),
AppendMessages {
scroll_id: Uuid,
msgs: Vec<crate::types::MessageRecord>,
},
DeleteSession {
scroll_id: Uuid,
},
ClearSessionMessages {
scroll_id: Uuid,
},
AppendDagEdge(crate::types::DagEdge),
AppendMetaEvents {
scroll_id: Uuid,
events: Vec<crate::types::MetaEventRecord>,
},
Shutdown(oneshot::Sender<()>),
}
impl WriteOp {
pub fn op_label(&self) -> &'static str {
match self {
WriteOp::PutSession(_) => "put_session",
WriteOp::AppendMessages { .. } => "append_messages",
WriteOp::DeleteSession { .. } => "delete_session",
WriteOp::ClearSessionMessages { .. } => "clear_session_messages",
WriteOp::AppendDagEdge(_) => "append_dag_edge",
WriteOp::AppendMetaEvents { .. } => "append_meta_events",
WriteOp::Shutdown(_) => "shutdown",
}
}
}
#[derive(Debug)]
pub struct WriterHandle {
pub sender: mpsc::Sender<WriteOp>,
pub overflow: OverflowPolicy,
pub queue_depth: watch::Receiver<usize>,
pub join: tokio::sync::Mutex<Option<JoinHandle<()>>>,
pub backend_name: String,
}
impl WriterHandle {
pub async fn enqueue(&self, op: WriteOp) -> Result<(), crate::error::ArchivistError> {
match self.overflow {
OverflowPolicy::Block => self.sender.send(op).await.map_err(|_| {
crate::error::ArchivistError::Other(format!(
"writer task for `{}` has closed",
self.backend_name
))
}),
OverflowPolicy::Error => self.sender.try_send(op).map_err(|e| match e {
mpsc::error::TrySendError::Full(op) => {
crate::error::ArchivistError::WriteQueueFull {
backend: self.backend_name.clone(),
op: op.op_label(),
}
}
mpsc::error::TrySendError::Closed(_) => {
crate::error::ArchivistError::Other(format!(
"writer task for `{}` has closed",
self.backend_name
))
}
}),
OverflowPolicy::DropOldest => {
// Tokio mpsc can't truly "drop oldest" without draining from the
// other side; we approximate with "drop newest when full". For
// observability sinks this is acceptable — the contract is
// "never block, may lose data".
let _ = self.sender.try_send(op);
Ok(())
}
}
}
pub fn queue_depth_now(&self) -> usize {
*self.queue_depth.borrow()
}
}
#[allow(clippy::too_many_arguments)]
pub fn spawn_writer(
backend: Arc<dyn ArchiveBackend>,
backend_name: String,
capacity: usize,
batch_window: Duration,
overflow: OverflowPolicy,
health: Arc<RwLock<HealthStatus>>,
last_error: Arc<RwLock<Option<(chrono::DateTime<chrono::Utc>, String)>>>,
consecutive_failures: Arc<RwLock<u32>>,
) -> WriterHandle {
let (tx, mut rx) = mpsc::channel::<WriteOp>(capacity);
let (depth_tx, depth_rx) = watch::channel(0usize);
let join = tokio::spawn({
let backend_name = backend_name.clone();
async move {
const FAILURE_THRESHOLD: u32 = 5;
loop {
let Some(first) = rx.recv().await else { break };
let mut batch: Vec<WriteOp> = vec![first];
let deadline = tokio::time::Instant::now() + batch_window;
while tokio::time::Instant::now() < deadline {
match tokio::time::timeout_at(deadline, rx.recv()).await {
Ok(Some(op)) => batch.push(op),
Ok(None) => break,
Err(_) => break,
}
}
let _ = depth_tx.send(rx.len());
let coalesced = coalesce(batch);
let mut shutdown_ack: Option<oneshot::Sender<()>> = None;
for op in coalesced {
if let WriteOp::Shutdown(ack) = op {
shutdown_ack = Some(ack);
break;
}
match dispatch_op(&*backend, op).await {
Ok(()) => {
*consecutive_failures.write().await = 0;
let mut h = health.write().await;
if matches!(*h, HealthStatus::Degraded { .. }) {
*h = HealthStatus::Healthy;
}
}
Err(e) => {
warn!(
backend = backend_name.as_str(),
error = %e,
"queued write failed; drifting health"
);
let mut n = consecutive_failures.write().await;
*n = n.saturating_add(1);
*last_error.write().await = Some((Utc::now(), format!("{e}")));
let mut h = health.write().await;
if *n >= FAILURE_THRESHOLD {
*h = HealthStatus::Unavailable {
reason: format!("{} consecutive failures", *n),
};
} else {
*h = HealthStatus::Degraded { reason: format!("{e}") };
}
}
}
}
if let Some(ack) = shutdown_ack {
debug!(backend = backend_name.as_str(), "writer task shutting down");
let _ = ack.send(());
break;
}
}
}
});
WriterHandle {
sender: tx,
overflow,
queue_depth: depth_rx,
join: tokio::sync::Mutex::new(Some(join)),
backend_name,
}
}
fn coalesce(batch: Vec<WriteOp>) -> Vec<WriteOp> {
let mut out: Vec<WriteOp> = Vec::with_capacity(batch.len());
for op in batch {
let merged = match (out.last_mut(), &op) {
(
Some(WriteOp::AppendMessages { scroll_id: a, .. }),
WriteOp::AppendMessages { scroll_id: b, .. },
) if a == b => true,
(
Some(WriteOp::AppendMetaEvents { scroll_id: a, .. }),
WriteOp::AppendMetaEvents { scroll_id: b, .. },
) if a == b => true,
_ => false,
};
if merged {
match out.last_mut().unwrap() {
WriteOp::AppendMessages { msgs: m1, .. } => {
if let WriteOp::AppendMessages { msgs: m2, .. } = op {
m1.extend(m2);
continue;
}
}
WriteOp::AppendMetaEvents { events: e1, .. } => {
if let WriteOp::AppendMetaEvents { events: e2, .. } = op {
e1.extend(e2);
continue;
}
}
_ => {}
}
}
out.push(op);
}
out
}
async fn dispatch_op(backend: &dyn ArchiveBackend, op: WriteOp) -> crate::error::Result<()> {
match op {
WriteOp::PutSession(meta) => backend.put_session(meta).await,
WriteOp::AppendMessages { scroll_id, msgs } => {
backend.append_messages(scroll_id, msgs).await
}
WriteOp::DeleteSession { scroll_id } => backend.delete_session(scroll_id).await,
WriteOp::ClearSessionMessages { scroll_id } => {
backend.clear_session_messages(scroll_id).await
}
WriteOp::AppendDagEdge(edge) => {
if let Some(d) = backend.as_dag() {
d.append_dag_edge(edge).await
} else {
Ok(())
}
}
WriteOp::AppendMetaEvents { scroll_id, events } => {
if let Some(m) = backend.as_meta_events() {
m.append_meta_events(scroll_id, events).await
} else {
Ok(())
}
}
WriteOp::Shutdown(_) => Ok(()),
}
}
-24
View File
@@ -1,24 +0,0 @@
//! Session management and lineage tracking.
//!
//! Handles session metadata, lineage relationships (splits, continuations),
//! and session lifecycle operations.
use crate::error::Result;
/// Session manager for tracking session metadata and lineage
pub struct SessionManager {
// Placeholder - will be populated in implementation phases
}
impl SessionManager {
/// Create a new session manager
pub fn new() -> Result<Self> {
Ok(Self {})
}
}
impl Default for SessionManager {
fn default() -> Self {
Self::new().expect("Failed to create default SessionManager")
}
}
@@ -1,465 +0,0 @@
//! Content-addressable file storage.
//!
//! Handles storage and retrieval of binary files (images, documents, etc.)
//! using content-addressable naming based on SHA-256 hashes.
//!
//! Files are stored with deduplication:
//! - Same content = same file_id = stored once
//! - Multiple sessions can reference the same file
//! - File index tracks all referencing sessions
use crate::storage::{ndjson, paths::ArchivePaths};
use crate::types::FileRecord;
use sha2::{Digest, Sha256};
use uuid::Uuid;
/// Store a file in the archive
///
/// This function:
/// 1. Computes SHA-256 hash of content
/// 2. Generates file_id: `sha256:{hex_digest}`
/// 3. Stores blob in sharded directory (if not already exists - deduplication)
/// 4. Updates file index to track the session referencing this file
/// 5. Returns the file_id
///
/// # Arguments
/// * `paths` - Archive paths helper
/// * `content` - File content bytes
/// * `original_name` - Original filename
/// * `mime` - Optional MIME type
/// * `session` - Session UUID that references this file
///
/// # Returns
/// The file_id (e.g., "sha256:abc123...")
pub async fn store_file(
paths: &ArchivePaths,
content: &[u8],
original_name: String,
mime: Option<String>,
session: Uuid,
) -> std::io::Result<String> {
// Compute SHA-256 hash
let hash = Sha256::digest(content);
let hex_digest = hex::encode(hash);
let file_id = format!("sha256:{}", hex_digest);
// Get blob path
let blob_path = paths.file_blob_path(&file_id);
// Create parent directories for blob
if let Some(parent) = blob_path.parent() {
tokio::fs::create_dir_all(parent).await?;
}
// Write blob if it doesn't exist (deduplication)
if !blob_path.exists() {
tokio::fs::write(&blob_path, content).await?;
}
// Update file index
let index_path = paths.root().join(".files").join("file_index.jsonl");
// Create .files directory if it doesn't exist
if let Some(parent) = index_path.parent() {
tokio::fs::create_dir_all(parent).await?;
}
// Serialize the read-modify-rewrite below. Concurrent callers against
// the same archive would otherwise lose records (both read the same
// snapshot) and race on `rename(.tmp → .ndjson)` (second call hits
// ENOENT because the first already consumed the shared temp path).
let index_lock = paths.file_index_lock();
let _index_guard = index_lock.lock().await;
// Read existing index
let mut records: Vec<FileRecord> = ndjson::read_ndjson(&index_path).await?;
// Find or create FileRecord
if let Some(existing) = records.iter_mut().find(|r| r.file_id == file_id) {
// File already exists - add session if not already present
if !existing.sessions.contains(&session) {
existing.sessions.push(session);
}
} else {
// New file - create record
let relative_path = blob_path
.strip_prefix(paths.root())
.unwrap_or(&blob_path)
.to_string_lossy()
.to_string();
let new_record = FileRecord {
version: 1,
file_id: file_id.clone(),
path: relative_path,
size: content.len() as u64,
mime: mime.clone(),
original_name: original_name.clone(),
sessions: vec![session],
metadata: serde_json::json!({}),
};
records.push(new_record);
}
// Rewrite entire index atomically
// Use temp file + rename pattern
let temp_index_path = index_path.with_extension("tmp");
// Clear temp file and write all records
if temp_index_path.exists() {
tokio::fs::remove_file(&temp_index_path).await?;
}
for rec in &records {
ndjson::append_ndjson(&temp_index_path, rec).await?;
}
// Rename to final location
tokio::fs::rename(&temp_index_path, &index_path).await?;
Ok(file_id)
}
/// Retrieve a file from the archive
///
/// # Arguments
/// * `paths` - Archive paths helper
/// * `file_id` - File identifier (e.g., "sha256:abc123...")
///
/// # Returns
/// File content bytes
pub async fn get_file(paths: &ArchivePaths, file_id: &str) -> std::io::Result<Vec<u8>> {
let blob_path = paths.file_blob_path(file_id);
tokio::fs::read(&blob_path).await
}
#[cfg(test)]
mod tests {
use super::*;
#[tokio::test]
async fn test_content_deduplication() {
let temp_dir =
std::env::temp_dir().join(format!("archivist_files_test_{}", Uuid::now_v7()));
let paths = ArchivePaths::new(temp_dir.clone());
let content = b"Hello, world! This is test content.";
let session1 = Uuid::now_v7();
let session2 = Uuid::now_v7();
// Store same content from two different sessions
let file_id1 = store_file(
&paths,
content,
"test1.txt".to_string(),
Some("text/plain".to_string()),
session1,
)
.await
.unwrap();
let file_id2 = store_file(
&paths,
content,
"test2.txt".to_string(), // Different name
Some("text/plain".to_string()),
session2,
)
.await
.unwrap();
// Same content should produce same file_id
assert_eq!(file_id1, file_id2);
// Verify blob was only written once
let blob_path = paths.file_blob_path(&file_id1);
assert!(blob_path.exists());
// Verify index tracks both sessions
let index_path = paths.root().join(".files").join("file_index.jsonl");
let records: Vec<FileRecord> = ndjson::read_ndjson(&index_path).await.unwrap();
let record = records.iter().find(|r| r.file_id == file_id1).unwrap();
assert_eq!(record.sessions.len(), 2);
assert!(record.sessions.contains(&session1));
assert!(record.sessions.contains(&session2));
// Clean up
tokio::fs::remove_dir_all(&temp_dir).await.ok();
}
#[tokio::test]
async fn test_sharding_distributes_files() {
let temp_dir =
std::env::temp_dir().join(format!("archivist_files_test_{}", Uuid::now_v7()));
let paths = ArchivePaths::new(temp_dir.clone());
let session = Uuid::now_v7();
// Store files with different content
let content1 = b"Content A";
let content2 = b"Content B";
let content3 = b"Content C";
let file_id1 = store_file(&paths, content1, "file1.txt".to_string(), None, session)
.await
.unwrap();
let file_id2 = store_file(&paths, content2, "file2.txt".to_string(), None, session)
.await
.unwrap();
let file_id3 = store_file(&paths, content3, "file3.txt".to_string(), None, session)
.await
.unwrap();
// Verify different content produces different file_ids
assert_ne!(file_id1, file_id2);
assert_ne!(file_id2, file_id3);
// Verify files are distributed across sharded directories
let blob_path1 = paths.file_blob_path(&file_id1);
let blob_path2 = paths.file_blob_path(&file_id2);
let blob_path3 = paths.file_blob_path(&file_id3);
assert!(blob_path1.exists());
assert!(blob_path2.exists());
assert!(blob_path3.exists());
// Verify sharding creates subdirectories
let files_dir = paths.root().join(".files");
let mut shard_dirs = Vec::new();
for entry in std::fs::read_dir(&files_dir).unwrap() {
let entry = entry.unwrap();
if entry.file_type().unwrap().is_dir() {
shard_dirs.push(entry.path());
}
}
// Should have at least one shard directory
assert!(!shard_dirs.is_empty());
// Clean up
tokio::fs::remove_dir_all(&temp_dir).await.ok();
}
#[tokio::test]
async fn test_index_tracks_sessions() {
let temp_dir =
std::env::temp_dir().join(format!("archivist_files_test_{}", Uuid::now_v7()));
let paths = ArchivePaths::new(temp_dir.clone());
let content = b"Shared content";
let session1 = Uuid::now_v7();
let session2 = Uuid::now_v7();
let session3 = Uuid::now_v7();
// Store from session1
let file_id = store_file(
&paths,
content,
"file.txt".to_string(),
Some("text/plain".to_string()),
session1,
)
.await
.unwrap();
// Verify index has 1 session
let index_path = paths.root().join(".files").join("file_index.jsonl");
let records: Vec<FileRecord> = ndjson::read_ndjson(&index_path).await.unwrap();
let record = records.iter().find(|r| r.file_id == file_id).unwrap();
assert_eq!(record.sessions.len(), 1);
assert_eq!(record.sessions[0], session1);
// Store same content from session2
store_file(
&paths,
content,
"file2.txt".to_string(),
Some("text/plain".to_string()),
session2,
)
.await
.unwrap();
// Verify index now has 2 sessions
let records: Vec<FileRecord> = ndjson::read_ndjson(&index_path).await.unwrap();
let record = records.iter().find(|r| r.file_id == file_id).unwrap();
assert_eq!(record.sessions.len(), 2);
assert!(record.sessions.contains(&session1));
assert!(record.sessions.contains(&session2));
// Store same content from session3
store_file(&paths, content, "file3.txt".to_string(), None, session3)
.await
.unwrap();
// Verify index now has 3 sessions
let records: Vec<FileRecord> = ndjson::read_ndjson(&index_path).await.unwrap();
let record = records.iter().find(|r| r.file_id == file_id).unwrap();
assert_eq!(record.sessions.len(), 3);
assert!(record.sessions.contains(&session1));
assert!(record.sessions.contains(&session2));
assert!(record.sessions.contains(&session3));
// Clean up
tokio::fs::remove_dir_all(&temp_dir).await.ok();
}
#[tokio::test]
async fn test_concurrent_writes_different_files() {
let temp_dir =
std::env::temp_dir().join(format!("archivist_files_test_{}", Uuid::now_v7()));
let paths = ArchivePaths::new(temp_dir.clone());
let content1 = b"Content 1";
let content2 = b"Content 2";
let session = Uuid::now_v7();
// Store concurrently
let (file_id1, file_id2) = tokio::join!(
store_file(&paths, content1, "file1.txt".to_string(), None, session,),
store_file(&paths, content2, "file2.txt".to_string(), None, session,)
);
let file_id1 = file_id1.unwrap();
let file_id2 = file_id2.unwrap();
// Verify both files exist
assert_ne!(file_id1, file_id2);
let retrieved1 = get_file(&paths, &file_id1).await.unwrap();
let retrieved2 = get_file(&paths, &file_id2).await.unwrap();
assert_eq!(retrieved1, content1);
assert_eq!(retrieved2, content2);
// Clean up
tokio::fs::remove_dir_all(&temp_dir).await.ok();
}
#[tokio::test]
async fn test_get_file_missing() {
let temp_dir =
std::env::temp_dir().join(format!("archivist_files_test_{}", Uuid::now_v7()));
let paths = ArchivePaths::new(temp_dir.clone());
// Try to get non-existent file
let result = get_file(&paths, "sha256:nonexistent").await;
assert!(result.is_err());
match result {
Err(e) => {
assert_eq!(e.kind(), std::io::ErrorKind::NotFound);
}
Ok(_) => panic!("Expected NotFound error"),
}
// Clean up
tokio::fs::remove_dir_all(&temp_dir).await.ok();
}
#[tokio::test]
async fn test_roundtrip_binary_content() {
let temp_dir =
std::env::temp_dir().join(format!("archivist_files_test_{}", Uuid::now_v7()));
let paths = ArchivePaths::new(temp_dir.clone());
// Binary content (not UTF-8)
let content: Vec<u8> = (0..256).map(|i| i as u8).collect();
let session = Uuid::now_v7();
// Store
let file_id = store_file(
&paths,
&content,
"binary.dat".to_string(),
Some("application/octet-stream".to_string()),
session,
)
.await
.unwrap();
// Retrieve
let retrieved = get_file(&paths, &file_id).await.unwrap();
// Verify exact match
assert_eq!(retrieved, content);
// Clean up
tokio::fs::remove_dir_all(&temp_dir).await.ok();
}
#[tokio::test]
async fn test_file_metadata_preserved() {
let temp_dir =
std::env::temp_dir().join(format!("archivist_files_test_{}", Uuid::now_v7()));
let paths = ArchivePaths::new(temp_dir.clone());
let content = b"Test content";
let session = Uuid::now_v7();
let original_name = "document.pdf".to_string();
let mime = Some("application/pdf".to_string());
// Store
let file_id = store_file(
&paths,
content,
original_name.clone(),
mime.clone(),
session,
)
.await
.unwrap();
// Read index
let index_path = paths.root().join(".files").join("file_index.jsonl");
let records: Vec<FileRecord> = ndjson::read_ndjson(&index_path).await.unwrap();
let record = records.iter().find(|r| r.file_id == file_id).unwrap();
// Verify metadata
assert_eq!(record.original_name, original_name);
assert_eq!(record.mime, mime);
assert_eq!(record.size, content.len() as u64);
assert!(record.path.contains(".files"));
// Clean up
tokio::fs::remove_dir_all(&temp_dir).await.ok();
}
#[tokio::test]
async fn test_deduplicate_same_session() {
let temp_dir =
std::env::temp_dir().join(format!("archivist_files_test_{}", Uuid::now_v7()));
let paths = ArchivePaths::new(temp_dir.clone());
let content = b"Duplicate content";
let session = Uuid::now_v7();
// Store same content twice from same session
let file_id1 = store_file(&paths, content, "file1.txt".to_string(), None, session)
.await
.unwrap();
let file_id2 = store_file(&paths, content, "file2.txt".to_string(), None, session)
.await
.unwrap();
// Same file_id
assert_eq!(file_id1, file_id2);
// Session should only appear once in the index
let index_path = paths.root().join(".files").join("file_index.jsonl");
let records: Vec<FileRecord> = ndjson::read_ndjson(&index_path).await.unwrap();
let record = records.iter().find(|r| r.file_id == file_id1).unwrap();
assert_eq!(record.sessions.len(), 1);
assert_eq!(record.sessions[0], session);
// Clean up
tokio::fs::remove_dir_all(&temp_dir).await.ok();
}
}
@@ -1,342 +0,0 @@
//! JSON storage utilities for session metadata.
//!
//! Handles reading and writing JSON files for session and connector metadata.
//! Uses atomic write operations (write-to-temp + rename) to ensure consistency.
use serde::{Deserialize, Serialize};
use std::path::Path;
use tokio::io::AsyncWriteExt;
/// Write a value to a JSON file atomically
///
/// This function:
/// 1. Serializes the value to pretty-printed JSON
/// 2. Writes to a temporary file (`{path}.tmp`)
/// 3. Renames the temp file to the target path (atomic operation)
///
/// The rename operation is atomic on most filesystems, ensuring that
/// readers will either see the old complete file or the new complete file,
/// never a partially written file.
///
/// # Arguments
/// * `path` - Path to the JSON file
/// * `value` - Value to serialize and write
///
/// # Example
/// ```no_run
/// use dirigent_archivist::storage::json::write_json;
/// use serde::{Serialize, Deserialize};
///
/// #[derive(Serialize, Deserialize)]
/// struct Config {
/// setting: String,
/// }
///
/// # async fn example() -> std::io::Result<()> {
/// let config = Config { setting: "value".to_string() };
/// write_json(std::path::Path::new("config.json"), &config).await?;
/// # Ok(())
/// # }
/// ```
pub async fn write_json<T: Serialize>(path: &Path, value: &T) -> std::io::Result<()> {
// Serialize to pretty-printed JSON
let json = serde_json::to_string_pretty(value)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
// Create temporary file path (same directory for atomic rename)
let temp_path = path.with_extension("tmp");
// Write to temporary file
let mut file = tokio::fs::File::create(&temp_path).await?;
file.write_all(json.as_bytes()).await?;
file.sync_all().await?;
drop(file); // Close the file before rename
// Atomically rename temp file to target path
tokio::fs::rename(&temp_path, path).await?;
Ok(())
}
/// Read a value from a JSON file
///
/// If the file doesn't exist, returns a NotFound error.
///
/// # Arguments
/// * `path` - Path to the JSON file
///
/// # Returns
/// Deserialized value
///
/// # Example
/// ```no_run
/// use dirigent_archivist::storage::json::read_json;
/// use serde::{Serialize, Deserialize};
///
/// #[derive(Serialize, Deserialize)]
/// struct Config {
/// setting: String,
/// }
///
/// # async fn example() -> std::io::Result<()> {
/// let config: Config = read_json(std::path::Path::new("config.json")).await?;
/// # Ok(())
/// # }
/// ```
pub async fn read_json<T: for<'de> Deserialize<'de>>(path: &Path) -> std::io::Result<T> {
// Read file to string
let content = tokio::fs::read_to_string(path).await?;
// Deserialize from JSON
let value: T = serde_json::from_str(&content)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
Ok(value)
}
#[cfg(test)]
mod tests {
use super::*;
use serde::{Deserialize, Serialize};
use uuid::Uuid;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
struct TestData {
id: String,
value: i32,
nested: NestedData,
}
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
struct NestedData {
flag: bool,
items: Vec<String>,
}
#[tokio::test]
async fn test_write_and_read_roundtrip() {
let temp_dir = std::env::temp_dir();
let file_path = temp_dir.join(format!("test_json_{}.json", Uuid::now_v7()));
let data = TestData {
id: "test-123".to_string(),
value: 42,
nested: NestedData {
flag: true,
items: vec!["a".to_string(), "b".to_string(), "c".to_string()],
},
};
// Write
write_json(&file_path, &data).await.unwrap();
// Read back
let read_data: TestData = read_json(&file_path).await.unwrap();
// Verify
assert_eq!(read_data, data);
// Clean up
tokio::fs::remove_file(&file_path).await.ok();
}
#[tokio::test]
async fn test_pretty_printed_output() {
let temp_dir = std::env::temp_dir();
let file_path = temp_dir.join(format!("test_pretty_{}.json", Uuid::now_v7()));
let data = TestData {
id: "test".to_string(),
value: 100,
nested: NestedData {
flag: false,
items: vec!["x".to_string()],
},
};
// Write
write_json(&file_path, &data).await.unwrap();
// Read as raw string
let content = tokio::fs::read_to_string(&file_path).await.unwrap();
// Verify it's pretty-printed (contains newlines and indentation)
assert!(content.contains('\n'));
assert!(content.contains(" ")); // Indentation
assert!(content.contains(r#""id""#));
assert!(content.contains(r#""value""#));
// Clean up
tokio::fs::remove_file(&file_path).await.ok();
}
#[tokio::test]
async fn test_read_missing_file() {
let temp_dir = std::env::temp_dir();
let file_path = temp_dir.join(format!("nonexistent_{}.json", Uuid::now_v7()));
// Should return NotFound error
let result: std::io::Result<TestData> = read_json(&file_path).await;
assert!(result.is_err());
match result {
Err(e) => {
assert_eq!(e.kind(), std::io::ErrorKind::NotFound);
}
Ok(_) => panic!("Expected NotFound error"),
}
}
#[tokio::test]
async fn test_atomic_write() {
let temp_dir = std::env::temp_dir();
let file_path = temp_dir.join(format!("test_atomic_{}.json", Uuid::now_v7()));
let data1 = TestData {
id: "first".to_string(),
value: 1,
nested: NestedData {
flag: true,
items: vec![],
},
};
let data2 = TestData {
id: "second".to_string(),
value: 2,
nested: NestedData {
flag: false,
items: vec!["updated".to_string()],
},
};
// Write first version
write_json(&file_path, &data1).await.unwrap();
// Verify first version
let read1: TestData = read_json(&file_path).await.unwrap();
assert_eq!(read1.id, "first");
// Overwrite with second version
write_json(&file_path, &data2).await.unwrap();
// Verify second version
let read2: TestData = read_json(&file_path).await.unwrap();
assert_eq!(read2.id, "second");
assert_eq!(read2.value, 2);
// Temp file should not exist after rename
let temp_path = file_path.with_extension("tmp");
assert!(!temp_path.exists());
// Clean up
tokio::fs::remove_file(&file_path).await.ok();
}
#[tokio::test]
async fn test_invalid_json_error() {
let temp_dir = std::env::temp_dir();
let file_path = temp_dir.join(format!("test_invalid_{}.json", Uuid::now_v7()));
// Write invalid JSON manually
tokio::fs::write(&file_path, "{ invalid json }")
.await
.unwrap();
// Reading should fail with InvalidData error
let result: std::io::Result<TestData> = read_json(&file_path).await;
assert!(result.is_err());
match result {
Err(e) => {
assert_eq!(e.kind(), std::io::ErrorKind::InvalidData);
}
Ok(_) => panic!("Expected InvalidData error"),
}
// Clean up
tokio::fs::remove_file(&file_path).await.ok();
}
#[tokio::test]
async fn test_concurrent_writes_different_files() {
let temp_dir = std::env::temp_dir();
let file1 = temp_dir.join(format!("test_concurrent_1_{}.json", Uuid::now_v7()));
let file2 = temp_dir.join(format!("test_concurrent_2_{}.json", Uuid::now_v7()));
let data1 = TestData {
id: "file1".to_string(),
value: 1,
nested: NestedData {
flag: true,
items: vec![],
},
};
let data2 = TestData {
id: "file2".to_string(),
value: 2,
nested: NestedData {
flag: false,
items: vec![],
},
};
// Write concurrently
let (r1, r2) = tokio::join!(write_json(&file1, &data1), write_json(&file2, &data2));
r1.unwrap();
r2.unwrap();
// Verify both files
let read1: TestData = read_json(&file1).await.unwrap();
let read2: TestData = read_json(&file2).await.unwrap();
assert_eq!(read1, data1);
assert_eq!(read2, data2);
// Clean up
tokio::fs::remove_file(&file1).await.ok();
tokio::fs::remove_file(&file2).await.ok();
}
#[tokio::test]
async fn test_write_creates_parent_directory() {
let temp_dir = std::env::temp_dir();
let base_dir = temp_dir.join(format!("test_parent_{}", Uuid::now_v7()));
// Note: Parent directory does NOT exist yet
// This test verifies that write_json does NOT auto-create parent dirs
// (Caller is responsible for creating parent directories)
let file_path = base_dir.join("subdir").join("test.json");
let data = TestData {
id: "test".to_string(),
value: 42,
nested: NestedData {
flag: true,
items: vec![],
},
};
// This should fail because parent directory doesn't exist
let result = write_json(&file_path, &data).await;
assert!(result.is_err());
// Now create parent directory
tokio::fs::create_dir_all(file_path.parent().unwrap())
.await
.unwrap();
// Now write should succeed
write_json(&file_path, &data).await.unwrap();
// Verify
let read_data: TestData = read_json(&file_path).await.unwrap();
assert_eq!(read_data, data);
// Clean up
tokio::fs::remove_dir_all(&base_dir).await.ok();
}
}
@@ -1,118 +0,0 @@
//! Storage layer for the Archivist.
//!
//! Provides file-based storage using NDJSON, JSON, and TSV formats,
//! along with content-addressable file storage for attachments.
use uuid::Uuid;
pub mod files;
pub mod json;
pub mod ndjson;
pub mod paths;
pub mod tsv;
// Re-export commonly used types and functions
pub use files::{get_file, store_file};
pub use json::{read_json, write_json};
pub use ndjson::{append_ndjson, read_ndjson, write_ndjson};
pub use paths::ArchivePaths;
pub use tsv::{read_connector_index, write_connector_index};
/// Check if a UUID is version 7 (time-ordered).
///
/// UUID version 7 is used throughout the archivist for scroll_ids and other
/// identifiers that need to be time-ordered and sortable.
///
/// # Examples
///
/// ```
/// use uuid::Uuid;
/// use dirigent_archivist::storage::is_uuid7;
///
/// let uuid7 = Uuid::now_v7();
/// assert!(is_uuid7(&uuid7));
///
/// let uuid4 = Uuid::new_v4();
/// assert!(!is_uuid7(&uuid4));
/// ```
pub fn is_uuid7(uuid: &Uuid) -> bool {
uuid.get_version_num() == 7
}
/// Parse a string as UUID7, returning None for other versions.
///
/// This function ensures that only UUID version 7 identifiers are accepted,
/// rejecting other UUID versions (v1, v4, v5, etc.) that may be valid UUIDs
/// but don't meet the archivist's time-ordering requirements.
///
/// # Examples
///
/// ```
/// use uuid::Uuid;
/// use dirigent_archivist::storage::parse_uuid7;
///
/// // UUID7 string parses successfully
/// let uuid7_str = Uuid::now_v7().to_string();
/// assert!(parse_uuid7(&uuid7_str).is_some());
///
/// // UUID4 string is rejected
/// let uuid4_str = Uuid::new_v4().to_string();
/// assert!(parse_uuid7(&uuid4_str).is_none());
///
/// // Invalid UUID string is rejected
/// assert!(parse_uuid7("not-a-uuid").is_none());
/// ```
pub fn parse_uuid7(s: &str) -> Option<Uuid> {
Uuid::parse_str(s).ok().filter(|u| is_uuid7(u))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_is_uuid7_accepts_uuid7() {
let uuid7 = Uuid::now_v7();
assert!(is_uuid7(&uuid7), "UUID7 should be recognized as version 7");
}
#[test]
fn test_is_uuid7_rejects_uuid4() {
let uuid4 = Uuid::new_v4();
assert!(!is_uuid7(&uuid4), "UUID4 should not be recognized as version 7");
}
#[test]
fn test_parse_uuid7_accepts_valid_uuid7_string() {
let uuid7 = Uuid::now_v7();
let uuid7_str = uuid7.to_string();
let parsed = parse_uuid7(&uuid7_str);
assert!(parsed.is_some(), "Valid UUID7 string should parse");
assert_eq!(parsed.unwrap(), uuid7, "Parsed UUID should match original");
}
#[test]
fn test_parse_uuid7_rejects_uuid4_string() {
let uuid4 = Uuid::new_v4();
let uuid4_str = uuid4.to_string();
let parsed = parse_uuid7(&uuid4_str);
assert!(parsed.is_none(), "UUID4 string should be rejected");
}
#[test]
fn test_parse_uuid7_rejects_invalid_uuid_string() {
let invalid_strings = vec![
"not-a-uuid",
"12345678-1234-1234-1234-123456789012-extra",
"",
"invalid",
];
for invalid in invalid_strings {
let parsed = parse_uuid7(invalid);
assert!(parsed.is_none(), "Invalid UUID string '{}' should be rejected", invalid);
}
}
}
@@ -1,361 +0,0 @@
//! NDJSON (Newline Delimited JSON) storage utilities.
//!
//! Handles reading and writing NDJSON files for incremental message logs.
//! NDJSON format stores one JSON object per line, making it ideal for
//! append-only logs that can be read incrementally.
use serde::{Deserialize, Serialize};
use std::path::Path;
use tokio::fs::OpenOptions;
use tokio::io::AsyncWriteExt;
/// Append a record to an NDJSON file
///
/// This function:
/// 1. Serializes the record to JSON
/// 2. Opens the file in append mode (creates if not exists)
/// 3. Writes the JSON followed by a newline
/// 4. Calls fsync to ensure durability
///
/// # Arguments
/// * `path` - Path to the NDJSON file
/// * `record` - Record to append (must be serializable)
///
/// # Example
/// ```no_run
/// use dirigent_archivist::storage::ndjson::append_ndjson;
/// use serde::{Serialize, Deserialize};
///
/// #[derive(Serialize, Deserialize)]
/// struct LogEntry {
/// message: String,
/// }
///
/// # async fn example() -> std::io::Result<()> {
/// let entry = LogEntry { message: "Hello".to_string() };
/// append_ndjson(std::path::Path::new("log.ndjson"), &entry).await?;
/// # Ok(())
/// # }
/// ```
pub async fn append_ndjson<T: Serialize>(path: &Path, record: &T) -> std::io::Result<()> {
// Serialize to JSON string
let json = serde_json::to_string(record)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
// Open file in append mode (create if not exists)
let mut file = OpenOptions::new()
.create(true)
.append(true)
.open(path)
.await?;
// Write JSON + newline
file.write_all(json.as_bytes()).await?;
file.write_all(b"\n").await?;
// Fsync for durability
file.sync_all().await?;
Ok(())
}
/// Atomically rewrite an NDJSON file with the given records.
///
/// Uses a temp file + rename for crash safety. If the process crashes during
/// the write, the original file remains untouched. Only after the new content
/// is fully written and fsynced is the old file replaced.
///
/// # Arguments
/// * `path` - Path to the NDJSON file (will be created or overwritten)
/// * `records` - Records to write (one per line)
///
/// # Example
/// ```no_run
/// use dirigent_archivist::storage::ndjson::write_ndjson;
/// use serde::{Serialize, Deserialize};
///
/// #[derive(Serialize, Deserialize)]
/// struct LogEntry {
/// message: String,
/// }
///
/// # async fn example() -> std::io::Result<()> {
/// let entries = vec![
/// LogEntry { message: "First".to_string() },
/// LogEntry { message: "Second".to_string() },
/// ];
/// write_ndjson(std::path::Path::new("log.ndjson"), &entries).await?;
/// # Ok(())
/// # }
/// ```
pub async fn write_ndjson<T: Serialize>(path: &Path, records: &[T]) -> std::io::Result<()> {
let temp_path = path.with_extension("jsonl.tmp");
let mut file = OpenOptions::new()
.create(true)
.write(true)
.truncate(true)
.open(&temp_path)
.await?;
for record in records {
let json = serde_json::to_string(record)
.map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
file.write_all(json.as_bytes()).await?;
file.write_all(b"\n").await?;
}
file.flush().await?;
file.sync_all().await?;
drop(file);
tokio::fs::rename(&temp_path, path).await?;
Ok(())
}
/// Read all records from an NDJSON file
///
/// This function:
/// 1. Reads the entire file to a string
/// 2. Splits by newlines
/// 3. Deserializes each non-empty line
///
/// If the file doesn't exist, returns an empty vector.
///
/// # Arguments
/// * `path` - Path to the NDJSON file
///
/// # Returns
/// Vector of deserialized records
///
/// # Example
/// ```no_run
/// use dirigent_archivist::storage::ndjson::read_ndjson;
/// use serde::{Serialize, Deserialize};
///
/// #[derive(Serialize, Deserialize)]
/// struct LogEntry {
/// message: String,
/// }
///
/// # async fn example() -> std::io::Result<()> {
/// let entries: Vec<LogEntry> = read_ndjson(std::path::Path::new("log.ndjson")).await?;
/// # Ok(())
/// # }
/// ```
pub async fn read_ndjson<T: for<'de> Deserialize<'de>>(path: &Path) -> std::io::Result<Vec<T>> {
// Check if file exists
if !path.exists() {
return Ok(Vec::new());
}
// Read entire file to string
let content = tokio::fs::read_to_string(path).await?;
// Parse line by line
let mut records = Vec::new();
for (line_num, line) in content.lines().enumerate() {
// Skip empty lines
if line.trim().is_empty() {
continue;
}
// Deserialize the line
let record: T = serde_json::from_str(line).map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::InvalidData,
format!("Failed to parse line {}: {}", line_num + 1, e),
)
})?;
records.push(record);
}
Ok(records)
}
#[cfg(test)]
mod tests {
use super::*;
use serde::{Deserialize, Serialize};
use uuid::Uuid;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
struct TestRecord {
id: String,
value: i32,
}
#[tokio::test]
async fn test_append_and_read_roundtrip() {
// Create a temporary file
let temp_dir = std::env::temp_dir();
let file_path = temp_dir.join(format!("test_ndjson_{}.ndjson", Uuid::now_v7()));
// Append multiple records
let record1 = TestRecord {
id: "rec1".to_string(),
value: 42,
};
let record2 = TestRecord {
id: "rec2".to_string(),
value: 100,
};
let record3 = TestRecord {
id: "rec3".to_string(),
value: -5,
};
append_ndjson(&file_path, &record1).await.unwrap();
append_ndjson(&file_path, &record2).await.unwrap();
append_ndjson(&file_path, &record3).await.unwrap();
// Read back
let records: Vec<TestRecord> = read_ndjson(&file_path).await.unwrap();
// Verify
assert_eq!(records.len(), 3);
assert_eq!(records[0], record1);
assert_eq!(records[1], record2);
assert_eq!(records[2], record3);
// Clean up
tokio::fs::remove_file(&file_path).await.ok();
}
#[tokio::test]
async fn test_read_empty_file() {
// Create a temporary empty file
let temp_dir = std::env::temp_dir();
let file_path = temp_dir.join(format!("test_empty_{}.ndjson", Uuid::now_v7()));
tokio::fs::write(&file_path, "").await.unwrap();
// Read should return empty vector
let records: Vec<TestRecord> = read_ndjson(&file_path).await.unwrap();
assert_eq!(records.len(), 0);
// Clean up
tokio::fs::remove_file(&file_path).await.ok();
}
#[tokio::test]
async fn test_read_missing_file() {
// Read from a non-existent file
let temp_dir = std::env::temp_dir();
let file_path = temp_dir.join(format!("nonexistent_{}.ndjson", Uuid::now_v7()));
// Should return empty vector, not error
let records: Vec<TestRecord> = read_ndjson(&file_path).await.unwrap();
assert_eq!(records.len(), 0);
}
#[tokio::test]
async fn test_trailing_newlines() {
// Create a file with trailing newlines
let temp_dir = std::env::temp_dir();
let file_path = temp_dir.join(format!("test_trailing_{}.ndjson", Uuid::now_v7()));
// Write manually with extra newlines
let content = r#"{"id":"rec1","value":42}
{"id":"rec2","value":100}
"#;
tokio::fs::write(&file_path, content).await.unwrap();
// Read should skip empty lines
let records: Vec<TestRecord> = read_ndjson(&file_path).await.unwrap();
assert_eq!(records.len(), 2);
assert_eq!(records[0].id, "rec1");
assert_eq!(records[1].id, "rec2");
// Clean up
tokio::fs::remove_file(&file_path).await.ok();
}
#[tokio::test]
async fn test_concurrent_appends() {
// Test appending to different files concurrently
let temp_dir = std::env::temp_dir();
let file1 = temp_dir.join(format!("test_concurrent_1_{}.ndjson", Uuid::now_v7()));
let file2 = temp_dir.join(format!("test_concurrent_2_{}.ndjson", Uuid::now_v7()));
let record1 = TestRecord {
id: "file1".to_string(),
value: 1,
};
let record2 = TestRecord {
id: "file2".to_string(),
value: 2,
};
// Append concurrently
let (r1, r2) = tokio::join!(
append_ndjson(&file1, &record1),
append_ndjson(&file2, &record2)
);
r1.unwrap();
r2.unwrap();
// Verify both files have correct content
let records1: Vec<TestRecord> = read_ndjson(&file1).await.unwrap();
let records2: Vec<TestRecord> = read_ndjson(&file2).await.unwrap();
assert_eq!(records1.len(), 1);
assert_eq!(records1[0], record1);
assert_eq!(records2.len(), 1);
assert_eq!(records2[0], record2);
// Clean up
tokio::fs::remove_file(&file1).await.ok();
tokio::fs::remove_file(&file2).await.ok();
}
#[tokio::test]
async fn test_invalid_json_error() {
// Create a file with invalid JSON
let temp_dir = std::env::temp_dir();
let file_path = temp_dir.join(format!("test_invalid_{}.ndjson", Uuid::now_v7()));
let content = r#"{"id":"rec1","value":42}
invalid json here
{"id":"rec2","value":100}"#;
tokio::fs::write(&file_path, content).await.unwrap();
// Reading should fail with InvalidData error
let result: std::io::Result<Vec<TestRecord>> = read_ndjson(&file_path).await;
assert!(result.is_err());
match result {
Err(e) => {
assert_eq!(e.kind(), std::io::ErrorKind::InvalidData);
assert!(e.to_string().contains("line 2"));
}
Ok(_) => panic!("Expected error"),
}
// Clean up
tokio::fs::remove_file(&file_path).await.ok();
}
#[tokio::test]
async fn test_fsync_called() {
// This test verifies that append_ndjson completes without error,
// which implies fsync was called successfully
let temp_dir = std::env::temp_dir();
let file_path = temp_dir.join(format!("test_fsync_{}.ndjson", Uuid::now_v7()));
let record = TestRecord {
id: "test".to_string(),
value: 42,
};
// Should complete without error (including fsync)
append_ndjson(&file_path, &record).await.unwrap();
// Verify file was written
assert!(file_path.exists());
// Clean up
tokio::fs::remove_file(&file_path).await.ok();
}
}
@@ -1,436 +0,0 @@
//! Path management for archive directory structure.
//!
//! Defines the archive directory layout and provides utilities for
//! constructing paths to various archive components.
use std::path::PathBuf;
use std::sync::Arc;
use tokio::sync::Mutex;
use uuid::Uuid;
/// Archive path utilities
///
/// Provides methods to generate paths for all archive components:
/// - Sessions: `.contexts/{scroll_id}/`
/// - Connectors: `.db/connectors/{connector_uid}/`
/// - Files: `.files/{ab}/{cd}/{ef}/{...}` (sharded by SHA-256)
///
/// Also carries the per-archive mutex that serialises `store_file`'s
/// read-modify-rewrite of `.files/file_index.ndjson` — the shared
/// `file_index.tmp` path made concurrent calls race on `rename`.
pub struct ArchivePaths {
root: PathBuf,
/// Guards the critical section in `storage::files::store_file` that
/// rewrites the per-archive file index. Cloneable `Arc` so callers
/// that share the same `ArchivePaths` instance serialise correctly.
file_index_lock: Arc<Mutex<()>>,
}
impl ArchivePaths {
/// Create a new ArchivePaths instance
pub fn new(root: PathBuf) -> Self {
Self {
root,
file_index_lock: Arc::new(Mutex::new(())),
}
}
/// Get the archive root directory
pub fn root(&self) -> &PathBuf {
&self.root
}
/// Acquire the per-archive file-index lock. Held across the
/// `read → modify → append-temp → rename` sequence in
/// `storage::files::store_file`.
pub(crate) fn file_index_lock(&self) -> Arc<Mutex<()>> {
Arc::clone(&self.file_index_lock)
}
// ========================================================================
// Session Paths
// ========================================================================
/// Get the directory for a specific session
///
/// Returns: `{root}/.contexts/{scroll_id}`
pub fn session_dir(&self, scroll_id: Uuid) -> PathBuf {
self.root.join(".contexts").join(scroll_id.to_string())
}
/// Get the session metadata JSON file path
///
/// Returns: `{root}/.contexts/{scroll_id}/session.json`
pub fn session_json(&self, scroll_id: Uuid) -> PathBuf {
self.session_dir(scroll_id).join("session.json")
}
/// Get the messages NDJSON file path for WRITE operations
///
/// For read operations, use `messages_path_for_read()` which supports both .jsonl and .ndjson
///
/// Returns: `{root}/.contexts/{scroll_id}/messages.ndjson`
#[deprecated(since = "0.2.0", note = "Use messages_path_for_write() instead")]
pub fn messages_ndjson(&self, scroll_id: Uuid) -> PathBuf {
self.session_dir(scroll_id).join("messages.ndjson")
}
/// Resolve messages file path for reading.
/// Checks for .jsonl first, falls back to .ndjson
pub fn messages_path_for_read(&self, scroll_id: Uuid) -> PathBuf {
let session_dir = self.session_dir(scroll_id);
self.resolve_ndjson_or_jsonl(&session_dir, "messages")
}
/// Get the messages file path for WRITE operations.
/// Always returns .jsonl path (new canonical format).
pub fn messages_path_for_write(&self, scroll_id: Uuid) -> PathBuf {
self.session_dir(scroll_id).join("messages.jsonl")
}
/// Get the events file path for meta sessions (.jsonl format)
///
/// Meta sessions (AcpConnection) store connection events in events.jsonl
/// instead of messages. These events track connection lifecycle and session navigation.
///
/// Returns: `{root}/.contexts/{scroll_id}/events.jsonl`
pub fn events_path(&self, scroll_id: Uuid) -> PathBuf {
self.session_dir(scroll_id).join("events.jsonl")
}
/// Get the DAG index file path.
///
/// Returns: `{root}/.db/dag.jsonl`
pub fn dag_path(&self) -> PathBuf {
self.root.join(".db").join("dag.jsonl")
}
/// Resolve sessions mapping file path for reading.
/// Checks for .jsonl first, falls back to .ndjson
pub fn sessions_path_for_read(&self, connector_uid: Uuid) -> PathBuf {
let connector_dir = self.connector_dir(connector_uid);
self.resolve_ndjson_or_jsonl(&connector_dir, "sessions")
}
/// Get the sessions file path for WRITE operations.
/// Always returns .jsonl path (new canonical format).
pub fn sessions_path_for_write(&self, connector_uid: Uuid) -> PathBuf {
self.connector_dir(connector_uid).join("sessions.jsonl")
}
// ========================================================================
// Connector Paths
// ========================================================================
/// Get the directory for a specific connector
///
/// Returns: `{root}/.db/connectors/{connector_uid}`
pub fn connector_dir(&self, connector_uid: Uuid) -> PathBuf {
self.root
.join(".db")
.join("connectors")
.join(connector_uid.to_string())
}
/// Get the connector index TSV file path
///
/// Returns: `{root}/.db/connectors/index.tsv`
pub fn connector_index_tsv(&self) -> PathBuf {
self.root.join(".db").join("connectors").join("index.tsv")
}
// ========================================================================
// File Storage Paths
// ========================================================================
/// Get the blob path for a file using sharded storage
///
/// Sharding strategy:
/// - Input: `sha256:abcdef0123456789...`
/// - Strip `sha256:` prefix
/// - Shard by first 6 characters (2-char segments)
/// - Returns: `{root}/.files/ab/cd/ef/0123456789...`
///
/// # Arguments
/// * `file_id` - File identifier (e.g., "sha256:abcdef...")
pub fn file_blob_path(&self, file_id: &str) -> PathBuf {
// Strip "sha256:" prefix if present
let hash = file_id.strip_prefix("sha256:").unwrap_or(file_id);
// Extract first 6 chars for sharding (3 levels of 2 chars each)
// If hash is shorter, we'll just use what we have
let (shard1, remainder) = if hash.len() >= 2 {
hash.split_at(2)
} else {
(hash, "")
};
let (shard2, remainder) = if remainder.len() >= 2 {
remainder.split_at(2)
} else {
(remainder, "")
};
let (shard3, _) = if remainder.len() >= 2 {
remainder.split_at(2)
} else {
(remainder, remainder)
};
// Build the sharded path
let mut path = self.root.join(".files");
if !shard1.is_empty() {
path = path.join(shard1);
}
if !shard2.is_empty() {
path = path.join(shard2);
}
if !shard3.is_empty() {
path = path.join(shard3);
}
// Use the full hash (without prefix) as filename
path.join(hash)
}
// ========================================================================
// Directory Creation
// ========================================================================
/// Ensure all required directories exist for a session
///
/// Creates the session directory if it doesn't exist.
pub async fn ensure_dirs(&self, scroll_id: Uuid) -> std::io::Result<()> {
let session_dir = self.session_dir(scroll_id);
tokio::fs::create_dir_all(session_dir).await
}
/// Ensure the connector directory exists
///
/// Creates `.db/connectors/{connector_uid}/` if it doesn't exist.
/// This should be called before any operations that write to connector-specific files.
pub async fn ensure_connector_dir(&self, connector_uid: Uuid) -> std::io::Result<()> {
let connector_dir = self.connector_dir(connector_uid);
tokio::fs::create_dir_all(&connector_dir).await
}
/// Generic resolution: prefer .jsonl, fall back to .ndjson
///
/// This enables backward compatibility with existing .ndjson archives
/// while supporting the more widely-recognized .jsonl extension.
fn resolve_ndjson_or_jsonl(&self, dir: &std::path::Path, base_name: &str) -> PathBuf {
// Check for .jsonl first (newer, more prominent extension)
let jsonl_path = dir.join(format!("{}.jsonl", base_name));
if jsonl_path.exists() {
return jsonl_path;
}
// Fall back to .ndjson (legacy format, still canonical for writes in Phase 1)
dir.join(format!("{}.ndjson", base_name))
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::path::Path;
#[test]
fn test_session_dir() {
let paths = ArchivePaths::new(PathBuf::from("/archive"));
let scroll_id = Uuid::parse_str("018c8f7e-7b6a-7e3c-9f2d-1a2b3c4d5e6f").unwrap();
let session_dir = paths.session_dir(scroll_id);
assert_eq!(
session_dir,
Path::new("/archive/.contexts/018c8f7e-7b6a-7e3c-9f2d-1a2b3c4d5e6f")
);
}
#[test]
fn test_session_json() {
let paths = ArchivePaths::new(PathBuf::from("/archive"));
let scroll_id = Uuid::parse_str("018c8f7e-7b6a-7e3c-9f2d-1a2b3c4d5e6f").unwrap();
let json_path = paths.session_json(scroll_id);
assert_eq!(
json_path,
Path::new("/archive/.contexts/018c8f7e-7b6a-7e3c-9f2d-1a2b3c4d5e6f/session.json")
);
}
#[test]
fn test_messages_ndjson() {
let paths = ArchivePaths::new(PathBuf::from("/archive"));
let scroll_id = Uuid::parse_str("018c8f7e-7b6a-7e3c-9f2d-1a2b3c4d5e6f").unwrap();
let messages_path = paths.messages_ndjson(scroll_id);
assert_eq!(
messages_path,
Path::new("/archive/.contexts/018c8f7e-7b6a-7e3c-9f2d-1a2b3c4d5e6f/messages.ndjson")
);
}
#[test]
fn test_connector_dir() {
let paths = ArchivePaths::new(PathBuf::from("/archive"));
let connector_uid = Uuid::parse_str("018c8f7e-7b6a-7e3c-9f2d-1a2b3c4d5e6f").unwrap();
let connector_dir = paths.connector_dir(connector_uid);
assert_eq!(
connector_dir,
Path::new("/archive/.db/connectors/018c8f7e-7b6a-7e3c-9f2d-1a2b3c4d5e6f")
);
}
#[test]
fn test_connector_index_tsv() {
let paths = ArchivePaths::new(PathBuf::from("/archive"));
let index_path = paths.connector_index_tsv();
assert_eq!(index_path, Path::new("/archive/.db/connectors/index.tsv"));
}
#[test]
fn test_file_blob_path_with_prefix() {
let paths = ArchivePaths::new(PathBuf::from("/archive"));
let file_id = "sha256:abcdef0123456789";
let blob_path = paths.file_blob_path(file_id);
assert_eq!(
blob_path,
Path::new("/archive/.files/ab/cd/ef/abcdef0123456789")
);
}
#[test]
fn test_file_blob_path_without_prefix() {
let paths = ArchivePaths::new(PathBuf::from("/archive"));
let file_id = "abcdef0123456789";
let blob_path = paths.file_blob_path(file_id);
assert_eq!(
blob_path,
Path::new("/archive/.files/ab/cd/ef/abcdef0123456789")
);
}
#[test]
fn test_file_blob_path_short_hash() {
let paths = ArchivePaths::new(PathBuf::from("/archive"));
// Very short hash (less than 6 chars)
let file_id = "sha256:abc";
let blob_path = paths.file_blob_path(file_id);
assert_eq!(blob_path, Path::new("/archive/.files/ab/c/abc"));
// 4 char hash
let file_id = "sha256:abcd";
let blob_path = paths.file_blob_path(file_id);
assert_eq!(blob_path, Path::new("/archive/.files/ab/cd/abcd"));
// 5 char hash
let file_id = "sha256:abcde";
let blob_path = paths.file_blob_path(file_id);
assert_eq!(blob_path, Path::new("/archive/.files/ab/cd/e/abcde"));
}
#[test]
fn test_file_blob_path_long_hash() {
let paths = ArchivePaths::new(PathBuf::from("/archive"));
let file_id = "sha256:abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789";
let blob_path = paths.file_blob_path(file_id);
assert_eq!(
blob_path,
Path::new("/archive/.files/ab/cd/ef/abcdef0123456789abcdef0123456789abcdef0123456789abcdef0123456789")
);
}
#[test]
fn test_paths_use_correct_separators() {
let paths = ArchivePaths::new(PathBuf::from("/archive"));
let scroll_id = Uuid::parse_str("018c8f7e-7b6a-7e3c-9f2d-1a2b3c4d5e6f").unwrap();
// All paths should use PathBuf which handles platform separators
let session_dir = paths.session_dir(scroll_id);
let session_json = paths.session_json(scroll_id);
let messages_ndjson = paths.messages_ndjson(scroll_id);
// On Windows, these should contain backslashes; on Unix, forward slashes
// PathBuf handles this automatically, so we just verify the components
assert!(session_dir.to_string_lossy().contains(".contexts"));
assert!(session_json.to_string_lossy().contains("session.json"));
assert!(messages_ndjson
.to_string_lossy()
.contains("messages.ndjson"));
}
#[tokio::test]
async fn test_ensure_dirs() {
// Create a temporary directory for testing
let temp_dir = std::env::temp_dir().join(format!("archivist_test_{}", Uuid::now_v7()));
let paths = ArchivePaths::new(temp_dir.clone());
let scroll_id = Uuid::now_v7();
// Directory should not exist yet
assert!(!paths.session_dir(scroll_id).exists());
// Create the directory
paths.ensure_dirs(scroll_id).await.unwrap();
// Directory should now exist
assert!(paths.session_dir(scroll_id).exists());
// Clean up
tokio::fs::remove_dir_all(temp_dir).await.ok();
}
#[tokio::test]
async fn test_messages_path_for_read_ndjson_only() {
// Create a temporary directory with only .ndjson file
let temp_dir = std::env::temp_dir().join(format!("archivist_test_{}", Uuid::now_v7()));
let paths = ArchivePaths::new(temp_dir.clone());
let scroll_id = Uuid::now_v7();
// Create session directory and .ndjson file
paths.ensure_dirs(scroll_id).await.unwrap();
let ndjson_path = paths.messages_ndjson(scroll_id);
tokio::fs::write(&ndjson_path, "test content").await.unwrap();
// messages_path_for_read should return the .ndjson path
let resolved_path = paths.messages_path_for_read(scroll_id);
assert_eq!(resolved_path, ndjson_path);
assert!(resolved_path.to_string_lossy().ends_with("messages.ndjson"));
// Clean up
tokio::fs::remove_dir_all(temp_dir).await.ok();
}
#[tokio::test]
async fn test_messages_path_for_read_jsonl_preferred() {
// Create a temporary directory with both .ndjson and .jsonl files
let temp_dir = std::env::temp_dir().join(format!("archivist_test_{}", Uuid::now_v7()));
let paths = ArchivePaths::new(temp_dir.clone());
let scroll_id = Uuid::now_v7();
// Create session directory and both files
paths.ensure_dirs(scroll_id).await.unwrap();
let session_dir = paths.session_dir(scroll_id);
let ndjson_path = session_dir.join("messages.ndjson");
let jsonl_path = session_dir.join("messages.jsonl");
tokio::fs::write(&ndjson_path, "old content").await.unwrap();
tokio::fs::write(&jsonl_path, "new content").await.unwrap();
// messages_path_for_read should prefer .jsonl
let resolved_path = paths.messages_path_for_read(scroll_id);
assert_eq!(resolved_path, jsonl_path);
assert!(resolved_path.to_string_lossy().ends_with("messages.jsonl"));
// Clean up
tokio::fs::remove_dir_all(temp_dir).await.ok();
}
}
@@ -1,552 +0,0 @@
//! TSV (Tab-Separated Values) storage utilities.
//!
//! Handles reading and writing TSV files for session listings and indices.
//! TSV format is human-readable and grep-able, making it ideal for manual
//! inspection and command-line processing.
use crate::types::ConnectorIndexRow;
use std::path::Path;
use tokio::io::AsyncWriteExt;
use uuid::Uuid;
/// Write connector index to a TSV file atomically
///
/// This function:
/// 1. Generates the header line
/// 2. Formats each row as tab-separated values
/// 3. Writes to a temporary file
/// 4. Renames to the target path (atomic operation)
///
/// TSV format:
/// ```text
/// connector_uid\ttype\ttitle\tclient_native_id\talias_of\tcreated_at
/// 018c8f7e-...\tOpenCode\tLocal Dev\topencode@...\t\t2025-01-15T12:34:56Z
/// ```
///
/// # Arguments
/// * `path` - Path to the TSV file
/// * `rows` - Rows to write
pub async fn write_connector_index(path: &Path, rows: &[ConnectorIndexRow]) -> std::io::Result<()> {
// Generate header
let header = "connector_uid\ttype\ttitle\tclient_native_id\talias_of\tcreated_at\tfingerprint\n";
// Format rows
let mut content = String::from(header);
for row in rows {
let alias_of_str = row
.alias_of
.as_ref()
.map(|u| u.to_string())
.unwrap_or_default();
let fingerprint_str = row.fingerprint.as_deref().unwrap_or("");
let line = format!(
"{}\t{}\t{}\t{}\t{}\t{}\t{}\n",
row.connector_uid,
row.r#type,
row.title,
row.client_native_id,
alias_of_str,
row.created_at.to_rfc3339(),
fingerprint_str,
);
content.push_str(&line);
}
// Write to temp file
let temp_path = path.with_extension("tmp");
let mut file = tokio::fs::File::create(&temp_path).await?;
file.write_all(content.as_bytes()).await?;
file.sync_all().await?;
drop(file);
// Atomically rename
tokio::fs::rename(&temp_path, path).await?;
Ok(())
}
/// Read connector index from a TSV file
///
/// If the file doesn't exist, returns an empty vector.
///
/// # Arguments
/// * `path` - Path to the TSV file
///
/// # Returns
/// Vector of connector index rows
pub async fn read_connector_index(path: &Path) -> std::io::Result<Vec<ConnectorIndexRow>> {
// Check if file exists
if !path.exists() {
return Ok(Vec::new());
}
// Read file to string
let content = tokio::fs::read_to_string(path).await?;
// Parse line by line
let mut rows = Vec::new();
for (line_num, line) in content.lines().enumerate() {
// Skip header (line 0)
if line_num == 0 {
continue;
}
// Skip empty lines
if line.trim().is_empty() {
continue;
}
// Split by tab
let parts: Vec<&str> = line.split('\t').collect();
// Accept 6 columns (legacy, no fingerprint) or 7 columns (with fingerprint)
if parts.len() != 6 && parts.len() != 7 {
return Err(std::io::Error::new(
std::io::ErrorKind::InvalidData,
format!(
"Invalid TSV format at line {}: expected 6 or 7 fields, got {}",
line_num + 1,
parts.len()
),
));
}
// Parse fields
let connector_uid = Uuid::parse_str(parts[0]).map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::InvalidData,
format!("Invalid UUID at line {}: {}", line_num + 1, e),
)
})?;
let r#type = parts[1].to_string();
let title = parts[2].to_string();
let client_native_id = parts[3].to_string();
let alias_of = if parts[4].is_empty() {
None
} else {
Some(Uuid::parse_str(parts[4]).map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::InvalidData,
format!("Invalid alias_of UUID at line {}: {}", line_num + 1, e),
)
})?)
};
let created_at = chrono::DateTime::parse_from_rfc3339(parts[5])
.map_err(|e| {
std::io::Error::new(
std::io::ErrorKind::InvalidData,
format!("Invalid timestamp at line {}: {}", line_num + 1, e),
)
})?
.with_timezone(&chrono::Utc);
// Parse optional fingerprint (7th column, may be absent in legacy files)
let fingerprint = if parts.len() >= 7 && !parts[6].is_empty() {
Some(parts[6].to_string())
} else {
None
};
rows.push(ConnectorIndexRow {
connector_uid,
r#type,
title,
client_native_id,
alias_of,
created_at,
fingerprint,
});
}
Ok(rows)
}
#[cfg(test)]
mod tests {
use super::*;
use chrono::{DateTime, Utc};
use std::time::SystemTime;
#[tokio::test]
async fn test_write_and_read_roundtrip() {
let temp_dir = std::env::temp_dir();
let file_path = temp_dir.join(format!("test_tsv_{}.tsv", Uuid::now_v7()));
let uid1 = Uuid::now_v7();
let uid2 = Uuid::now_v7();
let uid3 = Uuid::now_v7();
let now = DateTime::<Utc>::from(SystemTime::now());
let rows = vec![
ConnectorIndexRow {
connector_uid: uid1,
r#type: "OpenCode".to_string(),
title: "Local Dev".to_string(),
client_native_id: "opencode@localhost:12225".to_string(),
alias_of: None,
created_at: now,
fingerprint: None,
},
ConnectorIndexRow {
connector_uid: uid2,
r#type: "ACP".to_string(),
title: "Remote Agent".to_string(),
client_native_id: "acp@http://localhost:3000".to_string(),
alias_of: Some(uid3),
created_at: now,
fingerprint: None,
},
];
// Write
write_connector_index(&file_path, &rows).await.unwrap();
// Read back
let read_rows = read_connector_index(&file_path).await.unwrap();
// Verify
assert_eq!(read_rows.len(), 2);
assert_eq!(read_rows[0].connector_uid, uid1);
assert_eq!(read_rows[0].r#type, "OpenCode");
assert_eq!(read_rows[0].title, "Local Dev");
assert_eq!(read_rows[0].alias_of, None);
assert_eq!(read_rows[1].connector_uid, uid2);
assert_eq!(read_rows[1].r#type, "ACP");
assert_eq!(read_rows[1].alias_of, Some(uid3));
// Clean up
tokio::fs::remove_file(&file_path).await.ok();
}
#[tokio::test]
async fn test_optional_field_handling() {
let temp_dir = std::env::temp_dir();
let file_path = temp_dir.join(format!("test_optional_{}.tsv", Uuid::now_v7()));
let uid1 = Uuid::now_v7();
let uid2 = Uuid::now_v7();
let now = DateTime::<Utc>::from(SystemTime::now());
let rows = vec![
ConnectorIndexRow {
connector_uid: uid1,
r#type: "Type1".to_string(),
title: "Title1".to_string(),
client_native_id: "client1".to_string(),
alias_of: None, // Empty alias_of
created_at: now,
fingerprint: None,
},
ConnectorIndexRow {
connector_uid: uid2,
r#type: "Type2".to_string(),
title: "Title2".to_string(),
client_native_id: "client2".to_string(),
alias_of: Some(uid1), // Non-empty alias_of
created_at: now,
fingerprint: None,
},
];
// Write
write_connector_index(&file_path, &rows).await.unwrap();
// Verify raw content has empty string for None
let content = tokio::fs::read_to_string(&file_path).await.unwrap();
let lines: Vec<&str> = content.lines().collect();
// First data line should have empty alias_of (two consecutive tabs)
assert!(lines[1].contains("\t\t"));
// Second data line should have a UUID for alias_of
assert!(lines[2].contains(&uid1.to_string()));
// Read back
let read_rows = read_connector_index(&file_path).await.unwrap();
// Verify optional field handling
assert_eq!(read_rows[0].alias_of, None);
assert_eq!(read_rows[1].alias_of, Some(uid1));
// Clean up
tokio::fs::remove_file(&file_path).await.ok();
}
#[tokio::test]
async fn test_header_generation() {
let temp_dir = std::env::temp_dir();
let file_path = temp_dir.join(format!("test_header_{}.tsv", Uuid::now_v7()));
// Write empty index
write_connector_index(&file_path, &[]).await.unwrap();
// Read raw content
let content = tokio::fs::read_to_string(&file_path).await.unwrap();
// Verify header
assert_eq!(
content.trim(),
"connector_uid\ttype\ttitle\tclient_native_id\talias_of\tcreated_at\tfingerprint"
);
// Clean up
tokio::fs::remove_file(&file_path).await.ok();
}
#[tokio::test]
async fn test_rfc3339_timestamp_formatting() {
let temp_dir = std::env::temp_dir();
let file_path = temp_dir.join(format!("test_timestamp_{}.tsv", Uuid::now_v7()));
let uid = Uuid::now_v7();
let timestamp = DateTime::<Utc>::from(SystemTime::now());
let rows = vec![ConnectorIndexRow {
connector_uid: uid,
r#type: "Test".to_string(),
title: "Title".to_string(),
client_native_id: "client".to_string(),
alias_of: None,
created_at: timestamp,
fingerprint: None,
}];
// Write
write_connector_index(&file_path, &rows).await.unwrap();
// Read raw content
let content = tokio::fs::read_to_string(&file_path).await.unwrap();
// Verify RFC 3339 format in content
assert!(content.contains('T'));
assert!(content.contains('Z') || content.contains('+'));
// Read back and verify timestamp is preserved
let read_rows = read_connector_index(&file_path).await.unwrap();
let diff =
(timestamp.timestamp_millis() - read_rows[0].created_at.timestamp_millis()).abs();
assert!(diff < 1000, "Timestamp difference too large");
// Clean up
tokio::fs::remove_file(&file_path).await.ok();
}
#[tokio::test]
async fn test_missing_file_returns_empty_vec() {
let temp_dir = std::env::temp_dir();
let file_path = temp_dir.join(format!("nonexistent_{}.tsv", Uuid::now_v7()));
// Should return empty vec, not error
let rows = read_connector_index(&file_path).await.unwrap();
assert_eq!(rows.len(), 0);
}
#[tokio::test]
async fn test_malformed_tsv_error() {
let temp_dir = std::env::temp_dir();
let file_path = temp_dir.join(format!("test_malformed_{}.tsv", Uuid::now_v7()));
// Write malformed TSV (missing fields)
let content =
"connector_uid\ttype\ttitle\tclient_native_id\talias_of\tcreated_at\nuid1\ttype1\n";
tokio::fs::write(&file_path, content).await.unwrap();
// Should fail with InvalidData
let result = read_connector_index(&file_path).await;
assert!(result.is_err());
match result {
Err(e) => {
assert_eq!(e.kind(), std::io::ErrorKind::InvalidData);
assert!(e.to_string().contains("expected 6 or 7 fields"));
}
Ok(_) => panic!("Expected error"),
}
// Clean up
tokio::fs::remove_file(&file_path).await.ok();
}
#[tokio::test]
async fn test_invalid_uuid_error() {
let temp_dir = std::env::temp_dir();
let file_path = temp_dir.join(format!("test_invalid_uuid_{}.tsv", Uuid::now_v7()));
// Write TSV with invalid UUID
let content = "connector_uid\ttype\ttitle\tclient_native_id\talias_of\tcreated_at\ninvalid-uuid\tType\tTitle\tClient\t\t2025-01-15T12:34:56Z\n";
tokio::fs::write(&file_path, content).await.unwrap();
// Should fail with InvalidData
let result = read_connector_index(&file_path).await;
assert!(result.is_err());
match result {
Err(e) => {
assert_eq!(e.kind(), std::io::ErrorKind::InvalidData);
assert!(e.to_string().contains("Invalid UUID"));
}
Ok(_) => panic!("Expected error"),
}
// Clean up
tokio::fs::remove_file(&file_path).await.ok();
}
#[tokio::test]
async fn test_invalid_timestamp_error() {
let temp_dir = std::env::temp_dir();
let file_path = temp_dir.join(format!("test_invalid_timestamp_{}.tsv", Uuid::now_v7()));
let uid = Uuid::now_v7();
// Write TSV with invalid timestamp
let content = format!("connector_uid\ttype\ttitle\tclient_native_id\talias_of\tcreated_at\n{}\tType\tTitle\tClient\t\tinvalid-timestamp\n", uid);
tokio::fs::write(&file_path, content).await.unwrap();
// Should fail with InvalidData
let result = read_connector_index(&file_path).await;
assert!(result.is_err());
match result {
Err(e) => {
assert_eq!(e.kind(), std::io::ErrorKind::InvalidData);
assert!(e.to_string().contains("Invalid timestamp"));
}
Ok(_) => panic!("Expected error"),
}
// Clean up
tokio::fs::remove_file(&file_path).await.ok();
}
#[tokio::test]
async fn test_atomic_write() {
let temp_dir = std::env::temp_dir();
let file_path = temp_dir.join(format!("test_atomic_{}.tsv", Uuid::now_v7()));
let uid1 = Uuid::now_v7();
let uid2 = Uuid::now_v7();
let now = DateTime::<Utc>::from(SystemTime::now());
let rows1 = vec![ConnectorIndexRow {
connector_uid: uid1,
r#type: "First".to_string(),
title: "First Write".to_string(),
client_native_id: "client1".to_string(),
alias_of: None,
created_at: now,
fingerprint: None,
}];
let rows2 = vec![ConnectorIndexRow {
connector_uid: uid2,
r#type: "Second".to_string(),
title: "Second Write".to_string(),
client_native_id: "client2".to_string(),
alias_of: None,
created_at: now,
fingerprint: None,
}];
// Write first version
write_connector_index(&file_path, &rows1).await.unwrap();
// Verify first version
let read1 = read_connector_index(&file_path).await.unwrap();
assert_eq!(read1.len(), 1);
assert_eq!(read1[0].title, "First Write");
// Overwrite with second version
write_connector_index(&file_path, &rows2).await.unwrap();
// Verify second version
let read2 = read_connector_index(&file_path).await.unwrap();
assert_eq!(read2.len(), 1);
assert_eq!(read2[0].title, "Second Write");
// Temp file should not exist
let temp_path = file_path.with_extension("tmp");
assert!(!temp_path.exists());
// Clean up
tokio::fs::remove_file(&file_path).await.ok();
}
#[tokio::test]
async fn test_legacy_six_column_tsv_compatibility() {
let temp_dir = std::env::temp_dir();
let file_path = temp_dir.join(format!("test_legacy_tsv_{}.tsv", Uuid::now_v7()));
let uid = Uuid::now_v7();
// Write a legacy 6-column TSV (no fingerprint column)
let content = format!(
"connector_uid\ttype\ttitle\tclient_native_id\talias_of\tcreated_at\n{}\tOpenCode\tLegacy\tclient-legacy\t\t2025-01-15T12:34:56Z\n",
uid
);
tokio::fs::write(&file_path, content).await.unwrap();
// Should parse successfully with fingerprint = None
let rows = read_connector_index(&file_path).await.unwrap();
assert_eq!(rows.len(), 1);
assert_eq!(rows[0].connector_uid, uid);
assert_eq!(rows[0].r#type, "OpenCode");
assert_eq!(rows[0].title, "Legacy");
assert_eq!(rows[0].fingerprint, None);
// Clean up
tokio::fs::remove_file(&file_path).await.ok();
}
#[tokio::test]
async fn test_fingerprint_roundtrip() {
let temp_dir = std::env::temp_dir();
let file_path = temp_dir.join(format!("test_fingerprint_{}.tsv", Uuid::now_v7()));
let uid1 = Uuid::now_v7();
let uid2 = Uuid::now_v7();
let now = DateTime::<Utc>::from(SystemTime::now());
let rows = vec![
ConnectorIndexRow {
connector_uid: uid1,
r#type: "ACP".to_string(),
title: "Claude CLI".to_string(),
client_native_id: "acp-claude-1".to_string(),
alias_of: None,
created_at: now,
fingerprint: Some("acp/stdio:/usr/bin/claude".to_string()),
},
ConnectorIndexRow {
connector_uid: uid2,
r#type: "OpenCode".to_string(),
title: "No Fingerprint".to_string(),
client_native_id: "opencode@localhost".to_string(),
alias_of: None,
created_at: now,
fingerprint: None,
},
];
// Write
write_connector_index(&file_path, &rows).await.unwrap();
// Read back
let read_rows = read_connector_index(&file_path).await.unwrap();
assert_eq!(read_rows.len(), 2);
assert_eq!(
read_rows[0].fingerprint,
Some("acp/stdio:/usr/bin/claude".to_string())
);
assert_eq!(read_rows[1].fingerprint, None);
// Clean up
tokio::fs::remove_file(&file_path).await.ok();
}
}
File diff suppressed because it is too large Load Diff
@@ -1,334 +0,0 @@
//! Two-archive fanout tests exercising `ArchiveFilter` semantics.
//!
//! The primary backend is unfiltered; the secondary backend carries a
//! restricted filter. Writes should always reach the primary but only
//! fan out to the secondary when the session passes the filter.
#![cfg(feature = "test-utils")]
use std::collections::HashSet;
use std::sync::Arc;
use chrono::Utc;
use uuid::Uuid;
use dirigent_archivist::backend::mock::MockBackend;
use dirigent_archivist::backend::{ArchiveBackend, HealthStatus};
use dirigent_archivist::coordinator::Archivist;
use dirigent_archivist::registry::{
ArchiveFilter, ArchiveRegistration, FailureMode, WritePolicy,
};
use dirigent_archivist::types::{
ConnectorRecord, MessageRecord, RegisterSessionRequest,
};
fn reg(
name: &str,
backend: Arc<MockBackend>,
priority: u32,
filter: ArchiveFilter,
) -> Arc<ArchiveRegistration> {
Arc::new(
ArchiveRegistration::new(
name.into(),
"mock",
backend as Arc<dyn ArchiveBackend>,
/* write_active */ true,
FailureMode::Required,
priority,
/* enabled */ true,
WritePolicy::Inline,
/* writer */ None,
HealthStatus::Healthy,
)
.with_filter(filter),
)
}
/// Seed a connector into a MockBackend directly, bypassing the coordinator.
async fn seed_connector(backend: &MockBackend, connector_uid: Uuid, client_native_id: &str) {
use dirigent_archivist::backend::ConnectorRegistryBackend;
let rec = ConnectorRecord {
version: 1,
connector_uid,
r#type: "Mock".into(),
title: "Mock connector".into(),
client_native_id: client_native_id.into(),
alias_of: None,
created_at: Utc::now(),
metadata: serde_json::Value::Null,
fingerprint: None,
};
backend
.put_connector(rec)
.await
.expect("put_connector succeeds");
}
fn make_msg(session: Uuid, n: u32) -> MessageRecord {
MessageRecord {
version: 1,
message_id: Uuid::now_v7(),
session,
parent_id: None,
ts: Utc::now(),
role: "user".into(),
author: None,
content_md: format!("msg {}", n),
content_parts: None,
attachments: vec![],
metadata: serde_json::Value::Null,
}
}
#[tokio::test]
async fn secondary_archive_filters_by_exclude_connector() {
let primary_backend = Arc::new(MockBackend::new());
let secondary_backend = Arc::new(MockBackend::new());
let connector_a = Uuid::now_v7();
let connector_b = Uuid::now_v7();
// Connector A is excluded from the secondary.
let mut excluded = HashSet::new();
excluded.insert(connector_a);
let secondary_filter = ArchiveFilter {
exclude_connectors: excluded,
..Default::default()
};
// Seed connectors on primary (and on secondary so mapping writes that DO
// pass the filter don't fail for unrelated reasons).
seed_connector(&primary_backend, connector_a, "native/a").await;
seed_connector(&primary_backend, connector_b, "native/b").await;
let archivist = Archivist::from_registrations(vec![
reg("primary", primary_backend.clone(), 0, ArchiveFilter::default()),
reg("secondary", secondary_backend.clone(), 10, secondary_filter),
]);
// Register a session for each connector.
let resp_a = archivist
.register_session(
RegisterSessionRequest {
connector_uid: connector_a,
native_session_id: "sess-a".into(),
title: None,
custom_scroll_id: None,
metadata: serde_json::Value::Null,
completeness: Default::default(),
parent_scroll_id: None,
is_subagent: false,
continuation: None,
agent_id: None,
subagent_type: None,
spawning_tool_use_id: None,
},
None,
)
.await
.expect("register session a");
let scroll_a = resp_a.scroll_id;
let resp_b = archivist
.register_session(
RegisterSessionRequest {
connector_uid: connector_b,
native_session_id: "sess-b".into(),
title: None,
custom_scroll_id: None,
metadata: serde_json::Value::Null,
completeness: Default::default(),
parent_scroll_id: None,
is_subagent: false,
continuation: None,
agent_id: None,
subagent_type: None,
spawning_tool_use_id: None,
},
None,
)
.await
.expect("register session b");
let scroll_b = resp_b.scroll_id;
// Append 3 messages to each session.
archivist
.append_messages(
scroll_a,
vec![make_msg(scroll_a, 1), make_msg(scroll_a, 2), make_msg(scroll_a, 3)],
None,
)
.await
.expect("append to a");
archivist
.append_messages(
scroll_b,
vec![make_msg(scroll_b, 1), make_msg(scroll_b, 2), make_msg(scroll_b, 3)],
None,
)
.await
.expect("append to b");
// Primary sees every message.
assert_eq!(primary_backend.appended_count(scroll_a), 3);
assert_eq!(primary_backend.appended_count(scroll_b), 3);
// Secondary excludes connector_a: scroll_a is filtered out,
// scroll_b is replicated.
assert_eq!(
secondary_backend.appended_count(scroll_a),
0,
"secondary should NOT receive messages for the excluded connector"
);
assert_eq!(
secondary_backend.appended_count(scroll_b),
3,
"secondary should receive messages for the allowed connector"
);
// Session metadata fanout follows the same rule.
assert!(
primary_backend
.get_session(scroll_a)
.await
.unwrap()
.is_some(),
"primary has scroll_a"
);
assert!(
secondary_backend
.get_session(scroll_a)
.await
.unwrap()
.is_none(),
"secondary should NOT have scroll_a (excluded connector)"
);
assert!(
secondary_backend
.get_session(scroll_b)
.await
.unwrap()
.is_some(),
"secondary should have scroll_b (allowed connector)"
);
}
#[tokio::test]
async fn secondary_archive_filters_by_include_tag() {
let primary_backend = Arc::new(MockBackend::new());
let secondary_backend = Arc::new(MockBackend::new());
let connector = Uuid::now_v7();
seed_connector(&primary_backend, connector, "native/tagged").await;
let mut include = HashSet::new();
include.insert("prod".to_string());
let secondary_filter = ArchiveFilter {
include_tags: include,
..Default::default()
};
let archivist = Archivist::from_registrations(vec![
reg("primary", primary_backend.clone(), 0, ArchiveFilter::default()),
reg("secondary", secondary_backend.clone(), 10, secondary_filter),
]);
// Register two sessions on the same connector.
let prod_resp = archivist
.register_session(
RegisterSessionRequest {
connector_uid: connector,
native_session_id: "sess-prod".into(),
title: None,
custom_scroll_id: None,
metadata: serde_json::Value::Null,
completeness: Default::default(),
parent_scroll_id: None,
is_subagent: false,
continuation: None,
agent_id: None,
subagent_type: None,
spawning_tool_use_id: None,
},
None,
)
.await
.expect("register prod session");
let scroll_prod = prod_resp.scroll_id;
let dev_resp = archivist
.register_session(
RegisterSessionRequest {
connector_uid: connector,
native_session_id: "sess-dev".into(),
title: None,
custom_scroll_id: None,
metadata: serde_json::Value::Null,
completeness: Default::default(),
parent_scroll_id: None,
is_subagent: false,
continuation: None,
agent_id: None,
subagent_type: None,
spawning_tool_use_id: None,
},
None,
)
.await
.expect("register dev session");
let scroll_dev = dev_resp.scroll_id;
// Tag the prod session directly on the primary so the coordinator can
// see it on the next fanout metadata lookup. We mutate via the primary
// backend to avoid going through update_session_metadata (which doesn't
// expose a tag API).
{
use dirigent_archivist::backend::ArchiveBackend as _;
let mut md = primary_backend
.get_session(scroll_prod)
.await
.unwrap()
.expect("prod session on primary");
md.tags.push("prod".into());
primary_backend.put_session(md).await.unwrap();
}
// Append messages AFTER tagging — now the filter consults the tagged metadata.
archivist
.append_messages(
scroll_prod,
vec![
make_msg(scroll_prod, 1),
make_msg(scroll_prod, 2),
make_msg(scroll_prod, 3),
],
None,
)
.await
.expect("append prod");
archivist
.append_messages(
scroll_dev,
vec![make_msg(scroll_dev, 1), make_msg(scroll_dev, 2), make_msg(scroll_dev, 3)],
None,
)
.await
.expect("append dev");
// Primary keeps both.
assert_eq!(primary_backend.appended_count(scroll_prod), 3);
assert_eq!(primary_backend.appended_count(scroll_dev), 3);
// Secondary only keeps the tagged session.
assert_eq!(
secondary_backend.appended_count(scroll_prod),
3,
"secondary receives messages for the `prod`-tagged session"
);
assert_eq!(
secondary_backend.appended_count(scroll_dev),
0,
"secondary rejects the untagged session"
);
}
@@ -1,2 +0,0 @@
{"type":"user","uuid":"11111111-1111-7111-8111-111111111111","parentUuid":null,"timestamp":"2024-01-01T00:00:00Z","sessionId":"abc12345-1234-1234-1234-abcdef123456","cwd":"/home/user/myproj","version":"2.1.71","gitBranch":"main","isSidechain":false,"isMeta":false,"userType":"external","message":{"role":"user","content":"hello"}}
{"type":"assistant","uuid":"22222222-2222-7222-8222-222222222222","parentUuid":"11111111-1111-7111-8111-111111111111","timestamp":"2024-01-01T00:00:01Z","sessionId":"abc12345-1234-1234-1234-abcdef123456","cwd":"/home/user/myproj","version":"2.1.71","gitBranch":"main","isSidechain":false,"requestId":"req-001","message":{"model":"claude-3-5-sonnet","id":"msg-abc","type":"message","role":"assistant","content":[{"type":"text","text":"hi back"}],"stop_reason":"end_turn","usage":{"input_tokens":10,"output_tokens":5}}}
@@ -1,153 +0,0 @@
//! End-to-end test: import a Claude fixture twice, expect no duplication;
//! then append a new message and re-import, expect exactly 1 new message.
use camino::Utf8PathBuf;
use dirigent_archivist::{
backends::JsonlBackend,
import::{claude::import_claude_sessions, ImportProgressSink},
Archivist, SessionListQuery,
};
use std::sync::Arc;
use uuid::Uuid;
fn fixture_root() -> Utf8PathBuf {
Utf8PathBuf::from(std::env::var("CARGO_MANIFEST_DIR").unwrap())
.join("tests/fixtures/claude_minimal")
}
/// Build a self-contained coordinator for a given archive root.
///
/// Uses `from_single_backend` so that parallel-test runs do not race on a
/// shared `.archives.json` in the tempdir's parent (which is what
/// `new_with_single_archive` would create).
async fn mk_archivist(root: std::path::PathBuf) -> dirigent_archivist::Result<Archivist> {
let backend = Arc::new(JsonlBackend::new(root).await?);
Archivist::from_single_backend("main".into(), backend).await
}
#[tokio::test]
async fn claude_import_twice_is_idempotent() -> dirigent_archivist::Result<()> {
let tmp = std::env::temp_dir().join(format!("claude_idem_{}", Uuid::now_v7()));
let archivist = mk_archivist(tmp.clone()).await?;
let fixture = fixture_root();
// First run — should import everything.
let stats1 = import_claude_sessions(&archivist, &fixture, None, &ImportProgressSink::noop(), &std::collections::HashMap::new()).await?;
assert!(
stats1.sessions_imported >= 1,
"expected at least one imported session, got stats {:?}",
stats1
);
assert!(
stats1.messages_written >= 2,
"expected >=2 messages written, got {:?}",
stats1
);
// Second run — should write nothing (fingerprint gate skips unchanged sessions).
let stats2 = import_claude_sessions(&archivist, &fixture, None, &ImportProgressSink::noop(), &std::collections::HashMap::new()).await?;
assert_eq!(
stats2.messages_written, 0,
"expected no re-write on second import, got {:?}",
stats2
);
assert_eq!(stats2.sessions_imported, 0);
assert!(
stats2.sessions_skipped >= 1,
"expected at least one skipped session, got {:?}",
stats2
);
// Verify on disk: no duplicate message_ids within any session.
let page = archivist
.list_sessions_paged(SessionListQuery::default().with_limit(200))
.await?;
for session in &page.items {
let messages = archivist.get_messages(session.scroll_id, None).await?;
let mut seen = std::collections::HashSet::new();
for m in &messages {
assert!(
seen.insert(m.message_id),
"duplicate message_id {} in session {}",
m.message_id,
session.scroll_id
);
}
}
let _ = tokio::fs::remove_dir_all(tmp).await;
Ok(())
}
#[tokio::test]
async fn claude_import_picks_up_additive_growth() -> dirigent_archivist::Result<()> {
// Copy the fixture to a mutable temp dir so we can append a message.
let tmp_src = std::env::temp_dir().join(format!("claude_grow_src_{}", Uuid::now_v7()));
let fixture = fixture_root();
copy_dir_recursive(&fixture.as_std_path().to_path_buf(), &tmp_src).await;
let tmp_arch = std::env::temp_dir().join(format!("claude_grow_arch_{}", Uuid::now_v7()));
let archivist = mk_archivist(tmp_arch.clone()).await?;
let src = Utf8PathBuf::from_path_buf(tmp_src.clone()).unwrap();
let _ = import_claude_sessions(&archivist, &src, None, &ImportProgressSink::noop(), &std::collections::HashMap::new()).await?;
// Append a new message to the existing JSONL.
let jsonl = find_jsonl(&tmp_src).expect("fixture jsonl not found");
let extra = r#"{"type":"user","uuid":"33333333-3333-7333-8333-333333333333","parentUuid":"22222222-2222-7222-8222-222222222222","timestamp":"2024-01-01T00:00:02Z","sessionId":"abc12345-1234-1234-1234-abcdef123456","cwd":"/home/user/myproj","version":"2.1.71","gitBranch":"main","isSidechain":false,"isMeta":false,"userType":"external","message":{"role":"user","content":"follow up"}}"#;
use tokio::io::AsyncWriteExt;
let mut f = tokio::fs::OpenOptions::new()
.append(true)
.open(&jsonl)
.await
.unwrap();
f.write_all(extra.as_bytes()).await.unwrap();
f.write_all(b"\n").await.unwrap();
drop(f);
let stats = import_claude_sessions(&archivist, &src, None, &ImportProgressSink::noop(), &std::collections::HashMap::new()).await?;
assert_eq!(
stats.messages_written, 1,
"expected 1 new message to be imported, got {:?}",
stats
);
assert_eq!(
stats.sessions_updated, 1,
"expected 1 session updated, got {:?}",
stats
);
let _ = tokio::fs::remove_dir_all(tmp_src).await;
let _ = tokio::fs::remove_dir_all(tmp_arch).await;
Ok(())
}
async fn copy_dir_recursive(src: &std::path::Path, dst: &std::path::Path) {
tokio::fs::create_dir_all(dst).await.unwrap();
let mut stack = vec![(src.to_path_buf(), dst.to_path_buf())];
while let Some((s, d)) = stack.pop() {
let mut entries = tokio::fs::read_dir(&s).await.unwrap();
while let Some(entry) = entries.next_entry().await.unwrap() {
let from = entry.path();
let to = d.join(entry.file_name());
if entry.file_type().await.unwrap().is_dir() {
tokio::fs::create_dir_all(&to).await.unwrap();
stack.push((from, to));
} else {
tokio::fs::copy(&from, &to).await.unwrap();
}
}
}
}
fn find_jsonl(dir: &std::path::Path) -> Option<std::path::PathBuf> {
for entry in walkdir::WalkDir::new(dir).into_iter().flatten() {
if entry.file_type().is_file()
&& entry.path().extension().and_then(|s| s.to_str()) == Some("jsonl")
{
return Some(entry.path().to_path_buf());
}
}
None
}
@@ -1,89 +0,0 @@
//! Integration test: importer trait progress events fire in expected order.
//!
//! Drives a full `ChatGptImporter::import` against a fixture and asserts on
//! the `ImportProgressEvent` sequence observed on the paired receiver.
use std::sync::Arc;
use tempfile::TempDir;
use dirigent_archivist::{
backends::JsonlBackend,
coordinator::Archivist,
import::{
ImportConfig, ImportProgressEvent, ImportProgressSink, ImportTarget, ImporterRegistry,
},
};
#[tokio::test]
async fn progress_event_sequence_is_well_formed() {
// 1. Setup an in-memory archivist (JsonlBackend in tempdir).
let dir = TempDir::new().unwrap();
let backend = Arc::new(JsonlBackend::new(dir.path().to_path_buf()).await.unwrap());
let archivist = Archivist::from_single_backend("main".into(), backend)
.await
.unwrap();
let archivist = Arc::new(archivist);
// 2. Use the chatgpt fixture — a minimal conversations.json with a
// user + assistant message pair.
let fixture = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("../dirigent_chatgpt/tests/fixtures/minimal.json");
assert!(
fixture.exists(),
"chatgpt fixture missing at {}",
fixture.display()
);
let cfg = ImportConfig {
source: "chatgpt".into(),
params: {
let mut m = std::collections::BTreeMap::new();
m.insert("path".into(), serde_json::json!(fixture.display().to_string()));
m
},
};
// 3. Run the import with a channel sink.
let registry = ImporterRegistry::default();
let importer = registry.get("chatgpt").expect("chatgpt registered");
let (sink, mut rx) = ImportProgressSink::channel();
let archivist_for_job = archivist.clone();
let job = tokio::spawn(async move {
importer
.import(&cfg, &*archivist_for_job, ImportTarget::default(), sink)
.await
});
// 4. Collect all events until the sender side is dropped.
let mut events = Vec::new();
while let Some(evt) = rx.recv().await {
events.push(evt);
}
let stats = job.await.unwrap().expect("import");
// 5. Assertions on the event sequence.
// Must contain at least one SessionStarted before any SessionFinished.
let started_idx = events
.iter()
.position(|e| matches!(e, ImportProgressEvent::SessionStarted { .. }));
let finished_idx = events
.iter()
.position(|e| matches!(e, ImportProgressEvent::SessionFinished { .. }));
assert!(started_idx.is_some(), "expected a SessionStarted event");
assert!(finished_idx.is_some(), "expected a SessionFinished event");
assert!(
started_idx.unwrap() < finished_idx.unwrap(),
"SessionStarted must precede SessionFinished"
);
// Stats shows at least 2 messages written (chatgpt fixture has a user
// + assistant pair).
assert!(
stats.messages_written >= 2,
"expected messages to be written, got stats {:?}",
stats
);
assert_eq!(stats.sessions_imported, 1);
}
File diff suppressed because it is too large Load Diff
@@ -1,364 +0,0 @@
//! Tests for `Archivist::list_sessions_paged` — pagination, filters, cursor stability.
use chrono::{Duration, Utc};
use dirigent_archivist::{
backends::JsonlBackend, Archivist, RegisterConnectorRequest, RegisterSessionRequest,
Result, SessionListQuery,
};
use std::sync::Arc;
use uuid::Uuid;
/// Scaffold: create a coordinator backed by a single `JsonlBackend` in a
/// unique temp dir, returning the backend alongside it so tests can probe
/// disk paths via `backend.paths()`.
async fn mk_archivist() -> Result<(Archivist, Arc<JsonlBackend>, std::path::PathBuf)> {
let temp_dir = std::env::temp_dir().join(format!("paged_test_{}", Uuid::now_v7()));
let backend = Arc::new(JsonlBackend::new(temp_dir.clone()).await?);
let archivist =
Archivist::from_single_backend("main".into(), backend.clone()).await?;
Ok((archivist, backend, temp_dir))
}
/// Register a connector, return its UID.
async fn mk_connector(archivist: &Archivist, title: &str) -> Result<Uuid> {
let resp = archivist
.register_connector(
RegisterConnectorRequest {
r#type: "OpenCode".to_string(),
title: title.to_string(),
client_native_id: format!("{title}@local:{}", Uuid::now_v7()),
custom_uid: None,
metadata: serde_json::json!({}),
fingerprint: None,
},
None,
)
.await?;
Ok(resp.connector_uid)
}
/// Register a session and patch fields that `register_session` does not expose.
#[allow(clippy::too_many_arguments)]
async fn mk_session(
archivist: &Archivist,
backend: &JsonlBackend,
connector_uid: Uuid,
native_id: &str,
title: Option<&str>,
tags: Vec<String>,
model: Option<&str>,
project_id: Option<&str>,
no_update: bool,
) -> Result<Uuid> {
let mut metadata = serde_json::Map::new();
if let Some(m) = model {
metadata.insert("model".to_string(), serde_json::Value::String(m.to_string()));
}
if let Some(p) = project_id {
metadata.insert(
"project_id".to_string(),
serde_json::Value::String(p.to_string()),
);
}
let resp = archivist
.register_session(
RegisterSessionRequest {
connector_uid,
native_session_id: native_id.to_string(),
title: title.map(String::from),
custom_scroll_id: None,
metadata: serde_json::Value::Object(metadata),
completeness: Default::default(),
parent_scroll_id: None,
is_subagent: false,
continuation: None,
agent_id: None,
subagent_type: None,
spawning_tool_use_id: None,
},
None,
)
.await?;
let scroll_id = resp.scroll_id;
// Patch tags / no_update into session.json on disk.
if !tags.is_empty() || no_update {
let mut meta = archivist.get_session_metadata(scroll_id, None).await?;
meta.tags = tags;
meta.no_update = no_update;
let path = backend.paths().session_json(scroll_id);
dirigent_archivist::storage::json::write_json(&path, &meta)
.await
.map_err(dirigent_archivist::ArchivistError::Io)?;
}
Ok(scroll_id)
}
/// Overwrite a session's updated_at on disk for deterministic ordering.
async fn set_updated_at(
archivist: &Archivist,
backend: &JsonlBackend,
scroll_id: Uuid,
when: chrono::DateTime<chrono::Utc>,
) -> Result<()> {
let mut meta = archivist.get_session_metadata(scroll_id, None).await?;
meta.updated_at = when;
let path = backend.paths().session_json(scroll_id);
dirigent_archivist::storage::json::write_json(&path, &meta)
.await
.map_err(dirigent_archivist::ArchivistError::Io)?;
Ok(())
}
fn cleanup(path: std::path::PathBuf) {
let _ = std::fs::remove_dir_all(path);
}
#[tokio::test]
async fn list_sessions_paged_respects_limit() -> Result<()> {
let (archivist, backend, temp) = mk_archivist().await?;
let uid = mk_connector(&archivist, "connector-a").await?;
let base = Utc::now();
for i in 0..30 {
let scroll = mk_session(
&archivist,
&backend,
uid,
&format!("native-{i}"),
Some(&format!("title-{i}")),
Vec::new(),
None,
None,
false,
)
.await?;
set_updated_at(&archivist, &backend, scroll, base - Duration::seconds(i)).await?;
}
let page = archivist
.list_sessions_paged(SessionListQuery::default().with_connector(uid).with_limit(10))
.await?;
assert_eq!(page.items.len(), 10);
assert!(page.next_cursor.is_some());
cleanup(temp);
Ok(())
}
#[tokio::test]
async fn list_sessions_paged_end_of_list() -> Result<()> {
let (archivist, backend, temp) = mk_archivist().await?;
let uid = mk_connector(&archivist, "connector-b").await?;
let base = Utc::now();
for i in 0..5 {
let scroll = mk_session(
&archivist,
&backend,
uid,
&format!("native-{i}"),
Some(&format!("title-{i}")),
Vec::new(),
None,
None,
false,
)
.await?;
set_updated_at(&archivist, &backend, scroll, base - Duration::seconds(i)).await?;
}
let page = archivist
.list_sessions_paged(SessionListQuery::default().with_connector(uid).with_limit(100))
.await?;
assert_eq!(page.items.len(), 5);
assert!(page.next_cursor.is_none());
cleanup(temp);
Ok(())
}
#[tokio::test]
async fn list_sessions_paged_cursor_stability() -> Result<()> {
let (archivist, backend, temp) = mk_archivist().await?;
let uid = mk_connector(&archivist, "connector-c").await?;
let fixed = Utc::now();
for i in 0..6 {
let scroll = mk_session(
&archivist,
&backend,
uid,
&format!("native-{i}"),
Some(&format!("title-{i}")),
Vec::new(),
None,
None,
false,
)
.await?;
set_updated_at(&archivist, &backend, scroll, fixed).await?;
}
let p1 = archivist
.list_sessions_paged(SessionListQuery::default().with_connector(uid).with_limit(3))
.await?;
assert_eq!(p1.items.len(), 3);
assert!(p1.next_cursor.is_some());
let p2 = archivist
.list_sessions_paged(
SessionListQuery::default()
.with_connector(uid)
.with_limit(3)
.with_cursor(p1.next_cursor.clone()),
)
.await?;
assert_eq!(p2.items.len(), 3);
let ids1: std::collections::HashSet<_> = p1.items.iter().map(|s| s.scroll_id).collect();
let ids2: std::collections::HashSet<_> = p2.items.iter().map(|s| s.scroll_id).collect();
assert!(ids1.is_disjoint(&ids2), "page 1 and page 2 must not overlap");
assert!(p2.next_cursor.is_none());
cleanup(temp);
Ok(())
}
#[tokio::test]
async fn list_sessions_paged_title_filter() -> Result<()> {
let (archivist, backend, temp) = mk_archivist().await?;
let uid = mk_connector(&archivist, "connector-d").await?;
mk_session(&archivist, &backend, uid, "n1", Some("Alpha beta"), vec![], None, None, false).await?;
mk_session(&archivist, &backend, uid, "n2", Some("BETA only"), vec![], None, None, false).await?;
mk_session(&archivist, &backend, uid, "n3", Some("gamma"), vec![], None, None, false).await?;
mk_session(&archivist, &backend, uid, "n4", None, vec![], None, None, false).await?;
let page = archivist
.list_sessions_paged(
SessionListQuery::default()
.with_connector(uid)
.with_limit(50)
.with_title_query("beta"),
)
.await?;
let titles: Vec<_> = page.items.iter().filter_map(|s| s.title.clone()).collect();
assert_eq!(titles.len(), 2, "got {titles:?}");
assert!(titles.iter().any(|t| t == "Alpha beta"));
assert!(titles.iter().any(|t| t == "BETA only"));
cleanup(temp);
Ok(())
}
#[tokio::test]
async fn list_sessions_paged_tags_and() -> Result<()> {
let (archivist, backend, temp) = mk_archivist().await?;
let uid = mk_connector(&archivist, "connector-e").await?;
mk_session(
&archivist, &backend, uid, "n1", Some("s1"),
vec!["red".into(), "blue".into()], None, None, false,
).await?;
mk_session(
&archivist, &backend, uid, "n2", Some("s2"),
vec!["red".into()], None, None, false,
).await?;
mk_session(
&archivist, &backend, uid, "n3", Some("s3"),
vec!["blue".into()], None, None, false,
).await?;
let mut q = SessionListQuery::default().with_connector(uid).with_limit(50);
q.tags = vec!["red".into(), "blue".into()];
let page = archivist.list_sessions_paged(q).await?;
assert_eq!(page.items.len(), 1);
assert_eq!(page.items[0].title.as_deref(), Some("s1"));
cleanup(temp);
Ok(())
}
#[tokio::test]
async fn list_sessions_paged_model_filter() -> Result<()> {
let (archivist, backend, temp) = mk_archivist().await?;
let uid = mk_connector(&archivist, "connector-f").await?;
mk_session(&archivist, &backend, uid, "n1", Some("s1"), vec![], Some("claude-3-5-sonnet"), None, false).await?;
mk_session(&archivist, &backend, uid, "n2", Some("s2"), vec![], Some("gpt-4o"), None, false).await?;
mk_session(&archivist, &backend, uid, "n3", Some("s3"), vec![], None, None, false).await?;
let mut q = SessionListQuery::default().with_connector(uid).with_limit(50);
q.model_filter = Some("sonnet".into());
let page = archivist.list_sessions_paged(q).await?;
assert_eq!(page.items.len(), 1);
assert_eq!(page.items[0].title.as_deref(), Some("s1"));
cleanup(temp);
Ok(())
}
#[tokio::test]
async fn list_sessions_paged_include_hidden() -> Result<()> {
let (archivist, backend, temp) = mk_archivist().await?;
let uid = mk_connector(&archivist, "connector-g").await?;
mk_session(&archivist, &backend, uid, "n1", Some("visible"), vec![], None, None, false).await?;
mk_session(&archivist, &backend, uid, "n2", Some("hidden"), vec![], None, None, true).await?;
let visible_only = archivist
.list_sessions_paged(SessionListQuery::default().with_connector(uid).with_limit(50))
.await?;
assert_eq!(visible_only.items.len(), 1);
assert_eq!(visible_only.items[0].title.as_deref(), Some("visible"));
let all = archivist
.list_sessions_paged(
SessionListQuery::default()
.with_connector(uid)
.with_limit(50)
.with_include_hidden(true),
)
.await?;
assert_eq!(all.items.len(), 2);
cleanup(temp);
Ok(())
}
#[tokio::test]
async fn list_sessions_paged_project_scope() -> Result<()> {
let (archivist, backend, temp) = mk_archivist().await?;
let c1 = mk_connector(&archivist, "connector-h1").await?;
let c2 = mk_connector(&archivist, "connector-h2").await?;
mk_session(&archivist, &backend, c1, "n1", Some("proj-a-1"), vec![], None, Some("proj-a"), false).await?;
mk_session(&archivist, &backend, c1, "n2", Some("proj-b-1"), vec![], None, Some("proj-b"), false).await?;
mk_session(&archivist, &backend, c2, "n3", Some("proj-a-2"), vec![], None, Some("proj-a"), false).await?;
let page = archivist
.list_sessions_paged(
SessionListQuery::default()
.with_project("proj-a")
.with_limit(50),
)
.await?;
assert_eq!(page.items.len(), 2);
for s in &page.items {
assert_eq!(s.metadata.get("project_id").and_then(|v| v.as_str()), Some("proj-a"));
}
cleanup(temp);
Ok(())
}
@@ -1,130 +0,0 @@
use dirigent_archivist::{
coordinator::Archivist,
error::ArchivistBootError,
registry::{ArchivesConfig, BackendRegistry},
};
fn parse(toml_src: &str) -> ArchivesConfig {
toml::from_str(toml_src).unwrap()
}
#[tokio::test]
async fn boot_with_one_jsonl_archive() {
let dir = tempfile::tempdir().unwrap();
let cfg = parse(&format!(
r#"
[[archives]]
name = "main"
type = "jsonl"
[archives.params]
path = "{}"
"#,
dir.path().to_string_lossy().replace('\\', "/")
));
let registry = BackendRegistry::with_jsonl();
let _archivist = Archivist::from_config(cfg, &registry, None).await.unwrap();
}
#[tokio::test]
async fn boot_empty_config_is_ephemeral() {
let cfg: ArchivesConfig = toml::from_str("").unwrap();
let registry = BackendRegistry::with_jsonl();
let archivist = Archivist::from_config(cfg, &registry, None).await.unwrap();
let archives = archivist.list_archives().await.unwrap();
assert!(archives.is_empty());
}
#[tokio::test]
async fn boot_unknown_type_errors() {
let cfg = parse(
r#"
[[archives]]
name = "x"
type = "nope"
[archives.params]
"#,
);
let registry = BackendRegistry::with_jsonl();
let result = Archivist::from_config(cfg, &registry, None).await;
match result {
Ok(_) => panic!("expected UnknownType error"),
Err(err) => assert!(
matches!(err, ArchivistBootError::UnknownType { .. }),
"expected UnknownType, got {err:?}"
),
}
}
#[tokio::test]
async fn boot_no_primary_errors() {
let cfg = parse(
r#"
[[archives]]
name = "mirror"
type = "jsonl"
failure_mode = "best_effort"
[archives.params]
path = "/tmp/whatever"
"#,
);
let registry = BackendRegistry::with_jsonl();
let result = Archivist::from_config(cfg, &registry, None).await;
match result {
Ok(_) => panic!("expected Validation error"),
Err(err) => assert!(
matches!(err, ArchivistBootError::Validation(_)),
"expected Validation, got {err:?}"
),
}
}
#[tokio::test]
async fn boot_duplicate_name_errors() {
let dir = tempfile::tempdir().unwrap();
let cfg = parse(&format!(
r#"
[[archives]]
name = "main"
type = "jsonl"
[archives.params]
path = "{p}"
[[archives]]
name = "main"
type = "jsonl"
[archives.params]
path = "{p}"
"#,
p = dir.path().to_string_lossy().replace('\\', "/"),
));
let registry = BackendRegistry::with_jsonl();
let result = Archivist::from_config(cfg, &registry, None).await;
match result {
Ok(_) => panic!("expected Validation error"),
Err(err) => assert!(
matches!(err, ArchivistBootError::Validation(_)),
"expected Validation, got {err:?}"
),
}
}
#[test]
fn example_toml_parses() {
// Load the full dirigent.toml.example and parse just the [[archives]]
// section as `ArchivesConfig`. Confirms the example's archive syntax is
// valid Phase 3 schema.
let src = std::fs::read_to_string(
std::path::Path::new(env!("CARGO_MANIFEST_DIR"))
.join("../../dirigent.toml.example"),
)
.expect("dirigent.toml.example present at workspace root");
// Parse the whole file as a TOML value, then try to deserialize the
// full document into `ArchivesConfig`. Any `archives` subtable gets picked up;
// other top-level fields (connectors, matrix, ...) are ignored because
// `ArchivesConfig` only has `entries: Vec<ArchiveConfig>` via
// `#[serde(rename = "archives")]`.
let cfg: ArchivesConfig =
toml::from_str(&src).expect("ArchivesConfig from full example");
cfg.validate().expect("example config validates");
assert!(!cfg.entries.is_empty(), "example must declare at least one archive");
}
@@ -1,76 +0,0 @@
#![cfg(feature = "test-utils")]
use std::sync::Arc;
use dirigent_archivist::backend::mock::MockBackend;
use dirigent_archivist::backend::{ArchiveBackend, ArchiveCapability, CapabilitySet, HealthStatus};
use dirigent_archivist::coordinator::Archivist;
use dirigent_archivist::registry::{ArchiveRegistration, FailureMode, WritePolicy};
use dirigent_archivist::types::{MetaEventRecord, MetaEventType};
use uuid::Uuid;
fn reg(name: &str, backend: Arc<MockBackend>, priority: u32) -> Arc<ArchiveRegistration> {
Arc::new(ArchiveRegistration::new(
name.into(),
"mock",
backend as Arc<dyn ArchiveBackend>,
true,
FailureMode::Required,
priority,
true,
WritePolicy::Inline,
None,
HealthStatus::Healthy,
))
}
fn stub_meta_event(scroll_id: Uuid) -> MetaEventRecord {
MetaEventRecord {
version: 1,
event_id: Uuid::now_v7(),
session: scroll_id,
ts: chrono::Utc::now(),
event_type: MetaEventType::ClientConnected,
description: "test event".into(),
linked_session_id: None,
linked_connector_id: None,
linked_connector_title: None,
metadata: serde_json::Value::Null,
}
}
#[tokio::test]
async fn capability_filter_skips_backend_without_meta_events() {
let mut caps_with_meta = CapabilitySet::new();
caps_with_meta.insert(ArchiveCapability::MetaEvents);
caps_with_meta.insert(ArchiveCapability::SessionMapping);
caps_with_meta.insert(ArchiveCapability::ConnectorRegistry);
let with_meta = Arc::new(MockBackend::with_capabilities(caps_with_meta));
let mut caps_without_meta = CapabilitySet::new();
caps_without_meta.insert(ArchiveCapability::SessionMapping);
caps_without_meta.insert(ArchiveCapability::ConnectorRegistry);
let without_meta = Arc::new(MockBackend::with_capabilities(caps_without_meta));
let archivist = Archivist::from_registrations(vec![
reg("primary", with_meta.clone(), 0),
reg("secondary", without_meta.clone(), 10),
]);
let scroll = Uuid::new_v4();
archivist
.append_meta_events(scroll, vec![stub_meta_event(scroll)], None)
.await
.unwrap();
// Primary received the meta event.
assert!(
with_meta.has_meta_events(scroll),
"primary should receive meta event"
);
// Secondary was capability-skipped.
assert!(
!without_meta.has_meta_events(scroll),
"secondary should be skipped"
);
}
@@ -1,121 +0,0 @@
#![cfg(feature = "test-utils")]
use std::sync::Arc;
use dirigent_archivist::backend::mock::MockBackend;
use dirigent_archivist::backend::ArchiveBackend;
use dirigent_archivist::backend::HealthStatus;
use dirigent_archivist::coordinator::Archivist;
use dirigent_archivist::error::ArchivistError;
use dirigent_archivist::registry::{ArchiveRegistration, FailureMode, WritePolicy};
use dirigent_archivist::types::SessionMetadata;
use uuid::Uuid;
async fn dual_backend_coordinator() -> (Archivist, Arc<MockBackend>, Arc<MockBackend>) {
let a = Arc::new(MockBackend::new());
let b = Arc::new(MockBackend::new());
let regs = vec![
Arc::new(ArchiveRegistration::new(
"a".into(),
"mock",
a.clone() as Arc<dyn ArchiveBackend>,
true,
FailureMode::Required,
0,
true,
WritePolicy::Inline,
None,
HealthStatus::Healthy,
)),
Arc::new(ArchiveRegistration::new(
"b".into(),
"mock",
b.clone() as Arc<dyn ArchiveBackend>,
true,
FailureMode::Required,
10,
true,
WritePolicy::Inline,
None,
HealthStatus::Healthy,
)),
];
(Archivist::from_registrations(regs), a, b)
}
#[tokio::test]
async fn copy_session_carries_metadata_and_messages() {
let (archivist, a, b) = dual_backend_coordinator().await;
let scroll = Uuid::new_v4();
// Seed `a` only.
a.put_session(SessionMetadata::stub(scroll)).await.unwrap();
a.append_messages(scroll, vec![]).await.unwrap();
archivist.copy_session(scroll, "a", "b").await.unwrap();
assert!(b.get_session(scroll).await.unwrap().is_some());
assert!(a.get_session(scroll).await.unwrap().is_some());
}
#[tokio::test]
async fn move_session_removes_from_source() {
let (archivist, a, b) = dual_backend_coordinator().await;
let scroll = Uuid::new_v4();
a.put_session(SessionMetadata::stub(scroll)).await.unwrap();
archivist.move_session(scroll, "a", "b").await.unwrap();
assert!(a.get_session(scroll).await.unwrap().is_none());
assert!(b.get_session(scroll).await.unwrap().is_some());
assert_eq!(
archivist.read_cache_size().await,
1,
"cache should now reflect the move"
);
}
#[tokio::test]
async fn move_session_partial_failure_returns_partial_move_error() {
let (archivist, a, b) = dual_backend_coordinator().await;
let scroll = Uuid::new_v4();
a.put_session(SessionMetadata::stub(scroll)).await.unwrap();
// The source-side delete happens AFTER the copy. Inject ONE write failure
// AFTER the copy has already consumed the write capacity. `MockBackend`'s
// inject_write_failures decrements on every mutating call — so we:
// 1. perform the copy through the archivist (uses put_session+append on `b`,
// but NO writes on `a`, since reads happen on the source side).
// 2. THEN inject a write failure on `a` to make the delete fail.
//
// Actually `copy_session` reads from `a` then writes to `b`, no writes on `a`.
// So we can safely inject BEFORE calling move_session: the only write on `a`
// during move_session is the delete, which will hit the injected failure.
a.inject_write_failures(1);
let err = archivist.move_session(scroll, "a", "b").await.unwrap_err();
assert!(matches!(err, ArchivistError::PartialMove { .. }));
// Both backends now have the session.
assert!(a.get_session(scroll).await.unwrap().is_some());
assert!(b.get_session(scroll).await.unwrap().is_some());
}
#[tokio::test]
async fn delete_session_fans_out_and_invalidates_cache() {
let (archivist, a, b) = dual_backend_coordinator().await;
let scroll = Uuid::new_v4();
a.put_session(SessionMetadata::stub(scroll)).await.unwrap();
b.put_session(SessionMetadata::stub(scroll)).await.unwrap();
// Prime the cache with a read.
let _ = archivist.get_session_metadata(scroll, None).await.unwrap();
assert_eq!(archivist.read_cache_size().await, 1);
archivist.delete_session(scroll, None).await.unwrap();
assert!(a.get_session(scroll).await.unwrap().is_none());
assert!(b.get_session(scroll).await.unwrap().is_none());
assert_eq!(archivist.read_cache_size().await, 0);
}
@@ -1,124 +0,0 @@
#![cfg(feature = "test-utils")]
use std::sync::Arc;
use dirigent_archivist::backend::mock::MockBackend;
use dirigent_archivist::backend::{ArchiveBackend, HealthStatus};
use dirigent_archivist::coordinator::Archivist;
use dirigent_archivist::registry::{ArchiveRegistration, FailureMode, WritePolicy};
use uuid::Uuid;
fn reg(
name: &str,
backend: Arc<MockBackend>,
priority: u32,
failure: FailureMode,
) -> Arc<ArchiveRegistration> {
Arc::new(ArchiveRegistration::new(
name.into(),
"mock",
backend as Arc<dyn ArchiveBackend>,
true,
failure,
priority,
true,
WritePolicy::Inline,
None,
HealthStatus::Healthy,
))
}
fn sample_message(session: Uuid) -> dirigent_archivist::types::MessageRecord {
dirigent_archivist::types::MessageRecord {
version: 1,
message_id: Uuid::now_v7(),
session,
parent_id: None,
ts: chrono::Utc::now(),
role: "user".into(),
author: None,
content_md: "hi".into(),
content_parts: None,
attachments: vec![],
metadata: serde_json::Value::Null,
}
}
#[tokio::test]
async fn write_fans_out_to_both_backends() {
let a = Arc::new(MockBackend::new());
let b = Arc::new(MockBackend::new());
let archivist = Archivist::from_registrations(vec![
reg("a", a.clone(), 0, FailureMode::Required),
reg("b", b.clone(), 10, FailureMode::BestEffort),
]);
// Using a non-empty message vec for a robust positive-count check:
let scroll = Uuid::new_v4();
let m = sample_message(scroll);
archivist
.append_messages(scroll, vec![m], None)
.await
.unwrap();
assert_eq!(a.appended_count(scroll), 1);
assert_eq!(b.appended_count(scroll), 1);
}
#[tokio::test]
async fn best_effort_failure_does_not_propagate() {
let a = Arc::new(MockBackend::new());
let b = Arc::new(MockBackend::new());
b.inject_write_failures(1);
let archivist = Archivist::from_registrations(vec![
reg("a", a.clone(), 0, FailureMode::Required),
reg("b", b.clone(), 10, FailureMode::BestEffort),
]);
archivist
.append_messages(Uuid::new_v4(), vec![], None)
.await
.unwrap(); // Ok despite secondary failure
let snapshot = archivist.list_archives_with_health().await;
let b_status = snapshot.iter().find(|s| s.name == "b").unwrap();
assert!(matches!(b_status.health, HealthStatus::Degraded { .. }));
}
#[tokio::test]
async fn required_secondary_failure_propagates() {
let a = Arc::new(MockBackend::new());
let b = Arc::new(MockBackend::new());
b.inject_write_failures(1);
let archivist = Archivist::from_registrations(vec![
reg("a", a.clone(), 0, FailureMode::Required),
reg("b", b.clone(), 10, FailureMode::Required),
]);
let err = archivist
.append_messages(Uuid::new_v4(), vec![], None)
.await;
assert!(err.is_err(), "expected error when required secondary fails");
}
#[tokio::test]
async fn explicit_archive_overrides_default_primary() {
let a = Arc::new(MockBackend::new());
let b = Arc::new(MockBackend::new());
let archivist = Archivist::from_registrations(vec![
reg("a", a.clone(), 0, FailureMode::Required),
reg("b", b.clone(), 10, FailureMode::Required),
]);
let scroll = Uuid::new_v4();
let m = sample_message(scroll);
archivist
.append_messages(scroll, vec![m], Some("b".into()))
.await
.unwrap();
// Both receive the write: b is explicit primary, a is secondary via fanout.
assert_eq!(a.appended_count(scroll), 1);
assert_eq!(b.appended_count(scroll), 1);
}

Some files were not shown because too many files have changed in this diff Show More