sync from monorepo @ 2452e92e

This commit is contained in:
2026-05-08 01:59:04 +02:00
commit b03dc15371
459 changed files with 129586 additions and 0 deletions
+30
View File
@@ -0,0 +1,30 @@
# Package: dirigent_codex
Pure-Rust parser for OpenAI Codex JSONL session files.
## Scope
- `parse_file(path)` — reads one `*.jsonl` session file on disk and
returns a `ParsedSession`.
- `discover_sessions(dir)` — scans a directory (e.g.
`~/.codex/sessions/`) for session files.
- Types: `ParsedSession`, `ParsedMessage`.
No dirigent-specific types. `dirigent_archivist::import::sources::codex`
consumes this crate and maps into the archivist's internal types.
## Example
```rust
let sessions = dirigent_codex::discover_sessions(dir)?;
for s in sessions {
println!("{}: {} messages", s.id, s.messages.len());
}
```
## Failure modes
- Individual malformed JSONL lines are skipped where possible.
- Truly broken files return `ParseError::Json`.
- Unknown message shapes are preserved as best-effort text so no user
data is silently lost.
+14
View File
@@ -0,0 +1,14 @@
[package]
name = "dirigent_codex"
version = "0.1.0"
edition = "2021"
[dependencies]
chrono = { version = "0.4", features = ["serde"] }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
thiserror = "1"
uuid = { version = "1", features = ["v4", "v7", "serde"] }
[dev-dependencies]
tempfile = "3"
+14
View File
@@ -0,0 +1,14 @@
//! OpenAI Codex on-disk session parser. Zero dirigent-specific types.
//!
//! The Codex CLI persists its sessions as JSONL files under
//! `~/.codex/sessions/*.jsonl` (or a caller-supplied equivalent). Each line
//! is a best-effort event object with a `role`, some `content`, and an
//! optional timestamp. Exact schema varies across Codex versions, so this
//! parser is intentionally lenient: unknown/malformed lines are skipped,
//! not failed.
pub mod parser;
pub mod types;
pub use parser::{discover_sessions, parse_file, ParseError};
pub use types::{ParsedMessage, ParsedSession};
+274
View File
@@ -0,0 +1,274 @@
use std::path::{Path, PathBuf};
use chrono::{DateTime, TimeZone, Utc};
use thiserror::Error;
use crate::types::{ParsedMessage, ParsedSession};
#[derive(Debug, Error)]
pub enum ParseError {
#[error("I/O: {0}")]
Io(#[from] std::io::Error),
#[error("JSON: {0}")]
Json(#[from] serde_json::Error),
#[error("not found: {0}")]
NotFound(String),
}
/// Walk `dir` (non-recursively) for Codex session JSONL files.
///
/// Returns a deterministically ordered list (lexical by path) of every
/// `*.jsonl` file directly under `dir`. Returns `NotFound` if the directory
/// itself doesn't exist.
pub fn discover_sessions(dir: &Path) -> Result<Vec<PathBuf>, ParseError> {
if !dir.exists() {
return Err(ParseError::NotFound(dir.display().to_string()));
}
let mut out = Vec::new();
for entry in std::fs::read_dir(dir)? {
let entry = entry?;
let path = entry.path();
if path.is_file()
&& path.extension().and_then(|s| s.to_str()) == Some("jsonl")
{
out.push(path);
}
}
out.sort();
Ok(out)
}
/// Parse a single Codex session JSONL file.
///
/// Malformed / unexpected lines are skipped (not fatal). Every line that
/// exposes a `role` and a `content` is turned into a [`ParsedMessage`].
/// The session's `created_at` / `updated_at` bracket the first and last
/// message timestamps seen.
pub fn parse_file(path: &Path) -> Result<ParsedSession, ParseError> {
let text = std::fs::read_to_string(path)?;
let native_id = path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("unknown")
.to_string();
let mut messages = Vec::new();
let mut created_at: Option<DateTime<Utc>> = None;
let mut updated_at: Option<DateTime<Utc>> = None;
for line in text.lines() {
if line.trim().is_empty() {
continue;
}
let val: serde_json::Value = match serde_json::from_str(line) {
Ok(v) => v,
Err(_) => continue, // skip malformed lines
};
if let Some(msg) = extract_message(&val) {
if let Some(ts) = msg.ts {
if created_at.is_none() {
created_at = Some(ts);
}
updated_at = Some(ts);
}
messages.push(msg);
}
}
Ok(ParsedSession {
native_id,
source_path: path.to_path_buf(),
created_at,
updated_at,
messages,
})
}
/// Best-effort extraction of a [`ParsedMessage`] from an arbitrary JSONL
/// event. Returns `None` if the shape doesn't carry a role + content.
fn extract_message(val: &serde_json::Value) -> Option<ParsedMessage> {
let role = val.get("role").and_then(|v| v.as_str()).map(String::from)?;
let content = extract_content(val.get("content")?)?;
let ts = extract_ts(val);
Some(ParsedMessage {
ts,
role,
content,
metadata: val.clone(),
})
}
/// Flatten a `content` field into a single string.
///
/// - string → as-is
/// - array → strings joined by `\n`; objects with a `text` field use that,
/// otherwise their raw JSON is stringified
/// - object → `text` field if present, otherwise the raw JSON
/// - null → `None`
fn extract_content(content: &serde_json::Value) -> Option<String> {
match content {
serde_json::Value::String(s) => Some(s.clone()),
serde_json::Value::Array(arr) => {
let parts: Vec<String> = arr
.iter()
.filter_map(|p| {
if let Some(s) = p.as_str() {
Some(s.to_string())
} else if let Some(t) = p.get("text").and_then(|v| v.as_str()) {
Some(t.to_string())
} else {
Some(p.to_string())
}
})
.collect();
Some(parts.join("\n"))
}
serde_json::Value::Object(_) => {
if let Some(t) = content.get("text").and_then(|v| v.as_str()) {
Some(t.to_string())
} else {
Some(content.to_string())
}
}
serde_json::Value::Null => None,
other => Some(other.to_string()),
}
}
/// Extract a timestamp from one of several possible fields.
///
/// Accepts RFC 3339 strings or numeric unix-seconds (integer or float).
fn extract_ts(val: &serde_json::Value) -> Option<DateTime<Utc>> {
let candidate = val
.get("ts")
.or_else(|| val.get("timestamp"))
.or_else(|| val.get("created_at"))
.or_else(|| val.get("time"))?;
if let Some(s) = candidate.as_str() {
if let Ok(dt) = DateTime::parse_from_rfc3339(s) {
return Some(dt.with_timezone(&Utc));
}
}
if let Some(f) = candidate.as_f64() {
if f.is_finite() {
let secs = f.trunc() as i64;
let nanos = ((f - secs as f64) * 1_000_000_000.0).round() as u32;
return Utc.timestamp_opt(secs, nanos).single();
}
}
if let Some(i) = candidate.as_i64() {
return Utc.timestamp_opt(i, 0).single();
}
None
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
use std::io::Write;
fn write_jsonl(dir: &Path, name: &str, lines: &[&str]) -> PathBuf {
let path = dir.join(name);
let mut f = std::fs::File::create(&path).unwrap();
for line in lines {
writeln!(f, "{}", line).unwrap();
}
path
}
#[test]
fn discover_sessions_missing_dir_returns_not_found() {
let err = discover_sessions(Path::new("/tmp/this/does/not/exist/ever"))
.expect_err("should fail");
assert!(matches!(err, ParseError::NotFound(_)));
}
#[test]
fn discover_sessions_lists_only_jsonl() {
let tmp = tempfile::tempdir().unwrap();
let _ = write_jsonl(tmp.path(), "a.jsonl", &[]);
let _ = write_jsonl(tmp.path(), "b.jsonl", &[]);
let _ = write_jsonl(tmp.path(), "not-this.txt", &[]);
let found = discover_sessions(tmp.path()).unwrap();
assert_eq!(found.len(), 2);
assert!(found[0].ends_with("a.jsonl"));
assert!(found[1].ends_with("b.jsonl"));
}
#[test]
fn parse_file_extracts_basic_messages() {
let tmp = tempfile::tempdir().unwrap();
let path = write_jsonl(
tmp.path(),
"session-abc.jsonl",
&[
r#"{"role":"user","content":"hi","ts":"2025-01-01T12:00:00Z"}"#,
r#"{"role":"assistant","content":"hello","ts":"2025-01-01T12:00:01Z"}"#,
],
);
let session = parse_file(&path).unwrap();
assert_eq!(session.native_id, "session-abc");
assert_eq!(session.messages.len(), 2);
assert_eq!(session.messages[0].role, "user");
assert_eq!(session.messages[0].content, "hi");
assert_eq!(session.messages[1].role, "assistant");
assert!(session.created_at.is_some());
assert!(session.updated_at.is_some());
assert_ne!(session.created_at, session.updated_at);
}
#[test]
fn parse_file_skips_malformed_and_empty_lines() {
let tmp = tempfile::tempdir().unwrap();
let path = write_jsonl(
tmp.path(),
"session.jsonl",
&[
r#"{"role":"user","content":"hi"}"#,
"",
"not json at all",
r#"{"garbled":true}"#, // no role/content → skipped
r#"{"role":"assistant","content":"ok"}"#,
],
);
let session = parse_file(&path).unwrap();
assert_eq!(session.messages.len(), 2);
}
#[test]
fn parse_file_handles_content_array_and_object() {
let tmp = tempfile::tempdir().unwrap();
let path = write_jsonl(
tmp.path(),
"session.jsonl",
&[
r#"{"role":"user","content":["a","b","c"]}"#,
r#"{"role":"user","content":[{"text":"x"},{"text":"y"}]}"#,
r#"{"role":"assistant","content":{"text":"nested"}}"#,
],
);
let session = parse_file(&path).unwrap();
assert_eq!(session.messages.len(), 3);
assert_eq!(session.messages[0].content, "a\nb\nc");
assert_eq!(session.messages[1].content, "x\ny");
assert_eq!(session.messages[2].content, "nested");
}
#[test]
fn parse_file_accepts_unix_ts() {
let tmp = tempfile::tempdir().unwrap();
let path = write_jsonl(
tmp.path(),
"session.jsonl",
&[r#"{"role":"user","content":"hi","ts":1735732800}"#],
);
let session = parse_file(&path).unwrap();
assert_eq!(session.messages.len(), 1);
assert!(session.messages[0].ts.is_some());
}
}
+32
View File
@@ -0,0 +1,32 @@
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
use std::path::PathBuf;
/// A Codex session parsed from a single JSONL file.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ParsedSession {
/// The native session id. For Codex this is the JSONL file stem.
pub native_id: String,
/// The source file the session was loaded from.
pub source_path: PathBuf,
/// First message timestamp seen (if any).
pub created_at: Option<DateTime<Utc>>,
/// Last message timestamp seen (if any).
pub updated_at: Option<DateTime<Utc>>,
/// Parsed messages in file order.
pub messages: Vec<ParsedMessage>,
}
/// A single message event from a Codex JSONL session file.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ParsedMessage {
/// Timestamp if one could be extracted (RFC 3339 or unix epoch).
pub ts: Option<DateTime<Utc>>,
/// Free-form role, e.g. "user", "assistant", "system", "tool".
pub role: String,
/// Best-effort concatenated text content.
pub content: String,
/// Raw event for provenance.
#[serde(default)]
pub metadata: serde_json::Value,
}