sync from monorepo @ 2452e92e
This commit is contained in:
@@ -0,0 +1,30 @@
|
||||
# Package: dirigent_codex
|
||||
|
||||
Pure-Rust parser for OpenAI Codex JSONL session files.
|
||||
|
||||
## Scope
|
||||
|
||||
- `parse_file(path)` — reads one `*.jsonl` session file on disk and
|
||||
returns a `ParsedSession`.
|
||||
- `discover_sessions(dir)` — scans a directory (e.g.
|
||||
`~/.codex/sessions/`) for session files.
|
||||
- Types: `ParsedSession`, `ParsedMessage`.
|
||||
|
||||
No dirigent-specific types. `dirigent_archivist::import::sources::codex`
|
||||
consumes this crate and maps into the archivist's internal types.
|
||||
|
||||
## Example
|
||||
|
||||
```rust
|
||||
let sessions = dirigent_codex::discover_sessions(dir)?;
|
||||
for s in sessions {
|
||||
println!("{}: {} messages", s.id, s.messages.len());
|
||||
}
|
||||
```
|
||||
|
||||
## Failure modes
|
||||
|
||||
- Individual malformed JSONL lines are skipped where possible.
|
||||
- Truly broken files return `ParseError::Json`.
|
||||
- Unknown message shapes are preserved as best-effort text so no user
|
||||
data is silently lost.
|
||||
@@ -0,0 +1,14 @@
|
||||
[package]
|
||||
name = "dirigent_codex"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
thiserror = "1"
|
||||
uuid = { version = "1", features = ["v4", "v7", "serde"] }
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3"
|
||||
@@ -0,0 +1,14 @@
|
||||
//! OpenAI Codex on-disk session parser. Zero dirigent-specific types.
|
||||
//!
|
||||
//! The Codex CLI persists its sessions as JSONL files under
|
||||
//! `~/.codex/sessions/*.jsonl` (or a caller-supplied equivalent). Each line
|
||||
//! is a best-effort event object with a `role`, some `content`, and an
|
||||
//! optional timestamp. Exact schema varies across Codex versions, so this
|
||||
//! parser is intentionally lenient: unknown/malformed lines are skipped,
|
||||
//! not failed.
|
||||
|
||||
pub mod parser;
|
||||
pub mod types;
|
||||
|
||||
pub use parser::{discover_sessions, parse_file, ParseError};
|
||||
pub use types::{ParsedMessage, ParsedSession};
|
||||
@@ -0,0 +1,274 @@
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
use chrono::{DateTime, TimeZone, Utc};
|
||||
use thiserror::Error;
|
||||
|
||||
use crate::types::{ParsedMessage, ParsedSession};
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum ParseError {
|
||||
#[error("I/O: {0}")]
|
||||
Io(#[from] std::io::Error),
|
||||
#[error("JSON: {0}")]
|
||||
Json(#[from] serde_json::Error),
|
||||
#[error("not found: {0}")]
|
||||
NotFound(String),
|
||||
}
|
||||
|
||||
/// Walk `dir` (non-recursively) for Codex session JSONL files.
|
||||
///
|
||||
/// Returns a deterministically ordered list (lexical by path) of every
|
||||
/// `*.jsonl` file directly under `dir`. Returns `NotFound` if the directory
|
||||
/// itself doesn't exist.
|
||||
pub fn discover_sessions(dir: &Path) -> Result<Vec<PathBuf>, ParseError> {
|
||||
if !dir.exists() {
|
||||
return Err(ParseError::NotFound(dir.display().to_string()));
|
||||
}
|
||||
let mut out = Vec::new();
|
||||
for entry in std::fs::read_dir(dir)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
if path.is_file()
|
||||
&& path.extension().and_then(|s| s.to_str()) == Some("jsonl")
|
||||
{
|
||||
out.push(path);
|
||||
}
|
||||
}
|
||||
out.sort();
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
/// Parse a single Codex session JSONL file.
|
||||
///
|
||||
/// Malformed / unexpected lines are skipped (not fatal). Every line that
|
||||
/// exposes a `role` and a `content` is turned into a [`ParsedMessage`].
|
||||
/// The session's `created_at` / `updated_at` bracket the first and last
|
||||
/// message timestamps seen.
|
||||
pub fn parse_file(path: &Path) -> Result<ParsedSession, ParseError> {
|
||||
let text = std::fs::read_to_string(path)?;
|
||||
let native_id = path
|
||||
.file_stem()
|
||||
.and_then(|s| s.to_str())
|
||||
.unwrap_or("unknown")
|
||||
.to_string();
|
||||
|
||||
let mut messages = Vec::new();
|
||||
let mut created_at: Option<DateTime<Utc>> = None;
|
||||
let mut updated_at: Option<DateTime<Utc>> = None;
|
||||
|
||||
for line in text.lines() {
|
||||
if line.trim().is_empty() {
|
||||
continue;
|
||||
}
|
||||
let val: serde_json::Value = match serde_json::from_str(line) {
|
||||
Ok(v) => v,
|
||||
Err(_) => continue, // skip malformed lines
|
||||
};
|
||||
if let Some(msg) = extract_message(&val) {
|
||||
if let Some(ts) = msg.ts {
|
||||
if created_at.is_none() {
|
||||
created_at = Some(ts);
|
||||
}
|
||||
updated_at = Some(ts);
|
||||
}
|
||||
messages.push(msg);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ParsedSession {
|
||||
native_id,
|
||||
source_path: path.to_path_buf(),
|
||||
created_at,
|
||||
updated_at,
|
||||
messages,
|
||||
})
|
||||
}
|
||||
|
||||
/// Best-effort extraction of a [`ParsedMessage`] from an arbitrary JSONL
|
||||
/// event. Returns `None` if the shape doesn't carry a role + content.
|
||||
fn extract_message(val: &serde_json::Value) -> Option<ParsedMessage> {
|
||||
let role = val.get("role").and_then(|v| v.as_str()).map(String::from)?;
|
||||
let content = extract_content(val.get("content")?)?;
|
||||
let ts = extract_ts(val);
|
||||
Some(ParsedMessage {
|
||||
ts,
|
||||
role,
|
||||
content,
|
||||
metadata: val.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
/// Flatten a `content` field into a single string.
|
||||
///
|
||||
/// - string → as-is
|
||||
/// - array → strings joined by `\n`; objects with a `text` field use that,
|
||||
/// otherwise their raw JSON is stringified
|
||||
/// - object → `text` field if present, otherwise the raw JSON
|
||||
/// - null → `None`
|
||||
fn extract_content(content: &serde_json::Value) -> Option<String> {
|
||||
match content {
|
||||
serde_json::Value::String(s) => Some(s.clone()),
|
||||
serde_json::Value::Array(arr) => {
|
||||
let parts: Vec<String> = arr
|
||||
.iter()
|
||||
.filter_map(|p| {
|
||||
if let Some(s) = p.as_str() {
|
||||
Some(s.to_string())
|
||||
} else if let Some(t) = p.get("text").and_then(|v| v.as_str()) {
|
||||
Some(t.to_string())
|
||||
} else {
|
||||
Some(p.to_string())
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
Some(parts.join("\n"))
|
||||
}
|
||||
serde_json::Value::Object(_) => {
|
||||
if let Some(t) = content.get("text").and_then(|v| v.as_str()) {
|
||||
Some(t.to_string())
|
||||
} else {
|
||||
Some(content.to_string())
|
||||
}
|
||||
}
|
||||
serde_json::Value::Null => None,
|
||||
other => Some(other.to_string()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract a timestamp from one of several possible fields.
|
||||
///
|
||||
/// Accepts RFC 3339 strings or numeric unix-seconds (integer or float).
|
||||
fn extract_ts(val: &serde_json::Value) -> Option<DateTime<Utc>> {
|
||||
let candidate = val
|
||||
.get("ts")
|
||||
.or_else(|| val.get("timestamp"))
|
||||
.or_else(|| val.get("created_at"))
|
||||
.or_else(|| val.get("time"))?;
|
||||
|
||||
if let Some(s) = candidate.as_str() {
|
||||
if let Ok(dt) = DateTime::parse_from_rfc3339(s) {
|
||||
return Some(dt.with_timezone(&Utc));
|
||||
}
|
||||
}
|
||||
if let Some(f) = candidate.as_f64() {
|
||||
if f.is_finite() {
|
||||
let secs = f.trunc() as i64;
|
||||
let nanos = ((f - secs as f64) * 1_000_000_000.0).round() as u32;
|
||||
return Utc.timestamp_opt(secs, nanos).single();
|
||||
}
|
||||
}
|
||||
if let Some(i) = candidate.as_i64() {
|
||||
return Utc.timestamp_opt(i, 0).single();
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::io::Write;
|
||||
|
||||
fn write_jsonl(dir: &Path, name: &str, lines: &[&str]) -> PathBuf {
|
||||
let path = dir.join(name);
|
||||
let mut f = std::fs::File::create(&path).unwrap();
|
||||
for line in lines {
|
||||
writeln!(f, "{}", line).unwrap();
|
||||
}
|
||||
path
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn discover_sessions_missing_dir_returns_not_found() {
|
||||
let err = discover_sessions(Path::new("/tmp/this/does/not/exist/ever"))
|
||||
.expect_err("should fail");
|
||||
assert!(matches!(err, ParseError::NotFound(_)));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn discover_sessions_lists_only_jsonl() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let _ = write_jsonl(tmp.path(), "a.jsonl", &[]);
|
||||
let _ = write_jsonl(tmp.path(), "b.jsonl", &[]);
|
||||
let _ = write_jsonl(tmp.path(), "not-this.txt", &[]);
|
||||
let found = discover_sessions(tmp.path()).unwrap();
|
||||
assert_eq!(found.len(), 2);
|
||||
assert!(found[0].ends_with("a.jsonl"));
|
||||
assert!(found[1].ends_with("b.jsonl"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_file_extracts_basic_messages() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let path = write_jsonl(
|
||||
tmp.path(),
|
||||
"session-abc.jsonl",
|
||||
&[
|
||||
r#"{"role":"user","content":"hi","ts":"2025-01-01T12:00:00Z"}"#,
|
||||
r#"{"role":"assistant","content":"hello","ts":"2025-01-01T12:00:01Z"}"#,
|
||||
],
|
||||
);
|
||||
let session = parse_file(&path).unwrap();
|
||||
assert_eq!(session.native_id, "session-abc");
|
||||
assert_eq!(session.messages.len(), 2);
|
||||
assert_eq!(session.messages[0].role, "user");
|
||||
assert_eq!(session.messages[0].content, "hi");
|
||||
assert_eq!(session.messages[1].role, "assistant");
|
||||
assert!(session.created_at.is_some());
|
||||
assert!(session.updated_at.is_some());
|
||||
assert_ne!(session.created_at, session.updated_at);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_file_skips_malformed_and_empty_lines() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let path = write_jsonl(
|
||||
tmp.path(),
|
||||
"session.jsonl",
|
||||
&[
|
||||
r#"{"role":"user","content":"hi"}"#,
|
||||
"",
|
||||
"not json at all",
|
||||
r#"{"garbled":true}"#, // no role/content → skipped
|
||||
r#"{"role":"assistant","content":"ok"}"#,
|
||||
],
|
||||
);
|
||||
let session = parse_file(&path).unwrap();
|
||||
assert_eq!(session.messages.len(), 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_file_handles_content_array_and_object() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let path = write_jsonl(
|
||||
tmp.path(),
|
||||
"session.jsonl",
|
||||
&[
|
||||
r#"{"role":"user","content":["a","b","c"]}"#,
|
||||
r#"{"role":"user","content":[{"text":"x"},{"text":"y"}]}"#,
|
||||
r#"{"role":"assistant","content":{"text":"nested"}}"#,
|
||||
],
|
||||
);
|
||||
let session = parse_file(&path).unwrap();
|
||||
assert_eq!(session.messages.len(), 3);
|
||||
assert_eq!(session.messages[0].content, "a\nb\nc");
|
||||
assert_eq!(session.messages[1].content, "x\ny");
|
||||
assert_eq!(session.messages[2].content, "nested");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_file_accepts_unix_ts() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let path = write_jsonl(
|
||||
tmp.path(),
|
||||
"session.jsonl",
|
||||
&[r#"{"role":"user","content":"hi","ts":1735732800}"#],
|
||||
);
|
||||
let session = parse_file(&path).unwrap();
|
||||
assert_eq!(session.messages.len(), 1);
|
||||
assert!(session.messages[0].ts.is_some());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::path::PathBuf;
|
||||
|
||||
/// A Codex session parsed from a single JSONL file.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ParsedSession {
|
||||
/// The native session id. For Codex this is the JSONL file stem.
|
||||
pub native_id: String,
|
||||
/// The source file the session was loaded from.
|
||||
pub source_path: PathBuf,
|
||||
/// First message timestamp seen (if any).
|
||||
pub created_at: Option<DateTime<Utc>>,
|
||||
/// Last message timestamp seen (if any).
|
||||
pub updated_at: Option<DateTime<Utc>>,
|
||||
/// Parsed messages in file order.
|
||||
pub messages: Vec<ParsedMessage>,
|
||||
}
|
||||
|
||||
/// A single message event from a Codex JSONL session file.
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ParsedMessage {
|
||||
/// Timestamp if one could be extracted (RFC 3339 or unix epoch).
|
||||
pub ts: Option<DateTime<Utc>>,
|
||||
/// Free-form role, e.g. "user", "assistant", "system", "tool".
|
||||
pub role: String,
|
||||
/// Best-effort concatenated text content.
|
||||
pub content: String,
|
||||
/// Raw event for provenance.
|
||||
#[serde(default)]
|
||||
pub metadata: serde_json::Value,
|
||||
}
|
||||
Reference in New Issue
Block a user