sync from monorepo @ 2452e92e

This commit is contained in:
2026-05-08 01:59:04 +02:00
commit b03dc15371
459 changed files with 129586 additions and 0 deletions
+32
View File
@@ -0,0 +1,32 @@
# Package: dirigent_chatgpt
Pure-Rust parser for OpenAI's ChatGPT `conversations.json` data export.
## Scope
- `parse_export(path)` — reads a `conversations.json` file on disk and
returns `Vec<ParsedConversation>`.
- `parse_str(json)` — parses an in-memory JSON string (useful for tests
and piped inputs).
- Types: `ParsedConversation`, `ParsedMessage`, `ContentPart` (`Text`,
`Code`, `Tool`).
No dirigent-specific types. `dirigent_archivist::import::sources::chatgpt`
consumes this crate and maps into the archivist's internal types.
## Example
```rust
let convs = dirigent_chatgpt::parse_export(path)?;
for c in convs {
println!("{}: {} messages", c.title.as_deref().unwrap_or("(untitled)"), c.messages.len());
}
```
## Failure modes
- Truly broken JSON → `ParseError::Json`.
- Malformed individual messages are skipped where possible.
- Unknown content shapes are preserved as best-effort text in
`ContentPart::Text { text: raw_json }` so no user data is silently
lost.
+13
View File
@@ -0,0 +1,13 @@
[package]
name = "dirigent_chatgpt"
version = "0.1.0"
edition = "2021"
[dependencies]
chrono = { version = "0.4", features = ["serde"] }
serde = { version = "1", features = ["derive"] }
serde_json = "1"
thiserror = "1"
uuid = { version = "1", features = ["v4", "v7", "serde"] }
[dev-dependencies]
+7
View File
@@ -0,0 +1,7 @@
//! ChatGPT export parser. Zero dirigent-specific types.
pub mod parser;
pub mod types;
pub use parser::{parse_export, parse_str, ParseError};
pub use types::{ContentPart, ParsedConversation, ParsedMessage};
+349
View File
@@ -0,0 +1,349 @@
use std::path::Path;
use chrono::{DateTime, TimeZone, Utc};
use thiserror::Error;
use crate::types::{ContentPart, ParsedConversation, ParsedMessage};
#[derive(Debug, Error)]
pub enum ParseError {
#[error("I/O: {0}")] Io(#[from] std::io::Error),
#[error("JSON: {0}")] Json(#[from] serde_json::Error),
#[error("unsupported shape: {0}")] UnsupportedShape(String),
}
/// Parse a ChatGPT `conversations.json` file into a list of conversations.
pub fn parse_export(path: &Path) -> Result<Vec<ParsedConversation>, ParseError> {
let text = std::fs::read_to_string(path)?;
parse_str(&text)
}
/// Parse a JSON string of conversations.
pub fn parse_str(json: &str) -> Result<Vec<ParsedConversation>, ParseError> {
// ChatGPT conversations.json is a JSON array of conversation objects.
let root: serde_json::Value = serde_json::from_str(json)?;
let arr = root.as_array()
.ok_or_else(|| ParseError::UnsupportedShape("expected JSON array at root".into()))?;
let mut out = Vec::with_capacity(arr.len());
for conv in arr {
out.push(convert_conversation(conv)?);
}
Ok(out)
}
fn convert_conversation(conv: &serde_json::Value) -> Result<ParsedConversation, ParseError> {
let id = conv.get("id")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let title = conv.get("title").and_then(|v| v.as_str()).map(String::from);
let created_at = conv.get("create_time").and_then(parse_unix_time);
let updated_at = conv.get("update_time").and_then(parse_unix_time);
// Walk the mapping tree if present; otherwise return empty messages.
let messages = if let Some(mapping) = conv.get("mapping").and_then(|v| v.as_object()) {
walk_mapping(mapping)
} else {
Vec::new()
};
// Preserve whatever metadata we can't otherwise capture.
let mut metadata = serde_json::Map::new();
for key in &["conversation_id", "gizmo_id", "model", "default_model_slug", "moderation_results"] {
if let Some(v) = conv.get(*key) {
metadata.insert((*key).to_string(), v.clone());
}
}
Ok(ParsedConversation {
id,
title,
created_at,
updated_at,
messages,
metadata: if metadata.is_empty() {
serde_json::Value::Null
} else {
serde_json::Value::Object(metadata)
},
})
}
/// Walk the `mapping` tree starting at the root (parent=null), DFS in
/// `create_time` order, collecting non-null messages.
fn walk_mapping(
mapping: &serde_json::Map<String, serde_json::Value>,
) -> Vec<ParsedMessage> {
// Find roots: nodes with parent == null, or (fallback) nodes not
// referenced as a child by any other node.
let mut roots: Vec<&str> = mapping
.iter()
.filter_map(|(id, node)| {
let parent = node.get("parent");
let is_root = match parent {
None => true,
Some(serde_json::Value::Null) => true,
_ => false,
};
if is_root {
Some(id.as_str())
} else {
None
}
})
.collect();
// Fallback: if we didn't find any via the `parent` signal, derive from
// the child-set (a root is a node that nobody lists as a child).
if roots.is_empty() {
let mut referenced: std::collections::HashSet<&str> = std::collections::HashSet::new();
for node in mapping.values() {
if let Some(children) = node.get("children").and_then(|c| c.as_array()) {
for child in children {
if let Some(s) = child.as_str() {
referenced.insert(s);
}
}
}
}
roots = mapping
.keys()
.filter(|k| !referenced.contains(k.as_str()))
.map(|s| s.as_str())
.collect();
}
let mut out: Vec<ParsedMessage> = Vec::new();
let mut visited: std::collections::HashSet<String> = std::collections::HashSet::new();
for root_id in roots {
dfs_collect(mapping, root_id, &mut out, &mut visited);
}
out
}
fn dfs_collect(
mapping: &serde_json::Map<String, serde_json::Value>,
node_id: &str,
out: &mut Vec<ParsedMessage>,
visited: &mut std::collections::HashSet<String>,
) {
if !visited.insert(node_id.to_string()) {
return;
}
let node = match mapping.get(node_id) {
Some(n) => n,
None => return,
};
if let Some(msg) = node.get("message") {
if !msg.is_null() {
if let Some(parsed) = parse_mapping_message(msg) {
out.push(parsed);
}
}
}
// Children, sorted by create_time when available for deterministic order.
if let Some(children) = node.get("children").and_then(|c| c.as_array()) {
let mut child_refs: Vec<(&str, Option<f64>)> = children
.iter()
.filter_map(|v| v.as_str())
.map(|id| {
let ct = mapping
.get(id)
.and_then(|n| n.get("message"))
.and_then(|m| m.get("create_time"))
.and_then(|v| v.as_f64());
(id, ct)
})
.collect();
child_refs.sort_by(|a, b| match (a.1, b.1) {
(Some(x), Some(y)) => x.partial_cmp(&y).unwrap_or(std::cmp::Ordering::Equal),
(Some(_), None) => std::cmp::Ordering::Less,
(None, Some(_)) => std::cmp::Ordering::Greater,
(None, None) => std::cmp::Ordering::Equal,
});
for (child_id, _) in child_refs {
dfs_collect(mapping, child_id, out, visited);
}
}
}
fn parse_mapping_message(msg: &serde_json::Value) -> Option<ParsedMessage> {
let id = msg.get("id").and_then(|v| v.as_str()).unwrap_or("").to_string();
let role = msg
.get("author")
.and_then(|a| a.get("role"))
.and_then(|r| r.as_str())
.unwrap_or("unknown")
.to_string();
let ts = msg.get("create_time").and_then(parse_unix_time);
let content = msg
.get("content")
.map(content_to_parts)
.unwrap_or_default();
// Skip purely empty system placeholders (common at the root of chats).
if content.iter().all(|p| is_part_empty(p)) && role == "system" {
return None;
}
let mut metadata = serde_json::Map::new();
if let Some(m) = msg.get("metadata").and_then(|v| v.as_object()) {
for (k, v) in m {
metadata.insert(k.clone(), v.clone());
}
}
if let Some(author) = msg.get("author").and_then(|v| v.as_object()) {
if let Some(name) = author.get("name") {
metadata.insert("author_name".to_string(), name.clone());
}
}
Some(ParsedMessage {
id,
role,
ts,
content,
metadata: if metadata.is_empty() {
serde_json::Value::Null
} else {
serde_json::Value::Object(metadata)
},
})
}
fn is_part_empty(p: &ContentPart) -> bool {
match p {
ContentPart::Text { text } => text.trim().is_empty(),
ContentPart::Code { text, .. } => text.trim().is_empty(),
ContentPart::Tool { .. } => false,
}
}
/// Convert a `content` blob (various shapes) into a list of `ContentPart`s.
fn content_to_parts(content: &serde_json::Value) -> Vec<ContentPart> {
// Typical shape: { "content_type": "text", "parts": [ ... ] }
// Other content types seen in the wild: "code", "tether_browsing_display",
// "multimodal_text", "execution_output", "system_error". We do a best-effort
// normalisation here; Task 8+ can specialise further.
let content_type = content.get("content_type").and_then(|v| v.as_str()).unwrap_or("text");
if let Some(parts) = content.get("parts").and_then(|v| v.as_array()) {
return parts.iter().map(|p| part_to_content_part(p, content_type)).collect();
}
// `content_type = "code"` carries { language, text }
if content_type == "code" {
let language = content.get("language").and_then(|v| v.as_str()).map(String::from);
let text = content
.get("text")
.and_then(|v| v.as_str())
.map(String::from)
.unwrap_or_default();
return vec![ContentPart::Code { language, text }];
}
// `content_type = "tether_browsing_display"` / execution_output carry
// `text` or `result` fields — treat as text.
if let Some(text) = content.get("text").and_then(|v| v.as_str()) {
return vec![ContentPart::Text { text: text.to_string() }];
}
if let Some(text) = content.get("result").and_then(|v| v.as_str()) {
return vec![ContentPart::Text { text: text.to_string() }];
}
// Unknown shape — serialize the raw JSON.
vec![ContentPart::Text { text: content.to_string() }]
}
fn part_to_content_part(part: &serde_json::Value, outer_type: &str) -> ContentPart {
// String — plain text (or code, depending on outer content_type).
if let Some(s) = part.as_str() {
if outer_type == "code" {
return ContentPart::Code { language: None, text: s.to_string() };
}
return ContentPart::Text { text: s.to_string() };
}
// Object — inspect fields.
if let Some(obj) = part.as_object() {
// Multimodal text part: { "text": "...", ... }
if let Some(text) = obj.get("text").and_then(|v| v.as_str()) {
return ContentPart::Text { text: text.to_string() };
}
// Tool-ish shape: { "tool": "...", "input": {...}, "output": ... }
// (ChatGPT's actual tool shape varies; this is a best-effort catch.)
if let (Some(name), Some(input)) = (
obj.get("name").or_else(|| obj.get("tool")).and_then(|v| v.as_str()),
obj.get("input"),
) {
return ContentPart::Tool {
name: name.to_string(),
input: input.clone(),
output: obj.get("output").cloned(),
};
}
// Image / asset-pointer parts: describe them inline.
if let Some(asset) = obj.get("asset_pointer").and_then(|v| v.as_str()) {
return ContentPart::Text { text: format!("[asset: {}]", asset) };
}
}
// Unknown shape — serialise the raw JSON.
ContentPart::Text { text: part.to_string() }
}
/// Parse a ChatGPT unix-seconds timestamp (may be float or int, may be null).
fn parse_unix_time(v: &serde_json::Value) -> Option<DateTime<Utc>> {
let seconds = v.as_f64()?;
if !seconds.is_finite() {
return None;
}
let secs = seconds.trunc() as i64;
let nanos = ((seconds - secs as f64) * 1_000_000_000.0).round() as u32;
Utc.timestamp_opt(secs, nanos).single()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parses_minimal_fixture() {
let path = std::path::Path::new("tests/fixtures/minimal.json");
let convs = parse_export(path).expect("parse");
assert_eq!(convs.len(), 1);
assert_eq!(convs[0].title.as_deref(), Some("Hello"));
assert_eq!(convs[0].messages.len(), 2);
assert_eq!(convs[0].messages[0].role, "user");
assert_eq!(convs[0].messages[1].role, "assistant");
}
#[test]
fn parses_timestamps() {
let path = std::path::Path::new("tests/fixtures/minimal.json");
let convs = parse_export(path).expect("parse");
let c = &convs[0];
assert!(c.created_at.is_some());
assert!(c.updated_at.is_some());
// Message timestamps should be derived from create_time.
assert!(c.messages[0].ts.is_some());
assert!(c.messages[1].ts.is_some());
}
#[test]
fn text_content_extracted() {
let path = std::path::Path::new("tests/fixtures/minimal.json");
let convs = parse_export(path).expect("parse");
let msg0 = &convs[0].messages[0];
assert_eq!(msg0.content.len(), 1);
match &msg0.content[0] {
ContentPart::Text { text } => assert_eq!(text, "Hello, world"),
other => panic!("expected Text, got {:?}", other),
}
}
}
+31
View File
@@ -0,0 +1,31 @@
use chrono::{DateTime, Utc};
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ParsedConversation {
pub id: String, // ChatGPT's conversation_id (hex-ish; may or may not be a UUID)
pub title: Option<String>,
pub created_at: Option<DateTime<Utc>>,
pub updated_at: Option<DateTime<Utc>>,
pub messages: Vec<ParsedMessage>,
#[serde(default)]
pub metadata: serde_json::Value,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ParsedMessage {
pub id: String,
pub role: String, // "user" | "assistant" | "system" | "tool"
pub ts: Option<DateTime<Utc>>,
pub content: Vec<ContentPart>,
#[serde(default)]
pub metadata: serde_json::Value,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum ContentPart {
Text { text: String },
Code { language: Option<String>, text: String },
Tool { name: String, input: serde_json::Value, output: Option<serde_json::Value> },
}
+31
View File
@@ -0,0 +1,31 @@
[
{
"id": "c1",
"title": "Hello",
"create_time": 1700000000.0,
"update_time": 1700000100.0,
"mapping": {
"root": { "id": "root", "message": null, "children": ["m1"] },
"m1": {
"id": "m1",
"message": {
"id": "m1",
"author": { "role": "user" },
"create_time": 1700000010.0,
"content": { "content_type": "text", "parts": ["Hello, world"] }
},
"children": ["m2"]
},
"m2": {
"id": "m2",
"message": {
"id": "m2",
"author": { "role": "assistant" },
"create_time": 1700000020.0,
"content": { "content_type": "text", "parts": ["Hi!"] }
},
"children": []
}
}
}
]