sync from monorepo @ 2452e92e
This commit is contained in:
@@ -0,0 +1,32 @@
|
||||
# Package: dirigent_chatgpt
|
||||
|
||||
Pure-Rust parser for OpenAI's ChatGPT `conversations.json` data export.
|
||||
|
||||
## Scope
|
||||
|
||||
- `parse_export(path)` — reads a `conversations.json` file on disk and
|
||||
returns `Vec<ParsedConversation>`.
|
||||
- `parse_str(json)` — parses an in-memory JSON string (useful for tests
|
||||
and piped inputs).
|
||||
- Types: `ParsedConversation`, `ParsedMessage`, `ContentPart` (`Text`,
|
||||
`Code`, `Tool`).
|
||||
|
||||
No dirigent-specific types. `dirigent_archivist::import::sources::chatgpt`
|
||||
consumes this crate and maps into the archivist's internal types.
|
||||
|
||||
## Example
|
||||
|
||||
```rust
|
||||
let convs = dirigent_chatgpt::parse_export(path)?;
|
||||
for c in convs {
|
||||
println!("{}: {} messages", c.title.as_deref().unwrap_or("(untitled)"), c.messages.len());
|
||||
}
|
||||
```
|
||||
|
||||
## Failure modes
|
||||
|
||||
- Truly broken JSON → `ParseError::Json`.
|
||||
- Malformed individual messages are skipped where possible.
|
||||
- Unknown content shapes are preserved as best-effort text in
|
||||
`ContentPart::Text { text: raw_json }` so no user data is silently
|
||||
lost.
|
||||
@@ -0,0 +1,13 @@
|
||||
[package]
|
||||
name = "dirigent_chatgpt"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
chrono = { version = "0.4", features = ["serde"] }
|
||||
serde = { version = "1", features = ["derive"] }
|
||||
serde_json = "1"
|
||||
thiserror = "1"
|
||||
uuid = { version = "1", features = ["v4", "v7", "serde"] }
|
||||
|
||||
[dev-dependencies]
|
||||
@@ -0,0 +1,7 @@
|
||||
//! ChatGPT export parser. Zero dirigent-specific types.
|
||||
|
||||
pub mod parser;
|
||||
pub mod types;
|
||||
|
||||
pub use parser::{parse_export, parse_str, ParseError};
|
||||
pub use types::{ContentPart, ParsedConversation, ParsedMessage};
|
||||
@@ -0,0 +1,349 @@
|
||||
use std::path::Path;
|
||||
use chrono::{DateTime, TimeZone, Utc};
|
||||
use thiserror::Error;
|
||||
|
||||
use crate::types::{ContentPart, ParsedConversation, ParsedMessage};
|
||||
|
||||
#[derive(Debug, Error)]
|
||||
pub enum ParseError {
|
||||
#[error("I/O: {0}")] Io(#[from] std::io::Error),
|
||||
#[error("JSON: {0}")] Json(#[from] serde_json::Error),
|
||||
#[error("unsupported shape: {0}")] UnsupportedShape(String),
|
||||
}
|
||||
|
||||
/// Parse a ChatGPT `conversations.json` file into a list of conversations.
|
||||
pub fn parse_export(path: &Path) -> Result<Vec<ParsedConversation>, ParseError> {
|
||||
let text = std::fs::read_to_string(path)?;
|
||||
parse_str(&text)
|
||||
}
|
||||
|
||||
/// Parse a JSON string of conversations.
|
||||
pub fn parse_str(json: &str) -> Result<Vec<ParsedConversation>, ParseError> {
|
||||
// ChatGPT conversations.json is a JSON array of conversation objects.
|
||||
let root: serde_json::Value = serde_json::from_str(json)?;
|
||||
let arr = root.as_array()
|
||||
.ok_or_else(|| ParseError::UnsupportedShape("expected JSON array at root".into()))?;
|
||||
let mut out = Vec::with_capacity(arr.len());
|
||||
for conv in arr {
|
||||
out.push(convert_conversation(conv)?);
|
||||
}
|
||||
Ok(out)
|
||||
}
|
||||
|
||||
fn convert_conversation(conv: &serde_json::Value) -> Result<ParsedConversation, ParseError> {
|
||||
let id = conv.get("id")
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or("")
|
||||
.to_string();
|
||||
let title = conv.get("title").and_then(|v| v.as_str()).map(String::from);
|
||||
let created_at = conv.get("create_time").and_then(parse_unix_time);
|
||||
let updated_at = conv.get("update_time").and_then(parse_unix_time);
|
||||
|
||||
// Walk the mapping tree if present; otherwise return empty messages.
|
||||
let messages = if let Some(mapping) = conv.get("mapping").and_then(|v| v.as_object()) {
|
||||
walk_mapping(mapping)
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
// Preserve whatever metadata we can't otherwise capture.
|
||||
let mut metadata = serde_json::Map::new();
|
||||
for key in &["conversation_id", "gizmo_id", "model", "default_model_slug", "moderation_results"] {
|
||||
if let Some(v) = conv.get(*key) {
|
||||
metadata.insert((*key).to_string(), v.clone());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(ParsedConversation {
|
||||
id,
|
||||
title,
|
||||
created_at,
|
||||
updated_at,
|
||||
messages,
|
||||
metadata: if metadata.is_empty() {
|
||||
serde_json::Value::Null
|
||||
} else {
|
||||
serde_json::Value::Object(metadata)
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
/// Walk the `mapping` tree starting at the root (parent=null), DFS in
|
||||
/// `create_time` order, collecting non-null messages.
|
||||
fn walk_mapping(
|
||||
mapping: &serde_json::Map<String, serde_json::Value>,
|
||||
) -> Vec<ParsedMessage> {
|
||||
// Find roots: nodes with parent == null, or (fallback) nodes not
|
||||
// referenced as a child by any other node.
|
||||
let mut roots: Vec<&str> = mapping
|
||||
.iter()
|
||||
.filter_map(|(id, node)| {
|
||||
let parent = node.get("parent");
|
||||
let is_root = match parent {
|
||||
None => true,
|
||||
Some(serde_json::Value::Null) => true,
|
||||
_ => false,
|
||||
};
|
||||
if is_root {
|
||||
Some(id.as_str())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Fallback: if we didn't find any via the `parent` signal, derive from
|
||||
// the child-set (a root is a node that nobody lists as a child).
|
||||
if roots.is_empty() {
|
||||
let mut referenced: std::collections::HashSet<&str> = std::collections::HashSet::new();
|
||||
for node in mapping.values() {
|
||||
if let Some(children) = node.get("children").and_then(|c| c.as_array()) {
|
||||
for child in children {
|
||||
if let Some(s) = child.as_str() {
|
||||
referenced.insert(s);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
roots = mapping
|
||||
.keys()
|
||||
.filter(|k| !referenced.contains(k.as_str()))
|
||||
.map(|s| s.as_str())
|
||||
.collect();
|
||||
}
|
||||
|
||||
let mut out: Vec<ParsedMessage> = Vec::new();
|
||||
let mut visited: std::collections::HashSet<String> = std::collections::HashSet::new();
|
||||
for root_id in roots {
|
||||
dfs_collect(mapping, root_id, &mut out, &mut visited);
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
fn dfs_collect(
|
||||
mapping: &serde_json::Map<String, serde_json::Value>,
|
||||
node_id: &str,
|
||||
out: &mut Vec<ParsedMessage>,
|
||||
visited: &mut std::collections::HashSet<String>,
|
||||
) {
|
||||
if !visited.insert(node_id.to_string()) {
|
||||
return;
|
||||
}
|
||||
let node = match mapping.get(node_id) {
|
||||
Some(n) => n,
|
||||
None => return,
|
||||
};
|
||||
if let Some(msg) = node.get("message") {
|
||||
if !msg.is_null() {
|
||||
if let Some(parsed) = parse_mapping_message(msg) {
|
||||
out.push(parsed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Children, sorted by create_time when available for deterministic order.
|
||||
if let Some(children) = node.get("children").and_then(|c| c.as_array()) {
|
||||
let mut child_refs: Vec<(&str, Option<f64>)> = children
|
||||
.iter()
|
||||
.filter_map(|v| v.as_str())
|
||||
.map(|id| {
|
||||
let ct = mapping
|
||||
.get(id)
|
||||
.and_then(|n| n.get("message"))
|
||||
.and_then(|m| m.get("create_time"))
|
||||
.and_then(|v| v.as_f64());
|
||||
(id, ct)
|
||||
})
|
||||
.collect();
|
||||
child_refs.sort_by(|a, b| match (a.1, b.1) {
|
||||
(Some(x), Some(y)) => x.partial_cmp(&y).unwrap_or(std::cmp::Ordering::Equal),
|
||||
(Some(_), None) => std::cmp::Ordering::Less,
|
||||
(None, Some(_)) => std::cmp::Ordering::Greater,
|
||||
(None, None) => std::cmp::Ordering::Equal,
|
||||
});
|
||||
for (child_id, _) in child_refs {
|
||||
dfs_collect(mapping, child_id, out, visited);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_mapping_message(msg: &serde_json::Value) -> Option<ParsedMessage> {
|
||||
let id = msg.get("id").and_then(|v| v.as_str()).unwrap_or("").to_string();
|
||||
|
||||
let role = msg
|
||||
.get("author")
|
||||
.and_then(|a| a.get("role"))
|
||||
.and_then(|r| r.as_str())
|
||||
.unwrap_or("unknown")
|
||||
.to_string();
|
||||
|
||||
let ts = msg.get("create_time").and_then(parse_unix_time);
|
||||
|
||||
let content = msg
|
||||
.get("content")
|
||||
.map(content_to_parts)
|
||||
.unwrap_or_default();
|
||||
|
||||
// Skip purely empty system placeholders (common at the root of chats).
|
||||
if content.iter().all(|p| is_part_empty(p)) && role == "system" {
|
||||
return None;
|
||||
}
|
||||
|
||||
let mut metadata = serde_json::Map::new();
|
||||
if let Some(m) = msg.get("metadata").and_then(|v| v.as_object()) {
|
||||
for (k, v) in m {
|
||||
metadata.insert(k.clone(), v.clone());
|
||||
}
|
||||
}
|
||||
if let Some(author) = msg.get("author").and_then(|v| v.as_object()) {
|
||||
if let Some(name) = author.get("name") {
|
||||
metadata.insert("author_name".to_string(), name.clone());
|
||||
}
|
||||
}
|
||||
|
||||
Some(ParsedMessage {
|
||||
id,
|
||||
role,
|
||||
ts,
|
||||
content,
|
||||
metadata: if metadata.is_empty() {
|
||||
serde_json::Value::Null
|
||||
} else {
|
||||
serde_json::Value::Object(metadata)
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
fn is_part_empty(p: &ContentPart) -> bool {
|
||||
match p {
|
||||
ContentPart::Text { text } => text.trim().is_empty(),
|
||||
ContentPart::Code { text, .. } => text.trim().is_empty(),
|
||||
ContentPart::Tool { .. } => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert a `content` blob (various shapes) into a list of `ContentPart`s.
|
||||
fn content_to_parts(content: &serde_json::Value) -> Vec<ContentPart> {
|
||||
// Typical shape: { "content_type": "text", "parts": [ ... ] }
|
||||
// Other content types seen in the wild: "code", "tether_browsing_display",
|
||||
// "multimodal_text", "execution_output", "system_error". We do a best-effort
|
||||
// normalisation here; Task 8+ can specialise further.
|
||||
let content_type = content.get("content_type").and_then(|v| v.as_str()).unwrap_or("text");
|
||||
|
||||
if let Some(parts) = content.get("parts").and_then(|v| v.as_array()) {
|
||||
return parts.iter().map(|p| part_to_content_part(p, content_type)).collect();
|
||||
}
|
||||
|
||||
// `content_type = "code"` carries { language, text }
|
||||
if content_type == "code" {
|
||||
let language = content.get("language").and_then(|v| v.as_str()).map(String::from);
|
||||
let text = content
|
||||
.get("text")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(String::from)
|
||||
.unwrap_or_default();
|
||||
return vec![ContentPart::Code { language, text }];
|
||||
}
|
||||
|
||||
// `content_type = "tether_browsing_display"` / execution_output carry
|
||||
// `text` or `result` fields — treat as text.
|
||||
if let Some(text) = content.get("text").and_then(|v| v.as_str()) {
|
||||
return vec![ContentPart::Text { text: text.to_string() }];
|
||||
}
|
||||
if let Some(text) = content.get("result").and_then(|v| v.as_str()) {
|
||||
return vec![ContentPart::Text { text: text.to_string() }];
|
||||
}
|
||||
|
||||
// Unknown shape — serialize the raw JSON.
|
||||
vec![ContentPart::Text { text: content.to_string() }]
|
||||
}
|
||||
|
||||
fn part_to_content_part(part: &serde_json::Value, outer_type: &str) -> ContentPart {
|
||||
// String — plain text (or code, depending on outer content_type).
|
||||
if let Some(s) = part.as_str() {
|
||||
if outer_type == "code" {
|
||||
return ContentPart::Code { language: None, text: s.to_string() };
|
||||
}
|
||||
return ContentPart::Text { text: s.to_string() };
|
||||
}
|
||||
|
||||
// Object — inspect fields.
|
||||
if let Some(obj) = part.as_object() {
|
||||
// Multimodal text part: { "text": "...", ... }
|
||||
if let Some(text) = obj.get("text").and_then(|v| v.as_str()) {
|
||||
return ContentPart::Text { text: text.to_string() };
|
||||
}
|
||||
|
||||
// Tool-ish shape: { "tool": "...", "input": {...}, "output": ... }
|
||||
// (ChatGPT's actual tool shape varies; this is a best-effort catch.)
|
||||
if let (Some(name), Some(input)) = (
|
||||
obj.get("name").or_else(|| obj.get("tool")).and_then(|v| v.as_str()),
|
||||
obj.get("input"),
|
||||
) {
|
||||
return ContentPart::Tool {
|
||||
name: name.to_string(),
|
||||
input: input.clone(),
|
||||
output: obj.get("output").cloned(),
|
||||
};
|
||||
}
|
||||
|
||||
// Image / asset-pointer parts: describe them inline.
|
||||
if let Some(asset) = obj.get("asset_pointer").and_then(|v| v.as_str()) {
|
||||
return ContentPart::Text { text: format!("[asset: {}]", asset) };
|
||||
}
|
||||
}
|
||||
|
||||
// Unknown shape — serialise the raw JSON.
|
||||
ContentPart::Text { text: part.to_string() }
|
||||
}
|
||||
|
||||
/// Parse a ChatGPT unix-seconds timestamp (may be float or int, may be null).
|
||||
fn parse_unix_time(v: &serde_json::Value) -> Option<DateTime<Utc>> {
|
||||
let seconds = v.as_f64()?;
|
||||
if !seconds.is_finite() {
|
||||
return None;
|
||||
}
|
||||
let secs = seconds.trunc() as i64;
|
||||
let nanos = ((seconds - secs as f64) * 1_000_000_000.0).round() as u32;
|
||||
Utc.timestamp_opt(secs, nanos).single()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn parses_minimal_fixture() {
|
||||
let path = std::path::Path::new("tests/fixtures/minimal.json");
|
||||
let convs = parse_export(path).expect("parse");
|
||||
assert_eq!(convs.len(), 1);
|
||||
assert_eq!(convs[0].title.as_deref(), Some("Hello"));
|
||||
assert_eq!(convs[0].messages.len(), 2);
|
||||
assert_eq!(convs[0].messages[0].role, "user");
|
||||
assert_eq!(convs[0].messages[1].role, "assistant");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parses_timestamps() {
|
||||
let path = std::path::Path::new("tests/fixtures/minimal.json");
|
||||
let convs = parse_export(path).expect("parse");
|
||||
let c = &convs[0];
|
||||
assert!(c.created_at.is_some());
|
||||
assert!(c.updated_at.is_some());
|
||||
// Message timestamps should be derived from create_time.
|
||||
assert!(c.messages[0].ts.is_some());
|
||||
assert!(c.messages[1].ts.is_some());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn text_content_extracted() {
|
||||
let path = std::path::Path::new("tests/fixtures/minimal.json");
|
||||
let convs = parse_export(path).expect("parse");
|
||||
let msg0 = &convs[0].messages[0];
|
||||
assert_eq!(msg0.content.len(), 1);
|
||||
match &msg0.content[0] {
|
||||
ContentPart::Text { text } => assert_eq!(text, "Hello, world"),
|
||||
other => panic!("expected Text, got {:?}", other),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
use chrono::{DateTime, Utc};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ParsedConversation {
|
||||
pub id: String, // ChatGPT's conversation_id (hex-ish; may or may not be a UUID)
|
||||
pub title: Option<String>,
|
||||
pub created_at: Option<DateTime<Utc>>,
|
||||
pub updated_at: Option<DateTime<Utc>>,
|
||||
pub messages: Vec<ParsedMessage>,
|
||||
#[serde(default)]
|
||||
pub metadata: serde_json::Value,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ParsedMessage {
|
||||
pub id: String,
|
||||
pub role: String, // "user" | "assistant" | "system" | "tool"
|
||||
pub ts: Option<DateTime<Utc>>,
|
||||
pub content: Vec<ContentPart>,
|
||||
#[serde(default)]
|
||||
pub metadata: serde_json::Value,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(tag = "type", rename_all = "snake_case")]
|
||||
pub enum ContentPart {
|
||||
Text { text: String },
|
||||
Code { language: Option<String>, text: String },
|
||||
Tool { name: String, input: serde_json::Value, output: Option<serde_json::Value> },
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
[
|
||||
{
|
||||
"id": "c1",
|
||||
"title": "Hello",
|
||||
"create_time": 1700000000.0,
|
||||
"update_time": 1700000100.0,
|
||||
"mapping": {
|
||||
"root": { "id": "root", "message": null, "children": ["m1"] },
|
||||
"m1": {
|
||||
"id": "m1",
|
||||
"message": {
|
||||
"id": "m1",
|
||||
"author": { "role": "user" },
|
||||
"create_time": 1700000010.0,
|
||||
"content": { "content_type": "text", "parts": ["Hello, world"] }
|
||||
},
|
||||
"children": ["m2"]
|
||||
},
|
||||
"m2": {
|
||||
"id": "m2",
|
||||
"message": {
|
||||
"id": "m2",
|
||||
"author": { "role": "assistant" },
|
||||
"create_time": 1700000020.0,
|
||||
"content": { "content_type": "text", "parts": ["Hi!"] }
|
||||
},
|
||||
"children": []
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
Reference in New Issue
Block a user