sync from monorepo @ 2452e92e

This commit is contained in:
2026-05-08 01:59:04 +02:00
commit b03dc15371
459 changed files with 129586 additions and 0 deletions
@@ -0,0 +1,361 @@
//! ChatGPT importer: takes a path to a conversations.json file.
use std::path::PathBuf;
use async_trait::async_trait;
use chrono::Utc;
use uuid::Uuid;
use dirigent_chatgpt::{ContentPart, ParsedConversation, ParsedMessage};
use super::super::progress::ImportProgressSink;
use super::super::trait_def::{
ConfigField, ConfigFieldKind, ImportConfig, ImportConfigShape, ImportError, ImportTarget,
Importer,
};
use super::super::{
import_sessions, DiscoveredSession, ImportDiscovery, ImportProject, ImportStats,
};
use crate::coordinator::Archivist;
use crate::error::{ArchivistError, Result};
use crate::types::{MessageRecord, RegisterConnectorRequest};
/// Connector type string used for imported ChatGPT sessions.
pub const CHATGPT_CONNECTOR_TYPE: &str = "ChatGPT";
/// Fingerprint prefix for locally-imported ChatGPT exports.
pub const CHATGPT_FINGERPRINT_PREFIX: &str = "import/local:chatgpt";
/// Namespace UUID for deterministic UUIDv5 derivations on ChatGPT message ids
/// that are not already valid UUIDs.
const CHATGPT_MESSAGE_NS: Uuid = Uuid::from_u128(0x4e58_a7cb_bf1c_4de2_b7c9_8c31_11b3_1112);
pub struct ChatGptImporter;
#[async_trait]
impl Importer for ChatGptImporter {
fn source_name(&self) -> &'static str {
"chatgpt"
}
fn config_shape(&self) -> ImportConfigShape {
ImportConfigShape {
fields: vec![ConfigField {
key: "path".into(),
label: "conversations.json path".into(),
kind: ConfigFieldKind::File {
extension: Some("json".into()),
},
required: true,
help: Some(
"Unzipped OpenAI data export \u{2192} conversations.json".into(),
),
}],
example: ImportConfig {
source: "chatgpt".into(),
params: {
let mut m = std::collections::BTreeMap::new();
m.insert(
"path".into(),
serde_json::json!("~/Downloads/chatgpt-export/conversations.json"),
);
m
},
},
}
}
async fn discover(
&self,
cfg: &ImportConfig,
) -> std::result::Result<ImportDiscovery, ImportError> {
let path = require_path(cfg)?;
let convs = dirigent_chatgpt::parse_export(&path)
.map_err(|e| ImportError::Discovery(e.to_string()))?;
let total_sessions = convs.len();
let total_estimated_messages: usize = convs.iter().map(|c| c.messages.len()).sum();
// ChatGPT exports don't carry per-project information, so we bucket
// everything into a single synthetic project named after the file.
let project_name = path
.file_name()
.and_then(|s| s.to_str())
.unwrap_or("ChatGPT export")
.to_string();
Ok(ImportDiscovery {
source_name: "ChatGPT".to_string(),
source_path: path.display().to_string(),
projects: vec![ImportProject {
name: project_name,
session_count: total_sessions,
}],
total_sessions,
total_estimated_messages,
})
}
async fn import(
&self,
cfg: &ImportConfig,
archivist: &Archivist,
target: ImportTarget,
progress: ImportProgressSink,
) -> std::result::Result<ImportStats, ImportError> {
let path = require_path(cfg)?;
let convs = dirigent_chatgpt::parse_export(&path)
.map_err(|e| ImportError::Parser(e.to_string()))?;
// Build discovered-session list + keep the parsed convs handy for
// message conversion inside the closure.
let mut discovered: Vec<DiscoveredSession> = Vec::with_capacity(convs.len());
for c in &convs {
let metadata = serde_json::json!({
"source": "chatgpt",
"conversation_id": c.id,
"parser_metadata": c.metadata.clone(),
});
discovered.push(DiscoveredSession {
native_session_id: c.id.clone(),
title: c.title.clone(),
created_at: c.created_at,
updated_at: c.updated_at,
message_count: c.messages.len(),
metadata,
project_path: None,
file_size: None,
});
}
// Map native_id -> parsed conversation for O(1) lookup in `convert`.
let conv_lookup: std::collections::HashMap<String, ParsedConversation> = convs
.into_iter()
.map(|c| (c.id.clone(), c))
.collect();
// Fingerprint the import by the canonical path. Re-running against the
// same file aliases onto the same connector.
let canonical_path = path.canonicalize().unwrap_or_else(|_| path.clone());
let fingerprint = format!("{}:{}", CHATGPT_FINGERPRINT_PREFIX, canonical_path.display());
let connector_req = RegisterConnectorRequest {
r#type: CHATGPT_CONNECTOR_TYPE.to_string(),
title: format!("ChatGPT ({})", canonical_path.display()),
client_native_id: fingerprint.clone(),
custom_uid: None,
metadata: serde_json::json!({}),
fingerprint: Some(fingerprint),
};
let convert = |native_id: &str| -> Result<Vec<MessageRecord>> {
let conv = conv_lookup.get(native_id).ok_or_else(|| {
ArchivistError::InvalidRequest(format!(
"Parsed conversation not found for native_id: {}",
native_id
))
})?;
Ok(convert_conversation_to_records(conv))
};
import_sessions(
archivist,
connector_req,
discovered,
convert,
target.archive,
&progress,
false,
&target.project_map,
)
.await
.map_err(|e| ImportError::Archivist(e.to_string()))
}
}
// ---------------------------------------------------------------------------
// Conversion helpers
// ---------------------------------------------------------------------------
fn require_path(cfg: &ImportConfig) -> std::result::Result<PathBuf, ImportError> {
cfg.params
.get("path")
.and_then(|v| v.as_str())
.map(PathBuf::from)
.ok_or_else(|| ImportError::Config("missing `path`".into()))
}
/// Prefer to parse the native id as a UUID if possible; otherwise derive a
/// stable UUIDv5 under [`CHATGPT_MESSAGE_NS`].
fn parse_or_derive_uuid(native_id: &str) -> Uuid {
Uuid::parse_str(native_id)
.unwrap_or_else(|_| Uuid::new_v5(&CHATGPT_MESSAGE_NS, native_id.as_bytes()))
}
/// Convert parsed `ContentPart`s into `dirigent_protocol::MessagePart`s.
fn parts_to_message_parts(parts: &[ContentPart]) -> Vec<dirigent_protocol::MessagePart> {
parts
.iter()
.map(|p| match p {
ContentPart::Text { text } => dirigent_protocol::MessagePart::Text {
text: text.clone(),
},
ContentPart::Code { language, text } => dirigent_protocol::MessagePart::Code {
language: language.clone().unwrap_or_default(),
code: text.clone(),
},
ContentPart::Tool { name, input, output } => dirigent_protocol::MessagePart::Tool {
tool: name.clone(),
tool_call_id: None,
input: input.clone(),
output: output.clone(),
},
})
.collect()
}
/// Flatten a list of parsed content parts into a markdown-y string for the
/// `content_md` fallback surface.
fn parts_to_markdown(parts: &[ContentPart]) -> String {
parts
.iter()
.map(|p| match p {
ContentPart::Text { text } => text.clone(),
ContentPart::Code { language, text } => {
let lang = language.clone().unwrap_or_default();
format!("```{}\n{}\n```", lang, text)
}
ContentPart::Tool { name, .. } => format!("[Tool: {}]", name),
})
.collect::<Vec<_>>()
.join("\n\n")
}
/// Convert a parsed ChatGPT conversation into a vector of `MessageRecord`s.
///
/// Each message's `session` field is left as `Uuid::nil()`; the generic
/// `import_sessions` orchestrator patches it to the real scroll id.
fn convert_conversation_to_records(conv: &ParsedConversation) -> Vec<MessageRecord> {
conv.messages
.iter()
.filter_map(convert_parsed_message)
.collect()
}
fn convert_parsed_message(msg: &ParsedMessage) -> Option<MessageRecord> {
// Skip messages with entirely empty text payloads (nothing to archive).
let content_md = parts_to_markdown(&msg.content);
if content_md.trim().is_empty() && msg.content.iter().all(is_part_empty) {
return None;
}
let parts = parts_to_message_parts(&msg.content);
let content_parts = serde_json::to_value(&parts).ok();
let ts = msg.ts.unwrap_or_else(Utc::now);
let message_id = if msg.id.is_empty() {
// Fallback: derive from role + timestamp + a hash of content.
let key = format!("{}:{}:{}", msg.role, ts.to_rfc3339(), content_md);
Uuid::new_v5(&CHATGPT_MESSAGE_NS, key.as_bytes())
} else {
parse_or_derive_uuid(&msg.id)
};
Some(MessageRecord {
version: 1,
message_id,
session: Uuid::nil(),
parent_id: None,
ts,
role: msg.role.clone(),
author: None,
content_md,
content_parts,
attachments: Vec::new(),
metadata: msg.metadata.clone(),
})
}
fn is_part_empty(p: &ContentPart) -> bool {
match p {
ContentPart::Text { text } => text.trim().is_empty(),
ContentPart::Code { text, .. } => text.trim().is_empty(),
ContentPart::Tool { .. } => false,
}
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_or_derive_uuid_parses_real_uuid() {
let real = "12345678-1234-5678-1234-567812345678";
let u = parse_or_derive_uuid(real);
assert_eq!(u.to_string(), real);
}
#[test]
fn parse_or_derive_uuid_falls_back_to_v5() {
let a = parse_or_derive_uuid("not-a-uuid");
let b = parse_or_derive_uuid("not-a-uuid");
assert_eq!(a, b, "deterministic UUIDv5 derivation");
let c = parse_or_derive_uuid("different");
assert_ne!(a, c);
}
#[test]
fn parts_to_message_parts_covers_all_variants() {
let parts = vec![
ContentPart::Text { text: "hi".into() },
ContentPart::Code {
language: Some("rust".into()),
text: "fn main() {}".into(),
},
ContentPart::Tool {
name: "browser".into(),
input: serde_json::json!({"url": "https://example.com"}),
output: Some(serde_json::json!({"status": 200})),
},
];
let mp = parts_to_message_parts(&parts);
assert_eq!(mp.len(), 3);
assert!(matches!(&mp[0], dirigent_protocol::MessagePart::Text { .. }));
assert!(matches!(&mp[1], dirigent_protocol::MessagePart::Code { .. }));
assert!(matches!(&mp[2], dirigent_protocol::MessagePart::Tool { .. }));
}
#[test]
fn empty_parsed_message_is_skipped() {
let msg = ParsedMessage {
id: "m1".into(),
role: "system".into(),
ts: None,
content: vec![ContentPart::Text { text: " ".into() }],
metadata: serde_json::Value::Null,
};
assert!(convert_parsed_message(&msg).is_none());
}
#[test]
fn non_empty_parsed_message_round_trips() {
let msg = ParsedMessage {
id: "m1".into(),
role: "user".into(),
ts: None,
content: vec![ContentPart::Text {
text: "hello".into(),
}],
metadata: serde_json::Value::Null,
};
let record = convert_parsed_message(&msg).expect("should convert");
assert_eq!(record.role, "user");
assert_eq!(record.content_md, "hello");
assert_eq!(record.session, Uuid::nil());
assert!(record.content_parts.is_some());
}
}