//! ChatGPT importer: takes a path to a conversations.json file. use std::path::PathBuf; use async_trait::async_trait; use chrono::Utc; use uuid::Uuid; use dirigent_chatgpt::{ContentPart, ParsedConversation, ParsedMessage}; use super::super::progress::ImportProgressSink; use super::super::trait_def::{ ConfigField, ConfigFieldKind, ImportConfig, ImportConfigShape, ImportError, ImportTarget, Importer, }; use super::super::{ import_sessions, DiscoveredSession, ImportDiscovery, ImportProject, ImportStats, }; use crate::coordinator::Archivist; use crate::error::{ArchivistError, Result}; use crate::types::{MessageRecord, RegisterConnectorRequest}; /// Connector type string used for imported ChatGPT sessions. pub const CHATGPT_CONNECTOR_TYPE: &str = "ChatGPT"; /// Fingerprint prefix for locally-imported ChatGPT exports. pub const CHATGPT_FINGERPRINT_PREFIX: &str = "import/local:chatgpt"; /// Namespace UUID for deterministic UUIDv5 derivations on ChatGPT message ids /// that are not already valid UUIDs. const CHATGPT_MESSAGE_NS: Uuid = Uuid::from_u128(0x4e58_a7cb_bf1c_4de2_b7c9_8c31_11b3_1112); pub struct ChatGptImporter; #[async_trait] impl Importer for ChatGptImporter { fn source_name(&self) -> &'static str { "chatgpt" } fn config_shape(&self) -> ImportConfigShape { ImportConfigShape { fields: vec![ConfigField { key: "path".into(), label: "conversations.json path".into(), kind: ConfigFieldKind::File { extension: Some("json".into()), }, required: true, help: Some( "Unzipped OpenAI data export \u{2192} conversations.json".into(), ), }], example: ImportConfig { source: "chatgpt".into(), params: { let mut m = std::collections::BTreeMap::new(); m.insert( "path".into(), serde_json::json!("~/Downloads/chatgpt-export/conversations.json"), ); m }, }, } } async fn discover( &self, cfg: &ImportConfig, ) -> std::result::Result { let path = require_path(cfg)?; let convs = dirigent_chatgpt::parse_export(&path) .map_err(|e| ImportError::Discovery(e.to_string()))?; let total_sessions = convs.len(); let total_estimated_messages: usize = convs.iter().map(|c| c.messages.len()).sum(); // ChatGPT exports don't carry per-project information, so we bucket // everything into a single synthetic project named after the file. let project_name = path .file_name() .and_then(|s| s.to_str()) .unwrap_or("ChatGPT export") .to_string(); Ok(ImportDiscovery { source_name: "ChatGPT".to_string(), source_path: path.display().to_string(), projects: vec![ImportProject { name: project_name, session_count: total_sessions, }], total_sessions, total_estimated_messages, }) } async fn import( &self, cfg: &ImportConfig, archivist: &Archivist, target: ImportTarget, progress: ImportProgressSink, ) -> std::result::Result { let path = require_path(cfg)?; let convs = dirigent_chatgpt::parse_export(&path) .map_err(|e| ImportError::Parser(e.to_string()))?; // Build discovered-session list + keep the parsed convs handy for // message conversion inside the closure. let mut discovered: Vec = Vec::with_capacity(convs.len()); for c in &convs { let metadata = serde_json::json!({ "source": "chatgpt", "conversation_id": c.id, "parser_metadata": c.metadata.clone(), }); discovered.push(DiscoveredSession { native_session_id: c.id.clone(), title: c.title.clone(), created_at: c.created_at, updated_at: c.updated_at, message_count: c.messages.len(), metadata, project_path: None, file_size: None, }); } // Map native_id -> parsed conversation for O(1) lookup in `convert`. let conv_lookup: std::collections::HashMap = convs .into_iter() .map(|c| (c.id.clone(), c)) .collect(); // Fingerprint the import by the canonical path. Re-running against the // same file aliases onto the same connector. let canonical_path = path.canonicalize().unwrap_or_else(|_| path.clone()); let fingerprint = format!("{}:{}", CHATGPT_FINGERPRINT_PREFIX, canonical_path.display()); let connector_req = RegisterConnectorRequest { r#type: CHATGPT_CONNECTOR_TYPE.to_string(), title: format!("ChatGPT ({})", canonical_path.display()), client_native_id: fingerprint.clone(), custom_uid: None, metadata: serde_json::json!({}), fingerprint: Some(fingerprint), }; let convert = |native_id: &str| -> Result> { let conv = conv_lookup.get(native_id).ok_or_else(|| { ArchivistError::InvalidRequest(format!( "Parsed conversation not found for native_id: {}", native_id )) })?; Ok(convert_conversation_to_records(conv)) }; import_sessions( archivist, connector_req, discovered, convert, target.archive, &progress, false, &target.project_map, ) .await .map_err(|e| ImportError::Archivist(e.to_string())) } } // --------------------------------------------------------------------------- // Conversion helpers // --------------------------------------------------------------------------- fn require_path(cfg: &ImportConfig) -> std::result::Result { cfg.params .get("path") .and_then(|v| v.as_str()) .map(PathBuf::from) .ok_or_else(|| ImportError::Config("missing `path`".into())) } /// Prefer to parse the native id as a UUID if possible; otherwise derive a /// stable UUIDv5 under [`CHATGPT_MESSAGE_NS`]. fn parse_or_derive_uuid(native_id: &str) -> Uuid { Uuid::parse_str(native_id) .unwrap_or_else(|_| Uuid::new_v5(&CHATGPT_MESSAGE_NS, native_id.as_bytes())) } /// Convert parsed `ContentPart`s into `dirigent_protocol::MessagePart`s. fn parts_to_message_parts(parts: &[ContentPart]) -> Vec { parts .iter() .map(|p| match p { ContentPart::Text { text } => dirigent_protocol::MessagePart::Text { text: text.clone(), }, ContentPart::Code { language, text } => dirigent_protocol::MessagePart::Code { language: language.clone().unwrap_or_default(), code: text.clone(), }, ContentPart::Tool { name, input, output } => dirigent_protocol::MessagePart::Tool { tool: name.clone(), tool_call_id: None, input: input.clone(), output: output.clone(), }, }) .collect() } /// Flatten a list of parsed content parts into a markdown-y string for the /// `content_md` fallback surface. fn parts_to_markdown(parts: &[ContentPart]) -> String { parts .iter() .map(|p| match p { ContentPart::Text { text } => text.clone(), ContentPart::Code { language, text } => { let lang = language.clone().unwrap_or_default(); format!("```{}\n{}\n```", lang, text) } ContentPart::Tool { name, .. } => format!("[Tool: {}]", name), }) .collect::>() .join("\n\n") } /// Convert a parsed ChatGPT conversation into a vector of `MessageRecord`s. /// /// Each message's `session` field is left as `Uuid::nil()`; the generic /// `import_sessions` orchestrator patches it to the real scroll id. fn convert_conversation_to_records(conv: &ParsedConversation) -> Vec { conv.messages .iter() .filter_map(convert_parsed_message) .collect() } fn convert_parsed_message(msg: &ParsedMessage) -> Option { // Skip messages with entirely empty text payloads (nothing to archive). let content_md = parts_to_markdown(&msg.content); if content_md.trim().is_empty() && msg.content.iter().all(is_part_empty) { return None; } let parts = parts_to_message_parts(&msg.content); let content_parts = serde_json::to_value(&parts).ok(); let ts = msg.ts.unwrap_or_else(Utc::now); let message_id = if msg.id.is_empty() { // Fallback: derive from role + timestamp + a hash of content. let key = format!("{}:{}:{}", msg.role, ts.to_rfc3339(), content_md); Uuid::new_v5(&CHATGPT_MESSAGE_NS, key.as_bytes()) } else { parse_or_derive_uuid(&msg.id) }; Some(MessageRecord { version: 1, message_id, session: Uuid::nil(), parent_id: None, ts, role: msg.role.clone(), author: None, content_md, content_parts, attachments: Vec::new(), metadata: msg.metadata.clone(), }) } fn is_part_empty(p: &ContentPart) -> bool { match p { ContentPart::Text { text } => text.trim().is_empty(), ContentPart::Code { text, .. } => text.trim().is_empty(), ContentPart::Tool { .. } => false, } } // --------------------------------------------------------------------------- // Tests // --------------------------------------------------------------------------- #[cfg(test)] mod tests { use super::*; #[test] fn parse_or_derive_uuid_parses_real_uuid() { let real = "12345678-1234-5678-1234-567812345678"; let u = parse_or_derive_uuid(real); assert_eq!(u.to_string(), real); } #[test] fn parse_or_derive_uuid_falls_back_to_v5() { let a = parse_or_derive_uuid("not-a-uuid"); let b = parse_or_derive_uuid("not-a-uuid"); assert_eq!(a, b, "deterministic UUIDv5 derivation"); let c = parse_or_derive_uuid("different"); assert_ne!(a, c); } #[test] fn parts_to_message_parts_covers_all_variants() { let parts = vec![ ContentPart::Text { text: "hi".into() }, ContentPart::Code { language: Some("rust".into()), text: "fn main() {}".into(), }, ContentPart::Tool { name: "browser".into(), input: serde_json::json!({"url": "https://example.com"}), output: Some(serde_json::json!({"status": 200})), }, ]; let mp = parts_to_message_parts(&parts); assert_eq!(mp.len(), 3); assert!(matches!(&mp[0], dirigent_protocol::MessagePart::Text { .. })); assert!(matches!(&mp[1], dirigent_protocol::MessagePart::Code { .. })); assert!(matches!(&mp[2], dirigent_protocol::MessagePart::Tool { .. })); } #[test] fn empty_parsed_message_is_skipped() { let msg = ParsedMessage { id: "m1".into(), role: "system".into(), ts: None, content: vec![ContentPart::Text { text: " ".into() }], metadata: serde_json::Value::Null, }; assert!(convert_parsed_message(&msg).is_none()); } #[test] fn non_empty_parsed_message_round_trips() { let msg = ParsedMessage { id: "m1".into(), role: "user".into(), ts: None, content: vec![ContentPart::Text { text: "hello".into(), }], metadata: serde_json::Value::Null, }; let record = convert_parsed_message(&msg).expect("should convert"); assert_eq!(record.role, "user"); assert_eq!(record.content_md, "hello"); assert_eq!(record.session, Uuid::nil()); assert!(record.content_parts.is_some()); } }