362 lines
12 KiB
Rust
362 lines
12 KiB
Rust
//! ChatGPT importer: takes a path to a conversations.json file.
|
|
|
|
use std::path::PathBuf;
|
|
|
|
use async_trait::async_trait;
|
|
use chrono::Utc;
|
|
use uuid::Uuid;
|
|
|
|
use dirigent_chatgpt::{ContentPart, ParsedConversation, ParsedMessage};
|
|
|
|
use super::super::progress::ImportProgressSink;
|
|
use super::super::trait_def::{
|
|
ConfigField, ConfigFieldKind, ImportConfig, ImportConfigShape, ImportError, ImportTarget,
|
|
Importer,
|
|
};
|
|
use super::super::{
|
|
import_sessions, DiscoveredSession, ImportDiscovery, ImportProject, ImportStats,
|
|
};
|
|
use crate::coordinator::Archivist;
|
|
use crate::error::{ArchivistError, Result};
|
|
use crate::types::{MessageRecord, RegisterConnectorRequest};
|
|
|
|
/// Connector type string used for imported ChatGPT sessions.
|
|
pub const CHATGPT_CONNECTOR_TYPE: &str = "ChatGPT";
|
|
|
|
/// Fingerprint prefix for locally-imported ChatGPT exports.
|
|
pub const CHATGPT_FINGERPRINT_PREFIX: &str = "import/local:chatgpt";
|
|
|
|
/// Namespace UUID for deterministic UUIDv5 derivations on ChatGPT message ids
|
|
/// that are not already valid UUIDs.
|
|
const CHATGPT_MESSAGE_NS: Uuid = Uuid::from_u128(0x4e58_a7cb_bf1c_4de2_b7c9_8c31_11b3_1112);
|
|
|
|
pub struct ChatGptImporter;
|
|
|
|
#[async_trait]
|
|
impl Importer for ChatGptImporter {
|
|
fn source_name(&self) -> &'static str {
|
|
"chatgpt"
|
|
}
|
|
|
|
fn config_shape(&self) -> ImportConfigShape {
|
|
ImportConfigShape {
|
|
fields: vec![ConfigField {
|
|
key: "path".into(),
|
|
label: "conversations.json path".into(),
|
|
kind: ConfigFieldKind::File {
|
|
extension: Some("json".into()),
|
|
},
|
|
required: true,
|
|
help: Some(
|
|
"Unzipped OpenAI data export \u{2192} conversations.json".into(),
|
|
),
|
|
}],
|
|
example: ImportConfig {
|
|
source: "chatgpt".into(),
|
|
params: {
|
|
let mut m = std::collections::BTreeMap::new();
|
|
m.insert(
|
|
"path".into(),
|
|
serde_json::json!("~/Downloads/chatgpt-export/conversations.json"),
|
|
);
|
|
m
|
|
},
|
|
},
|
|
}
|
|
}
|
|
|
|
async fn discover(
|
|
&self,
|
|
cfg: &ImportConfig,
|
|
) -> std::result::Result<ImportDiscovery, ImportError> {
|
|
let path = require_path(cfg)?;
|
|
let convs = dirigent_chatgpt::parse_export(&path)
|
|
.map_err(|e| ImportError::Discovery(e.to_string()))?;
|
|
|
|
let total_sessions = convs.len();
|
|
let total_estimated_messages: usize = convs.iter().map(|c| c.messages.len()).sum();
|
|
|
|
// ChatGPT exports don't carry per-project information, so we bucket
|
|
// everything into a single synthetic project named after the file.
|
|
let project_name = path
|
|
.file_name()
|
|
.and_then(|s| s.to_str())
|
|
.unwrap_or("ChatGPT export")
|
|
.to_string();
|
|
|
|
Ok(ImportDiscovery {
|
|
source_name: "ChatGPT".to_string(),
|
|
source_path: path.display().to_string(),
|
|
projects: vec![ImportProject {
|
|
name: project_name,
|
|
session_count: total_sessions,
|
|
}],
|
|
total_sessions,
|
|
total_estimated_messages,
|
|
})
|
|
}
|
|
|
|
async fn import(
|
|
&self,
|
|
cfg: &ImportConfig,
|
|
archivist: &Archivist,
|
|
target: ImportTarget,
|
|
progress: ImportProgressSink,
|
|
) -> std::result::Result<ImportStats, ImportError> {
|
|
let path = require_path(cfg)?;
|
|
let convs = dirigent_chatgpt::parse_export(&path)
|
|
.map_err(|e| ImportError::Parser(e.to_string()))?;
|
|
|
|
// Build discovered-session list + keep the parsed convs handy for
|
|
// message conversion inside the closure.
|
|
let mut discovered: Vec<DiscoveredSession> = Vec::with_capacity(convs.len());
|
|
for c in &convs {
|
|
let metadata = serde_json::json!({
|
|
"source": "chatgpt",
|
|
"conversation_id": c.id,
|
|
"parser_metadata": c.metadata.clone(),
|
|
});
|
|
discovered.push(DiscoveredSession {
|
|
native_session_id: c.id.clone(),
|
|
title: c.title.clone(),
|
|
created_at: c.created_at,
|
|
updated_at: c.updated_at,
|
|
message_count: c.messages.len(),
|
|
metadata,
|
|
project_path: None,
|
|
file_size: None,
|
|
});
|
|
}
|
|
|
|
// Map native_id -> parsed conversation for O(1) lookup in `convert`.
|
|
let conv_lookup: std::collections::HashMap<String, ParsedConversation> = convs
|
|
.into_iter()
|
|
.map(|c| (c.id.clone(), c))
|
|
.collect();
|
|
|
|
// Fingerprint the import by the canonical path. Re-running against the
|
|
// same file aliases onto the same connector.
|
|
let canonical_path = path.canonicalize().unwrap_or_else(|_| path.clone());
|
|
let fingerprint = format!("{}:{}", CHATGPT_FINGERPRINT_PREFIX, canonical_path.display());
|
|
|
|
let connector_req = RegisterConnectorRequest {
|
|
r#type: CHATGPT_CONNECTOR_TYPE.to_string(),
|
|
title: format!("ChatGPT ({})", canonical_path.display()),
|
|
client_native_id: fingerprint.clone(),
|
|
custom_uid: None,
|
|
metadata: serde_json::json!({}),
|
|
fingerprint: Some(fingerprint),
|
|
};
|
|
|
|
let convert = |native_id: &str| -> Result<Vec<MessageRecord>> {
|
|
let conv = conv_lookup.get(native_id).ok_or_else(|| {
|
|
ArchivistError::InvalidRequest(format!(
|
|
"Parsed conversation not found for native_id: {}",
|
|
native_id
|
|
))
|
|
})?;
|
|
Ok(convert_conversation_to_records(conv))
|
|
};
|
|
|
|
import_sessions(
|
|
archivist,
|
|
connector_req,
|
|
discovered,
|
|
convert,
|
|
target.archive,
|
|
&progress,
|
|
false,
|
|
&target.project_map,
|
|
)
|
|
.await
|
|
.map_err(|e| ImportError::Archivist(e.to_string()))
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Conversion helpers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
fn require_path(cfg: &ImportConfig) -> std::result::Result<PathBuf, ImportError> {
|
|
cfg.params
|
|
.get("path")
|
|
.and_then(|v| v.as_str())
|
|
.map(PathBuf::from)
|
|
.ok_or_else(|| ImportError::Config("missing `path`".into()))
|
|
}
|
|
|
|
/// Prefer to parse the native id as a UUID if possible; otherwise derive a
|
|
/// stable UUIDv5 under [`CHATGPT_MESSAGE_NS`].
|
|
fn parse_or_derive_uuid(native_id: &str) -> Uuid {
|
|
Uuid::parse_str(native_id)
|
|
.unwrap_or_else(|_| Uuid::new_v5(&CHATGPT_MESSAGE_NS, native_id.as_bytes()))
|
|
}
|
|
|
|
/// Convert parsed `ContentPart`s into `dirigent_protocol::MessagePart`s.
|
|
fn parts_to_message_parts(parts: &[ContentPart]) -> Vec<dirigent_protocol::MessagePart> {
|
|
parts
|
|
.iter()
|
|
.map(|p| match p {
|
|
ContentPart::Text { text } => dirigent_protocol::MessagePart::Text {
|
|
text: text.clone(),
|
|
},
|
|
ContentPart::Code { language, text } => dirigent_protocol::MessagePart::Code {
|
|
language: language.clone().unwrap_or_default(),
|
|
code: text.clone(),
|
|
},
|
|
ContentPart::Tool { name, input, output } => dirigent_protocol::MessagePart::Tool {
|
|
tool: name.clone(),
|
|
tool_call_id: None,
|
|
input: input.clone(),
|
|
output: output.clone(),
|
|
},
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
/// Flatten a list of parsed content parts into a markdown-y string for the
|
|
/// `content_md` fallback surface.
|
|
fn parts_to_markdown(parts: &[ContentPart]) -> String {
|
|
parts
|
|
.iter()
|
|
.map(|p| match p {
|
|
ContentPart::Text { text } => text.clone(),
|
|
ContentPart::Code { language, text } => {
|
|
let lang = language.clone().unwrap_or_default();
|
|
format!("```{}\n{}\n```", lang, text)
|
|
}
|
|
ContentPart::Tool { name, .. } => format!("[Tool: {}]", name),
|
|
})
|
|
.collect::<Vec<_>>()
|
|
.join("\n\n")
|
|
}
|
|
|
|
/// Convert a parsed ChatGPT conversation into a vector of `MessageRecord`s.
|
|
///
|
|
/// Each message's `session` field is left as `Uuid::nil()`; the generic
|
|
/// `import_sessions` orchestrator patches it to the real scroll id.
|
|
fn convert_conversation_to_records(conv: &ParsedConversation) -> Vec<MessageRecord> {
|
|
conv.messages
|
|
.iter()
|
|
.filter_map(convert_parsed_message)
|
|
.collect()
|
|
}
|
|
|
|
fn convert_parsed_message(msg: &ParsedMessage) -> Option<MessageRecord> {
|
|
// Skip messages with entirely empty text payloads (nothing to archive).
|
|
let content_md = parts_to_markdown(&msg.content);
|
|
if content_md.trim().is_empty() && msg.content.iter().all(is_part_empty) {
|
|
return None;
|
|
}
|
|
|
|
let parts = parts_to_message_parts(&msg.content);
|
|
let content_parts = serde_json::to_value(&parts).ok();
|
|
|
|
let ts = msg.ts.unwrap_or_else(Utc::now);
|
|
let message_id = if msg.id.is_empty() {
|
|
// Fallback: derive from role + timestamp + a hash of content.
|
|
let key = format!("{}:{}:{}", msg.role, ts.to_rfc3339(), content_md);
|
|
Uuid::new_v5(&CHATGPT_MESSAGE_NS, key.as_bytes())
|
|
} else {
|
|
parse_or_derive_uuid(&msg.id)
|
|
};
|
|
|
|
Some(MessageRecord {
|
|
version: 1,
|
|
message_id,
|
|
session: Uuid::nil(),
|
|
parent_id: None,
|
|
ts,
|
|
role: msg.role.clone(),
|
|
author: None,
|
|
content_md,
|
|
content_parts,
|
|
attachments: Vec::new(),
|
|
metadata: msg.metadata.clone(),
|
|
})
|
|
}
|
|
|
|
fn is_part_empty(p: &ContentPart) -> bool {
|
|
match p {
|
|
ContentPart::Text { text } => text.trim().is_empty(),
|
|
ContentPart::Code { text, .. } => text.trim().is_empty(),
|
|
ContentPart::Tool { .. } => false,
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Tests
|
|
// ---------------------------------------------------------------------------
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn parse_or_derive_uuid_parses_real_uuid() {
|
|
let real = "12345678-1234-5678-1234-567812345678";
|
|
let u = parse_or_derive_uuid(real);
|
|
assert_eq!(u.to_string(), real);
|
|
}
|
|
|
|
#[test]
|
|
fn parse_or_derive_uuid_falls_back_to_v5() {
|
|
let a = parse_or_derive_uuid("not-a-uuid");
|
|
let b = parse_or_derive_uuid("not-a-uuid");
|
|
assert_eq!(a, b, "deterministic UUIDv5 derivation");
|
|
let c = parse_or_derive_uuid("different");
|
|
assert_ne!(a, c);
|
|
}
|
|
|
|
#[test]
|
|
fn parts_to_message_parts_covers_all_variants() {
|
|
let parts = vec![
|
|
ContentPart::Text { text: "hi".into() },
|
|
ContentPart::Code {
|
|
language: Some("rust".into()),
|
|
text: "fn main() {}".into(),
|
|
},
|
|
ContentPart::Tool {
|
|
name: "browser".into(),
|
|
input: serde_json::json!({"url": "https://example.com"}),
|
|
output: Some(serde_json::json!({"status": 200})),
|
|
},
|
|
];
|
|
let mp = parts_to_message_parts(&parts);
|
|
assert_eq!(mp.len(), 3);
|
|
assert!(matches!(&mp[0], dirigent_protocol::MessagePart::Text { .. }));
|
|
assert!(matches!(&mp[1], dirigent_protocol::MessagePart::Code { .. }));
|
|
assert!(matches!(&mp[2], dirigent_protocol::MessagePart::Tool { .. }));
|
|
}
|
|
|
|
#[test]
|
|
fn empty_parsed_message_is_skipped() {
|
|
let msg = ParsedMessage {
|
|
id: "m1".into(),
|
|
role: "system".into(),
|
|
ts: None,
|
|
content: vec![ContentPart::Text { text: " ".into() }],
|
|
metadata: serde_json::Value::Null,
|
|
};
|
|
assert!(convert_parsed_message(&msg).is_none());
|
|
}
|
|
|
|
#[test]
|
|
fn non_empty_parsed_message_round_trips() {
|
|
let msg = ParsedMessage {
|
|
id: "m1".into(),
|
|
role: "user".into(),
|
|
ts: None,
|
|
content: vec![ContentPart::Text {
|
|
text: "hello".into(),
|
|
}],
|
|
metadata: serde_json::Value::Null,
|
|
};
|
|
let record = convert_parsed_message(&msg).expect("should convert");
|
|
assert_eq!(record.role, "user");
|
|
assert_eq!(record.content_md, "hello");
|
|
assert_eq!(record.session, Uuid::nil());
|
|
assert!(record.content_parts.is_some());
|
|
}
|
|
}
|