sync from monorepo @ 2452e92e
This commit is contained in:
@@ -0,0 +1,361 @@
|
||||
//! ChatGPT importer: takes a path to a conversations.json file.
|
||||
|
||||
use std::path::PathBuf;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use chrono::Utc;
|
||||
use uuid::Uuid;
|
||||
|
||||
use dirigent_chatgpt::{ContentPart, ParsedConversation, ParsedMessage};
|
||||
|
||||
use super::super::progress::ImportProgressSink;
|
||||
use super::super::trait_def::{
|
||||
ConfigField, ConfigFieldKind, ImportConfig, ImportConfigShape, ImportError, ImportTarget,
|
||||
Importer,
|
||||
};
|
||||
use super::super::{
|
||||
import_sessions, DiscoveredSession, ImportDiscovery, ImportProject, ImportStats,
|
||||
};
|
||||
use crate::coordinator::Archivist;
|
||||
use crate::error::{ArchivistError, Result};
|
||||
use crate::types::{MessageRecord, RegisterConnectorRequest};
|
||||
|
||||
/// Connector type string used for imported ChatGPT sessions.
|
||||
pub const CHATGPT_CONNECTOR_TYPE: &str = "ChatGPT";
|
||||
|
||||
/// Fingerprint prefix for locally-imported ChatGPT exports.
|
||||
pub const CHATGPT_FINGERPRINT_PREFIX: &str = "import/local:chatgpt";
|
||||
|
||||
/// Namespace UUID for deterministic UUIDv5 derivations on ChatGPT message ids
|
||||
/// that are not already valid UUIDs.
|
||||
const CHATGPT_MESSAGE_NS: Uuid = Uuid::from_u128(0x4e58_a7cb_bf1c_4de2_b7c9_8c31_11b3_1112);
|
||||
|
||||
pub struct ChatGptImporter;
|
||||
|
||||
#[async_trait]
|
||||
impl Importer for ChatGptImporter {
|
||||
fn source_name(&self) -> &'static str {
|
||||
"chatgpt"
|
||||
}
|
||||
|
||||
fn config_shape(&self) -> ImportConfigShape {
|
||||
ImportConfigShape {
|
||||
fields: vec![ConfigField {
|
||||
key: "path".into(),
|
||||
label: "conversations.json path".into(),
|
||||
kind: ConfigFieldKind::File {
|
||||
extension: Some("json".into()),
|
||||
},
|
||||
required: true,
|
||||
help: Some(
|
||||
"Unzipped OpenAI data export \u{2192} conversations.json".into(),
|
||||
),
|
||||
}],
|
||||
example: ImportConfig {
|
||||
source: "chatgpt".into(),
|
||||
params: {
|
||||
let mut m = std::collections::BTreeMap::new();
|
||||
m.insert(
|
||||
"path".into(),
|
||||
serde_json::json!("~/Downloads/chatgpt-export/conversations.json"),
|
||||
);
|
||||
m
|
||||
},
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
async fn discover(
|
||||
&self,
|
||||
cfg: &ImportConfig,
|
||||
) -> std::result::Result<ImportDiscovery, ImportError> {
|
||||
let path = require_path(cfg)?;
|
||||
let convs = dirigent_chatgpt::parse_export(&path)
|
||||
.map_err(|e| ImportError::Discovery(e.to_string()))?;
|
||||
|
||||
let total_sessions = convs.len();
|
||||
let total_estimated_messages: usize = convs.iter().map(|c| c.messages.len()).sum();
|
||||
|
||||
// ChatGPT exports don't carry per-project information, so we bucket
|
||||
// everything into a single synthetic project named after the file.
|
||||
let project_name = path
|
||||
.file_name()
|
||||
.and_then(|s| s.to_str())
|
||||
.unwrap_or("ChatGPT export")
|
||||
.to_string();
|
||||
|
||||
Ok(ImportDiscovery {
|
||||
source_name: "ChatGPT".to_string(),
|
||||
source_path: path.display().to_string(),
|
||||
projects: vec![ImportProject {
|
||||
name: project_name,
|
||||
session_count: total_sessions,
|
||||
}],
|
||||
total_sessions,
|
||||
total_estimated_messages,
|
||||
})
|
||||
}
|
||||
|
||||
async fn import(
|
||||
&self,
|
||||
cfg: &ImportConfig,
|
||||
archivist: &Archivist,
|
||||
target: ImportTarget,
|
||||
progress: ImportProgressSink,
|
||||
) -> std::result::Result<ImportStats, ImportError> {
|
||||
let path = require_path(cfg)?;
|
||||
let convs = dirigent_chatgpt::parse_export(&path)
|
||||
.map_err(|e| ImportError::Parser(e.to_string()))?;
|
||||
|
||||
// Build discovered-session list + keep the parsed convs handy for
|
||||
// message conversion inside the closure.
|
||||
let mut discovered: Vec<DiscoveredSession> = Vec::with_capacity(convs.len());
|
||||
for c in &convs {
|
||||
let metadata = serde_json::json!({
|
||||
"source": "chatgpt",
|
||||
"conversation_id": c.id,
|
||||
"parser_metadata": c.metadata.clone(),
|
||||
});
|
||||
discovered.push(DiscoveredSession {
|
||||
native_session_id: c.id.clone(),
|
||||
title: c.title.clone(),
|
||||
created_at: c.created_at,
|
||||
updated_at: c.updated_at,
|
||||
message_count: c.messages.len(),
|
||||
metadata,
|
||||
project_path: None,
|
||||
file_size: None,
|
||||
});
|
||||
}
|
||||
|
||||
// Map native_id -> parsed conversation for O(1) lookup in `convert`.
|
||||
let conv_lookup: std::collections::HashMap<String, ParsedConversation> = convs
|
||||
.into_iter()
|
||||
.map(|c| (c.id.clone(), c))
|
||||
.collect();
|
||||
|
||||
// Fingerprint the import by the canonical path. Re-running against the
|
||||
// same file aliases onto the same connector.
|
||||
let canonical_path = path.canonicalize().unwrap_or_else(|_| path.clone());
|
||||
let fingerprint = format!("{}:{}", CHATGPT_FINGERPRINT_PREFIX, canonical_path.display());
|
||||
|
||||
let connector_req = RegisterConnectorRequest {
|
||||
r#type: CHATGPT_CONNECTOR_TYPE.to_string(),
|
||||
title: format!("ChatGPT ({})", canonical_path.display()),
|
||||
client_native_id: fingerprint.clone(),
|
||||
custom_uid: None,
|
||||
metadata: serde_json::json!({}),
|
||||
fingerprint: Some(fingerprint),
|
||||
};
|
||||
|
||||
let convert = |native_id: &str| -> Result<Vec<MessageRecord>> {
|
||||
let conv = conv_lookup.get(native_id).ok_or_else(|| {
|
||||
ArchivistError::InvalidRequest(format!(
|
||||
"Parsed conversation not found for native_id: {}",
|
||||
native_id
|
||||
))
|
||||
})?;
|
||||
Ok(convert_conversation_to_records(conv))
|
||||
};
|
||||
|
||||
import_sessions(
|
||||
archivist,
|
||||
connector_req,
|
||||
discovered,
|
||||
convert,
|
||||
target.archive,
|
||||
&progress,
|
||||
false,
|
||||
&target.project_map,
|
||||
)
|
||||
.await
|
||||
.map_err(|e| ImportError::Archivist(e.to_string()))
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Conversion helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn require_path(cfg: &ImportConfig) -> std::result::Result<PathBuf, ImportError> {
|
||||
cfg.params
|
||||
.get("path")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(PathBuf::from)
|
||||
.ok_or_else(|| ImportError::Config("missing `path`".into()))
|
||||
}
|
||||
|
||||
/// Prefer to parse the native id as a UUID if possible; otherwise derive a
|
||||
/// stable UUIDv5 under [`CHATGPT_MESSAGE_NS`].
|
||||
fn parse_or_derive_uuid(native_id: &str) -> Uuid {
|
||||
Uuid::parse_str(native_id)
|
||||
.unwrap_or_else(|_| Uuid::new_v5(&CHATGPT_MESSAGE_NS, native_id.as_bytes()))
|
||||
}
|
||||
|
||||
/// Convert parsed `ContentPart`s into `dirigent_protocol::MessagePart`s.
|
||||
fn parts_to_message_parts(parts: &[ContentPart]) -> Vec<dirigent_protocol::MessagePart> {
|
||||
parts
|
||||
.iter()
|
||||
.map(|p| match p {
|
||||
ContentPart::Text { text } => dirigent_protocol::MessagePart::Text {
|
||||
text: text.clone(),
|
||||
},
|
||||
ContentPart::Code { language, text } => dirigent_protocol::MessagePart::Code {
|
||||
language: language.clone().unwrap_or_default(),
|
||||
code: text.clone(),
|
||||
},
|
||||
ContentPart::Tool { name, input, output } => dirigent_protocol::MessagePart::Tool {
|
||||
tool: name.clone(),
|
||||
tool_call_id: None,
|
||||
input: input.clone(),
|
||||
output: output.clone(),
|
||||
},
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Flatten a list of parsed content parts into a markdown-y string for the
|
||||
/// `content_md` fallback surface.
|
||||
fn parts_to_markdown(parts: &[ContentPart]) -> String {
|
||||
parts
|
||||
.iter()
|
||||
.map(|p| match p {
|
||||
ContentPart::Text { text } => text.clone(),
|
||||
ContentPart::Code { language, text } => {
|
||||
let lang = language.clone().unwrap_or_default();
|
||||
format!("```{}\n{}\n```", lang, text)
|
||||
}
|
||||
ContentPart::Tool { name, .. } => format!("[Tool: {}]", name),
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.join("\n\n")
|
||||
}
|
||||
|
||||
/// Convert a parsed ChatGPT conversation into a vector of `MessageRecord`s.
|
||||
///
|
||||
/// Each message's `session` field is left as `Uuid::nil()`; the generic
|
||||
/// `import_sessions` orchestrator patches it to the real scroll id.
|
||||
fn convert_conversation_to_records(conv: &ParsedConversation) -> Vec<MessageRecord> {
|
||||
conv.messages
|
||||
.iter()
|
||||
.filter_map(convert_parsed_message)
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn convert_parsed_message(msg: &ParsedMessage) -> Option<MessageRecord> {
|
||||
// Skip messages with entirely empty text payloads (nothing to archive).
|
||||
let content_md = parts_to_markdown(&msg.content);
|
||||
if content_md.trim().is_empty() && msg.content.iter().all(is_part_empty) {
|
||||
return None;
|
||||
}
|
||||
|
||||
let parts = parts_to_message_parts(&msg.content);
|
||||
let content_parts = serde_json::to_value(&parts).ok();
|
||||
|
||||
let ts = msg.ts.unwrap_or_else(Utc::now);
|
||||
let message_id = if msg.id.is_empty() {
|
||||
// Fallback: derive from role + timestamp + a hash of content.
|
||||
let key = format!("{}:{}:{}", msg.role, ts.to_rfc3339(), content_md);
|
||||
Uuid::new_v5(&CHATGPT_MESSAGE_NS, key.as_bytes())
|
||||
} else {
|
||||
parse_or_derive_uuid(&msg.id)
|
||||
};
|
||||
|
||||
Some(MessageRecord {
|
||||
version: 1,
|
||||
message_id,
|
||||
session: Uuid::nil(),
|
||||
parent_id: None,
|
||||
ts,
|
||||
role: msg.role.clone(),
|
||||
author: None,
|
||||
content_md,
|
||||
content_parts,
|
||||
attachments: Vec::new(),
|
||||
metadata: msg.metadata.clone(),
|
||||
})
|
||||
}
|
||||
|
||||
fn is_part_empty(p: &ContentPart) -> bool {
|
||||
match p {
|
||||
ContentPart::Text { text } => text.trim().is_empty(),
|
||||
ContentPart::Code { text, .. } => text.trim().is_empty(),
|
||||
ContentPart::Tool { .. } => false,
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Tests
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn parse_or_derive_uuid_parses_real_uuid() {
|
||||
let real = "12345678-1234-5678-1234-567812345678";
|
||||
let u = parse_or_derive_uuid(real);
|
||||
assert_eq!(u.to_string(), real);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_or_derive_uuid_falls_back_to_v5() {
|
||||
let a = parse_or_derive_uuid("not-a-uuid");
|
||||
let b = parse_or_derive_uuid("not-a-uuid");
|
||||
assert_eq!(a, b, "deterministic UUIDv5 derivation");
|
||||
let c = parse_or_derive_uuid("different");
|
||||
assert_ne!(a, c);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parts_to_message_parts_covers_all_variants() {
|
||||
let parts = vec![
|
||||
ContentPart::Text { text: "hi".into() },
|
||||
ContentPart::Code {
|
||||
language: Some("rust".into()),
|
||||
text: "fn main() {}".into(),
|
||||
},
|
||||
ContentPart::Tool {
|
||||
name: "browser".into(),
|
||||
input: serde_json::json!({"url": "https://example.com"}),
|
||||
output: Some(serde_json::json!({"status": 200})),
|
||||
},
|
||||
];
|
||||
let mp = parts_to_message_parts(&parts);
|
||||
assert_eq!(mp.len(), 3);
|
||||
assert!(matches!(&mp[0], dirigent_protocol::MessagePart::Text { .. }));
|
||||
assert!(matches!(&mp[1], dirigent_protocol::MessagePart::Code { .. }));
|
||||
assert!(matches!(&mp[2], dirigent_protocol::MessagePart::Tool { .. }));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_parsed_message_is_skipped() {
|
||||
let msg = ParsedMessage {
|
||||
id: "m1".into(),
|
||||
role: "system".into(),
|
||||
ts: None,
|
||||
content: vec![ContentPart::Text { text: " ".into() }],
|
||||
metadata: serde_json::Value::Null,
|
||||
};
|
||||
assert!(convert_parsed_message(&msg).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn non_empty_parsed_message_round_trips() {
|
||||
let msg = ParsedMessage {
|
||||
id: "m1".into(),
|
||||
role: "user".into(),
|
||||
ts: None,
|
||||
content: vec![ContentPart::Text {
|
||||
text: "hello".into(),
|
||||
}],
|
||||
metadata: serde_json::Value::Null,
|
||||
};
|
||||
let record = convert_parsed_message(&msg).expect("should convert");
|
||||
assert_eq!(record.role, "user");
|
||||
assert_eq!(record.content_md, "hello");
|
||||
assert_eq!(record.session, Uuid::nil());
|
||||
assert!(record.content_parts.is_some());
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user