sync from monorepo @ 2452e92e
This commit is contained in:
@@ -0,0 +1,214 @@
|
||||
//! File storage example for dirigent_archivist
|
||||
//!
|
||||
//! This example demonstrates:
|
||||
//! - Storing files with content-addressing
|
||||
//! - Retrieving files by file_id
|
||||
//! - Automatic deduplication of identical content
|
||||
//! - Session tracking for file references
|
||||
|
||||
use dirigent_archivist::storage::{files, ndjson, paths::ArchivePaths};
|
||||
use dirigent_archivist::types::FileRecord;
|
||||
use dirigent_archivist::Result;
|
||||
use uuid::Uuid;
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<()> {
|
||||
// Create a temporary archive directory for this example
|
||||
let temp_dir = std::env::temp_dir().join(format!("dirigent_files_example_{}", Uuid::now_v7()));
|
||||
println!("Creating archive at: {}", temp_dir.display());
|
||||
|
||||
let paths = ArchivePaths::new(temp_dir.clone());
|
||||
|
||||
// Example 1: Store a file
|
||||
println!("\n--- Example 1: Store a File ---");
|
||||
let content1 = b"This is a sample document with some text content.";
|
||||
let session1 = Uuid::now_v7();
|
||||
|
||||
let file_id1 = files::store_file(
|
||||
&paths,
|
||||
content1,
|
||||
"document.txt".to_string(),
|
||||
Some("text/plain".to_string()),
|
||||
session1,
|
||||
)
|
||||
.await?;
|
||||
|
||||
println!("Stored file with ID: {}", file_id1);
|
||||
println!("Session: {}", session1);
|
||||
|
||||
// Example 2: Retrieve the file
|
||||
println!("\n--- Example 2: Retrieve the File ---");
|
||||
let retrieved1 = files::get_file(&paths, &file_id1).await?;
|
||||
println!("Retrieved {} bytes", retrieved1.len());
|
||||
println!("Content: {}", String::from_utf8_lossy(&retrieved1));
|
||||
|
||||
// Example 3: Store the same content from a different session (deduplication)
|
||||
println!("\n--- Example 3: Deduplication Demo ---");
|
||||
let session2 = Uuid::now_v7();
|
||||
|
||||
let file_id2 = files::store_file(
|
||||
&paths,
|
||||
content1, // Same content as before
|
||||
"duplicate.txt".to_string(), // Different name
|
||||
Some("text/plain".to_string()),
|
||||
session2,
|
||||
)
|
||||
.await?;
|
||||
|
||||
println!("Stored same content with different name");
|
||||
println!("File ID 1: {}", file_id1);
|
||||
println!("File ID 2: {}", file_id2);
|
||||
println!("Same file_id? {}", file_id1 == file_id2);
|
||||
println!("\nDeduplication: Same content produces same file_id, stored only once!");
|
||||
|
||||
// Example 4: Check the file index
|
||||
println!("\n--- Example 4: File Index ---");
|
||||
let index_path = paths.root().join(".files").join("file_index.jsonl");
|
||||
let records: Vec<FileRecord> = ndjson::read_ndjson(&index_path).await?;
|
||||
|
||||
println!("File index contains {} record(s)", records.len());
|
||||
for record in &records {
|
||||
println!("\nFile: {}", record.file_id);
|
||||
println!(" Original name: {}", record.original_name);
|
||||
println!(" MIME type: {:?}", record.mime);
|
||||
println!(" Size: {} bytes", record.size);
|
||||
println!(" Referenced by {} session(s):", record.sessions.len());
|
||||
for session_id in &record.sessions {
|
||||
println!(" - {}", session_id);
|
||||
}
|
||||
}
|
||||
|
||||
// Example 5: Store different content
|
||||
println!("\n--- Example 5: Store Different Content ---");
|
||||
let content2 = b"This is completely different content with more data!";
|
||||
let session3 = Uuid::now_v7();
|
||||
|
||||
let file_id3 = files::store_file(
|
||||
&paths,
|
||||
content2,
|
||||
"different.txt".to_string(),
|
||||
Some("text/plain".to_string()),
|
||||
session3,
|
||||
)
|
||||
.await?;
|
||||
|
||||
println!("Stored different content");
|
||||
println!("File ID 3: {}", file_id3);
|
||||
println!("Different from file_id1? {}", file_id1 != file_id3);
|
||||
|
||||
// Example 6: Store binary content
|
||||
println!("\n--- Example 6: Binary Content ---");
|
||||
let binary_content: Vec<u8> = (0..256).map(|i| i as u8).collect();
|
||||
let session4 = Uuid::now_v7();
|
||||
|
||||
let file_id4 = files::store_file(
|
||||
&paths,
|
||||
&binary_content,
|
||||
"binary.dat".to_string(),
|
||||
Some("application/octet-stream".to_string()),
|
||||
session4,
|
||||
)
|
||||
.await?;
|
||||
|
||||
println!("Stored binary content (256 bytes)");
|
||||
println!("File ID: {}", file_id4);
|
||||
|
||||
// Retrieve and verify
|
||||
let retrieved_binary = files::get_file(&paths, &file_id4).await?;
|
||||
println!("Retrieved {} bytes", retrieved_binary.len());
|
||||
println!(
|
||||
"Binary content verified: {}",
|
||||
retrieved_binary == binary_content
|
||||
);
|
||||
|
||||
// Example 7: Show final archive structure
|
||||
println!("\n--- Example 7: Archive Structure ---");
|
||||
println!("Archive root: {}", temp_dir.display());
|
||||
show_files_directory(&paths)?;
|
||||
|
||||
// Example 8: Final statistics
|
||||
println!("\n--- Final Statistics ---");
|
||||
let final_records: Vec<FileRecord> = ndjson::read_ndjson(&index_path).await?;
|
||||
println!("Total unique files stored: {}", final_records.len());
|
||||
|
||||
let total_sessions: usize = final_records.iter().map(|r| r.sessions.len()).sum();
|
||||
println!("Total session references: {}", total_sessions);
|
||||
|
||||
let total_size: u64 = final_records.iter().map(|r| r.size).sum();
|
||||
println!("Total storage used: {} bytes", total_size);
|
||||
|
||||
// Content-addressing means if we had stored content1 1000 times,
|
||||
// we'd still only use storage for it once!
|
||||
println!("\nContent-addressing benefit:");
|
||||
println!(" File '{}' is referenced by {} sessions", file_id1, 2);
|
||||
println!(" But stored only once on disk!");
|
||||
|
||||
// Cleanup
|
||||
println!("\n--- Cleanup ---");
|
||||
std::fs::remove_dir_all(&temp_dir)?;
|
||||
println!("Removed temporary archive");
|
||||
|
||||
println!("\nExample completed successfully!");
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Helper function to show .files directory structure
|
||||
fn show_files_directory(paths: &ArchivePaths) -> Result<()> {
|
||||
let files_dir = paths.root().join(".files");
|
||||
|
||||
if !files_dir.exists() {
|
||||
println!("No files directory found");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
println!("\n.files/ directory:");
|
||||
|
||||
// Show index file
|
||||
let index_path = files_dir.join("file_index.jsonl");
|
||||
if index_path.exists() {
|
||||
let metadata = std::fs::metadata(&index_path)?;
|
||||
println!(" file_index.jsonl ({} bytes)", metadata.len());
|
||||
}
|
||||
|
||||
// Show shard directories
|
||||
for entry in std::fs::read_dir(&files_dir)? {
|
||||
let entry = entry?;
|
||||
let path = entry.path();
|
||||
|
||||
if path.is_dir() {
|
||||
println!(" {}/", path.file_name().unwrap().to_string_lossy());
|
||||
|
||||
// Show files in shard
|
||||
for sub_entry in std::fs::read_dir(&path)? {
|
||||
let sub_entry = sub_entry?;
|
||||
let sub_path = sub_entry.path();
|
||||
|
||||
if sub_path.is_dir() {
|
||||
println!(" {}/", sub_path.file_name().unwrap().to_string_lossy());
|
||||
|
||||
// Show files in sub-shard
|
||||
for file_entry in std::fs::read_dir(&sub_path)? {
|
||||
let file_entry = file_entry?;
|
||||
let file_path = file_entry.path();
|
||||
let metadata = std::fs::metadata(&file_path)?;
|
||||
println!(
|
||||
" {} ({} bytes)",
|
||||
file_path.file_name().unwrap().to_string_lossy(),
|
||||
metadata.len()
|
||||
);
|
||||
}
|
||||
} else {
|
||||
let metadata = std::fs::metadata(&sub_path)?;
|
||||
println!(
|
||||
" {} ({} bytes)",
|
||||
sub_path.file_name().unwrap().to_string_lossy(),
|
||||
metadata.len()
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Reference in New Issue
Block a user