dirigent/crates/dirigent_archivist/tests/multi_backend_writer_test.rs

#![cfg(feature = "test-utils")]

//! Integration tests for Task 17's per-backend queued writer task.
//!
//! These exercise the full enqueue → batch → coalesce → dispatch pipeline
//! end-to-end by constructing real writer tasks against `MockBackend`
//! instances and driving them through the `Archivist` coordinator.
//!
//! The tests are timing-sensitive: the batch window is 25ms and the
//! backpressure test artificially slows the backend. Assertions use
//! tolerant margins so they survive CI jitter.

use std::sync::Arc;
use std::time::Duration;

use dirigent_archivist::backend::mock::MockBackend;
use dirigent_archivist::backend::{ArchiveBackend, HealthStatus};
use dirigent_archivist::coordinator::Archivist;
use dirigent_archivist::registry::writer::spawn_writer;
use dirigent_archivist::registry::{
    ArchiveRegistration, FailureMode, OverflowPolicy, WritePolicy,
};
use uuid::Uuid;

fn sample_message(scroll: Uuid) -> dirigent_archivist::types::MessageRecord {
    dirigent_archivist::types::MessageRecord {
        version: 1,
        message_id: Uuid::now_v7(),
        session: scroll,
        parent_id: None,
        ts: chrono::Utc::now(),
        role: "user".into(),
        author: None,
        content_md: "hi".into(),
        content_parts: None,
        attachments: vec![],
        metadata: serde_json::Value::Null,
    }
}

fn queued_reg(
    name: &str,
    backend: Arc<MockBackend>,
    priority: u32,
    overflow: OverflowPolicy,
) -> Arc<ArchiveRegistration> {
    let initial_health = HealthStatus::Healthy;
    let policy = WritePolicy::Queued {
        batch_window_ms: 25,
        capacity: 8,
        overflow,
    };

    let health = Arc::new(tokio::sync::RwLock::new(initial_health));
    let last_error = Arc::new(tokio::sync::RwLock::new(None));
    let consecutive = Arc::new(tokio::sync::RwLock::new(0u32));

    let writer = Some(spawn_writer(
        backend.clone() as Arc<dyn ArchiveBackend>,
        name.into(),
        8,
        Duration::from_millis(25),
        overflow,
        health.clone(),
        last_error.clone(),
        consecutive.clone(),
    ));

    Arc::new(ArchiveRegistration::new_with_shared_state(
        name.into(),
        "mock",
        backend as Arc<dyn ArchiveBackend>,
        true,
        FailureMode::Required,
        priority,
        true,
        policy,
        writer,
        health,
        last_error,
        consecutive,
    ))
}

#[tokio::test]
async fn queued_write_returns_immediately_then_eventually_lands() {
    let mock = Arc::new(MockBackend::new());
    let archivist = Archivist::from_registrations(vec![queued_reg(
        "queued",
        mock.clone(),
        0,
        OverflowPolicy::Block,
    )]);

    let scroll = Uuid::new_v4();
    archivist
        .append_messages(scroll, vec![sample_message(scroll)], None)
        .await
        .unwrap();

    // Wait up to 500ms for the writer to drain.
    let mut landed = false;
    for _ in 0..50 {
        if mock.appended_count(scroll) > 0 {
            landed = true;
            break;
        }
        tokio::time::sleep(Duration::from_millis(10)).await;
    }
    assert!(landed, "writer task did not drain within 500ms");
    assert_eq!(mock.appended_count(scroll), 1);

    archivist.shutdown().await.unwrap();
}

#[tokio::test]
async fn coalescing_merges_consecutive_appends_for_same_scroll() {
    let mock = Arc::new(MockBackend::new());
    let archivist = Archivist::from_registrations(vec![queued_reg(
        "queued",
        mock.clone(),
        0,
        OverflowPolicy::Block,
    )]);

    let scroll = Uuid::new_v4();
    for _ in 0..5 {
        archivist
            .append_messages(scroll, vec![sample_message(scroll)], None)
            .await
            .unwrap();
    }

    // Give the writer time to drain + coalesce, then shut down to guarantee
    // any still-queued ops are flushed before we assert.
    tokio::time::sleep(Duration::from_millis(200)).await;
    archivist.shutdown().await.unwrap();

    // Five enqueued ops may have been coalesced into fewer backend calls.
    // The only strict invariant we can reliably assert is: the total number
    // of backend `append_messages` INVOCATIONS is <= 5.
    assert!(
        mock.append_call_count(scroll) <= 5,
        "expected <= 5 backend calls, got {}",
        mock.append_call_count(scroll)
    );
    assert_eq!(
        mock.appended_count(scroll),
        5,
        "all 5 messages should land"
    );
}

#[tokio::test]
async fn overflow_block_applies_backpressure() {
    // For backpressure to visibly stall the sender, we need four things:
    //   1. A tight queue (capacity=2) so the channel actually fills up.
    //   2. A slow backend (per-op 50ms) so the writer stalls in dispatch
    //      long enough for the channel to fill.
    //   3. batch_window=0 so the writer spends (almost) all its time in
    //      the 50ms per-op sleep instead of draining fast inside the
    //      batch-collection phase.
    //   4. Distinct scroll IDs so the writer's same-scroll coalescing
    //      doesn't merge everything into one dispatch call (which would
    //      collapse the entire batch into a single 50ms sleep).
    // With those, the writer dispatches N serial 50ms calls; while it's
    // sleeping the sender can't fit its next op into the full channel
    // and must wait for a drain.
    let mock = Arc::new(MockBackend::new());
    mock.set_per_op_delay(Duration::from_millis(50));

    let capacity = 2usize;
    let overflow = OverflowPolicy::Block;
    // batch_window=0 means the writer dispatches each op immediately and
    // spends (almost) all its time in the 50ms per-op sleep — so the
    // channel stays full and the sender has to wait on every drain.
    let policy = WritePolicy::Queued {
        batch_window_ms: 0,
        capacity,
        overflow,
    };

    let health = Arc::new(tokio::sync::RwLock::new(HealthStatus::Healthy));
    let last_error = Arc::new(tokio::sync::RwLock::new(None));
    let consecutive = Arc::new(tokio::sync::RwLock::new(0u32));

    let writer = Some(spawn_writer(
        mock.clone() as Arc<dyn ArchiveBackend>,
        "queued".into(),
        capacity,
        Duration::from_millis(0),
        overflow,
        health.clone(),
        last_error.clone(),
        consecutive.clone(),
    ));

    let reg = Arc::new(ArchiveRegistration::new_with_shared_state(
        "queued".into(),
        "mock",
        mock.clone() as Arc<dyn ArchiveBackend>,
        true,
        FailureMode::Required,
        0,
        true,
        policy,
        writer,
        health,
        last_error,
        consecutive,
    ));

    let archivist = Archivist::from_registrations(vec![reg]);

    // Prime the writer with one op and wait just long enough for it to
    // enter its first 50ms dispatch sleep. After that the writer is NOT
    // recv'ing, so the tight capacity=2 channel fills and further sends
    // must wait for a drain.
    let scroll0 = Uuid::new_v4();
    archivist
        .append_messages(scroll0, vec![sample_message(scroll0)], None)
        .await
        .unwrap();
    tokio::time::sleep(Duration::from_millis(10)).await;

    // Now measure the cost of many more sends with distinct scroll IDs
    // so the writer can't coalesce them. Each dispatch call is 50ms, the
    // queue holds only 2, so the sender must wait repeatedly for the
    // writer to drain cycles.
    let start = std::time::Instant::now();
    for _ in 0..24 {
        let scroll = Uuid::new_v4();
        archivist
            .append_messages(scroll, vec![sample_message(scroll)], None)
            .await
            .unwrap();
    }
    let elapsed = start.elapsed();

    // With 24 distinct-scroll sends, a capacity=2 queue, batch_window=0,
    // and a 50ms per-op delay, the sender cannot finish instantly — the
    // writer needs many drain cycles and the sender waits on each. A
    // 100ms floor keeps the test meaningful (a non-blocking run measures
    // in microseconds) while being lenient on CI jitter.
    assert!(
        elapsed >= Duration::from_millis(100),
        "block policy did not apply backpressure (elapsed: {:?})",
        elapsed
    );

    archivist.shutdown().await.unwrap();
}