sync from monorepo @ 2452e92e
This commit is contained in:
@@ -0,0 +1,477 @@
|
||||
//! SharingBus: single-producer, many-subscriber event multiplexer with
|
||||
//! subscriber-side filtering performed by a worker task. See
|
||||
//! docs/plans/2026-04-21-archivist-phase4-design.md §1.
|
||||
//!
|
||||
//! Architecture:
|
||||
//! - One internal `tokio::sync::broadcast::Sender<BusEvent>` feeds a single
|
||||
//! worker task. The worker iterates `Vec<SubscriberSlot>` (behind `RwLock`),
|
||||
//! filter-matches each slot, and `try_send`s the event onto each slot's
|
||||
//! `mpsc::Sender<BusEvent>`.
|
||||
//! - Slow subscribers drop their own events at their mpsc (counted in the
|
||||
//! slot's `lagged` atomic). The bus-internal broadcast channel never drops
|
||||
//! due to a slow subscriber — only due to the broadcast lag contract, which
|
||||
//! we log and continue.
|
||||
//! - `SessionRegistered` events late-bind `(connector_id, native_session_id) ->
|
||||
//! scroll_id` via a small cache consulted on every publish.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
use tokio::sync::{broadcast, mpsc, RwLock};
|
||||
use tokio::task::JoinHandle;
|
||||
use tracing::{debug, warn};
|
||||
use uuid::Uuid;
|
||||
|
||||
use dirigent_protocol::streaming::{BusEvent, EventFilter};
|
||||
pub use dirigent_protocol::streaming::BusReceiver;
|
||||
use dirigent_protocol::Event;
|
||||
|
||||
const BUS_INTERNAL_CAPACITY: usize = 1024;
|
||||
const SUBSCRIBER_QUEUE_DEFAULT: usize = 256;
|
||||
|
||||
/// Single-producer, many-subscriber event multiplexer.
|
||||
///
|
||||
/// Subscribers see a `mpsc::Receiver<BusEvent>` that only yields events
|
||||
/// matching their `EventFilter`. Filtering happens inside a single worker
|
||||
/// task, so the cost per event is O(n_subscribers) regardless of publisher
|
||||
/// count. Slow subscribers lose events at their own mpsc, not at the bus.
|
||||
pub struct SharingBus {
|
||||
publish_tx: broadcast::Sender<BusEvent>,
|
||||
subscribers: Arc<RwLock<Vec<SubscriberSlot>>>,
|
||||
scroll_id_cache: Arc<RwLock<HashMap<(String, String), Uuid>>>,
|
||||
next_id: Arc<AtomicU64>,
|
||||
_worker: JoinHandle<()>,
|
||||
}
|
||||
|
||||
struct SubscriberSlot {
|
||||
id: u64,
|
||||
filter: EventFilter,
|
||||
sender: mpsc::Sender<BusEvent>,
|
||||
lagged: Arc<AtomicU64>,
|
||||
}
|
||||
|
||||
impl SharingBus {
|
||||
/// Construct a new bus and spawn its dispatch worker.
|
||||
pub fn new() -> Arc<Self> {
|
||||
let (publish_tx, publish_rx) = broadcast::channel(BUS_INTERNAL_CAPACITY);
|
||||
let subscribers: Arc<RwLock<Vec<SubscriberSlot>>> = Arc::new(RwLock::new(Vec::new()));
|
||||
let scroll_id_cache: Arc<RwLock<HashMap<(String, String), Uuid>>> =
|
||||
Arc::new(RwLock::new(HashMap::new()));
|
||||
let next_id = Arc::new(AtomicU64::new(0));
|
||||
|
||||
let worker = tokio::spawn(run_worker(publish_rx, Arc::clone(&subscribers)));
|
||||
|
||||
Arc::new(Self {
|
||||
publish_tx,
|
||||
subscribers,
|
||||
scroll_id_cache,
|
||||
next_id,
|
||||
_worker: worker,
|
||||
})
|
||||
}
|
||||
|
||||
/// Publish a `BusEvent` to all matching subscribers.
|
||||
///
|
||||
/// This method also performs two side-effects on the scroll-id cache:
|
||||
///
|
||||
/// 1. If the wrapped event is `Event::SessionRegistered`, the binding
|
||||
/// `(connector_id, session_id) -> scroll_id` is inserted into the
|
||||
/// cache, and the current event's `routing.scroll_id` is set so the
|
||||
/// binding event itself carries its own scroll_id downstream.
|
||||
/// 2. If the event's `routing.scroll_id` is absent but it carries both a
|
||||
/// `connector_id` and `native_session_id`, the cache is consulted to
|
||||
/// late-bind `scroll_id` before broadcasting.
|
||||
pub async fn publish(&self, mut bus_event: BusEvent) {
|
||||
// (2) Late-bind scroll_id from cache if we can, BEFORE the possibly
|
||||
// more specific (1) handling overrides it. This is a no-op for
|
||||
// SessionRegistered (its scroll_id is always populated in (1)).
|
||||
if bus_event.routing.scroll_id.is_none() {
|
||||
if let (Some(cid), Some(nsid)) = (
|
||||
bus_event.routing.connector_id.as_ref(),
|
||||
bus_event.routing.native_session_id.as_ref(),
|
||||
) {
|
||||
let cache = self.scroll_id_cache.read().await;
|
||||
if let Some(uuid) = cache.get(&(cid.clone(), nsid.clone())) {
|
||||
bus_event.routing.scroll_id = Some(*uuid);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// (1) If the wrapped event is SessionRegistered, populate the cache
|
||||
// and set scroll_id on the event itself.
|
||||
if let Event::SessionRegistered {
|
||||
connector_id,
|
||||
session_id,
|
||||
scroll_id,
|
||||
} = bus_event.event.as_ref()
|
||||
{
|
||||
match Uuid::parse_str(scroll_id) {
|
||||
Ok(uuid) => {
|
||||
self.scroll_id_cache
|
||||
.write()
|
||||
.await
|
||||
.insert((connector_id.clone(), session_id.clone()), uuid);
|
||||
bus_event.routing.scroll_id = Some(uuid);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(
|
||||
connector_id = %connector_id,
|
||||
session_id = %session_id,
|
||||
scroll_id = %scroll_id,
|
||||
error = %e,
|
||||
"SessionRegistered carried an unparseable scroll_id; skipping late-bind cache insert",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// No subscribers is not an error — ignore the Result.
|
||||
let _ = self.publish_tx.send(bus_event);
|
||||
}
|
||||
|
||||
/// Subscribe to every event on the bus.
|
||||
pub async fn subscribe_all(&self) -> BusReceiver {
|
||||
self.subscribe_filtered(EventFilter::All, SUBSCRIBER_QUEUE_DEFAULT)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Subscribe to events that match `filter`. `queue_capacity` caps the
|
||||
/// buffered events between the worker and the caller's `recv()`.
|
||||
pub async fn subscribe_filtered(
|
||||
&self,
|
||||
filter: EventFilter,
|
||||
queue_capacity: usize,
|
||||
) -> BusReceiver {
|
||||
let (tx, rx) = mpsc::channel(queue_capacity);
|
||||
let lagged = Arc::new(AtomicU64::new(0));
|
||||
// Relaxed ordering is sufficient: subscriber IDs are only compared for
|
||||
// equality with other IDs issued by this same bus; there is no
|
||||
// cross-thread ordering dependency on this counter.
|
||||
let id = self.next_id.fetch_add(1, Ordering::Relaxed);
|
||||
self.subscribers.write().await.push(SubscriberSlot {
|
||||
id,
|
||||
filter,
|
||||
sender: tx,
|
||||
lagged: Arc::clone(&lagged),
|
||||
});
|
||||
BusReceiver { id, rx, lagged }
|
||||
}
|
||||
|
||||
/// Remove a subscriber by id. Idempotent.
|
||||
pub async fn unsubscribe(&self, id: u64) {
|
||||
self.subscribers.write().await.retain(|s| s.id != id);
|
||||
}
|
||||
}
|
||||
|
||||
async fn run_worker(
|
||||
mut rx: broadcast::Receiver<BusEvent>,
|
||||
subscribers: Arc<RwLock<Vec<SubscriberSlot>>>,
|
||||
) {
|
||||
loop {
|
||||
match rx.recv().await {
|
||||
Ok(evt) => {
|
||||
let mut closed_ids: Vec<u64> = Vec::new();
|
||||
{
|
||||
let subs = subscribers.read().await;
|
||||
for slot in subs.iter() {
|
||||
if !slot.filter.matches(&evt) {
|
||||
continue;
|
||||
}
|
||||
match slot.sender.try_send(evt.clone()) {
|
||||
Ok(()) => {}
|
||||
Err(mpsc::error::TrySendError::Full(_)) => {
|
||||
slot.lagged.fetch_add(1, Ordering::Relaxed);
|
||||
warn!(
|
||||
subscriber_id = slot.id,
|
||||
"bus subscriber queue full; dropping event"
|
||||
);
|
||||
}
|
||||
Err(mpsc::error::TrySendError::Closed(_)) => {
|
||||
closed_ids.push(slot.id);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if !closed_ids.is_empty() {
|
||||
subscribers
|
||||
.write()
|
||||
.await
|
||||
.retain(|s| !closed_ids.contains(&s.id));
|
||||
debug!(removed = closed_ids.len(), "GC'd closed subscriber slots");
|
||||
}
|
||||
}
|
||||
Err(broadcast::error::RecvError::Lagged(n)) => {
|
||||
warn!(skipped = n, "SharingBus internal broadcast lagged");
|
||||
}
|
||||
Err(broadcast::error::RecvError::Closed) => {
|
||||
debug!("SharingBus worker exiting (sender closed)");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Tests ───────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::time::Duration;
|
||||
|
||||
use tokio::time::timeout;
|
||||
use uuid::Uuid;
|
||||
|
||||
use super::*;
|
||||
use dirigent_protocol::streaming::{BusEvent, EventKind, EventOrigin, EventRouting};
|
||||
use dirigent_protocol::Event;
|
||||
|
||||
/// Build a minimal `BusEvent` for tests. Uses `Event::Connected` as payload
|
||||
/// unless a specific event is needed for late-bind checks.
|
||||
fn make_event(
|
||||
scroll_id: Option<Uuid>,
|
||||
connector_uid: Option<Uuid>,
|
||||
connector_id: Option<String>,
|
||||
native_session_id: Option<String>,
|
||||
kind: EventKind,
|
||||
event: Event,
|
||||
) -> BusEvent {
|
||||
BusEvent {
|
||||
routing: EventRouting {
|
||||
scroll_id,
|
||||
connector_uid,
|
||||
connector_id,
|
||||
native_session_id,
|
||||
kind,
|
||||
},
|
||||
origin: EventOrigin::Runtime,
|
||||
event: Arc::new(event),
|
||||
}
|
||||
}
|
||||
|
||||
// 1. subscribe_all + publish: one event round-trips to receiver.
|
||||
#[tokio::test]
|
||||
async fn subscribe_all_receives_published_event() {
|
||||
let bus = SharingBus::new();
|
||||
let mut recv = bus.subscribe_all().await;
|
||||
|
||||
let ev = make_event(
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
EventKind::System,
|
||||
Event::Connected,
|
||||
);
|
||||
bus.publish(ev).await;
|
||||
|
||||
let got = timeout(Duration::from_millis(200), recv.rx.recv())
|
||||
.await
|
||||
.expect("timed out waiting for event")
|
||||
.expect("channel closed unexpectedly");
|
||||
|
||||
match got.event.as_ref() {
|
||||
Event::Connected => {}
|
||||
other => panic!("expected Event::Connected, got {:?}", other),
|
||||
}
|
||||
}
|
||||
|
||||
// 2. ConnectorUid filter: matching UID passes, other UID skipped.
|
||||
#[tokio::test]
|
||||
async fn connector_uid_filter_only_forwards_matching_events() {
|
||||
let bus = SharingBus::new();
|
||||
let target = Uuid::new_v4();
|
||||
let other = Uuid::new_v4();
|
||||
|
||||
let mut recv = bus
|
||||
.subscribe_filtered(EventFilter::ConnectorUid(target), 16)
|
||||
.await;
|
||||
|
||||
// Publish one matching and one non-matching event.
|
||||
let ev_match = make_event(
|
||||
None,
|
||||
Some(target),
|
||||
None,
|
||||
None,
|
||||
EventKind::System,
|
||||
Event::Connected,
|
||||
);
|
||||
let ev_other = make_event(
|
||||
None,
|
||||
Some(other),
|
||||
None,
|
||||
None,
|
||||
EventKind::System,
|
||||
Event::Connected,
|
||||
);
|
||||
bus.publish(ev_match).await;
|
||||
bus.publish(ev_other).await;
|
||||
|
||||
// First recv returns the matching event.
|
||||
let got = timeout(Duration::from_millis(200), recv.rx.recv())
|
||||
.await
|
||||
.expect("timed out waiting for first event")
|
||||
.expect("channel closed unexpectedly");
|
||||
assert_eq!(got.routing.connector_uid, Some(target));
|
||||
|
||||
// Second recv must time out — no other matching event was published.
|
||||
let result = timeout(Duration::from_millis(100), recv.rx.recv()).await;
|
||||
assert!(
|
||||
result.is_err(),
|
||||
"expected no further events, got: {:?}",
|
||||
result.ok().flatten().map(|e| e.routing.connector_uid)
|
||||
);
|
||||
}
|
||||
|
||||
// 3. Queue full = lagged counter increments, first event still delivered.
|
||||
#[tokio::test]
|
||||
async fn full_queue_increments_lagged_counter() {
|
||||
let bus = SharingBus::new();
|
||||
// Capacity 1 — only one event can be buffered before try_send fails.
|
||||
let mut recv = bus.subscribe_filtered(EventFilter::All, 1).await;
|
||||
|
||||
// Publish 5 events without draining.
|
||||
for _ in 0..5 {
|
||||
let ev = make_event(
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
EventKind::System,
|
||||
Event::Connected,
|
||||
);
|
||||
bus.publish(ev).await;
|
||||
}
|
||||
|
||||
// Give the worker a chance to process all 5.
|
||||
for _ in 0..10 {
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(20)).await;
|
||||
|
||||
// First event is still in the queue.
|
||||
let first = timeout(Duration::from_millis(200), recv.rx.recv())
|
||||
.await
|
||||
.expect("timed out waiting for first event")
|
||||
.expect("channel closed unexpectedly");
|
||||
match first.event.as_ref() {
|
||||
Event::Connected => {}
|
||||
other => panic!("expected Event::Connected, got {:?}", other),
|
||||
}
|
||||
|
||||
// At minimum 4 events were dropped (5 published, 1 fit).
|
||||
let lagged = recv.lagged.load(Ordering::Relaxed);
|
||||
assert!(
|
||||
lagged >= 4,
|
||||
"expected lagged >= 4 after publishing 5 events to a capacity-1 queue, got {}",
|
||||
lagged
|
||||
);
|
||||
}
|
||||
|
||||
// 4. scroll_id late-bind: SessionRegistered populates cache; subsequent
|
||||
// events with matching (connector_id, native_session_id) get their
|
||||
// scroll_id filled in before dispatch.
|
||||
#[tokio::test]
|
||||
async fn session_registered_populates_cache_and_late_binds_subsequent_events() {
|
||||
let bus = SharingBus::new();
|
||||
let scroll = Uuid::new_v4();
|
||||
|
||||
// Subscriber filters on ScrollId(scroll). It should see:
|
||||
// - the SessionRegistered event (bus sets its own scroll_id at publish)
|
||||
// - a follow-up event with (connector_id="c", native_session_id="s")
|
||||
// that had no scroll_id on entry (late-bound from the cache).
|
||||
let mut recv = bus
|
||||
.subscribe_filtered(EventFilter::ScrollId(scroll), 16)
|
||||
.await;
|
||||
|
||||
// --- publish SessionRegistered (binding event) ---
|
||||
let reg_event = Event::SessionRegistered {
|
||||
connector_id: "c".to_string(),
|
||||
session_id: "s".to_string(),
|
||||
scroll_id: scroll.to_string(),
|
||||
};
|
||||
// We pass through the routing fields the producer would populate.
|
||||
// `scroll_id` starts as None; publish() sets it from the event payload.
|
||||
let reg_bus = make_event(
|
||||
None,
|
||||
None,
|
||||
Some("c".to_string()),
|
||||
Some("s".to_string()),
|
||||
EventKind::SessionLifecycle,
|
||||
reg_event,
|
||||
);
|
||||
bus.publish(reg_bus).await;
|
||||
|
||||
let got1 = timeout(Duration::from_millis(200), recv.rx.recv())
|
||||
.await
|
||||
.expect("timed out waiting for SessionRegistered")
|
||||
.expect("channel closed unexpectedly");
|
||||
assert!(matches!(
|
||||
got1.event.as_ref(),
|
||||
Event::SessionRegistered { .. }
|
||||
));
|
||||
assert_eq!(got1.routing.scroll_id, Some(scroll));
|
||||
|
||||
// --- publish a follow-up event with no scroll_id but matching
|
||||
// connector_id + native_session_id ---
|
||||
let follow_up = make_event(
|
||||
None,
|
||||
None,
|
||||
Some("c".to_string()),
|
||||
Some("s".to_string()),
|
||||
EventKind::System,
|
||||
Event::Connected,
|
||||
);
|
||||
bus.publish(follow_up).await;
|
||||
|
||||
let got2 = timeout(Duration::from_millis(200), recv.rx.recv())
|
||||
.await
|
||||
.expect("timed out waiting for late-bound follow-up")
|
||||
.expect("channel closed unexpectedly");
|
||||
assert_eq!(
|
||||
got2.routing.scroll_id,
|
||||
Some(scroll),
|
||||
"follow-up event should have had scroll_id late-bound from the cache"
|
||||
);
|
||||
assert!(matches!(got2.event.as_ref(), Event::Connected));
|
||||
}
|
||||
|
||||
// 5. Dropped receiver is GC'd after the next publish.
|
||||
#[tokio::test]
|
||||
async fn closed_receiver_slot_is_reaped_on_next_publish() {
|
||||
let bus = SharingBus::new();
|
||||
|
||||
// Subscribe, then immediately drop the receiver — simulates a caller
|
||||
// that forgets (or skips) `unsubscribe()`.
|
||||
let recv = bus.subscribe_all().await;
|
||||
drop(recv);
|
||||
|
||||
// Sanity check: slot is present before GC.
|
||||
assert_eq!(bus.subscribers.read().await.len(), 1);
|
||||
|
||||
// Publish one event; the worker encounters TrySendError::Closed and
|
||||
// schedules the slot for removal.
|
||||
let ev = make_event(
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
EventKind::System,
|
||||
Event::Connected,
|
||||
);
|
||||
bus.publish(ev).await;
|
||||
|
||||
// Give the worker a moment to process and GC.
|
||||
for _ in 0..10 {
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
|
||||
assert_eq!(
|
||||
bus.subscribers.read().await.len(),
|
||||
0,
|
||||
"closed subscriber slot should have been GC'd after publish"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,101 @@
|
||||
//! `[[streams]]` TOML config block parsed from `dirigent.toml`.
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use dirigent_protocol::streaming::StreamScope;
|
||||
|
||||
#[derive(Debug, Clone, Default, Deserialize, Serialize)]
|
||||
pub struct StreamsConfig {
|
||||
#[serde(default, rename = "streams")]
|
||||
pub entries: Vec<StreamConfig>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Deserialize, Serialize)]
|
||||
pub struct StreamConfig {
|
||||
pub name: String,
|
||||
#[serde(rename = "type")]
|
||||
pub kind: String, // "matrix" | "langfuse" | ...
|
||||
pub scope: StreamScope,
|
||||
#[serde(default = "default_enabled")]
|
||||
pub enabled: bool,
|
||||
#[serde(default = "default_params")]
|
||||
pub params: toml::Value, // type-specific
|
||||
}
|
||||
|
||||
fn default_enabled() -> bool { true }
|
||||
|
||||
fn default_params() -> toml::Value { toml::Value::Table(toml::map::Map::new()) }
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use uuid::Uuid;
|
||||
|
||||
const FULL_TOML: &str = r#"
|
||||
[[streams]]
|
||||
name = "matrix-main"
|
||||
type = "matrix"
|
||||
enabled = true
|
||||
|
||||
[streams.scope]
|
||||
kind = "session"
|
||||
scroll_id = "01985d00-0000-7000-8000-000000000000"
|
||||
|
||||
[streams.params]
|
||||
homeserver_url = "https://matrix.org"
|
||||
room_id = "!abc:matrix.org"
|
||||
"#;
|
||||
|
||||
const MINIMAL_TOML: &str = r#"
|
||||
[[streams]]
|
||||
name = "minimal"
|
||||
type = "langfuse"
|
||||
|
||||
[streams.scope]
|
||||
kind = "archive_wide"
|
||||
acknowledged = false
|
||||
"#;
|
||||
|
||||
#[test]
|
||||
fn round_trip_full() {
|
||||
let cfg: StreamsConfig = toml::from_str(FULL_TOML).expect("parse failed");
|
||||
|
||||
assert_eq!(cfg.entries.len(), 1);
|
||||
let entry = &cfg.entries[0];
|
||||
assert_eq!(entry.name, "matrix-main");
|
||||
assert_eq!(entry.kind, "matrix");
|
||||
assert!(entry.enabled);
|
||||
|
||||
let expected_id = Uuid::parse_str("01985d00-0000-7000-8000-000000000000").unwrap();
|
||||
match &entry.scope {
|
||||
StreamScope::Session { scroll_id } => {
|
||||
assert_eq!(*scroll_id, expected_id);
|
||||
}
|
||||
other => panic!("expected Session scope, got {:?}", other),
|
||||
}
|
||||
|
||||
let params = entry.params.as_table().expect("params should be a table");
|
||||
assert_eq!(
|
||||
params.get("homeserver_url").and_then(|v| v.as_str()),
|
||||
Some("https://matrix.org")
|
||||
);
|
||||
assert_eq!(
|
||||
params.get("room_id").and_then(|v| v.as_str()),
|
||||
Some("!abc:matrix.org")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn default_enabled_when_omitted() {
|
||||
let cfg: StreamsConfig = toml::from_str(MINIMAL_TOML).expect("parse failed");
|
||||
assert_eq!(cfg.entries.len(), 1);
|
||||
let entry = &cfg.entries[0];
|
||||
assert_eq!(entry.name, "minimal");
|
||||
assert!(entry.enabled, "enabled should default to true");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_config_is_valid() {
|
||||
let cfg: StreamsConfig = toml::from_str("").expect("empty parse failed");
|
||||
assert!(cfg.entries.is_empty());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,143 @@
|
||||
//! Maps a `StreamConfig`'s `type` string to a concrete [`SessionStream`].
|
||||
//!
|
||||
//! Stream implementations register themselves with a
|
||||
//! [`StreamFactoryRegistry`] at boot so the runtime can construct streams
|
||||
//! from `[[streams]]` config blocks without knowing every concrete type up
|
||||
//! front. This is the stream-side analogue of the archivist's
|
||||
//! `BackendRegistry` / `BackendFactory` pattern (Phase 3).
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use thiserror::Error;
|
||||
|
||||
use dirigent_protocol::streaming::SessionStream;
|
||||
|
||||
use super::config::StreamConfig;
|
||||
|
||||
/// Builds a concrete [`SessionStream`] from a `StreamConfig`.
|
||||
///
|
||||
/// Each implementation advertises a single `kind` (matching the `type`
|
||||
/// string in the TOML config). The registry routes config blocks to the
|
||||
/// matching factory at boot.
|
||||
#[async_trait]
|
||||
pub trait StreamFactory: Send + Sync {
|
||||
/// The `type` discriminator in `[[streams]]` — e.g. `"matrix"`,
|
||||
/// `"langfuse"`. Must be unique across all registered factories.
|
||||
fn kind(&self) -> &'static str;
|
||||
|
||||
/// Build a running stream from its config. Implementations are
|
||||
/// expected to read `cfg.params` (type-specific TOML table), establish
|
||||
/// any transport, and return an [`Arc<dyn SessionStream>`] ready to
|
||||
/// receive events.
|
||||
async fn build(&self, cfg: &StreamConfig) -> Result<Arc<dyn SessionStream>, StreamBuildError>;
|
||||
}
|
||||
|
||||
/// Lookup table of `StreamFactory` implementations keyed by `kind()`.
|
||||
///
|
||||
/// Populate this at startup (typically once, on the main thread) before
|
||||
/// handing it off to the runtime. The registry is cheap to clone via
|
||||
/// `Arc` at the call site.
|
||||
#[derive(Default)]
|
||||
pub struct StreamFactoryRegistry {
|
||||
factories: HashMap<&'static str, Arc<dyn StreamFactory>>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for StreamFactoryRegistry {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("StreamFactoryRegistry")
|
||||
.field("kinds", &self.factories.keys().collect::<Vec<_>>())
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl StreamFactoryRegistry {
|
||||
/// Construct an empty registry.
|
||||
pub fn new() -> Self {
|
||||
Self::default()
|
||||
}
|
||||
|
||||
/// Register a factory. Consumes and returns `self` so callers can
|
||||
/// chain `.register(...)` calls at startup.
|
||||
pub fn register<F: StreamFactory + 'static>(mut self, f: F) -> Self {
|
||||
self.factories.insert(f.kind(), Arc::new(f));
|
||||
self
|
||||
}
|
||||
|
||||
/// Look up a factory by its `kind` discriminator, or `None` if no
|
||||
/// factory for that kind has been registered.
|
||||
pub fn get(&self, kind: &str) -> Option<&Arc<dyn StreamFactory>> {
|
||||
self.factories.get(kind)
|
||||
}
|
||||
}
|
||||
|
||||
/// Errors raised while building a `SessionStream` from its config.
|
||||
#[derive(Debug, Error)]
|
||||
pub enum StreamBuildError {
|
||||
/// No factory is registered for `cfg.kind`.
|
||||
#[error("unknown kind: {0}")]
|
||||
UnknownKind(String),
|
||||
/// Config was structurally valid TOML but semantically invalid
|
||||
/// (missing required field, enum out of range, etc).
|
||||
#[error("config: {0}")]
|
||||
Config(String),
|
||||
/// The factory accepted the config but the transport refused to come
|
||||
/// up (network error, auth failure, …).
|
||||
#[error("transport: {0}")]
|
||||
Transport(String),
|
||||
}
|
||||
|
||||
// ─── Tests ───────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use async_trait::async_trait;
|
||||
use dirigent_protocol::streaming::{
|
||||
BusEvent, SessionStream, StreamKind, StreamOutcome, StreamScope, StreamSummary,
|
||||
};
|
||||
|
||||
struct DummyStream;
|
||||
|
||||
#[async_trait]
|
||||
impl SessionStream for DummyStream {
|
||||
fn summary(&self) -> StreamSummary {
|
||||
StreamSummary {
|
||||
name: "dummy".to_string(),
|
||||
kind: StreamKind::Custom,
|
||||
target: "dummy".to_string(),
|
||||
active_since: chrono::Utc::now(),
|
||||
}
|
||||
}
|
||||
fn scope(&self) -> StreamScope {
|
||||
StreamScope::ArchiveWide { acknowledged: false }
|
||||
}
|
||||
async fn on_event(&self, _event: &BusEvent) -> StreamOutcome {
|
||||
StreamOutcome::Ok
|
||||
}
|
||||
async fn shutdown(&self) {}
|
||||
}
|
||||
|
||||
struct DummyFactory;
|
||||
|
||||
#[async_trait]
|
||||
impl StreamFactory for DummyFactory {
|
||||
fn kind(&self) -> &'static str {
|
||||
"dummy"
|
||||
}
|
||||
async fn build(
|
||||
&self,
|
||||
_cfg: &StreamConfig,
|
||||
) -> Result<Arc<dyn SessionStream>, StreamBuildError> {
|
||||
Ok(Arc::new(DummyStream))
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn register_and_lookup() {
|
||||
let reg = StreamFactoryRegistry::new().register(DummyFactory);
|
||||
assert!(reg.get("dummy").is_some());
|
||||
assert!(reg.get("missing").is_none());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,118 @@
|
||||
//! Consecutive-failure health drift for streams (K=5 threshold).
|
||||
//!
|
||||
//! Mirrors the archivist's drift logic but tracks a single stream's
|
||||
//! outcomes. Re-exports the shared `HealthStatus` enum from the
|
||||
//! archivist's backend module to avoid duplication.
|
||||
|
||||
pub use dirigent_archivist::backend::HealthStatus;
|
||||
|
||||
/// Number of consecutive failures before a stream drifts from `Degraded`
|
||||
/// to `Unavailable`. Matches the archivist's backend drift threshold.
|
||||
pub const FAILURE_THRESHOLD: u32 = 5;
|
||||
|
||||
/// Update health state after a successful event delivery.
|
||||
/// Resets consecutive-failure counter; lifts Degraded → Healthy.
|
||||
pub fn record_success(health: &mut HealthStatus, consecutive_failures: &mut u32) {
|
||||
*consecutive_failures = 0;
|
||||
if matches!(health, HealthStatus::Degraded { .. }) {
|
||||
*health = HealthStatus::Healthy;
|
||||
}
|
||||
}
|
||||
|
||||
/// Update health state after a failed event delivery.
|
||||
/// Increments counter; drifts Healthy → Degraded → Unavailable at K=5.
|
||||
pub fn record_failure(
|
||||
health: &mut HealthStatus,
|
||||
consecutive_failures: &mut u32,
|
||||
reason: String,
|
||||
) {
|
||||
*consecutive_failures += 1;
|
||||
if *consecutive_failures >= FAILURE_THRESHOLD {
|
||||
*health = HealthStatus::Unavailable {
|
||||
reason: format!("{} failures: {}", *consecutive_failures, reason),
|
||||
};
|
||||
} else {
|
||||
*health = HealthStatus::Degraded { reason };
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Tests ───────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn record_success_promotes_degraded_to_healthy() {
|
||||
let mut health = HealthStatus::Degraded {
|
||||
reason: "earlier hiccup".to_string(),
|
||||
};
|
||||
let mut counter: u32 = 3;
|
||||
record_success(&mut health, &mut counter);
|
||||
assert_eq!(counter, 0);
|
||||
assert_eq!(health, HealthStatus::Healthy);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn record_success_keeps_healthy_and_resets_counter() {
|
||||
let mut health = HealthStatus::Healthy;
|
||||
let mut counter: u32 = 2;
|
||||
record_success(&mut health, &mut counter);
|
||||
assert_eq!(counter, 0);
|
||||
assert_eq!(health, HealthStatus::Healthy);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn record_success_does_not_rescue_unavailable() {
|
||||
// The archivist rule is: once Unavailable, only operational events
|
||||
// rescue. Success on a stream does clear the counter but we leave
|
||||
// the final clearing decision to the caller; document the current
|
||||
// behaviour: we only downgrade Degraded, not Unavailable.
|
||||
let mut health = HealthStatus::Unavailable {
|
||||
reason: "still broken".to_string(),
|
||||
};
|
||||
let mut counter: u32 = 7;
|
||||
record_success(&mut health, &mut counter);
|
||||
assert_eq!(counter, 0);
|
||||
assert!(matches!(health, HealthStatus::Unavailable { .. }));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn record_failure_once_moves_healthy_to_degraded() {
|
||||
let mut health = HealthStatus::Healthy;
|
||||
let mut counter: u32 = 0;
|
||||
record_failure(&mut health, &mut counter, "boom".to_string());
|
||||
assert_eq!(counter, 1);
|
||||
match health {
|
||||
HealthStatus::Degraded { reason } => assert_eq!(reason, "boom"),
|
||||
other => panic!("expected Degraded, got {:?}", other),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn record_failure_five_times_drifts_to_unavailable() {
|
||||
let mut health = HealthStatus::Healthy;
|
||||
let mut counter: u32 = 0;
|
||||
for i in 0..5 {
|
||||
record_failure(&mut health, &mut counter, format!("err-{i}"));
|
||||
}
|
||||
assert_eq!(counter, 5);
|
||||
match health {
|
||||
HealthStatus::Unavailable { reason } => {
|
||||
assert!(reason.contains("5 failures"), "reason: {reason}");
|
||||
}
|
||||
other => panic!("expected Unavailable after 5 failures, got {:?}", other),
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn record_failure_from_degraded_drifts_to_unavailable_at_threshold() {
|
||||
let mut health = HealthStatus::Degraded {
|
||||
reason: "early".to_string(),
|
||||
};
|
||||
let mut counter: u32 = 4;
|
||||
record_failure(&mut health, &mut counter, "final".to_string());
|
||||
assert_eq!(counter, 5);
|
||||
assert!(matches!(health, HealthStatus::Unavailable { .. }));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,217 @@
|
||||
//! `MatrixFactory`: build a Matrix [`SessionStream`] from a `[[streams]]`
|
||||
//! config block.
|
||||
//!
|
||||
//! This is the first stream-side factory wired for the Phase 4 migration
|
||||
//! (Task 18). The factory lives in `dirigent_core` rather than
|
||||
//! `dirigent_matrix` because `StreamFactory` is defined here and
|
||||
//! `dirigent_core` already depends on `dirigent_matrix` — putting it in
|
||||
//! `dirigent_matrix` would create a cycle.
|
||||
//!
|
||||
//! ## Scope
|
||||
//!
|
||||
//! The factory's responsibility is narrow: parse `cfg.params`, resolve
|
||||
//! the target Matrix room via a running `MatrixService`, and construct a
|
||||
//! `MatrixSessionShare` configured for the stream path (no legacy
|
||||
//! forwarder task). Command-proxy wiring (Matrix → Dirigent
|
||||
//! `ConnectorCommand::SendMessage`) remains in
|
||||
//! `CoreRuntime::create_matrix_share` for now; a follow-up will extend
|
||||
//! the factory to cover that path.
|
||||
//!
|
||||
//! ## Config shape
|
||||
//!
|
||||
//! ```toml
|
||||
//! [[streams]]
|
||||
//! name = "matrix-main"
|
||||
//! type = "matrix"
|
||||
//!
|
||||
//! [streams.scope]
|
||||
//! kind = "session"
|
||||
//! scroll_id = "01985d00-..."
|
||||
//!
|
||||
//! [streams.params]
|
||||
//! connector_id = "opencode-1" # dirigent connector key
|
||||
//! session_id = "native-abc123" # native connector session id
|
||||
//! room_id = "!abc:matrix.org" # pre-existing room to attach to
|
||||
//! homeserver_url = "https://matrix.org" # informational (service already knows)
|
||||
//! ```
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use serde::Deserialize;
|
||||
|
||||
use dirigent_protocol::streaming::{SessionStream, StreamScope};
|
||||
|
||||
use super::config::StreamConfig;
|
||||
use super::factory::{StreamBuildError, StreamFactory};
|
||||
|
||||
/// Stream-side factory for Matrix. See module docs for the expected
|
||||
/// TOML shape.
|
||||
pub struct MatrixFactory {
|
||||
service: Arc<dirigent_matrix::MatrixService>,
|
||||
}
|
||||
|
||||
impl MatrixFactory {
|
||||
/// Build a factory bound to a running `MatrixService`. The service
|
||||
/// is expected to be logged in and sync-started by the time
|
||||
/// `build()` is called; if it isn't, `build()` returns
|
||||
/// `StreamBuildError::Transport`.
|
||||
pub fn new(service: Arc<dirigent_matrix::MatrixService>) -> Self {
|
||||
Self { service }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
struct MatrixStreamParams {
|
||||
/// Dirigent connector id that owns the session being bridged.
|
||||
connector_id: String,
|
||||
/// Native connector session id.
|
||||
session_id: String,
|
||||
/// Matrix room id — must be a pre-existing room the bot can access.
|
||||
/// Room creation is still handled by
|
||||
/// `CoreRuntime::create_matrix_share` until the factory path is
|
||||
/// expanded to cover it.
|
||||
room_id: String,
|
||||
/// Informational; the logged-in `MatrixService` is the authority on
|
||||
/// which homeserver to talk to. Accepted so configs can be
|
||||
/// self-documenting and round-trip through TOML.
|
||||
#[serde(default)]
|
||||
#[allow(dead_code)]
|
||||
homeserver_url: Option<String>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl StreamFactory for MatrixFactory {
|
||||
fn kind(&self) -> &'static str {
|
||||
"matrix"
|
||||
}
|
||||
|
||||
async fn build(
|
||||
&self,
|
||||
cfg: &StreamConfig,
|
||||
) -> Result<Arc<dyn SessionStream>, StreamBuildError> {
|
||||
// Scope must be Session; Matrix shares are intrinsically
|
||||
// per-session bi-directional bridges.
|
||||
let scroll_id = match &cfg.scope {
|
||||
StreamScope::Session { scroll_id } => *scroll_id,
|
||||
other => {
|
||||
return Err(StreamBuildError::Config(format!(
|
||||
"matrix stream requires scope.kind = \"session\", got {:?}",
|
||||
other
|
||||
)));
|
||||
}
|
||||
};
|
||||
|
||||
// Parse type-specific params.
|
||||
let params: MatrixStreamParams = cfg
|
||||
.params
|
||||
.clone()
|
||||
.try_into()
|
||||
.map_err(|e: toml::de::Error| {
|
||||
StreamBuildError::Config(format!(
|
||||
"matrix stream '{}': invalid params: {}",
|
||||
cfg.name, e
|
||||
))
|
||||
})?;
|
||||
|
||||
// Look up the room via the service. We intentionally don't
|
||||
// create or join rooms here — the room must already exist.
|
||||
// Creation remains the responsibility of
|
||||
// `CoreRuntime::create_matrix_share`.
|
||||
let room = match self.service.room_by_id(¶ms.room_id).await {
|
||||
Ok(Some(room)) => room,
|
||||
Ok(None) => {
|
||||
return Err(StreamBuildError::Transport(format!(
|
||||
"matrix stream '{}': room '{}' not found on client \
|
||||
— ensure the bot has joined it",
|
||||
cfg.name, params.room_id
|
||||
)));
|
||||
}
|
||||
Err(dirigent_matrix::MatrixError::NotLoggedIn) => {
|
||||
return Err(StreamBuildError::Transport(
|
||||
"matrix service is not logged in; cannot build stream"
|
||||
.to_string(),
|
||||
));
|
||||
}
|
||||
Err(dirigent_matrix::MatrixError::Config(msg)) => {
|
||||
return Err(StreamBuildError::Config(format!(
|
||||
"matrix stream '{}': {}",
|
||||
cfg.name, msg
|
||||
)));
|
||||
}
|
||||
Err(other) => {
|
||||
return Err(StreamBuildError::Transport(format!(
|
||||
"matrix stream '{}': {}",
|
||||
cfg.name, other
|
||||
)));
|
||||
}
|
||||
};
|
||||
|
||||
// Construct the share for the stream path (no legacy forwarder
|
||||
// task). We drop the command receiver on the floor here — the
|
||||
// Matrix → Dirigent direction is not covered by this factory
|
||||
// yet; see the follow-up TODO in the module docs.
|
||||
let (share, _command_rx) = dirigent_matrix::MatrixSessionShare::new_for_stream(
|
||||
params.connector_id,
|
||||
params.session_id,
|
||||
scroll_id,
|
||||
params.room_id,
|
||||
room,
|
||||
);
|
||||
|
||||
Ok(Arc::new(share) as Arc<dyn SessionStream>)
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Tests ───────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn factory_kind_is_matrix() {
|
||||
// The factory's `kind()` is static and doesn't require a running
|
||||
// MatrixService to read — covered by a minimal construction
|
||||
// check in the integration test suite.
|
||||
fn assert_is_factory<F: StreamFactory>(_: &F) {}
|
||||
|
||||
// We can't easily build a MatrixService in a unit test (it needs
|
||||
// an Account + data dir + SQLite store). The full smoke test
|
||||
// lives in `packages/dirigent_matrix/tests/factory_test.rs` and
|
||||
// the cross-crate registry test in
|
||||
// `packages/dirigent_core/tests/matrix_migration_test.rs`.
|
||||
//
|
||||
// This module-local test exists only to assert that the impl
|
||||
// block type-checks against the `StreamFactory` trait bound.
|
||||
fn _compile_check(f: &MatrixFactory) {
|
||||
assert_is_factory(f);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn matrix_stream_params_deserialise_ok() {
|
||||
let toml_str = r#"
|
||||
connector_id = "opencode-1"
|
||||
session_id = "native-abc"
|
||||
room_id = "!foo:example.com"
|
||||
homeserver_url = "https://matrix.org"
|
||||
"#;
|
||||
let p: MatrixStreamParams = toml::from_str(toml_str).expect("parse");
|
||||
assert_eq!(p.connector_id, "opencode-1");
|
||||
assert_eq!(p.session_id, "native-abc");
|
||||
assert_eq!(p.room_id, "!foo:example.com");
|
||||
assert_eq!(p.homeserver_url.as_deref(), Some("https://matrix.org"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn matrix_stream_params_reject_missing_required() {
|
||||
// Missing room_id should fail.
|
||||
let toml_str = r#"
|
||||
connector_id = "opencode-1"
|
||||
session_id = "native-abc"
|
||||
"#;
|
||||
let err: Result<MatrixStreamParams, _> = toml::from_str(toml_str);
|
||||
assert!(err.is_err());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,80 @@
|
||||
//! Test-only `SessionStream` implementation for integration tests.
|
||||
//!
|
||||
//! Records every received `BusEvent` into an in-memory buffer; can be
|
||||
//! configured to fail the next N events to exercise health drift paths.
|
||||
|
||||
#![cfg(any(test, feature = "test-utils"))]
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::sync::Mutex;
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
|
||||
use async_trait::async_trait;
|
||||
use chrono::Utc;
|
||||
|
||||
use dirigent_protocol::streaming::{
|
||||
BusEvent, SessionStream, StreamError, StreamKind, StreamOutcome, StreamScope, StreamSummary,
|
||||
};
|
||||
|
||||
/// In-memory `SessionStream` used in integration tests.
|
||||
pub struct MockStream {
|
||||
scope: StreamScope,
|
||||
name: String,
|
||||
pub received: Arc<Mutex<Vec<BusEvent>>>,
|
||||
pub fail_remaining: Arc<AtomicU32>,
|
||||
pub shutdown_called: Arc<Mutex<bool>>,
|
||||
}
|
||||
|
||||
impl MockStream {
|
||||
pub fn new(name: impl Into<String>, scope: StreamScope) -> Arc<Self> {
|
||||
Arc::new(Self {
|
||||
scope,
|
||||
name: name.into(),
|
||||
received: Arc::new(Mutex::new(Vec::new())),
|
||||
fail_remaining: Arc::new(AtomicU32::new(0)),
|
||||
shutdown_called: Arc::new(Mutex::new(false)),
|
||||
})
|
||||
}
|
||||
|
||||
/// Configure the next N `on_event` calls to return `StreamOutcome::Failed`.
|
||||
pub fn fail_next(&self, n: u32) {
|
||||
self.fail_remaining.store(n, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
pub fn received_count(&self) -> usize {
|
||||
self.received.lock().unwrap().len()
|
||||
}
|
||||
|
||||
pub fn was_shutdown(&self) -> bool {
|
||||
*self.shutdown_called.lock().unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl SessionStream for MockStream {
|
||||
fn summary(&self) -> StreamSummary {
|
||||
StreamSummary {
|
||||
name: self.name.clone(),
|
||||
kind: StreamKind::Custom,
|
||||
target: "mock".into(),
|
||||
active_since: Utc::now(),
|
||||
}
|
||||
}
|
||||
|
||||
fn scope(&self) -> StreamScope {
|
||||
self.scope.clone()
|
||||
}
|
||||
|
||||
async fn on_event(&self, event: &BusEvent) -> StreamOutcome {
|
||||
if self.fail_remaining.load(Ordering::Relaxed) > 0 {
|
||||
self.fail_remaining.fetch_sub(1, Ordering::Relaxed);
|
||||
return StreamOutcome::Failed(StreamError::Rejected("mock fail".into()));
|
||||
}
|
||||
self.received.lock().unwrap().push(event.clone());
|
||||
StreamOutcome::Ok
|
||||
}
|
||||
|
||||
async fn shutdown(&self) {
|
||||
*self.shutdown_called.lock().unwrap() = true;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
//! SharingBus, StreamRegistry, and replay. See docs/plans/2026-04-21-archivist-phase4-design.md.
|
||||
|
||||
pub mod bus;
|
||||
pub mod config;
|
||||
pub mod factory;
|
||||
pub mod health;
|
||||
#[cfg(feature = "server")]
|
||||
pub mod matrix;
|
||||
#[cfg(any(test, feature = "test-utils"))]
|
||||
pub mod mock;
|
||||
pub mod registry;
|
||||
pub mod replay;
|
||||
|
||||
pub use bus::{BusReceiver, SharingBus};
|
||||
pub use config::{StreamConfig, StreamsConfig};
|
||||
pub use factory::{StreamBuildError, StreamFactory, StreamFactoryRegistry};
|
||||
pub use health::HealthStatus;
|
||||
#[cfg(feature = "server")]
|
||||
pub use matrix::MatrixFactory;
|
||||
pub use registry::{StreamId, StreamInfo, StreamRegistration, StreamRegistry};
|
||||
pub use replay::{ReplayError, ReplayOptions, ReplayReport, ReplaySpeed};
|
||||
#[cfg(any(test, feature = "test-utils"))]
|
||||
pub use mock::MockStream;
|
||||
@@ -0,0 +1,236 @@
|
||||
//! Owns all active streams.
|
||||
//!
|
||||
//! Populated at boot from `[[streams]]` config and at runtime via
|
||||
//! [`StreamRegistry::attach`]. Each attached stream gets:
|
||||
//!
|
||||
//! - a bus subscription with an [`EventFilter`] derived from its scope,
|
||||
//! - a dedicated worker task that drives `SessionStream::on_event`,
|
||||
//! - a per-stream [`HealthStatus`] that drifts on consecutive failures
|
||||
//! (see [`super::health`]).
|
||||
//!
|
||||
//! The worker is cancellable via a one-shot `mpsc::Sender<()>` on the
|
||||
//! registration so [`detach`](StreamRegistry::detach) can stop delivery
|
||||
//! deterministically before invoking `SessionStream::shutdown`.
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicU32, Ordering};
|
||||
|
||||
use tokio::sync::{RwLock, mpsc};
|
||||
use tokio::task::JoinHandle;
|
||||
use tracing::warn;
|
||||
use uuid::Uuid;
|
||||
|
||||
use dirigent_protocol::streaming::{
|
||||
BusReceiver, EventFilter, SessionStream, StreamOutcome, StreamScope, StreamSummary,
|
||||
};
|
||||
|
||||
use super::bus::SharingBus;
|
||||
use super::health::{HealthStatus, record_failure, record_success};
|
||||
|
||||
/// Per-subscriber queue capacity for a stream's bus subscription. Matches
|
||||
/// the default used by `SharingBus::subscribe_all`.
|
||||
const STREAM_QUEUE_CAPACITY: usize = 256;
|
||||
|
||||
/// Identifier of a registered stream. Opaque wrapper around a `Uuid` so
|
||||
/// that callers can't confuse stream ids with scroll/connector ids.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
|
||||
pub struct StreamId(pub Uuid);
|
||||
|
||||
/// Full registration record for a live stream.
|
||||
///
|
||||
/// Held inside the registry behind an `Arc`; `detach` returns the Arc so
|
||||
/// callers can inspect the final health state or await the worker handle
|
||||
/// if they wish. Most fields are `Arc`s so the worker task and the
|
||||
/// registry share state without serialising on a single lock.
|
||||
pub struct StreamRegistration {
|
||||
pub id: StreamId,
|
||||
pub name: String,
|
||||
pub stream: Arc<dyn SessionStream>,
|
||||
pub scope: StreamScope,
|
||||
pub enabled: bool,
|
||||
pub health: Arc<RwLock<HealthStatus>>,
|
||||
/// Number of consecutive delivery failures; drives the K=5 drift to
|
||||
/// `Unavailable`. Stored atomic so the worker can update without
|
||||
/// taking the health lock on every success.
|
||||
pub consecutive_failures: Arc<AtomicU32>,
|
||||
pub worker: JoinHandle<()>,
|
||||
pub stop_tx: mpsc::Sender<()>,
|
||||
}
|
||||
|
||||
/// Snapshot view of a registered stream. Returned by
|
||||
/// [`StreamRegistry::list`] for telemetry / UI.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct StreamInfo {
|
||||
pub id: StreamId,
|
||||
pub name: String,
|
||||
pub summary: StreamSummary,
|
||||
pub scope: StreamScope,
|
||||
pub enabled: bool,
|
||||
pub health: HealthStatus,
|
||||
/// Current consecutive-failure count (mirrors
|
||||
/// `StreamRegistration::consecutive_failures` at read time).
|
||||
pub lagged_count: u64,
|
||||
}
|
||||
|
||||
/// The live registry of all streams wired to a [`SharingBus`].
|
||||
pub struct StreamRegistry {
|
||||
bus: Arc<SharingBus>,
|
||||
regs: RwLock<Vec<Arc<StreamRegistration>>>,
|
||||
}
|
||||
|
||||
impl StreamRegistry {
|
||||
/// Build an empty registry bound to `bus`.
|
||||
pub fn new(bus: Arc<SharingBus>) -> Self {
|
||||
Self {
|
||||
bus,
|
||||
regs: RwLock::new(Vec::new()),
|
||||
}
|
||||
}
|
||||
|
||||
/// Attach a running stream.
|
||||
///
|
||||
/// Subscribes to the bus with a filter derived from `stream.scope()`,
|
||||
/// spawns a worker task that ferries events into `on_event`, and
|
||||
/// stores a registration with fresh health state.
|
||||
pub async fn attach(&self, name: String, stream: Arc<dyn SessionStream>) -> StreamId {
|
||||
let id = StreamId(Uuid::now_v7());
|
||||
let scope = stream.scope();
|
||||
let filter = scope_to_filter(&scope);
|
||||
let bus_rx = self
|
||||
.bus
|
||||
.subscribe_filtered(filter, STREAM_QUEUE_CAPACITY)
|
||||
.await;
|
||||
|
||||
let (stop_tx, stop_rx) = mpsc::channel(1);
|
||||
let health = Arc::new(RwLock::new(HealthStatus::Healthy));
|
||||
let failures = Arc::new(AtomicU32::new(0));
|
||||
|
||||
let stream_for_worker = Arc::clone(&stream);
|
||||
let health_for_worker = Arc::clone(&health);
|
||||
let failures_for_worker = Arc::clone(&failures);
|
||||
let name_for_worker = name.clone();
|
||||
|
||||
let worker = tokio::spawn(run_stream_worker(
|
||||
name_for_worker,
|
||||
bus_rx,
|
||||
stream_for_worker,
|
||||
health_for_worker,
|
||||
failures_for_worker,
|
||||
stop_rx,
|
||||
));
|
||||
|
||||
let reg = Arc::new(StreamRegistration {
|
||||
id,
|
||||
name,
|
||||
stream,
|
||||
scope,
|
||||
enabled: true,
|
||||
health,
|
||||
consecutive_failures: failures,
|
||||
worker,
|
||||
stop_tx,
|
||||
});
|
||||
self.regs.write().await.push(reg);
|
||||
id
|
||||
}
|
||||
|
||||
/// Detach a stream. Signals the worker to exit, then invokes
|
||||
/// `SessionStream::shutdown`. Returns the registration if the stream
|
||||
/// was found, or `None` if the id was already detached.
|
||||
pub async fn detach(&self, id: StreamId) -> Option<Arc<StreamRegistration>> {
|
||||
let mut regs = self.regs.write().await;
|
||||
let idx = regs.iter().position(|r| r.id == id)?;
|
||||
let reg = regs.remove(idx);
|
||||
drop(regs);
|
||||
|
||||
// Best-effort stop: if the channel is already closed (worker panicked)
|
||||
// we still want to run shutdown.
|
||||
let _ = reg.stop_tx.send(()).await;
|
||||
reg.stream.shutdown().await;
|
||||
Some(reg)
|
||||
}
|
||||
|
||||
/// Look up a live stream by id.
|
||||
pub async fn get_stream(&self, id: StreamId) -> Option<Arc<dyn SessionStream>> {
|
||||
self.regs
|
||||
.read()
|
||||
.await
|
||||
.iter()
|
||||
.find(|r| r.id == id)
|
||||
.map(|r| Arc::clone(&r.stream))
|
||||
}
|
||||
|
||||
/// Snapshot every registered stream. Clones the underlying health
|
||||
/// value so the returned `Vec` is safe to hand across async tasks
|
||||
/// without holding any locks.
|
||||
pub async fn list(&self) -> Vec<StreamInfo> {
|
||||
let regs = self.regs.read().await;
|
||||
let mut out = Vec::with_capacity(regs.len());
|
||||
for r in regs.iter() {
|
||||
let health = r.health.read().await.clone();
|
||||
out.push(StreamInfo {
|
||||
id: r.id,
|
||||
name: r.name.clone(),
|
||||
summary: r.stream.summary(),
|
||||
scope: r.scope.clone(),
|
||||
enabled: r.enabled,
|
||||
health,
|
||||
lagged_count: r.consecutive_failures.load(Ordering::Relaxed) as u64,
|
||||
});
|
||||
}
|
||||
out
|
||||
}
|
||||
}
|
||||
|
||||
/// Translate a declarative [`StreamScope`] into the subscriber-side
|
||||
/// [`EventFilter`] applied on the bus.
|
||||
fn scope_to_filter(scope: &StreamScope) -> EventFilter {
|
||||
match scope {
|
||||
StreamScope::Session { scroll_id } => EventFilter::ScrollId(*scroll_id),
|
||||
StreamScope::Connector { connector_uid } => EventFilter::ConnectorUid(*connector_uid),
|
||||
StreamScope::ArchiveWide { .. } => EventFilter::All,
|
||||
}
|
||||
}
|
||||
|
||||
/// Worker loop: pulls events from the bus subscription, forwards them to
|
||||
/// the stream, and updates health state on every outcome.
|
||||
async fn run_stream_worker(
|
||||
name: String,
|
||||
mut rx: BusReceiver,
|
||||
stream: Arc<dyn SessionStream>,
|
||||
health: Arc<RwLock<HealthStatus>>,
|
||||
failures: Arc<AtomicU32>,
|
||||
mut stop_rx: mpsc::Receiver<()>,
|
||||
) {
|
||||
loop {
|
||||
tokio::select! {
|
||||
biased;
|
||||
_ = stop_rx.recv() => {
|
||||
return;
|
||||
}
|
||||
maybe_evt = rx.rx.recv() => {
|
||||
let Some(evt) = maybe_evt else {
|
||||
// Bus hung up — registry should detach but we can exit
|
||||
// here regardless.
|
||||
return;
|
||||
};
|
||||
match stream.on_event(&evt).await {
|
||||
StreamOutcome::Ok | StreamOutcome::Skipped => {
|
||||
let mut h = health.write().await;
|
||||
let mut counter = failures.load(Ordering::Relaxed);
|
||||
record_success(&mut h, &mut counter);
|
||||
failures.store(counter, Ordering::Relaxed);
|
||||
}
|
||||
StreamOutcome::Failed(err) => {
|
||||
let reason = err.to_string();
|
||||
warn!(stream = %name, error = %reason, "stream rejected event");
|
||||
let mut h = health.write().await;
|
||||
let mut counter = failures.load(Ordering::Relaxed);
|
||||
record_failure(&mut h, &mut counter, reason);
|
||||
failures.store(counter, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,226 @@
|
||||
//! Replay: reads a session from the archive and dispatches synthetic
|
||||
//! `BusEvent`s with `EventOrigin::Replay` directly to a target stream,
|
||||
//! bypassing the `SharingBus`.
|
||||
//!
|
||||
//! Consumed by `CoreRuntime::replay_session_to_stream` (task 16). This
|
||||
//! module intentionally exposes a free function that takes
|
||||
//! `&Archivist`, `scroll_id`, `Arc<dyn SessionStream>`, and `ReplayOptions`
|
||||
//! so it can be unit-tested without a full runtime.
|
||||
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
|
||||
use uuid::Uuid;
|
||||
|
||||
use dirigent_archivist::coordinator::Archivist;
|
||||
use dirigent_archivist::error::ArchivistError;
|
||||
use dirigent_archivist::types::MessageRecord;
|
||||
use dirigent_protocol::{
|
||||
Event, Message, MessagePart, MessageRole, MessageStatus,
|
||||
streaming::{BusEvent, EventOrigin, EventRouting, SessionStream, StreamOutcome},
|
||||
};
|
||||
|
||||
/// Options controlling a replay pass.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ReplayOptions {
|
||||
/// When true and the session is an AcpConnection, meta-events are read from
|
||||
/// the archive (currently only counted — rendering meta events as
|
||||
/// `BusEvent`s is out of scope for Phase 4).
|
||||
pub include_meta_events: bool,
|
||||
/// Pace events in real time (sleep between consecutive timestamps) or emit
|
||||
/// as fast as the target stream can consume.
|
||||
pub speed: ReplaySpeed,
|
||||
}
|
||||
|
||||
impl Default for ReplayOptions {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
include_meta_events: false,
|
||||
speed: ReplaySpeed::AsFastAsPossible,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Controls inter-event pacing during replay.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum ReplaySpeed {
|
||||
/// Sleep the wall-clock delta between consecutive message timestamps.
|
||||
Realtime,
|
||||
/// Emit events as fast as the stream can consume.
|
||||
AsFastAsPossible,
|
||||
}
|
||||
|
||||
/// Outcome of a replay pass.
|
||||
#[derive(Debug, Default, Clone)]
|
||||
pub struct ReplayReport {
|
||||
/// Total events dispatched to the stream (includes failed attempts).
|
||||
pub events_sent: usize,
|
||||
/// Events the stream rejected (`StreamOutcome::Failed`).
|
||||
pub failures: usize,
|
||||
/// Wall-clock duration of the replay in milliseconds.
|
||||
pub duration_ms: u64,
|
||||
}
|
||||
|
||||
/// Errors raised by `replay_session_to_stream` itself. Stream-side failures are
|
||||
/// counted in `ReplayReport::failures` rather than propagated, so one bad event
|
||||
/// doesn't abort the replay.
|
||||
#[derive(Debug, thiserror::Error)]
|
||||
pub enum ReplayError {
|
||||
/// The archive has no session with the given scroll id.
|
||||
#[error("session not found: {0}")]
|
||||
SessionNotFound(Uuid),
|
||||
/// Archivist returned a non-SessionUnknown error (I/O, decoding, etc).
|
||||
#[error("archivist: {0}")]
|
||||
Archivist(String),
|
||||
}
|
||||
|
||||
/// Replay a session's archived messages to a single `SessionStream`.
|
||||
///
|
||||
/// Reads metadata + messages from `archivist`, synthesises a `BusEvent` per
|
||||
/// message with `EventOrigin::Replay { replay_id }`, and dispatches directly
|
||||
/// to the target stream. The `SharingBus` is not involved; live events remain
|
||||
/// unaffected.
|
||||
///
|
||||
/// The function continues on stream failures and records the count in the
|
||||
/// returned `ReplayReport`; only unrecoverable archive errors propagate.
|
||||
pub async fn replay_session_to_stream(
|
||||
archivist: &Archivist,
|
||||
scroll_id: Uuid,
|
||||
stream: Arc<dyn SessionStream>,
|
||||
opts: ReplayOptions,
|
||||
) -> Result<ReplayReport, ReplayError> {
|
||||
let start = std::time::Instant::now();
|
||||
let replay_id = Uuid::new_v4();
|
||||
|
||||
// Load metadata. Translate the archivist's typed `SessionUnknown` into
|
||||
// the replay-level `SessionNotFound` variant; everything else becomes
|
||||
// `Archivist(_)` so callers can distinguish "missing" from "broken".
|
||||
let metadata = archivist
|
||||
.get_session_metadata(scroll_id, None)
|
||||
.await
|
||||
.map_err(|e| match e {
|
||||
ArchivistError::SessionUnknown(id) => ReplayError::SessionNotFound(id),
|
||||
other => ReplayError::Archivist(other.to_string()),
|
||||
})?;
|
||||
|
||||
let messages = archivist
|
||||
.get_messages(scroll_id, None)
|
||||
.await
|
||||
.map_err(|e| ReplayError::Archivist(e.to_string()))?;
|
||||
|
||||
let connector_uid = Some(metadata.connector_uid);
|
||||
let native_session_id = metadata.native_session_id.clone();
|
||||
// We do not persist the orchestrator-side `connector_id` string in session
|
||||
// metadata; the native session id is the best reversible handle we have.
|
||||
let connector_id = native_session_id.clone().unwrap_or_default();
|
||||
|
||||
let mut events_sent = 0usize;
|
||||
let mut failures = 0usize;
|
||||
let mut prev_ts: Option<chrono::DateTime<chrono::Utc>> = None;
|
||||
|
||||
for record in messages {
|
||||
if matches!(opts.speed, ReplaySpeed::Realtime) {
|
||||
if let Some(prev) = prev_ts {
|
||||
let delta = record.ts.signed_duration_since(prev);
|
||||
if let Ok(d) = delta.to_std() {
|
||||
// Cap per-step sleep at 1h to avoid pathological archives
|
||||
// where a session sat idle for days.
|
||||
if d > Duration::from_millis(0) && d < Duration::from_secs(3600) {
|
||||
tokio::time::sleep(d).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
prev_ts = Some(record.ts);
|
||||
}
|
||||
|
||||
let message = message_from_record(&record, native_session_id.as_deref());
|
||||
let event = Event::MessageCompleted {
|
||||
connector_id: connector_id.clone(),
|
||||
message,
|
||||
};
|
||||
|
||||
let mut routing = EventRouting::derive(&event, connector_uid, &connector_id);
|
||||
// `derive()` leaves scroll_id=None (the bus cache normally fills it in).
|
||||
// During replay we have the authoritative scroll_id up front.
|
||||
routing.scroll_id = Some(scroll_id);
|
||||
|
||||
let bus_event = BusEvent {
|
||||
routing,
|
||||
origin: EventOrigin::Replay { replay_id },
|
||||
event: Arc::new(event),
|
||||
};
|
||||
|
||||
match stream.on_event(&bus_event).await {
|
||||
StreamOutcome::Ok | StreamOutcome::Skipped => {
|
||||
events_sent += 1;
|
||||
}
|
||||
StreamOutcome::Failed(_err) => {
|
||||
failures += 1;
|
||||
events_sent += 1; // count attempted regardless
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if opts.include_meta_events {
|
||||
// Meta-events exist only on AcpConnection sessions; the read is
|
||||
// cheap and idempotent, so we don't gate on `metadata.kind`. Render-
|
||||
// as-BusEvent is out of scope for Phase 4 — we just probe the
|
||||
// archive so missing meta-event storage surfaces as a log line
|
||||
// here rather than later in the call chain.
|
||||
let _ = archivist.get_meta_events(scroll_id, None).await;
|
||||
}
|
||||
|
||||
Ok(ReplayReport {
|
||||
events_sent,
|
||||
failures,
|
||||
duration_ms: start.elapsed().as_millis() as u64,
|
||||
})
|
||||
}
|
||||
|
||||
/// Synthesize a protocol `Message` from an archived `MessageRecord`.
|
||||
///
|
||||
/// The session_id we emit is the connector's native session id when known,
|
||||
/// falling back to the stringified scroll_id so downstream routing at least
|
||||
/// has a stable handle.
|
||||
fn message_from_record(record: &MessageRecord, native_session_id: Option<&str>) -> Message {
|
||||
Message {
|
||||
id: record.message_id.to_string(),
|
||||
session_id: native_session_id
|
||||
.map(str::to_string)
|
||||
.unwrap_or_else(|| record.session.to_string()),
|
||||
role: parse_role(&record.role),
|
||||
created_at: record.ts,
|
||||
content: content_parts_from_record(record),
|
||||
status: MessageStatus::Completed,
|
||||
metadata: None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse the archivist's stringly-typed role into the protocol enum.
|
||||
///
|
||||
/// `MessageRole` only has `User` and `Assistant` today; archived "system" /
|
||||
/// "tool" rows (which the protocol layer does not support) fall back to
|
||||
/// `User` rather than drop the message entirely. Lossy but preserves content.
|
||||
fn parse_role(role: &str) -> MessageRole {
|
||||
match role {
|
||||
"assistant" => MessageRole::Assistant,
|
||||
"user" => MessageRole::User,
|
||||
// Protocol has no System/Tool variant; surface these as user messages
|
||||
// so their content still reaches the stream.
|
||||
_ => MessageRole::User,
|
||||
}
|
||||
}
|
||||
|
||||
/// Prefer the archived structured `content_parts` (round-trips tool calls,
|
||||
/// code blocks, etc). Fall back to a single `Text` part built from the
|
||||
/// markdown rendering when parts are missing or fail to parse.
|
||||
fn content_parts_from_record(record: &MessageRecord) -> Vec<MessagePart> {
|
||||
if let Some(parts) = &record.content_parts {
|
||||
if let Ok(parsed) = serde_json::from_value::<Vec<MessagePart>>(parts.clone()) {
|
||||
return parsed;
|
||||
}
|
||||
}
|
||||
vec![MessagePart::Text {
|
||||
text: record.content_md.clone(),
|
||||
}]
|
||||
}
|
||||
Reference in New Issue
Block a user