sync from monorepo @ 2452e92e

This commit is contained in:
2026-05-08 01:59:04 +02:00
commit b03dc15371
459 changed files with 129586 additions and 0 deletions
+477
View File
@@ -0,0 +1,477 @@
//! SharingBus: single-producer, many-subscriber event multiplexer with
//! subscriber-side filtering performed by a worker task. See
//! docs/plans/2026-04-21-archivist-phase4-design.md §1.
//!
//! Architecture:
//! - One internal `tokio::sync::broadcast::Sender<BusEvent>` feeds a single
//! worker task. The worker iterates `Vec<SubscriberSlot>` (behind `RwLock`),
//! filter-matches each slot, and `try_send`s the event onto each slot's
//! `mpsc::Sender<BusEvent>`.
//! - Slow subscribers drop their own events at their mpsc (counted in the
//! slot's `lagged` atomic). The bus-internal broadcast channel never drops
//! due to a slow subscriber — only due to the broadcast lag contract, which
//! we log and continue.
//! - `SessionRegistered` events late-bind `(connector_id, native_session_id) ->
//! scroll_id` via a small cache consulted on every publish.
use std::collections::HashMap;
use std::sync::Arc;
use std::sync::atomic::{AtomicU64, Ordering};
use tokio::sync::{broadcast, mpsc, RwLock};
use tokio::task::JoinHandle;
use tracing::{debug, warn};
use uuid::Uuid;
use dirigent_protocol::streaming::{BusEvent, EventFilter};
pub use dirigent_protocol::streaming::BusReceiver;
use dirigent_protocol::Event;
const BUS_INTERNAL_CAPACITY: usize = 1024;
const SUBSCRIBER_QUEUE_DEFAULT: usize = 256;
/// Single-producer, many-subscriber event multiplexer.
///
/// Subscribers see a `mpsc::Receiver<BusEvent>` that only yields events
/// matching their `EventFilter`. Filtering happens inside a single worker
/// task, so the cost per event is O(n_subscribers) regardless of publisher
/// count. Slow subscribers lose events at their own mpsc, not at the bus.
pub struct SharingBus {
publish_tx: broadcast::Sender<BusEvent>,
subscribers: Arc<RwLock<Vec<SubscriberSlot>>>,
scroll_id_cache: Arc<RwLock<HashMap<(String, String), Uuid>>>,
next_id: Arc<AtomicU64>,
_worker: JoinHandle<()>,
}
struct SubscriberSlot {
id: u64,
filter: EventFilter,
sender: mpsc::Sender<BusEvent>,
lagged: Arc<AtomicU64>,
}
impl SharingBus {
/// Construct a new bus and spawn its dispatch worker.
pub fn new() -> Arc<Self> {
let (publish_tx, publish_rx) = broadcast::channel(BUS_INTERNAL_CAPACITY);
let subscribers: Arc<RwLock<Vec<SubscriberSlot>>> = Arc::new(RwLock::new(Vec::new()));
let scroll_id_cache: Arc<RwLock<HashMap<(String, String), Uuid>>> =
Arc::new(RwLock::new(HashMap::new()));
let next_id = Arc::new(AtomicU64::new(0));
let worker = tokio::spawn(run_worker(publish_rx, Arc::clone(&subscribers)));
Arc::new(Self {
publish_tx,
subscribers,
scroll_id_cache,
next_id,
_worker: worker,
})
}
/// Publish a `BusEvent` to all matching subscribers.
///
/// This method also performs two side-effects on the scroll-id cache:
///
/// 1. If the wrapped event is `Event::SessionRegistered`, the binding
/// `(connector_id, session_id) -> scroll_id` is inserted into the
/// cache, and the current event's `routing.scroll_id` is set so the
/// binding event itself carries its own scroll_id downstream.
/// 2. If the event's `routing.scroll_id` is absent but it carries both a
/// `connector_id` and `native_session_id`, the cache is consulted to
/// late-bind `scroll_id` before broadcasting.
pub async fn publish(&self, mut bus_event: BusEvent) {
// (2) Late-bind scroll_id from cache if we can, BEFORE the possibly
// more specific (1) handling overrides it. This is a no-op for
// SessionRegistered (its scroll_id is always populated in (1)).
if bus_event.routing.scroll_id.is_none() {
if let (Some(cid), Some(nsid)) = (
bus_event.routing.connector_id.as_ref(),
bus_event.routing.native_session_id.as_ref(),
) {
let cache = self.scroll_id_cache.read().await;
if let Some(uuid) = cache.get(&(cid.clone(), nsid.clone())) {
bus_event.routing.scroll_id = Some(*uuid);
}
}
}
// (1) If the wrapped event is SessionRegistered, populate the cache
// and set scroll_id on the event itself.
if let Event::SessionRegistered {
connector_id,
session_id,
scroll_id,
} = bus_event.event.as_ref()
{
match Uuid::parse_str(scroll_id) {
Ok(uuid) => {
self.scroll_id_cache
.write()
.await
.insert((connector_id.clone(), session_id.clone()), uuid);
bus_event.routing.scroll_id = Some(uuid);
}
Err(e) => {
warn!(
connector_id = %connector_id,
session_id = %session_id,
scroll_id = %scroll_id,
error = %e,
"SessionRegistered carried an unparseable scroll_id; skipping late-bind cache insert",
);
}
}
}
// No subscribers is not an error — ignore the Result.
let _ = self.publish_tx.send(bus_event);
}
/// Subscribe to every event on the bus.
pub async fn subscribe_all(&self) -> BusReceiver {
self.subscribe_filtered(EventFilter::All, SUBSCRIBER_QUEUE_DEFAULT)
.await
}
/// Subscribe to events that match `filter`. `queue_capacity` caps the
/// buffered events between the worker and the caller's `recv()`.
pub async fn subscribe_filtered(
&self,
filter: EventFilter,
queue_capacity: usize,
) -> BusReceiver {
let (tx, rx) = mpsc::channel(queue_capacity);
let lagged = Arc::new(AtomicU64::new(0));
// Relaxed ordering is sufficient: subscriber IDs are only compared for
// equality with other IDs issued by this same bus; there is no
// cross-thread ordering dependency on this counter.
let id = self.next_id.fetch_add(1, Ordering::Relaxed);
self.subscribers.write().await.push(SubscriberSlot {
id,
filter,
sender: tx,
lagged: Arc::clone(&lagged),
});
BusReceiver { id, rx, lagged }
}
/// Remove a subscriber by id. Idempotent.
pub async fn unsubscribe(&self, id: u64) {
self.subscribers.write().await.retain(|s| s.id != id);
}
}
async fn run_worker(
mut rx: broadcast::Receiver<BusEvent>,
subscribers: Arc<RwLock<Vec<SubscriberSlot>>>,
) {
loop {
match rx.recv().await {
Ok(evt) => {
let mut closed_ids: Vec<u64> = Vec::new();
{
let subs = subscribers.read().await;
for slot in subs.iter() {
if !slot.filter.matches(&evt) {
continue;
}
match slot.sender.try_send(evt.clone()) {
Ok(()) => {}
Err(mpsc::error::TrySendError::Full(_)) => {
slot.lagged.fetch_add(1, Ordering::Relaxed);
warn!(
subscriber_id = slot.id,
"bus subscriber queue full; dropping event"
);
}
Err(mpsc::error::TrySendError::Closed(_)) => {
closed_ids.push(slot.id);
}
}
}
}
if !closed_ids.is_empty() {
subscribers
.write()
.await
.retain(|s| !closed_ids.contains(&s.id));
debug!(removed = closed_ids.len(), "GC'd closed subscriber slots");
}
}
Err(broadcast::error::RecvError::Lagged(n)) => {
warn!(skipped = n, "SharingBus internal broadcast lagged");
}
Err(broadcast::error::RecvError::Closed) => {
debug!("SharingBus worker exiting (sender closed)");
return;
}
}
}
}
// ─── Tests ───────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use std::sync::Arc;
use std::sync::atomic::Ordering;
use std::time::Duration;
use tokio::time::timeout;
use uuid::Uuid;
use super::*;
use dirigent_protocol::streaming::{BusEvent, EventKind, EventOrigin, EventRouting};
use dirigent_protocol::Event;
/// Build a minimal `BusEvent` for tests. Uses `Event::Connected` as payload
/// unless a specific event is needed for late-bind checks.
fn make_event(
scroll_id: Option<Uuid>,
connector_uid: Option<Uuid>,
connector_id: Option<String>,
native_session_id: Option<String>,
kind: EventKind,
event: Event,
) -> BusEvent {
BusEvent {
routing: EventRouting {
scroll_id,
connector_uid,
connector_id,
native_session_id,
kind,
},
origin: EventOrigin::Runtime,
event: Arc::new(event),
}
}
// 1. subscribe_all + publish: one event round-trips to receiver.
#[tokio::test]
async fn subscribe_all_receives_published_event() {
let bus = SharingBus::new();
let mut recv = bus.subscribe_all().await;
let ev = make_event(
None,
None,
None,
None,
EventKind::System,
Event::Connected,
);
bus.publish(ev).await;
let got = timeout(Duration::from_millis(200), recv.rx.recv())
.await
.expect("timed out waiting for event")
.expect("channel closed unexpectedly");
match got.event.as_ref() {
Event::Connected => {}
other => panic!("expected Event::Connected, got {:?}", other),
}
}
// 2. ConnectorUid filter: matching UID passes, other UID skipped.
#[tokio::test]
async fn connector_uid_filter_only_forwards_matching_events() {
let bus = SharingBus::new();
let target = Uuid::new_v4();
let other = Uuid::new_v4();
let mut recv = bus
.subscribe_filtered(EventFilter::ConnectorUid(target), 16)
.await;
// Publish one matching and one non-matching event.
let ev_match = make_event(
None,
Some(target),
None,
None,
EventKind::System,
Event::Connected,
);
let ev_other = make_event(
None,
Some(other),
None,
None,
EventKind::System,
Event::Connected,
);
bus.publish(ev_match).await;
bus.publish(ev_other).await;
// First recv returns the matching event.
let got = timeout(Duration::from_millis(200), recv.rx.recv())
.await
.expect("timed out waiting for first event")
.expect("channel closed unexpectedly");
assert_eq!(got.routing.connector_uid, Some(target));
// Second recv must time out — no other matching event was published.
let result = timeout(Duration::from_millis(100), recv.rx.recv()).await;
assert!(
result.is_err(),
"expected no further events, got: {:?}",
result.ok().flatten().map(|e| e.routing.connector_uid)
);
}
// 3. Queue full = lagged counter increments, first event still delivered.
#[tokio::test]
async fn full_queue_increments_lagged_counter() {
let bus = SharingBus::new();
// Capacity 1 — only one event can be buffered before try_send fails.
let mut recv = bus.subscribe_filtered(EventFilter::All, 1).await;
// Publish 5 events without draining.
for _ in 0..5 {
let ev = make_event(
None,
None,
None,
None,
EventKind::System,
Event::Connected,
);
bus.publish(ev).await;
}
// Give the worker a chance to process all 5.
for _ in 0..10 {
tokio::task::yield_now().await;
}
tokio::time::sleep(Duration::from_millis(20)).await;
// First event is still in the queue.
let first = timeout(Duration::from_millis(200), recv.rx.recv())
.await
.expect("timed out waiting for first event")
.expect("channel closed unexpectedly");
match first.event.as_ref() {
Event::Connected => {}
other => panic!("expected Event::Connected, got {:?}", other),
}
// At minimum 4 events were dropped (5 published, 1 fit).
let lagged = recv.lagged.load(Ordering::Relaxed);
assert!(
lagged >= 4,
"expected lagged >= 4 after publishing 5 events to a capacity-1 queue, got {}",
lagged
);
}
// 4. scroll_id late-bind: SessionRegistered populates cache; subsequent
// events with matching (connector_id, native_session_id) get their
// scroll_id filled in before dispatch.
#[tokio::test]
async fn session_registered_populates_cache_and_late_binds_subsequent_events() {
let bus = SharingBus::new();
let scroll = Uuid::new_v4();
// Subscriber filters on ScrollId(scroll). It should see:
// - the SessionRegistered event (bus sets its own scroll_id at publish)
// - a follow-up event with (connector_id="c", native_session_id="s")
// that had no scroll_id on entry (late-bound from the cache).
let mut recv = bus
.subscribe_filtered(EventFilter::ScrollId(scroll), 16)
.await;
// --- publish SessionRegistered (binding event) ---
let reg_event = Event::SessionRegistered {
connector_id: "c".to_string(),
session_id: "s".to_string(),
scroll_id: scroll.to_string(),
};
// We pass through the routing fields the producer would populate.
// `scroll_id` starts as None; publish() sets it from the event payload.
let reg_bus = make_event(
None,
None,
Some("c".to_string()),
Some("s".to_string()),
EventKind::SessionLifecycle,
reg_event,
);
bus.publish(reg_bus).await;
let got1 = timeout(Duration::from_millis(200), recv.rx.recv())
.await
.expect("timed out waiting for SessionRegistered")
.expect("channel closed unexpectedly");
assert!(matches!(
got1.event.as_ref(),
Event::SessionRegistered { .. }
));
assert_eq!(got1.routing.scroll_id, Some(scroll));
// --- publish a follow-up event with no scroll_id but matching
// connector_id + native_session_id ---
let follow_up = make_event(
None,
None,
Some("c".to_string()),
Some("s".to_string()),
EventKind::System,
Event::Connected,
);
bus.publish(follow_up).await;
let got2 = timeout(Duration::from_millis(200), recv.rx.recv())
.await
.expect("timed out waiting for late-bound follow-up")
.expect("channel closed unexpectedly");
assert_eq!(
got2.routing.scroll_id,
Some(scroll),
"follow-up event should have had scroll_id late-bound from the cache"
);
assert!(matches!(got2.event.as_ref(), Event::Connected));
}
// 5. Dropped receiver is GC'd after the next publish.
#[tokio::test]
async fn closed_receiver_slot_is_reaped_on_next_publish() {
let bus = SharingBus::new();
// Subscribe, then immediately drop the receiver — simulates a caller
// that forgets (or skips) `unsubscribe()`.
let recv = bus.subscribe_all().await;
drop(recv);
// Sanity check: slot is present before GC.
assert_eq!(bus.subscribers.read().await.len(), 1);
// Publish one event; the worker encounters TrySendError::Closed and
// schedules the slot for removal.
let ev = make_event(
None,
None,
None,
None,
EventKind::System,
Event::Connected,
);
bus.publish(ev).await;
// Give the worker a moment to process and GC.
for _ in 0..10 {
tokio::task::yield_now().await;
}
tokio::time::sleep(Duration::from_millis(10)).await;
assert_eq!(
bus.subscribers.read().await.len(),
0,
"closed subscriber slot should have been GC'd after publish"
);
}
}