478 lines
17 KiB
Rust
478 lines
17 KiB
Rust
//! SharingBus: single-producer, many-subscriber event multiplexer with
|
|
//! subscriber-side filtering performed by a worker task. See
|
|
//! docs/plans/2026-04-21-archivist-phase4-design.md §1.
|
|
//!
|
|
//! Architecture:
|
|
//! - One internal `tokio::sync::broadcast::Sender<BusEvent>` feeds a single
|
|
//! worker task. The worker iterates `Vec<SubscriberSlot>` (behind `RwLock`),
|
|
//! filter-matches each slot, and `try_send`s the event onto each slot's
|
|
//! `mpsc::Sender<BusEvent>`.
|
|
//! - Slow subscribers drop their own events at their mpsc (counted in the
|
|
//! slot's `lagged` atomic). The bus-internal broadcast channel never drops
|
|
//! due to a slow subscriber — only due to the broadcast lag contract, which
|
|
//! we log and continue.
|
|
//! - `SessionRegistered` events late-bind `(connector_id, native_session_id) ->
|
|
//! scroll_id` via a small cache consulted on every publish.
|
|
|
|
use std::collections::HashMap;
|
|
use std::sync::Arc;
|
|
use std::sync::atomic::{AtomicU64, Ordering};
|
|
|
|
use tokio::sync::{broadcast, mpsc, RwLock};
|
|
use tokio::task::JoinHandle;
|
|
use tracing::{debug, warn};
|
|
use uuid::Uuid;
|
|
|
|
use dirigent_protocol::streaming::{BusEvent, EventFilter};
|
|
pub use dirigent_protocol::streaming::BusReceiver;
|
|
use dirigent_protocol::Event;
|
|
|
|
const BUS_INTERNAL_CAPACITY: usize = 1024;
|
|
const SUBSCRIBER_QUEUE_DEFAULT: usize = 256;
|
|
|
|
/// Single-producer, many-subscriber event multiplexer.
|
|
///
|
|
/// Subscribers see a `mpsc::Receiver<BusEvent>` that only yields events
|
|
/// matching their `EventFilter`. Filtering happens inside a single worker
|
|
/// task, so the cost per event is O(n_subscribers) regardless of publisher
|
|
/// count. Slow subscribers lose events at their own mpsc, not at the bus.
|
|
pub struct SharingBus {
|
|
publish_tx: broadcast::Sender<BusEvent>,
|
|
subscribers: Arc<RwLock<Vec<SubscriberSlot>>>,
|
|
scroll_id_cache: Arc<RwLock<HashMap<(String, String), Uuid>>>,
|
|
next_id: Arc<AtomicU64>,
|
|
_worker: JoinHandle<()>,
|
|
}
|
|
|
|
struct SubscriberSlot {
|
|
id: u64,
|
|
filter: EventFilter,
|
|
sender: mpsc::Sender<BusEvent>,
|
|
lagged: Arc<AtomicU64>,
|
|
}
|
|
|
|
impl SharingBus {
|
|
/// Construct a new bus and spawn its dispatch worker.
|
|
pub fn new() -> Arc<Self> {
|
|
let (publish_tx, publish_rx) = broadcast::channel(BUS_INTERNAL_CAPACITY);
|
|
let subscribers: Arc<RwLock<Vec<SubscriberSlot>>> = Arc::new(RwLock::new(Vec::new()));
|
|
let scroll_id_cache: Arc<RwLock<HashMap<(String, String), Uuid>>> =
|
|
Arc::new(RwLock::new(HashMap::new()));
|
|
let next_id = Arc::new(AtomicU64::new(0));
|
|
|
|
let worker = tokio::spawn(run_worker(publish_rx, Arc::clone(&subscribers)));
|
|
|
|
Arc::new(Self {
|
|
publish_tx,
|
|
subscribers,
|
|
scroll_id_cache,
|
|
next_id,
|
|
_worker: worker,
|
|
})
|
|
}
|
|
|
|
/// Publish a `BusEvent` to all matching subscribers.
|
|
///
|
|
/// This method also performs two side-effects on the scroll-id cache:
|
|
///
|
|
/// 1. If the wrapped event is `Event::SessionRegistered`, the binding
|
|
/// `(connector_id, session_id) -> scroll_id` is inserted into the
|
|
/// cache, and the current event's `routing.scroll_id` is set so the
|
|
/// binding event itself carries its own scroll_id downstream.
|
|
/// 2. If the event's `routing.scroll_id` is absent but it carries both a
|
|
/// `connector_id` and `native_session_id`, the cache is consulted to
|
|
/// late-bind `scroll_id` before broadcasting.
|
|
pub async fn publish(&self, mut bus_event: BusEvent) {
|
|
// (2) Late-bind scroll_id from cache if we can, BEFORE the possibly
|
|
// more specific (1) handling overrides it. This is a no-op for
|
|
// SessionRegistered (its scroll_id is always populated in (1)).
|
|
if bus_event.routing.scroll_id.is_none() {
|
|
if let (Some(cid), Some(nsid)) = (
|
|
bus_event.routing.connector_id.as_ref(),
|
|
bus_event.routing.native_session_id.as_ref(),
|
|
) {
|
|
let cache = self.scroll_id_cache.read().await;
|
|
if let Some(uuid) = cache.get(&(cid.clone(), nsid.clone())) {
|
|
bus_event.routing.scroll_id = Some(*uuid);
|
|
}
|
|
}
|
|
}
|
|
|
|
// (1) If the wrapped event is SessionRegistered, populate the cache
|
|
// and set scroll_id on the event itself.
|
|
if let Event::SessionRegistered {
|
|
connector_id,
|
|
session_id,
|
|
scroll_id,
|
|
} = bus_event.event.as_ref()
|
|
{
|
|
match Uuid::parse_str(scroll_id) {
|
|
Ok(uuid) => {
|
|
self.scroll_id_cache
|
|
.write()
|
|
.await
|
|
.insert((connector_id.clone(), session_id.clone()), uuid);
|
|
bus_event.routing.scroll_id = Some(uuid);
|
|
}
|
|
Err(e) => {
|
|
warn!(
|
|
connector_id = %connector_id,
|
|
session_id = %session_id,
|
|
scroll_id = %scroll_id,
|
|
error = %e,
|
|
"SessionRegistered carried an unparseable scroll_id; skipping late-bind cache insert",
|
|
);
|
|
}
|
|
}
|
|
}
|
|
|
|
// No subscribers is not an error — ignore the Result.
|
|
let _ = self.publish_tx.send(bus_event);
|
|
}
|
|
|
|
/// Subscribe to every event on the bus.
|
|
pub async fn subscribe_all(&self) -> BusReceiver {
|
|
self.subscribe_filtered(EventFilter::All, SUBSCRIBER_QUEUE_DEFAULT)
|
|
.await
|
|
}
|
|
|
|
/// Subscribe to events that match `filter`. `queue_capacity` caps the
|
|
/// buffered events between the worker and the caller's `recv()`.
|
|
pub async fn subscribe_filtered(
|
|
&self,
|
|
filter: EventFilter,
|
|
queue_capacity: usize,
|
|
) -> BusReceiver {
|
|
let (tx, rx) = mpsc::channel(queue_capacity);
|
|
let lagged = Arc::new(AtomicU64::new(0));
|
|
// Relaxed ordering is sufficient: subscriber IDs are only compared for
|
|
// equality with other IDs issued by this same bus; there is no
|
|
// cross-thread ordering dependency on this counter.
|
|
let id = self.next_id.fetch_add(1, Ordering::Relaxed);
|
|
self.subscribers.write().await.push(SubscriberSlot {
|
|
id,
|
|
filter,
|
|
sender: tx,
|
|
lagged: Arc::clone(&lagged),
|
|
});
|
|
BusReceiver { id, rx, lagged }
|
|
}
|
|
|
|
/// Remove a subscriber by id. Idempotent.
|
|
pub async fn unsubscribe(&self, id: u64) {
|
|
self.subscribers.write().await.retain(|s| s.id != id);
|
|
}
|
|
}
|
|
|
|
async fn run_worker(
|
|
mut rx: broadcast::Receiver<BusEvent>,
|
|
subscribers: Arc<RwLock<Vec<SubscriberSlot>>>,
|
|
) {
|
|
loop {
|
|
match rx.recv().await {
|
|
Ok(evt) => {
|
|
let mut closed_ids: Vec<u64> = Vec::new();
|
|
{
|
|
let subs = subscribers.read().await;
|
|
for slot in subs.iter() {
|
|
if !slot.filter.matches(&evt) {
|
|
continue;
|
|
}
|
|
match slot.sender.try_send(evt.clone()) {
|
|
Ok(()) => {}
|
|
Err(mpsc::error::TrySendError::Full(_)) => {
|
|
slot.lagged.fetch_add(1, Ordering::Relaxed);
|
|
warn!(
|
|
subscriber_id = slot.id,
|
|
"bus subscriber queue full; dropping event"
|
|
);
|
|
}
|
|
Err(mpsc::error::TrySendError::Closed(_)) => {
|
|
closed_ids.push(slot.id);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
if !closed_ids.is_empty() {
|
|
subscribers
|
|
.write()
|
|
.await
|
|
.retain(|s| !closed_ids.contains(&s.id));
|
|
debug!(removed = closed_ids.len(), "GC'd closed subscriber slots");
|
|
}
|
|
}
|
|
Err(broadcast::error::RecvError::Lagged(n)) => {
|
|
warn!(skipped = n, "SharingBus internal broadcast lagged");
|
|
}
|
|
Err(broadcast::error::RecvError::Closed) => {
|
|
debug!("SharingBus worker exiting (sender closed)");
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// ─── Tests ───────────────────────────────────────────────────────────────────
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use std::sync::Arc;
|
|
use std::sync::atomic::Ordering;
|
|
use std::time::Duration;
|
|
|
|
use tokio::time::timeout;
|
|
use uuid::Uuid;
|
|
|
|
use super::*;
|
|
use dirigent_protocol::streaming::{BusEvent, EventKind, EventOrigin, EventRouting};
|
|
use dirigent_protocol::Event;
|
|
|
|
/// Build a minimal `BusEvent` for tests. Uses `Event::Connected` as payload
|
|
/// unless a specific event is needed for late-bind checks.
|
|
fn make_event(
|
|
scroll_id: Option<Uuid>,
|
|
connector_uid: Option<Uuid>,
|
|
connector_id: Option<String>,
|
|
native_session_id: Option<String>,
|
|
kind: EventKind,
|
|
event: Event,
|
|
) -> BusEvent {
|
|
BusEvent {
|
|
routing: EventRouting {
|
|
scroll_id,
|
|
connector_uid,
|
|
connector_id,
|
|
native_session_id,
|
|
kind,
|
|
},
|
|
origin: EventOrigin::Runtime,
|
|
event: Arc::new(event),
|
|
}
|
|
}
|
|
|
|
// 1. subscribe_all + publish: one event round-trips to receiver.
|
|
#[tokio::test]
|
|
async fn subscribe_all_receives_published_event() {
|
|
let bus = SharingBus::new();
|
|
let mut recv = bus.subscribe_all().await;
|
|
|
|
let ev = make_event(
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
EventKind::System,
|
|
Event::Connected,
|
|
);
|
|
bus.publish(ev).await;
|
|
|
|
let got = timeout(Duration::from_millis(200), recv.rx.recv())
|
|
.await
|
|
.expect("timed out waiting for event")
|
|
.expect("channel closed unexpectedly");
|
|
|
|
match got.event.as_ref() {
|
|
Event::Connected => {}
|
|
other => panic!("expected Event::Connected, got {:?}", other),
|
|
}
|
|
}
|
|
|
|
// 2. ConnectorUid filter: matching UID passes, other UID skipped.
|
|
#[tokio::test]
|
|
async fn connector_uid_filter_only_forwards_matching_events() {
|
|
let bus = SharingBus::new();
|
|
let target = Uuid::new_v4();
|
|
let other = Uuid::new_v4();
|
|
|
|
let mut recv = bus
|
|
.subscribe_filtered(EventFilter::ConnectorUid(target), 16)
|
|
.await;
|
|
|
|
// Publish one matching and one non-matching event.
|
|
let ev_match = make_event(
|
|
None,
|
|
Some(target),
|
|
None,
|
|
None,
|
|
EventKind::System,
|
|
Event::Connected,
|
|
);
|
|
let ev_other = make_event(
|
|
None,
|
|
Some(other),
|
|
None,
|
|
None,
|
|
EventKind::System,
|
|
Event::Connected,
|
|
);
|
|
bus.publish(ev_match).await;
|
|
bus.publish(ev_other).await;
|
|
|
|
// First recv returns the matching event.
|
|
let got = timeout(Duration::from_millis(200), recv.rx.recv())
|
|
.await
|
|
.expect("timed out waiting for first event")
|
|
.expect("channel closed unexpectedly");
|
|
assert_eq!(got.routing.connector_uid, Some(target));
|
|
|
|
// Second recv must time out — no other matching event was published.
|
|
let result = timeout(Duration::from_millis(100), recv.rx.recv()).await;
|
|
assert!(
|
|
result.is_err(),
|
|
"expected no further events, got: {:?}",
|
|
result.ok().flatten().map(|e| e.routing.connector_uid)
|
|
);
|
|
}
|
|
|
|
// 3. Queue full = lagged counter increments, first event still delivered.
|
|
#[tokio::test]
|
|
async fn full_queue_increments_lagged_counter() {
|
|
let bus = SharingBus::new();
|
|
// Capacity 1 — only one event can be buffered before try_send fails.
|
|
let mut recv = bus.subscribe_filtered(EventFilter::All, 1).await;
|
|
|
|
// Publish 5 events without draining.
|
|
for _ in 0..5 {
|
|
let ev = make_event(
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
EventKind::System,
|
|
Event::Connected,
|
|
);
|
|
bus.publish(ev).await;
|
|
}
|
|
|
|
// Give the worker a chance to process all 5.
|
|
for _ in 0..10 {
|
|
tokio::task::yield_now().await;
|
|
}
|
|
tokio::time::sleep(Duration::from_millis(20)).await;
|
|
|
|
// First event is still in the queue.
|
|
let first = timeout(Duration::from_millis(200), recv.rx.recv())
|
|
.await
|
|
.expect("timed out waiting for first event")
|
|
.expect("channel closed unexpectedly");
|
|
match first.event.as_ref() {
|
|
Event::Connected => {}
|
|
other => panic!("expected Event::Connected, got {:?}", other),
|
|
}
|
|
|
|
// At minimum 4 events were dropped (5 published, 1 fit).
|
|
let lagged = recv.lagged.load(Ordering::Relaxed);
|
|
assert!(
|
|
lagged >= 4,
|
|
"expected lagged >= 4 after publishing 5 events to a capacity-1 queue, got {}",
|
|
lagged
|
|
);
|
|
}
|
|
|
|
// 4. scroll_id late-bind: SessionRegistered populates cache; subsequent
|
|
// events with matching (connector_id, native_session_id) get their
|
|
// scroll_id filled in before dispatch.
|
|
#[tokio::test]
|
|
async fn session_registered_populates_cache_and_late_binds_subsequent_events() {
|
|
let bus = SharingBus::new();
|
|
let scroll = Uuid::new_v4();
|
|
|
|
// Subscriber filters on ScrollId(scroll). It should see:
|
|
// - the SessionRegistered event (bus sets its own scroll_id at publish)
|
|
// - a follow-up event with (connector_id="c", native_session_id="s")
|
|
// that had no scroll_id on entry (late-bound from the cache).
|
|
let mut recv = bus
|
|
.subscribe_filtered(EventFilter::ScrollId(scroll), 16)
|
|
.await;
|
|
|
|
// --- publish SessionRegistered (binding event) ---
|
|
let reg_event = Event::SessionRegistered {
|
|
connector_id: "c".to_string(),
|
|
session_id: "s".to_string(),
|
|
scroll_id: scroll.to_string(),
|
|
};
|
|
// We pass through the routing fields the producer would populate.
|
|
// `scroll_id` starts as None; publish() sets it from the event payload.
|
|
let reg_bus = make_event(
|
|
None,
|
|
None,
|
|
Some("c".to_string()),
|
|
Some("s".to_string()),
|
|
EventKind::SessionLifecycle,
|
|
reg_event,
|
|
);
|
|
bus.publish(reg_bus).await;
|
|
|
|
let got1 = timeout(Duration::from_millis(200), recv.rx.recv())
|
|
.await
|
|
.expect("timed out waiting for SessionRegistered")
|
|
.expect("channel closed unexpectedly");
|
|
assert!(matches!(
|
|
got1.event.as_ref(),
|
|
Event::SessionRegistered { .. }
|
|
));
|
|
assert_eq!(got1.routing.scroll_id, Some(scroll));
|
|
|
|
// --- publish a follow-up event with no scroll_id but matching
|
|
// connector_id + native_session_id ---
|
|
let follow_up = make_event(
|
|
None,
|
|
None,
|
|
Some("c".to_string()),
|
|
Some("s".to_string()),
|
|
EventKind::System,
|
|
Event::Connected,
|
|
);
|
|
bus.publish(follow_up).await;
|
|
|
|
let got2 = timeout(Duration::from_millis(200), recv.rx.recv())
|
|
.await
|
|
.expect("timed out waiting for late-bound follow-up")
|
|
.expect("channel closed unexpectedly");
|
|
assert_eq!(
|
|
got2.routing.scroll_id,
|
|
Some(scroll),
|
|
"follow-up event should have had scroll_id late-bound from the cache"
|
|
);
|
|
assert!(matches!(got2.event.as_ref(), Event::Connected));
|
|
}
|
|
|
|
// 5. Dropped receiver is GC'd after the next publish.
|
|
#[tokio::test]
|
|
async fn closed_receiver_slot_is_reaped_on_next_publish() {
|
|
let bus = SharingBus::new();
|
|
|
|
// Subscribe, then immediately drop the receiver — simulates a caller
|
|
// that forgets (or skips) `unsubscribe()`.
|
|
let recv = bus.subscribe_all().await;
|
|
drop(recv);
|
|
|
|
// Sanity check: slot is present before GC.
|
|
assert_eq!(bus.subscribers.read().await.len(), 1);
|
|
|
|
// Publish one event; the worker encounters TrySendError::Closed and
|
|
// schedules the slot for removal.
|
|
let ev = make_event(
|
|
None,
|
|
None,
|
|
None,
|
|
None,
|
|
EventKind::System,
|
|
Event::Connected,
|
|
);
|
|
bus.publish(ev).await;
|
|
|
|
// Give the worker a moment to process and GC.
|
|
for _ in 0..10 {
|
|
tokio::task::yield_now().await;
|
|
}
|
|
tokio::time::sleep(Duration::from_millis(10)).await;
|
|
|
|
assert_eq!(
|
|
bus.subscribers.read().await.len(),
|
|
0,
|
|
"closed subscriber slot should have been GC'd after publish"
|
|
);
|
|
}
|
|
}
|