sync from monorepo @ 2452e92e
This commit is contained in:
@@ -0,0 +1,477 @@
|
||||
//! SharingBus: single-producer, many-subscriber event multiplexer with
|
||||
//! subscriber-side filtering performed by a worker task. See
|
||||
//! docs/plans/2026-04-21-archivist-phase4-design.md §1.
|
||||
//!
|
||||
//! Architecture:
|
||||
//! - One internal `tokio::sync::broadcast::Sender<BusEvent>` feeds a single
|
||||
//! worker task. The worker iterates `Vec<SubscriberSlot>` (behind `RwLock`),
|
||||
//! filter-matches each slot, and `try_send`s the event onto each slot's
|
||||
//! `mpsc::Sender<BusEvent>`.
|
||||
//! - Slow subscribers drop their own events at their mpsc (counted in the
|
||||
//! slot's `lagged` atomic). The bus-internal broadcast channel never drops
|
||||
//! due to a slow subscriber — only due to the broadcast lag contract, which
|
||||
//! we log and continue.
|
||||
//! - `SessionRegistered` events late-bind `(connector_id, native_session_id) ->
|
||||
//! scroll_id` via a small cache consulted on every publish.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
|
||||
use tokio::sync::{broadcast, mpsc, RwLock};
|
||||
use tokio::task::JoinHandle;
|
||||
use tracing::{debug, warn};
|
||||
use uuid::Uuid;
|
||||
|
||||
use dirigent_protocol::streaming::{BusEvent, EventFilter};
|
||||
pub use dirigent_protocol::streaming::BusReceiver;
|
||||
use dirigent_protocol::Event;
|
||||
|
||||
const BUS_INTERNAL_CAPACITY: usize = 1024;
|
||||
const SUBSCRIBER_QUEUE_DEFAULT: usize = 256;
|
||||
|
||||
/// Single-producer, many-subscriber event multiplexer.
|
||||
///
|
||||
/// Subscribers see a `mpsc::Receiver<BusEvent>` that only yields events
|
||||
/// matching their `EventFilter`. Filtering happens inside a single worker
|
||||
/// task, so the cost per event is O(n_subscribers) regardless of publisher
|
||||
/// count. Slow subscribers lose events at their own mpsc, not at the bus.
|
||||
pub struct SharingBus {
|
||||
publish_tx: broadcast::Sender<BusEvent>,
|
||||
subscribers: Arc<RwLock<Vec<SubscriberSlot>>>,
|
||||
scroll_id_cache: Arc<RwLock<HashMap<(String, String), Uuid>>>,
|
||||
next_id: Arc<AtomicU64>,
|
||||
_worker: JoinHandle<()>,
|
||||
}
|
||||
|
||||
struct SubscriberSlot {
|
||||
id: u64,
|
||||
filter: EventFilter,
|
||||
sender: mpsc::Sender<BusEvent>,
|
||||
lagged: Arc<AtomicU64>,
|
||||
}
|
||||
|
||||
impl SharingBus {
|
||||
/// Construct a new bus and spawn its dispatch worker.
|
||||
pub fn new() -> Arc<Self> {
|
||||
let (publish_tx, publish_rx) = broadcast::channel(BUS_INTERNAL_CAPACITY);
|
||||
let subscribers: Arc<RwLock<Vec<SubscriberSlot>>> = Arc::new(RwLock::new(Vec::new()));
|
||||
let scroll_id_cache: Arc<RwLock<HashMap<(String, String), Uuid>>> =
|
||||
Arc::new(RwLock::new(HashMap::new()));
|
||||
let next_id = Arc::new(AtomicU64::new(0));
|
||||
|
||||
let worker = tokio::spawn(run_worker(publish_rx, Arc::clone(&subscribers)));
|
||||
|
||||
Arc::new(Self {
|
||||
publish_tx,
|
||||
subscribers,
|
||||
scroll_id_cache,
|
||||
next_id,
|
||||
_worker: worker,
|
||||
})
|
||||
}
|
||||
|
||||
/// Publish a `BusEvent` to all matching subscribers.
|
||||
///
|
||||
/// This method also performs two side-effects on the scroll-id cache:
|
||||
///
|
||||
/// 1. If the wrapped event is `Event::SessionRegistered`, the binding
|
||||
/// `(connector_id, session_id) -> scroll_id` is inserted into the
|
||||
/// cache, and the current event's `routing.scroll_id` is set so the
|
||||
/// binding event itself carries its own scroll_id downstream.
|
||||
/// 2. If the event's `routing.scroll_id` is absent but it carries both a
|
||||
/// `connector_id` and `native_session_id`, the cache is consulted to
|
||||
/// late-bind `scroll_id` before broadcasting.
|
||||
pub async fn publish(&self, mut bus_event: BusEvent) {
|
||||
// (2) Late-bind scroll_id from cache if we can, BEFORE the possibly
|
||||
// more specific (1) handling overrides it. This is a no-op for
|
||||
// SessionRegistered (its scroll_id is always populated in (1)).
|
||||
if bus_event.routing.scroll_id.is_none() {
|
||||
if let (Some(cid), Some(nsid)) = (
|
||||
bus_event.routing.connector_id.as_ref(),
|
||||
bus_event.routing.native_session_id.as_ref(),
|
||||
) {
|
||||
let cache = self.scroll_id_cache.read().await;
|
||||
if let Some(uuid) = cache.get(&(cid.clone(), nsid.clone())) {
|
||||
bus_event.routing.scroll_id = Some(*uuid);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// (1) If the wrapped event is SessionRegistered, populate the cache
|
||||
// and set scroll_id on the event itself.
|
||||
if let Event::SessionRegistered {
|
||||
connector_id,
|
||||
session_id,
|
||||
scroll_id,
|
||||
} = bus_event.event.as_ref()
|
||||
{
|
||||
match Uuid::parse_str(scroll_id) {
|
||||
Ok(uuid) => {
|
||||
self.scroll_id_cache
|
||||
.write()
|
||||
.await
|
||||
.insert((connector_id.clone(), session_id.clone()), uuid);
|
||||
bus_event.routing.scroll_id = Some(uuid);
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(
|
||||
connector_id = %connector_id,
|
||||
session_id = %session_id,
|
||||
scroll_id = %scroll_id,
|
||||
error = %e,
|
||||
"SessionRegistered carried an unparseable scroll_id; skipping late-bind cache insert",
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// No subscribers is not an error — ignore the Result.
|
||||
let _ = self.publish_tx.send(bus_event);
|
||||
}
|
||||
|
||||
/// Subscribe to every event on the bus.
|
||||
pub async fn subscribe_all(&self) -> BusReceiver {
|
||||
self.subscribe_filtered(EventFilter::All, SUBSCRIBER_QUEUE_DEFAULT)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Subscribe to events that match `filter`. `queue_capacity` caps the
|
||||
/// buffered events between the worker and the caller's `recv()`.
|
||||
pub async fn subscribe_filtered(
|
||||
&self,
|
||||
filter: EventFilter,
|
||||
queue_capacity: usize,
|
||||
) -> BusReceiver {
|
||||
let (tx, rx) = mpsc::channel(queue_capacity);
|
||||
let lagged = Arc::new(AtomicU64::new(0));
|
||||
// Relaxed ordering is sufficient: subscriber IDs are only compared for
|
||||
// equality with other IDs issued by this same bus; there is no
|
||||
// cross-thread ordering dependency on this counter.
|
||||
let id = self.next_id.fetch_add(1, Ordering::Relaxed);
|
||||
self.subscribers.write().await.push(SubscriberSlot {
|
||||
id,
|
||||
filter,
|
||||
sender: tx,
|
||||
lagged: Arc::clone(&lagged),
|
||||
});
|
||||
BusReceiver { id, rx, lagged }
|
||||
}
|
||||
|
||||
/// Remove a subscriber by id. Idempotent.
|
||||
pub async fn unsubscribe(&self, id: u64) {
|
||||
self.subscribers.write().await.retain(|s| s.id != id);
|
||||
}
|
||||
}
|
||||
|
||||
async fn run_worker(
|
||||
mut rx: broadcast::Receiver<BusEvent>,
|
||||
subscribers: Arc<RwLock<Vec<SubscriberSlot>>>,
|
||||
) {
|
||||
loop {
|
||||
match rx.recv().await {
|
||||
Ok(evt) => {
|
||||
let mut closed_ids: Vec<u64> = Vec::new();
|
||||
{
|
||||
let subs = subscribers.read().await;
|
||||
for slot in subs.iter() {
|
||||
if !slot.filter.matches(&evt) {
|
||||
continue;
|
||||
}
|
||||
match slot.sender.try_send(evt.clone()) {
|
||||
Ok(()) => {}
|
||||
Err(mpsc::error::TrySendError::Full(_)) => {
|
||||
slot.lagged.fetch_add(1, Ordering::Relaxed);
|
||||
warn!(
|
||||
subscriber_id = slot.id,
|
||||
"bus subscriber queue full; dropping event"
|
||||
);
|
||||
}
|
||||
Err(mpsc::error::TrySendError::Closed(_)) => {
|
||||
closed_ids.push(slot.id);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if !closed_ids.is_empty() {
|
||||
subscribers
|
||||
.write()
|
||||
.await
|
||||
.retain(|s| !closed_ids.contains(&s.id));
|
||||
debug!(removed = closed_ids.len(), "GC'd closed subscriber slots");
|
||||
}
|
||||
}
|
||||
Err(broadcast::error::RecvError::Lagged(n)) => {
|
||||
warn!(skipped = n, "SharingBus internal broadcast lagged");
|
||||
}
|
||||
Err(broadcast::error::RecvError::Closed) => {
|
||||
debug!("SharingBus worker exiting (sender closed)");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Tests ───────────────────────────────────────────────────────────────────
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::time::Duration;
|
||||
|
||||
use tokio::time::timeout;
|
||||
use uuid::Uuid;
|
||||
|
||||
use super::*;
|
||||
use dirigent_protocol::streaming::{BusEvent, EventKind, EventOrigin, EventRouting};
|
||||
use dirigent_protocol::Event;
|
||||
|
||||
/// Build a minimal `BusEvent` for tests. Uses `Event::Connected` as payload
|
||||
/// unless a specific event is needed for late-bind checks.
|
||||
fn make_event(
|
||||
scroll_id: Option<Uuid>,
|
||||
connector_uid: Option<Uuid>,
|
||||
connector_id: Option<String>,
|
||||
native_session_id: Option<String>,
|
||||
kind: EventKind,
|
||||
event: Event,
|
||||
) -> BusEvent {
|
||||
BusEvent {
|
||||
routing: EventRouting {
|
||||
scroll_id,
|
||||
connector_uid,
|
||||
connector_id,
|
||||
native_session_id,
|
||||
kind,
|
||||
},
|
||||
origin: EventOrigin::Runtime,
|
||||
event: Arc::new(event),
|
||||
}
|
||||
}
|
||||
|
||||
// 1. subscribe_all + publish: one event round-trips to receiver.
|
||||
#[tokio::test]
|
||||
async fn subscribe_all_receives_published_event() {
|
||||
let bus = SharingBus::new();
|
||||
let mut recv = bus.subscribe_all().await;
|
||||
|
||||
let ev = make_event(
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
EventKind::System,
|
||||
Event::Connected,
|
||||
);
|
||||
bus.publish(ev).await;
|
||||
|
||||
let got = timeout(Duration::from_millis(200), recv.rx.recv())
|
||||
.await
|
||||
.expect("timed out waiting for event")
|
||||
.expect("channel closed unexpectedly");
|
||||
|
||||
match got.event.as_ref() {
|
||||
Event::Connected => {}
|
||||
other => panic!("expected Event::Connected, got {:?}", other),
|
||||
}
|
||||
}
|
||||
|
||||
// 2. ConnectorUid filter: matching UID passes, other UID skipped.
|
||||
#[tokio::test]
|
||||
async fn connector_uid_filter_only_forwards_matching_events() {
|
||||
let bus = SharingBus::new();
|
||||
let target = Uuid::new_v4();
|
||||
let other = Uuid::new_v4();
|
||||
|
||||
let mut recv = bus
|
||||
.subscribe_filtered(EventFilter::ConnectorUid(target), 16)
|
||||
.await;
|
||||
|
||||
// Publish one matching and one non-matching event.
|
||||
let ev_match = make_event(
|
||||
None,
|
||||
Some(target),
|
||||
None,
|
||||
None,
|
||||
EventKind::System,
|
||||
Event::Connected,
|
||||
);
|
||||
let ev_other = make_event(
|
||||
None,
|
||||
Some(other),
|
||||
None,
|
||||
None,
|
||||
EventKind::System,
|
||||
Event::Connected,
|
||||
);
|
||||
bus.publish(ev_match).await;
|
||||
bus.publish(ev_other).await;
|
||||
|
||||
// First recv returns the matching event.
|
||||
let got = timeout(Duration::from_millis(200), recv.rx.recv())
|
||||
.await
|
||||
.expect("timed out waiting for first event")
|
||||
.expect("channel closed unexpectedly");
|
||||
assert_eq!(got.routing.connector_uid, Some(target));
|
||||
|
||||
// Second recv must time out — no other matching event was published.
|
||||
let result = timeout(Duration::from_millis(100), recv.rx.recv()).await;
|
||||
assert!(
|
||||
result.is_err(),
|
||||
"expected no further events, got: {:?}",
|
||||
result.ok().flatten().map(|e| e.routing.connector_uid)
|
||||
);
|
||||
}
|
||||
|
||||
// 3. Queue full = lagged counter increments, first event still delivered.
|
||||
#[tokio::test]
|
||||
async fn full_queue_increments_lagged_counter() {
|
||||
let bus = SharingBus::new();
|
||||
// Capacity 1 — only one event can be buffered before try_send fails.
|
||||
let mut recv = bus.subscribe_filtered(EventFilter::All, 1).await;
|
||||
|
||||
// Publish 5 events without draining.
|
||||
for _ in 0..5 {
|
||||
let ev = make_event(
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
EventKind::System,
|
||||
Event::Connected,
|
||||
);
|
||||
bus.publish(ev).await;
|
||||
}
|
||||
|
||||
// Give the worker a chance to process all 5.
|
||||
for _ in 0..10 {
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(20)).await;
|
||||
|
||||
// First event is still in the queue.
|
||||
let first = timeout(Duration::from_millis(200), recv.rx.recv())
|
||||
.await
|
||||
.expect("timed out waiting for first event")
|
||||
.expect("channel closed unexpectedly");
|
||||
match first.event.as_ref() {
|
||||
Event::Connected => {}
|
||||
other => panic!("expected Event::Connected, got {:?}", other),
|
||||
}
|
||||
|
||||
// At minimum 4 events were dropped (5 published, 1 fit).
|
||||
let lagged = recv.lagged.load(Ordering::Relaxed);
|
||||
assert!(
|
||||
lagged >= 4,
|
||||
"expected lagged >= 4 after publishing 5 events to a capacity-1 queue, got {}",
|
||||
lagged
|
||||
);
|
||||
}
|
||||
|
||||
// 4. scroll_id late-bind: SessionRegistered populates cache; subsequent
|
||||
// events with matching (connector_id, native_session_id) get their
|
||||
// scroll_id filled in before dispatch.
|
||||
#[tokio::test]
|
||||
async fn session_registered_populates_cache_and_late_binds_subsequent_events() {
|
||||
let bus = SharingBus::new();
|
||||
let scroll = Uuid::new_v4();
|
||||
|
||||
// Subscriber filters on ScrollId(scroll). It should see:
|
||||
// - the SessionRegistered event (bus sets its own scroll_id at publish)
|
||||
// - a follow-up event with (connector_id="c", native_session_id="s")
|
||||
// that had no scroll_id on entry (late-bound from the cache).
|
||||
let mut recv = bus
|
||||
.subscribe_filtered(EventFilter::ScrollId(scroll), 16)
|
||||
.await;
|
||||
|
||||
// --- publish SessionRegistered (binding event) ---
|
||||
let reg_event = Event::SessionRegistered {
|
||||
connector_id: "c".to_string(),
|
||||
session_id: "s".to_string(),
|
||||
scroll_id: scroll.to_string(),
|
||||
};
|
||||
// We pass through the routing fields the producer would populate.
|
||||
// `scroll_id` starts as None; publish() sets it from the event payload.
|
||||
let reg_bus = make_event(
|
||||
None,
|
||||
None,
|
||||
Some("c".to_string()),
|
||||
Some("s".to_string()),
|
||||
EventKind::SessionLifecycle,
|
||||
reg_event,
|
||||
);
|
||||
bus.publish(reg_bus).await;
|
||||
|
||||
let got1 = timeout(Duration::from_millis(200), recv.rx.recv())
|
||||
.await
|
||||
.expect("timed out waiting for SessionRegistered")
|
||||
.expect("channel closed unexpectedly");
|
||||
assert!(matches!(
|
||||
got1.event.as_ref(),
|
||||
Event::SessionRegistered { .. }
|
||||
));
|
||||
assert_eq!(got1.routing.scroll_id, Some(scroll));
|
||||
|
||||
// --- publish a follow-up event with no scroll_id but matching
|
||||
// connector_id + native_session_id ---
|
||||
let follow_up = make_event(
|
||||
None,
|
||||
None,
|
||||
Some("c".to_string()),
|
||||
Some("s".to_string()),
|
||||
EventKind::System,
|
||||
Event::Connected,
|
||||
);
|
||||
bus.publish(follow_up).await;
|
||||
|
||||
let got2 = timeout(Duration::from_millis(200), recv.rx.recv())
|
||||
.await
|
||||
.expect("timed out waiting for late-bound follow-up")
|
||||
.expect("channel closed unexpectedly");
|
||||
assert_eq!(
|
||||
got2.routing.scroll_id,
|
||||
Some(scroll),
|
||||
"follow-up event should have had scroll_id late-bound from the cache"
|
||||
);
|
||||
assert!(matches!(got2.event.as_ref(), Event::Connected));
|
||||
}
|
||||
|
||||
// 5. Dropped receiver is GC'd after the next publish.
|
||||
#[tokio::test]
|
||||
async fn closed_receiver_slot_is_reaped_on_next_publish() {
|
||||
let bus = SharingBus::new();
|
||||
|
||||
// Subscribe, then immediately drop the receiver — simulates a caller
|
||||
// that forgets (or skips) `unsubscribe()`.
|
||||
let recv = bus.subscribe_all().await;
|
||||
drop(recv);
|
||||
|
||||
// Sanity check: slot is present before GC.
|
||||
assert_eq!(bus.subscribers.read().await.len(), 1);
|
||||
|
||||
// Publish one event; the worker encounters TrySendError::Closed and
|
||||
// schedules the slot for removal.
|
||||
let ev = make_event(
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
EventKind::System,
|
||||
Event::Connected,
|
||||
);
|
||||
bus.publish(ev).await;
|
||||
|
||||
// Give the worker a moment to process and GC.
|
||||
for _ in 0..10 {
|
||||
tokio::task::yield_now().await;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(10)).await;
|
||||
|
||||
assert_eq!(
|
||||
bus.subscribers.read().await.len(),
|
||||
0,
|
||||
"closed subscriber slot should have been GC'd after publish"
|
||||
);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user