From 41d786cc11c791a13c897e1b42489804c00b8dff Mon Sep 17 00:00:00 2001 From: Alexey <247128645+axkurcom@users.noreply.github.com> Date: Wed, 25 Mar 2026 16:29:35 +0300 Subject: [PATCH 01/17] Safety Gates Invariants + HybridAsyncPersistent + Watch + Runtime Snapshots + ME Writer Ping Tracker + Parallel Recovery + Backpressure Guardrails Co-Authored-By: brekotis <93345790+brekotis@users.noreply.github.com> --- src/config/hot_reload.rs | 3 + src/maestro/me_startup.rs | 2 + src/metrics.rs | 14 ++ src/stats/mod.rs | 10 + src/transport/middle_proxy/config_updater.rs | 47 ---- src/transport/middle_proxy/health.rs | 33 ++- src/transport/middle_proxy/pool.rs | 201 +++++++++++------- src/transport/middle_proxy/pool_config.rs | 2 +- src/transport/middle_proxy/pool_refill.rs | 23 ++ src/transport/middle_proxy/pool_writer.rs | 45 ++-- src/transport/middle_proxy/reader.rs | 12 +- src/transport/middle_proxy/registry.rs | 42 +++- src/transport/middle_proxy/send.rs | 122 ++++++++++- .../tests/health_adversarial_tests.rs | 2 + .../tests/health_integration_tests.rs | 2 + .../tests/health_regression_tests.rs | 2 + .../tests/pool_refill_security_tests.rs | 2 + .../tests/pool_writer_security_tests.rs | 2 + .../tests/send_adversarial_tests.rs | 2 + 19 files changed, 384 insertions(+), 184 deletions(-) diff --git a/src/config/hot_reload.rs b/src/config/hot_reload.rs index 7f7499e..f8064dd 100644 --- a/src/config/hot_reload.rs +++ b/src/config/hot_reload.rs @@ -651,6 +651,9 @@ fn warn_non_hot_changes(old: &ProxyConfig, new: &ProxyConfig, non_hot_changed: b } if old.general.me_route_no_writer_mode != new.general.me_route_no_writer_mode || old.general.me_route_no_writer_wait_ms != new.general.me_route_no_writer_wait_ms + || old.general.me_route_hybrid_max_wait_ms != new.general.me_route_hybrid_max_wait_ms + || old.general.me_route_blocking_send_timeout_ms + != new.general.me_route_blocking_send_timeout_ms || old.general.me_route_inline_recovery_attempts != new.general.me_route_inline_recovery_attempts || old.general.me_route_inline_recovery_wait_ms diff --git a/src/maestro/me_startup.rs b/src/maestro/me_startup.rs index b1e605c..4e49e9e 100644 --- a/src/maestro/me_startup.rs +++ b/src/maestro/me_startup.rs @@ -277,6 +277,8 @@ pub(crate) async fn initialize_me_pool( config.general.me_warn_rate_limit_ms, config.general.me_route_no_writer_mode, config.general.me_route_no_writer_wait_ms, + config.general.me_route_hybrid_max_wait_ms, + config.general.me_route_blocking_send_timeout_ms, config.general.me_route_inline_recovery_attempts, config.general.me_route_inline_recovery_wait_ms, ); diff --git a/src/metrics.rs b/src/metrics.rs index f9475f6..c125ef5 100644 --- a/src/metrics.rs +++ b/src/metrics.rs @@ -2318,6 +2318,20 @@ async fn render_metrics(stats: &Stats, config: &ProxyConfig, ip_tracker: &UserIp 0 } ); + let _ = writeln!( + out, + "# HELP telemt_me_hybrid_timeout_total ME hybrid route timeouts after bounded retry window" + ); + let _ = writeln!(out, "# TYPE telemt_me_hybrid_timeout_total counter"); + let _ = writeln!( + out, + "telemt_me_hybrid_timeout_total {}", + if me_allows_normal { + stats.get_me_hybrid_timeout_total() + } else { + 0 + } + ); let _ = writeln!( out, "# HELP telemt_me_async_recovery_trigger_total Async ME recovery trigger attempts from route path" diff --git a/src/stats/mod.rs b/src/stats/mod.rs index ff15d4f..9cba3e8 100644 --- a/src/stats/mod.rs +++ b/src/stats/mod.rs @@ -234,6 +234,7 @@ pub struct Stats { me_writer_restored_same_endpoint_total: AtomicU64, me_writer_restored_fallback_total: AtomicU64, me_no_writer_failfast_total: AtomicU64, + me_hybrid_timeout_total: AtomicU64, me_async_recovery_trigger_total: AtomicU64, me_inline_recovery_total: AtomicU64, ip_reservation_rollback_tcp_limit_total: AtomicU64, @@ -1203,6 +1204,12 @@ impl Stats { .fetch_add(1, Ordering::Relaxed); } } + pub fn increment_me_hybrid_timeout_total(&self) { + if self.telemetry_me_allows_normal() { + self.me_hybrid_timeout_total + .fetch_add(1, Ordering::Relaxed); + } + } pub fn increment_me_async_recovery_trigger_total(&self) { if self.telemetry_me_allows_normal() { self.me_async_recovery_trigger_total @@ -1876,6 +1883,9 @@ impl Stats { pub fn get_me_no_writer_failfast_total(&self) -> u64 { self.me_no_writer_failfast_total.load(Ordering::Relaxed) } + pub fn get_me_hybrid_timeout_total(&self) -> u64 { + self.me_hybrid_timeout_total.load(Ordering::Relaxed) + } pub fn get_me_async_recovery_trigger_total(&self) -> u64 { self.me_async_recovery_trigger_total.load(Ordering::Relaxed) } diff --git a/src/transport/middle_proxy/config_updater.rs b/src/transport/middle_proxy/config_updater.rs index ba90c1a..ebe45fc 100644 --- a/src/transport/middle_proxy/config_updater.rs +++ b/src/transport/middle_proxy/config_updater.rs @@ -314,53 +314,6 @@ async fn run_update_cycle( reinit_tx: &mpsc::Sender, ) { let upstream = pool.upstream.clone(); - pool.update_runtime_reinit_policy( - cfg.general.hardswap, - cfg.general.me_pool_drain_ttl_secs, - cfg.general.me_instadrain, - cfg.general.me_pool_drain_threshold, - cfg.general.me_pool_drain_soft_evict_enabled, - cfg.general.me_pool_drain_soft_evict_grace_secs, - cfg.general.me_pool_drain_soft_evict_per_writer, - cfg.general.me_pool_drain_soft_evict_budget_per_core, - cfg.general.me_pool_drain_soft_evict_cooldown_ms, - cfg.general.effective_me_pool_force_close_secs(), - cfg.general.me_pool_min_fresh_ratio, - cfg.general.me_hardswap_warmup_delay_min_ms, - cfg.general.me_hardswap_warmup_delay_max_ms, - cfg.general.me_hardswap_warmup_extra_passes, - cfg.general.me_hardswap_warmup_pass_backoff_base_ms, - cfg.general.me_bind_stale_mode, - cfg.general.me_bind_stale_ttl_secs, - cfg.general.me_secret_atomic_snapshot, - cfg.general.me_deterministic_writer_sort, - cfg.general.me_writer_pick_mode, - cfg.general.me_writer_pick_sample_size, - cfg.general.me_single_endpoint_shadow_writers, - cfg.general.me_single_endpoint_outage_mode_enabled, - cfg.general.me_single_endpoint_outage_disable_quarantine, - cfg.general.me_single_endpoint_outage_backoff_min_ms, - cfg.general.me_single_endpoint_outage_backoff_max_ms, - cfg.general.me_single_endpoint_shadow_rotate_every_secs, - cfg.general.me_floor_mode, - cfg.general.me_adaptive_floor_idle_secs, - cfg.general.me_adaptive_floor_min_writers_single_endpoint, - cfg.general.me_adaptive_floor_min_writers_multi_endpoint, - cfg.general.me_adaptive_floor_recover_grace_secs, - cfg.general.me_adaptive_floor_writers_per_core_total, - cfg.general.me_adaptive_floor_cpu_cores_override, - cfg.general - .me_adaptive_floor_max_extra_writers_single_per_core, - cfg.general - .me_adaptive_floor_max_extra_writers_multi_per_core, - cfg.general.me_adaptive_floor_max_active_writers_per_core, - cfg.general.me_adaptive_floor_max_warm_writers_per_core, - cfg.general.me_adaptive_floor_max_active_writers_global, - cfg.general.me_adaptive_floor_max_warm_writers_global, - cfg.general.me_health_interval_ms_unhealthy, - cfg.general.me_health_interval_ms_healthy, - cfg.general.me_warn_rate_limit_ms, - ); let required_cfg_snapshots = cfg.general.me_config_stable_snapshots.max(1); let required_secret_snapshots = cfg.general.proxy_secret_stable_snapshots.max(1); diff --git a/src/transport/middle_proxy/health.rs b/src/transport/middle_proxy/health.rs index 3e53f38..9e0933f 100644 --- a/src/transport/middle_proxy/health.rs +++ b/src/transport/middle_proxy/health.rs @@ -7,6 +7,7 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use rand::RngExt; +use tokio::sync::Semaphore; use tracing::{debug, info, warn}; use crate::config::MeFloorMode; @@ -78,6 +79,7 @@ pub async fn me_health_monitor(pool: Arc, rng: Arc, _min_c }; tokio::time::sleep(interval).await; pool.prune_closed_writers().await; + pool.sweep_endpoint_quarantine().await; reap_draining_writers(&pool, &mut drain_warn_next_allowed).await; let v4_degraded = check_family( IpFamily::V4, @@ -365,7 +367,8 @@ async fn check_family( endpoints.sort_unstable(); endpoints.dedup(); } - let mut reconnect_budget = health_reconnect_budget(pool, dc_endpoints.len()); + let reconnect_budget = health_reconnect_budget(pool, dc_endpoints.len()); + let reconnect_sem = Arc::new(Semaphore::new(reconnect_budget)); if pool.floor_mode() == MeFloorMode::Static { adaptive_idle_since.clear(); @@ -461,7 +464,7 @@ async fn check_family( required, outage_backoff, outage_next_attempt, - &mut reconnect_budget, + &reconnect_sem, ) .await; continue; @@ -521,7 +524,7 @@ async fn check_family( family_degraded = true; let now = Instant::now(); - if reconnect_budget == 0 { + if reconnect_sem.available_permits() == 0 { let base_ms = pool.me_reconnect_backoff_base.as_millis() as u64; let next_ms = (*backoff.get(&key).unwrap_or(&base_ms)).max(base_ms); let jitter = next_ms / JITTER_FRAC_NUM; @@ -567,10 +570,9 @@ async fn check_family( let mut restored = 0usize; for _ in 0..missing { - if reconnect_budget == 0 { + let Ok(reconnect_permit) = reconnect_sem.clone().try_acquire_owned() else { break; - } - reconnect_budget = reconnect_budget.saturating_sub(1); + }; if pool.active_contour_writer_count_total().await >= floor_plan.active_cap_effective_total { @@ -621,6 +623,7 @@ async fn check_family( debug!(dc = %dc, ?family, "ME reconnect timed out"); } } + drop(reconnect_permit); } let now_alive = alive + restored; @@ -1188,7 +1191,7 @@ async fn recover_single_endpoint_outage( required: usize, outage_backoff: &mut HashMap<(i32, IpFamily), u64>, outage_next_attempt: &mut HashMap<(i32, IpFamily), Instant>, - reconnect_budget: &mut usize, + reconnect_sem: &Arc, ) { let now = Instant::now(); if let Some(ts) = outage_next_attempt.get(&key) @@ -1198,7 +1201,7 @@ async fn recover_single_endpoint_outage( } let (min_backoff_ms, max_backoff_ms) = pool.single_endpoint_outage_backoff_bounds_ms(); - if *reconnect_budget == 0 { + if reconnect_sem.available_permits() == 0 { outage_next_attempt.insert(key, now + Duration::from_millis(min_backoff_ms.max(250))); debug!( dc = %key.0, @@ -1209,7 +1212,17 @@ async fn recover_single_endpoint_outage( ); return; } - *reconnect_budget = (*reconnect_budget).saturating_sub(1); + let Ok(_reconnect_permit) = reconnect_sem.clone().try_acquire_owned() else { + outage_next_attempt.insert(key, now + Duration::from_millis(min_backoff_ms.max(250))); + debug!( + dc = %key.0, + family = ?key.1, + %endpoint, + required, + "Single-endpoint outage reconnect deferred by semaphore saturation" + ); + return; + }; pool.stats .increment_me_single_endpoint_outage_reconnect_attempt_total(); @@ -1687,6 +1700,8 @@ mod tests { general.me_warn_rate_limit_ms, MeRouteNoWriterMode::default(), general.me_route_no_writer_wait_ms, + general.me_route_hybrid_max_wait_ms, + general.me_route_blocking_send_timeout_ms, general.me_route_inline_recovery_attempts, general.me_route_inline_recovery_wait_ms, ) diff --git a/src/transport/middle_proxy/pool.rs b/src/transport/middle_proxy/pool.rs index 71ab257..5faa76d 100644 --- a/src/transport/middle_proxy/pool.rs +++ b/src/transport/middle_proxy/pool.rs @@ -8,7 +8,8 @@ use std::sync::atomic::{ }; use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; -use tokio::sync::{Mutex, Notify, RwLock, mpsc}; +use arc_swap::ArcSwap; +use tokio::sync::{Mutex, RwLock, mpsc, watch}; use tokio_util::sync::CancellationToken; use crate::config::{ @@ -69,6 +70,10 @@ impl WriterContour { } pub(super) fn from_u8(value: u8) -> Self { + debug_assert!( + value <= Self::Draining as u8, + "Unexpected WriterContour discriminant: {value}" + ); match value { 0 => Self::Warm, 1 => Self::Active, @@ -87,6 +92,33 @@ pub(crate) enum MeFamilyRuntimeState { Recovering = 3, } +#[derive(Debug, Clone)] +pub(crate) struct FamilyHealthSnapshot { + pub(crate) state: MeFamilyRuntimeState, + pub(crate) state_since_epoch_secs: u64, + pub(crate) suppressed_until_epoch_secs: u64, + pub(crate) fail_streak: u32, + pub(crate) recover_success_streak: u32, +} + +impl FamilyHealthSnapshot { + fn new( + state: MeFamilyRuntimeState, + state_since_epoch_secs: u64, + suppressed_until_epoch_secs: u64, + fail_streak: u32, + recover_success_streak: u32, + ) -> Self { + Self { + state, + state_since_epoch_secs, + suppressed_until_epoch_secs, + fail_streak, + recover_success_streak, + } + } +} + impl MeFamilyRuntimeState { pub(crate) fn from_u8(value: u8) -> Self { match value { @@ -214,13 +246,11 @@ pub struct MePool { pub(super) endpoint_dc_map: Arc>>>, pub(super) default_dc: AtomicI32, pub(super) next_writer_id: AtomicU64, - pub(super) ping_tracker: Arc>>, - pub(super) ping_tracker_last_cleanup_epoch_ms: AtomicU64, pub(super) rtt_stats: Arc>>, pub(super) nat_reflection_cache: Arc>, pub(super) nat_reflection_singleflight_v4: Arc>, pub(super) nat_reflection_singleflight_v6: Arc>, - pub(super) writer_available: Arc, + pub(super) writer_epoch: watch::Sender, pub(super) refill_inflight: Arc>>, pub(super) refill_inflight_dc: Arc>>, pub(super) conn_count: AtomicUsize, @@ -259,21 +289,18 @@ pub struct MePool { pub(super) me_reader_route_data_wait_ms: Arc, pub(super) me_route_no_writer_mode: AtomicU8, pub(super) me_route_no_writer_wait: Duration, + pub(super) me_route_hybrid_max_wait: Duration, + pub(super) me_route_blocking_send_timeout: Option, + pub(super) me_route_last_success_epoch_ms: AtomicU64, + pub(super) me_route_hybrid_timeout_warn_epoch_ms: AtomicU64, + pub(super) me_async_recovery_last_trigger_epoch_ms: AtomicU64, pub(super) me_route_inline_recovery_attempts: u32, pub(super) me_route_inline_recovery_wait: Duration, pub(super) me_health_interval_ms_unhealthy: AtomicU64, pub(super) me_health_interval_ms_healthy: AtomicU64, pub(super) me_warn_rate_limit_ms: AtomicU64, - pub(super) me_family_v4_runtime_state: AtomicU8, - pub(super) me_family_v6_runtime_state: AtomicU8, - pub(super) me_family_v4_state_since_epoch_secs: AtomicU64, - pub(super) me_family_v6_state_since_epoch_secs: AtomicU64, - pub(super) me_family_v4_suppressed_until_epoch_secs: AtomicU64, - pub(super) me_family_v6_suppressed_until_epoch_secs: AtomicU64, - pub(super) me_family_v4_fail_streak: AtomicU32, - pub(super) me_family_v6_fail_streak: AtomicU32, - pub(super) me_family_v4_recover_success_streak: AtomicU32, - pub(super) me_family_v6_recover_success_streak: AtomicU32, + pub(super) family_health_v4: ArcSwap, + pub(super) family_health_v6: ArcSwap, pub(super) me_last_drain_gate_route_quorum_ok: AtomicBool, pub(super) me_last_drain_gate_redundancy_ok: AtomicBool, pub(super) me_last_drain_gate_block_reason: AtomicU8, @@ -396,6 +423,8 @@ impl MePool { me_warn_rate_limit_ms: u64, me_route_no_writer_mode: MeRouteNoWriterMode, me_route_no_writer_wait_ms: u64, + me_route_hybrid_max_wait_ms: u64, + me_route_blocking_send_timeout_ms: u64, me_route_inline_recovery_attempts: u32, me_route_inline_recovery_wait_ms: u64, ) -> Arc { @@ -410,6 +439,8 @@ impl MePool { me_route_backpressure_high_timeout_ms, me_route_backpressure_high_watermark_pct, ); + let (writer_epoch, _) = watch::channel(0u64); + let now_epoch_secs = Self::now_epoch_secs(); Arc::new(Self { registry, writers: Arc::new(RwLock::new(Vec::new())), @@ -527,13 +558,11 @@ impl MePool { endpoint_dc_map: Arc::new(RwLock::new(endpoint_dc_map)), default_dc: AtomicI32::new(default_dc.unwrap_or(2)), next_writer_id: AtomicU64::new(1), - ping_tracker: Arc::new(Mutex::new(HashMap::new())), - ping_tracker_last_cleanup_epoch_ms: AtomicU64::new(0), rtt_stats: Arc::new(Mutex::new(HashMap::new())), nat_reflection_cache: Arc::new(Mutex::new(NatReflectionCache::default())), nat_reflection_singleflight_v4: Arc::new(Mutex::new(())), nat_reflection_singleflight_v6: Arc::new(Mutex::new(())), - writer_available: Arc::new(Notify::new()), + writer_epoch, refill_inflight: Arc::new(Mutex::new(HashSet::new())), refill_inflight_dc: Arc::new(Mutex::new(HashSet::new())), conn_count: AtomicUsize::new(0), @@ -585,25 +614,40 @@ impl MePool { me_reader_route_data_wait_ms: Arc::new(AtomicU64::new(me_reader_route_data_wait_ms)), me_route_no_writer_mode: AtomicU8::new(me_route_no_writer_mode.as_u8()), me_route_no_writer_wait: Duration::from_millis(me_route_no_writer_wait_ms), + me_route_hybrid_max_wait: Duration::from_millis(me_route_hybrid_max_wait_ms.max(50)), + me_route_blocking_send_timeout: if me_route_blocking_send_timeout_ms == 0 { + None + } else { + Some(Duration::from_millis( + me_route_blocking_send_timeout_ms.min(5_000), + )) + }, + me_route_last_success_epoch_ms: AtomicU64::new(0), + me_route_hybrid_timeout_warn_epoch_ms: AtomicU64::new(0), + me_async_recovery_last_trigger_epoch_ms: AtomicU64::new(0), me_route_inline_recovery_attempts, me_route_inline_recovery_wait: Duration::from_millis(me_route_inline_recovery_wait_ms), me_health_interval_ms_unhealthy: AtomicU64::new(me_health_interval_ms_unhealthy.max(1)), me_health_interval_ms_healthy: AtomicU64::new(me_health_interval_ms_healthy.max(1)), me_warn_rate_limit_ms: AtomicU64::new(me_warn_rate_limit_ms.max(1)), - me_family_v4_runtime_state: AtomicU8::new(MeFamilyRuntimeState::Healthy as u8), - me_family_v6_runtime_state: AtomicU8::new(MeFamilyRuntimeState::Healthy as u8), - me_family_v4_state_since_epoch_secs: AtomicU64::new(Self::now_epoch_secs()), - me_family_v6_state_since_epoch_secs: AtomicU64::new(Self::now_epoch_secs()), - me_family_v4_suppressed_until_epoch_secs: AtomicU64::new(0), - me_family_v6_suppressed_until_epoch_secs: AtomicU64::new(0), - me_family_v4_fail_streak: AtomicU32::new(0), - me_family_v6_fail_streak: AtomicU32::new(0), - me_family_v4_recover_success_streak: AtomicU32::new(0), - me_family_v6_recover_success_streak: AtomicU32::new(0), + family_health_v4: ArcSwap::from_pointee(FamilyHealthSnapshot::new( + MeFamilyRuntimeState::Healthy, + now_epoch_secs, + 0, + 0, + 0, + )), + family_health_v6: ArcSwap::from_pointee(FamilyHealthSnapshot::new( + MeFamilyRuntimeState::Healthy, + now_epoch_secs, + 0, + 0, + 0, + )), me_last_drain_gate_route_quorum_ok: AtomicBool::new(false), me_last_drain_gate_redundancy_ok: AtomicBool::new(false), me_last_drain_gate_block_reason: AtomicU8::new(MeDrainGateReason::Open as u8), - me_last_drain_gate_updated_at_epoch_secs: AtomicU64::new(Self::now_epoch_secs()), + me_last_drain_gate_updated_at_epoch_secs: AtomicU64::new(now_epoch_secs), runtime_ready: AtomicBool::new(false), preferred_endpoints_by_dc: Arc::new(RwLock::new(preferred_endpoints_by_dc)), }) @@ -621,6 +665,19 @@ impl MePool { self.runtime_ready.load(Ordering::Relaxed) } + pub(super) fn now_epoch_millis() -> u64 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_millis() as u64 + } + + pub(super) fn notify_writer_epoch(&self) { + let _ = self.writer_epoch.send_modify(|epoch| { + *epoch = epoch.wrapping_add(1); + }); + } + #[allow(dead_code)] pub(super) fn set_family_runtime_state( &self, @@ -631,82 +688,51 @@ impl MePool { fail_streak: u32, recover_success_streak: u32, ) { + let snapshot = Arc::new(FamilyHealthSnapshot::new( + state, + state_since_epoch_secs, + suppressed_until_epoch_secs, + fail_streak, + recover_success_streak, + )); match family { - IpFamily::V4 => { - self.me_family_v4_runtime_state - .store(state as u8, Ordering::Relaxed); - self.me_family_v4_state_since_epoch_secs - .store(state_since_epoch_secs, Ordering::Relaxed); - self.me_family_v4_suppressed_until_epoch_secs - .store(suppressed_until_epoch_secs, Ordering::Relaxed); - self.me_family_v4_fail_streak - .store(fail_streak, Ordering::Relaxed); - self.me_family_v4_recover_success_streak - .store(recover_success_streak, Ordering::Relaxed); - } - IpFamily::V6 => { - self.me_family_v6_runtime_state - .store(state as u8, Ordering::Relaxed); - self.me_family_v6_state_since_epoch_secs - .store(state_since_epoch_secs, Ordering::Relaxed); - self.me_family_v6_suppressed_until_epoch_secs - .store(suppressed_until_epoch_secs, Ordering::Relaxed); - self.me_family_v6_fail_streak - .store(fail_streak, Ordering::Relaxed); - self.me_family_v6_recover_success_streak - .store(recover_success_streak, Ordering::Relaxed); - } + IpFamily::V4 => self.family_health_v4.store(snapshot), + IpFamily::V6 => self.family_health_v6.store(snapshot), } } pub(crate) fn family_runtime_state(&self, family: IpFamily) -> MeFamilyRuntimeState { match family { - IpFamily::V4 => MeFamilyRuntimeState::from_u8( - self.me_family_v4_runtime_state.load(Ordering::Relaxed), - ), - IpFamily::V6 => MeFamilyRuntimeState::from_u8( - self.me_family_v6_runtime_state.load(Ordering::Relaxed), - ), + IpFamily::V4 => self.family_health_v4.load().state, + IpFamily::V6 => self.family_health_v6.load().state, } } pub(crate) fn family_runtime_state_since_epoch_secs(&self, family: IpFamily) -> u64 { match family { - IpFamily::V4 => self - .me_family_v4_state_since_epoch_secs - .load(Ordering::Relaxed), - IpFamily::V6 => self - .me_family_v6_state_since_epoch_secs - .load(Ordering::Relaxed), + IpFamily::V4 => self.family_health_v4.load().state_since_epoch_secs, + IpFamily::V6 => self.family_health_v6.load().state_since_epoch_secs, } } pub(crate) fn family_suppressed_until_epoch_secs(&self, family: IpFamily) -> u64 { match family { - IpFamily::V4 => self - .me_family_v4_suppressed_until_epoch_secs - .load(Ordering::Relaxed), - IpFamily::V6 => self - .me_family_v6_suppressed_until_epoch_secs - .load(Ordering::Relaxed), + IpFamily::V4 => self.family_health_v4.load().suppressed_until_epoch_secs, + IpFamily::V6 => self.family_health_v6.load().suppressed_until_epoch_secs, } } pub(crate) fn family_fail_streak(&self, family: IpFamily) -> u32 { match family { - IpFamily::V4 => self.me_family_v4_fail_streak.load(Ordering::Relaxed), - IpFamily::V6 => self.me_family_v6_fail_streak.load(Ordering::Relaxed), + IpFamily::V4 => self.family_health_v4.load().fail_streak, + IpFamily::V6 => self.family_health_v6.load().fail_streak, } } pub(crate) fn family_recover_success_streak(&self, family: IpFamily) -> u32 { match family { - IpFamily::V4 => self - .me_family_v4_recover_success_streak - .load(Ordering::Relaxed), - IpFamily::V6 => self - .me_family_v6_recover_success_streak - .load(Ordering::Relaxed), + IpFamily::V4 => self.family_health_v4.load().recover_success_streak, + IpFamily::V6 => self.family_health_v6.load().recover_success_streak, } } @@ -818,6 +844,9 @@ impl MePool { self.me_instadrain.store(instadrain, Ordering::Relaxed); self.me_pool_drain_threshold .store(pool_drain_threshold, Ordering::Relaxed); + // Runtime soft-evict knobs are updated lock-free to keep control-plane + // writes non-blocking; readers observe a short eventual-consistency + // window by design. self.me_pool_drain_soft_evict_enabled .store(pool_drain_soft_evict_enabled, Ordering::Relaxed); self.me_pool_drain_soft_evict_grace_secs @@ -1574,6 +1603,22 @@ impl MePool { let preferred = Self::build_preferred_endpoints_by_dc(&self.decision, &map_v4, &map_v6); *self.endpoint_dc_map.write().await = rebuilt; *self.preferred_endpoints_by_dc.write().await = preferred; + let configured_endpoints = self + .endpoint_dc_map + .read() + .await + .keys() + .copied() + .collect::>(); + { + let mut quarantine = self.endpoint_quarantine.lock().await; + let now = Instant::now(); + quarantine.retain(|addr, expiry| *expiry > now && configured_endpoints.contains(addr)); + } + { + let mut kdf_fp = self.kdf_material_fingerprint.write().await; + kdf_fp.retain(|addr, _| configured_endpoints.contains(addr)); + } } pub(super) async fn preferred_endpoints_for_dc(&self, dc: i32) -> Vec { diff --git a/src/transport/middle_proxy/pool_config.rs b/src/transport/middle_proxy/pool_config.rs index 486fad0..ebbadd2 100644 --- a/src/transport/middle_proxy/pool_config.rs +++ b/src/transport/middle_proxy/pool_config.rs @@ -72,7 +72,7 @@ impl MePool { } if changed { self.rebuild_endpoint_dc_map().await; - self.writer_available.notify_waiters(); + self.notify_writer_epoch(); } if changed { SnapshotApplyOutcome::AppliedChanged diff --git a/src/transport/middle_proxy/pool_refill.rs b/src/transport/middle_proxy/pool_refill.rs index d93bcfe..fc5c996 100644 --- a/src/transport/middle_proxy/pool_refill.rs +++ b/src/transport/middle_proxy/pool_refill.rs @@ -13,8 +13,22 @@ use super::pool::{MePool, RefillDcKey, RefillEndpointKey, WriterContour}; const ME_FLAP_UPTIME_THRESHOLD_SECS: u64 = 20; const ME_FLAP_QUARANTINE_SECS: u64 = 25; +const ME_REFILL_TOTAL_ATTEMPT_CAP: u32 = 20; impl MePool { + pub(super) async fn sweep_endpoint_quarantine(&self) { + let configured = self + .endpoint_dc_map + .read() + .await + .keys() + .copied() + .collect::>(); + let now = Instant::now(); + let mut guard = self.endpoint_quarantine.lock().await; + guard.retain(|addr, expiry| *expiry > now && configured.contains(addr)); + } + pub(super) async fn maybe_quarantine_flapping_endpoint( &self, addr: SocketAddr, @@ -206,10 +220,15 @@ impl MePool { async fn refill_writer_after_loss(self: &Arc, addr: SocketAddr, writer_dc: i32) -> bool { let fast_retries = self.me_reconnect_fast_retry_count.max(1); + let mut total_attempts = 0u32; let same_endpoint_quarantined = self.is_endpoint_quarantined(addr).await; if !same_endpoint_quarantined { for attempt in 0..fast_retries { + if total_attempts >= ME_REFILL_TOTAL_ATTEMPT_CAP { + break; + } + total_attempts = total_attempts.saturating_add(1); self.stats.increment_me_reconnect_attempt(); match self .connect_one_for_dc(addr, writer_dc, self.rng.as_ref()) @@ -250,6 +269,10 @@ impl MePool { } for attempt in 0..fast_retries { + if total_attempts >= ME_REFILL_TOTAL_ATTEMPT_CAP { + break; + } + total_attempts = total_attempts.saturating_add(1); self.stats.increment_me_reconnect_attempt(); if self .connect_endpoints_round_robin(writer_dc, &dc_endpoints, self.rng.as_ref()) diff --git a/src/transport/middle_proxy/pool_writer.rs b/src/transport/middle_proxy/pool_writer.rs index 22fb909..908b113 100644 --- a/src/transport/middle_proxy/pool_writer.rs +++ b/src/transport/middle_proxy/pool_writer.rs @@ -1,5 +1,6 @@ use std::io::ErrorKind; use std::net::SocketAddr; +use std::collections::HashMap; use std::sync::Arc; use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU32, AtomicU64, Ordering}; use std::time::{Duration, Instant}; @@ -25,6 +26,7 @@ const ME_ACTIVE_PING_SECS: u64 = 25; const ME_ACTIVE_PING_JITTER_SECS: i64 = 5; const ME_IDLE_KEEPALIVE_MAX_SECS: u64 = 5; const ME_RPC_PROXY_REQ_RESPONSE_WAIT_MS: u64 = 700; +const ME_PING_TRACKER_CLEANUP_EVERY: u32 = 32; #[derive(Clone, Copy)] enum WriterTeardownMode { @@ -197,11 +199,11 @@ impl MePool { self.registry.register_writer(writer_id, tx.clone()).await; self.registry.mark_writer_idle(writer_id).await; self.conn_count.fetch_add(1, Ordering::Relaxed); - self.writer_available.notify_one(); + self.notify_writer_epoch(); let reg = self.registry.clone(); let writers_arc = self.writers_arc(); - let ping_tracker = self.ping_tracker.clone(); + let ping_tracker = Arc::new(tokio::sync::Mutex::new(HashMap::::new())); let ping_tracker_reader = ping_tracker.clone(); let rtt_stats = self.rtt_stats.clone(); let stats_reader = self.stats.clone(); @@ -280,6 +282,7 @@ impl MePool { let pool_ping = Arc::downgrade(self); tokio::spawn(async move { let mut ping_id: i64 = rand::random::(); + let mut cleanup_tick: u32 = 0; let idle_interval_cap = Duration::from_secs(ME_IDLE_KEEPALIVE_MAX_SECS); // Per-writer jittered start to avoid phase sync. let startup_jitter = if keepalive_enabled { @@ -339,39 +342,16 @@ impl MePool { p.extend_from_slice(&sent_id.to_le_bytes()); { let mut tracker = ping_tracker_ping.lock().await; - let now_epoch_ms = std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap_or_default() - .as_millis() as u64; - let mut run_cleanup = false; - if let Some(pool) = pool_ping.upgrade() { - let last_cleanup_ms = pool - .ping_tracker_last_cleanup_epoch_ms - .load(Ordering::Relaxed); - if now_epoch_ms.saturating_sub(last_cleanup_ms) >= 30_000 - && pool - .ping_tracker_last_cleanup_epoch_ms - .compare_exchange( - last_cleanup_ms, - now_epoch_ms, - Ordering::AcqRel, - Ordering::Relaxed, - ) - .is_ok() - { - run_cleanup = true; - } - } - - if run_cleanup { + cleanup_tick = cleanup_tick.wrapping_add(1); + if cleanup_tick.is_multiple_of(ME_PING_TRACKER_CLEANUP_EVERY) { let before = tracker.len(); - tracker.retain(|_, (ts, _)| ts.elapsed() < Duration::from_secs(120)); + tracker.retain(|_, ts| ts.elapsed() < Duration::from_secs(120)); let expired = before.saturating_sub(tracker.len()); if expired > 0 { stats_ping.increment_me_keepalive_timeout_by(expired as u64); } } - tracker.insert(sent_id, (std::time::Instant::now(), writer_id)); + tracker.insert(sent_id, std::time::Instant::now()); } ping_id = ping_id.wrapping_add(1); stats_ping.increment_me_keepalive_sent(); @@ -594,10 +574,6 @@ impl MePool { // The close command below is only a best-effort accelerator for task shutdown. // Cleanup progress must never depend on command-channel availability. let _ = self.registry.writer_lost(writer_id).await; - { - let mut tracker = self.ping_tracker.lock().await; - tracker.retain(|_, (_, wid)| *wid != writer_id); - } self.rtt_stats.lock().await.remove(&writer_id); if let Some(tx) = close_tx { let _ = tx.send(WriterCommand::Close).await; @@ -611,6 +587,9 @@ impl MePool { self.trigger_immediate_refill_for_dc(addr, writer_dc); } } + if removed { + self.notify_writer_epoch(); + } removed } diff --git a/src/transport/middle_proxy/reader.rs b/src/transport/middle_proxy/reader.rs index 46acd7e..aec55cd 100644 --- a/src/transport/middle_proxy/reader.rs +++ b/src/transport/middle_proxy/reader.rs @@ -32,10 +32,10 @@ pub(crate) async fn reader_loop( enc_leftover: BytesMut, mut dec: BytesMut, tx: mpsc::Sender, - ping_tracker: Arc>>, + ping_tracker: Arc>>, rtt_stats: Arc>>, stats: Arc, - _writer_id: u64, + writer_id: u64, degraded: Arc, writer_rtt_ema_ms_x10: Arc, reader_route_data_wait_ms: Arc, @@ -45,7 +45,7 @@ pub(crate) async fn reader_loop( let mut expected_seq: i32 = 0; loop { - let mut tmp = [0u8; 16_384]; + let mut tmp = [0u8; 65_536]; let n = tokio::select! { res = rd.read(&mut tmp) => res.map_err(ProxyError::Io)?, _ = cancel.cancelled() => return Ok(()), @@ -203,13 +203,13 @@ pub(crate) async fn reader_loop( } else if pt == RPC_PONG_U32 && body.len() >= 8 { let ping_id = i64::from_le_bytes(body[0..8].try_into().unwrap()); stats.increment_me_keepalive_pong(); - if let Some((sent, wid)) = { + if let Some(sent) = { let mut guard = ping_tracker.lock().await; guard.remove(&ping_id) } { let rtt = sent.elapsed().as_secs_f64() * 1000.0; let mut stats = rtt_stats.lock().await; - let entry = stats.entry(wid).or_insert((rtt, rtt)); + let entry = stats.entry(writer_id).or_insert((rtt, rtt)); entry.1 = entry.1 * 0.8 + rtt * 0.2; if rtt < entry.0 { entry.0 = rtt; @@ -224,7 +224,7 @@ pub(crate) async fn reader_loop( Ordering::Relaxed, ); trace!( - writer_id = wid, + writer_id, rtt_ms = rtt, ema_ms = entry.1, base_ms = entry.0, diff --git a/src/transport/middle_proxy/registry.rs b/src/transport/middle_proxy/registry.rs index 0a95e18..becd8c4 100644 --- a/src/transport/middle_proxy/registry.rs +++ b/src/transport/middle_proxy/registry.rs @@ -292,6 +292,12 @@ impl ConnRegistry { pub async fn bind_writer(&self, conn_id: u64, writer_id: u64, meta: ConnMeta) -> bool { let mut inner = self.inner.write().await; + // ROUTING IS THE SOURCE OF TRUTH: + // never keep/attach writer binding for a connection that is already + // absent from the routing table. + if !inner.map.contains_key(&conn_id) { + return false; + } if !inner.writers.contains_key(&writer_id) { return false; } @@ -382,9 +388,39 @@ impl ConnRegistry { } pub async fn get_writer(&self, conn_id: u64) -> Option { - let inner = self.inner.read().await; - let writer_id = inner.writer_for_conn.get(&conn_id).cloned()?; - let writer = inner.writers.get(&writer_id).cloned()?; + let mut inner = self.inner.write().await; + // ROUTING IS THE SOURCE OF TRUTH: + // stale bindings are ignored and lazily cleaned when routing no longer + // contains the connection. + if !inner.map.contains_key(&conn_id) { + inner.meta.remove(&conn_id); + if let Some(stale_writer_id) = inner.writer_for_conn.remove(&conn_id) + && let Some(conns) = inner.conns_for_writer.get_mut(&stale_writer_id) + { + conns.remove(&conn_id); + if conns.is_empty() { + inner + .writer_idle_since_epoch_secs + .insert(stale_writer_id, Self::now_epoch_secs()); + } + } + return None; + } + + let writer_id = inner.writer_for_conn.get(&conn_id).copied()?; + let Some(writer) = inner.writers.get(&writer_id).cloned() else { + inner.writer_for_conn.remove(&conn_id); + inner.meta.remove(&conn_id); + if let Some(conns) = inner.conns_for_writer.get_mut(&writer_id) { + conns.remove(&conn_id); + if conns.is_empty() { + inner + .writer_idle_since_epoch_secs + .insert(writer_id, Self::now_epoch_secs()); + } + } + return None; + }; Some(ConnWriter { writer_id, tx: writer, diff --git a/src/transport/middle_proxy/send.rs b/src/transport/middle_proxy/send.rs index b1cf54e..d38775f 100644 --- a/src/transport/middle_proxy/send.rs +++ b/src/transport/middle_proxy/send.rs @@ -26,6 +26,9 @@ use rand::seq::SliceRandom; const IDLE_WRITER_PENALTY_MID_SECS: u64 = 45; const IDLE_WRITER_PENALTY_HIGH_SECS: u64 = 55; const HYBRID_GLOBAL_BURST_PERIOD_ROUNDS: u32 = 4; +const HYBRID_RECENT_SUCCESS_WINDOW_MS: u64 = 120_000; +const HYBRID_TIMEOUT_WARN_RATE_LIMIT_MS: u64 = 5_000; +const HYBRID_RECOVERY_TRIGGER_MIN_INTERVAL_MS: u64 = 5_000; const PICK_PENALTY_WARM: u64 = 200; const PICK_PENALTY_DRAINING: u64 = 600; const PICK_PENALTY_STALE: u64 = 300; @@ -77,6 +80,7 @@ impl MePool { let mut async_recovery_triggered = false; let mut hybrid_recovery_round = 0u32; let mut hybrid_last_recovery_at: Option = None; + let mut hybrid_total_deadline: Option = None; let hybrid_wait_step = self.me_route_no_writer_wait.max(Duration::from_millis(50)); let mut hybrid_wait_current = hybrid_wait_step; @@ -92,9 +96,13 @@ impl MePool { .tx .try_send(WriterCommand::Data(current_payload.clone())) { - Ok(()) => return Ok(()), + Ok(()) => { + self.note_hybrid_route_success(); + return Ok(()); + } Err(TrySendError::Full(cmd)) => { if current.tx.send(cmd).await.is_ok() { + self.note_hybrid_route_success(); return Ok(()); } warn!(writer_id = current.writer_id, "ME writer channel closed"); @@ -182,6 +190,15 @@ impl MePool { continue; } MeRouteNoWriterMode::HybridAsyncPersistent => { + let total_deadline = *hybrid_total_deadline.get_or_insert_with(|| { + Instant::now() + self.hybrid_total_wait_budget() + }); + if Instant::now() >= total_deadline { + self.on_hybrid_timeout(total_deadline, routed_dc); + return Err(ProxyError::Proxy( + "ME writer not available within hybrid timeout".into(), + )); + } if !unknown_target_dc { self.maybe_trigger_hybrid_recovery( routed_dc, @@ -292,6 +309,15 @@ impl MePool { } } MeRouteNoWriterMode::HybridAsyncPersistent => { + let total_deadline = *hybrid_total_deadline + .get_or_insert_with(|| Instant::now() + self.hybrid_total_wait_budget()); + if Instant::now() >= total_deadline { + self.on_hybrid_timeout(total_deadline, routed_dc); + return Err(ProxyError::Proxy( + "No ME writers available for target DC within hybrid timeout" + .into(), + )); + } if !unknown_target_dc { self.maybe_trigger_hybrid_recovery( routed_dc, @@ -423,6 +449,7 @@ impl MePool { "Selected stale ME writer for fallback bind" ); } + self.note_hybrid_route_success(); return Ok(()); } Err(TrySendError::Full(_)) => { @@ -453,7 +480,18 @@ impl MePool { .increment_me_writer_pick_blocking_fallback_total(); let effective_our_addr = SocketAddr::new(w.source_ip, our_addr.port()); let (payload, meta) = build_routed_payload(effective_our_addr); - match w.tx.clone().reserve_owned().await { + let reserve_result = if let Some(timeout) = self.me_route_blocking_send_timeout { + match tokio::time::timeout(timeout, w.tx.clone().reserve_owned()).await { + Ok(result) => result, + Err(_) => { + self.stats.increment_me_writer_pick_full_total(pick_mode); + continue; + } + } + } else { + w.tx.clone().reserve_owned().await + }; + match reserve_result { Ok(permit) => { if !self.registry.bind_writer(conn_id, w.id, meta).await { debug!( @@ -471,6 +509,7 @@ impl MePool { if w.generation < self.current_generation() { self.stats.increment_pool_stale_pick_total(); } + self.note_hybrid_route_success(); return Ok(()); } Err(_) => { @@ -483,7 +522,7 @@ impl MePool { } async fn wait_for_writer_until(&self, deadline: Instant) -> bool { - let waiter = self.writer_available.notified(); + let mut rx = self.writer_epoch.subscribe(); if !self.writers.read().await.is_empty() { return true; } @@ -492,13 +531,14 @@ impl MePool { return !self.writers.read().await.is_empty(); } let timeout = deadline.saturating_duration_since(now); - if tokio::time::timeout(timeout, waiter).await.is_ok() { - return true; + if tokio::time::timeout(timeout, rx.changed()).await.is_ok() { + return !self.writers.read().await.is_empty(); } !self.writers.read().await.is_empty() } async fn wait_for_candidate_until(&self, routed_dc: i32, deadline: Instant) -> bool { + let mut rx = self.writer_epoch.subscribe(); loop { if self.has_candidate_for_target_dc(routed_dc).await { return true; @@ -509,7 +549,6 @@ impl MePool { return self.has_candidate_for_target_dc(routed_dc).await; } - let waiter = self.writer_available.notified(); if self.has_candidate_for_target_dc(routed_dc).await { return true; } @@ -517,7 +556,7 @@ impl MePool { if remaining.is_zero() { return self.has_candidate_for_target_dc(routed_dc).await; } - if tokio::time::timeout(remaining, waiter).await.is_err() { + if tokio::time::timeout(remaining, rx.changed()).await.is_err() { return self.has_candidate_for_target_dc(routed_dc).await; } } @@ -587,6 +626,10 @@ impl MePool { hybrid_last_recovery_at: &mut Option, hybrid_wait_step: Duration, ) { + if !self.try_consume_hybrid_recovery_trigger_slot(HYBRID_RECOVERY_TRIGGER_MIN_INTERVAL_MS) + { + return; + } if let Some(last) = *hybrid_last_recovery_at && last.elapsed() < hybrid_wait_step { @@ -602,6 +645,71 @@ impl MePool { *hybrid_last_recovery_at = Some(Instant::now()); } + fn hybrid_total_wait_budget(&self) -> Duration { + let base = self.me_route_hybrid_max_wait.max(Duration::from_millis(50)); + let now_ms = Self::now_epoch_millis(); + let last_success_ms = self.me_route_last_success_epoch_ms.load(Ordering::Relaxed); + if last_success_ms != 0 + && now_ms.saturating_sub(last_success_ms) <= HYBRID_RECENT_SUCCESS_WINDOW_MS + { + return base.saturating_mul(2); + } + base + } + + fn note_hybrid_route_success(&self) { + self.me_route_last_success_epoch_ms + .store(Self::now_epoch_millis(), Ordering::Relaxed); + } + + fn on_hybrid_timeout(&self, deadline: Instant, routed_dc: i32) { + self.stats.increment_me_hybrid_timeout_total(); + let now_ms = Self::now_epoch_millis(); + let mut last_warn_ms = self + .me_route_hybrid_timeout_warn_epoch_ms + .load(Ordering::Relaxed); + while now_ms.saturating_sub(last_warn_ms) >= HYBRID_TIMEOUT_WARN_RATE_LIMIT_MS { + match self.me_route_hybrid_timeout_warn_epoch_ms.compare_exchange_weak( + last_warn_ms, + now_ms, + Ordering::AcqRel, + Ordering::Relaxed, + ) { + Ok(_) => { + warn!( + routed_dc, + budget_ms = self.hybrid_total_wait_budget().as_millis() as u64, + elapsed_ms = deadline.elapsed().as_millis() as u64, + "ME hybrid route timeout reached" + ); + break; + } + Err(actual) => last_warn_ms = actual, + } + } + } + + fn try_consume_hybrid_recovery_trigger_slot(&self, min_interval_ms: u64) -> bool { + let now_ms = Self::now_epoch_millis(); + let mut last_trigger_ms = self + .me_async_recovery_last_trigger_epoch_ms + .load(Ordering::Relaxed); + loop { + if now_ms.saturating_sub(last_trigger_ms) < min_interval_ms { + return false; + } + match self.me_async_recovery_last_trigger_epoch_ms.compare_exchange_weak( + last_trigger_ms, + now_ms, + Ordering::AcqRel, + Ordering::Relaxed, + ) { + Ok(_) => return true, + Err(actual) => last_trigger_ms = actual, + } + } + } + pub async fn send_close(self: &Arc, conn_id: u64) -> Result<()> { if let Some(w) = self.registry.get_writer(conn_id).await { let mut p = Vec::with_capacity(12); diff --git a/src/transport/middle_proxy/tests/health_adversarial_tests.rs b/src/transport/middle_proxy/tests/health_adversarial_tests.rs index 3444120..4bee91c 100644 --- a/src/transport/middle_proxy/tests/health_adversarial_tests.rs +++ b/src/transport/middle_proxy/tests/health_adversarial_tests.rs @@ -113,6 +113,8 @@ async fn make_pool( general.me_warn_rate_limit_ms, MeRouteNoWriterMode::default(), general.me_route_no_writer_wait_ms, + general.me_route_hybrid_max_wait_ms, + general.me_route_blocking_send_timeout_ms, general.me_route_inline_recovery_attempts, general.me_route_inline_recovery_wait_ms, ); diff --git a/src/transport/middle_proxy/tests/health_integration_tests.rs b/src/transport/middle_proxy/tests/health_integration_tests.rs index b0d3a2a..0a6e110 100644 --- a/src/transport/middle_proxy/tests/health_integration_tests.rs +++ b/src/transport/middle_proxy/tests/health_integration_tests.rs @@ -111,6 +111,8 @@ async fn make_pool( general.me_warn_rate_limit_ms, MeRouteNoWriterMode::default(), general.me_route_no_writer_wait_ms, + general.me_route_hybrid_max_wait_ms, + general.me_route_blocking_send_timeout_ms, general.me_route_inline_recovery_attempts, general.me_route_inline_recovery_wait_ms, ); diff --git a/src/transport/middle_proxy/tests/health_regression_tests.rs b/src/transport/middle_proxy/tests/health_regression_tests.rs index 55bf8f6..92398b4 100644 --- a/src/transport/middle_proxy/tests/health_regression_tests.rs +++ b/src/transport/middle_proxy/tests/health_regression_tests.rs @@ -106,6 +106,8 @@ async fn make_pool(me_pool_drain_threshold: u64) -> Arc { general.me_warn_rate_limit_ms, MeRouteNoWriterMode::default(), general.me_route_no_writer_wait_ms, + general.me_route_hybrid_max_wait_ms, + general.me_route_blocking_send_timeout_ms, general.me_route_inline_recovery_attempts, general.me_route_inline_recovery_wait_ms, ) diff --git a/src/transport/middle_proxy/tests/pool_refill_security_tests.rs b/src/transport/middle_proxy/tests/pool_refill_security_tests.rs index 2d1e23a..90c8382 100644 --- a/src/transport/middle_proxy/tests/pool_refill_security_tests.rs +++ b/src/transport/middle_proxy/tests/pool_refill_security_tests.rs @@ -95,6 +95,8 @@ async fn make_pool() -> Arc { general.me_warn_rate_limit_ms, MeRouteNoWriterMode::default(), general.me_route_no_writer_wait_ms, + general.me_route_hybrid_max_wait_ms, + general.me_route_blocking_send_timeout_ms, general.me_route_inline_recovery_attempts, general.me_route_inline_recovery_wait_ms, ) diff --git a/src/transport/middle_proxy/tests/pool_writer_security_tests.rs b/src/transport/middle_proxy/tests/pool_writer_security_tests.rs index 7bfc061..fc5135b 100644 --- a/src/transport/middle_proxy/tests/pool_writer_security_tests.rs +++ b/src/transport/middle_proxy/tests/pool_writer_security_tests.rs @@ -100,6 +100,8 @@ async fn make_pool() -> Arc { general.me_warn_rate_limit_ms, MeRouteNoWriterMode::default(), general.me_route_no_writer_wait_ms, + general.me_route_hybrid_max_wait_ms, + general.me_route_blocking_send_timeout_ms, general.me_route_inline_recovery_attempts, general.me_route_inline_recovery_wait_ms, ) diff --git a/src/transport/middle_proxy/tests/send_adversarial_tests.rs b/src/transport/middle_proxy/tests/send_adversarial_tests.rs index 80379a5..de52d18 100644 --- a/src/transport/middle_proxy/tests/send_adversarial_tests.rs +++ b/src/transport/middle_proxy/tests/send_adversarial_tests.rs @@ -106,6 +106,8 @@ async fn make_pool() -> (Arc, Arc) { general.me_warn_rate_limit_ms, general.me_route_no_writer_mode, general.me_route_no_writer_wait_ms, + general.me_route_hybrid_max_wait_ms, + general.me_route_blocking_send_timeout_ms, general.me_route_inline_recovery_attempts, general.me_route_inline_recovery_wait_ms, ); From 8bab3f70e100cec7026f5f4499e4c82162f514eb Mon Sep 17 00:00:00 2001 From: Alexey <247128645+axkurcom@users.noreply.github.com> Date: Wed, 25 Mar 2026 17:25:35 +0300 Subject: [PATCH 02/17] WritersState on ArcSwao + Preferred Endpoint on ArcSwap + Two-map Rotation for Desync Dedup Co-Authored-By: brekotis <93345790+brekotis@users.noreply.github.com> --- src/proxy/middle_relay.rs | 143 +++++++++++++--------- src/transport/middle_proxy/pool.rs | 96 +++++++++++++-- src/transport/middle_proxy/pool_config.rs | 2 +- src/transport/middle_proxy/pool_status.rs | 6 +- src/transport/middle_proxy/pool_writer.rs | 4 +- 5 files changed, 182 insertions(+), 69 deletions(-) diff --git a/src/proxy/middle_relay.rs b/src/proxy/middle_relay.rs index 3259597..ca32e6f 100644 --- a/src/proxy/middle_relay.rs +++ b/src/proxy/middle_relay.rs @@ -4,7 +4,7 @@ use std::collections::{BTreeSet, HashMap}; use std::future::Future; use std::hash::{BuildHasher, Hash}; use std::net::{IpAddr, SocketAddr}; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::{Arc, Mutex, OnceLock}; use std::time::{Duration, Instant}; @@ -36,7 +36,6 @@ enum C2MeCommand { const DESYNC_DEDUP_WINDOW: Duration = Duration::from_secs(60); const DESYNC_DEDUP_MAX_ENTRIES: usize = 65_536; -const DESYNC_DEDUP_PRUNE_SCAN_LIMIT: usize = 1024; const DESYNC_FULL_CACHE_EMIT_MIN_INTERVAL: Duration = Duration::from_millis(1000); const DESYNC_ERROR_CLASS: &str = "frame_too_large_crypto_desync"; const C2ME_CHANNEL_CAPACITY_FALLBACK: usize = 128; @@ -57,12 +56,18 @@ const ME_D2C_FRAME_BUF_SHRINK_HYSTERESIS_FACTOR: usize = 2; const ME_D2C_SINGLE_WRITE_COALESCE_MAX_BYTES: usize = 128 * 1024; const QUOTA_RESERVE_SPIN_RETRIES: usize = 32; static DESYNC_DEDUP: OnceLock> = OnceLock::new(); +static DESYNC_DEDUP_PREVIOUS: OnceLock> = OnceLock::new(); static DESYNC_HASHER: OnceLock = OnceLock::new(); static DESYNC_FULL_CACHE_LAST_EMIT_AT: OnceLock>> = OnceLock::new(); -static DESYNC_DEDUP_EVER_SATURATED: OnceLock = OnceLock::new(); +static DESYNC_DEDUP_ROTATION_STATE: OnceLock> = OnceLock::new(); static RELAY_IDLE_CANDIDATE_REGISTRY: OnceLock> = OnceLock::new(); static RELAY_IDLE_MARK_SEQ: AtomicU64 = AtomicU64::new(0); +#[derive(Default)] +struct DesyncDedupRotationState { + current_started_at: Option, +} + struct RelayForensicsState { trace_id: u64, conn_id: u64, @@ -312,64 +317,76 @@ fn should_emit_full_desync(key: u64, all_full: bool, now: Instant) -> bool { return true; } - let dedup = DESYNC_DEDUP.get_or_init(DashMap::new); - let saturated_before = dedup.len() >= DESYNC_DEDUP_MAX_ENTRIES; - let ever_saturated = DESYNC_DEDUP_EVER_SATURATED.get_or_init(|| AtomicBool::new(false)); - if saturated_before { - ever_saturated.store(true, Ordering::Relaxed); - } + let dedup_current = DESYNC_DEDUP.get_or_init(DashMap::new); + let dedup_previous = DESYNC_DEDUP_PREVIOUS.get_or_init(DashMap::new); + let rotation_state = DESYNC_DEDUP_ROTATION_STATE + .get_or_init(|| Mutex::new(DesyncDedupRotationState::default())); - if let Some(mut seen_at) = dedup.get_mut(&key) { - if now.duration_since(*seen_at) >= DESYNC_DEDUP_WINDOW { - *seen_at = now; - return true; + let mut state = match rotation_state.lock() { + Ok(guard) => guard, + Err(poisoned) => { + let mut guard = poisoned.into_inner(); + *guard = DesyncDedupRotationState::default(); + rotation_state.clear_poison(); + guard } - return false; - } - - if dedup.len() >= DESYNC_DEDUP_MAX_ENTRIES { - let mut stale_keys = Vec::new(); - let mut oldest_candidate: Option<(u64, Instant)> = None; - for entry in dedup.iter().take(DESYNC_DEDUP_PRUNE_SCAN_LIMIT) { - let key = *entry.key(); - let seen_at = *entry.value(); - - match oldest_candidate { - Some((_, oldest_seen)) if seen_at >= oldest_seen => {} - _ => oldest_candidate = Some((key, seen_at)), - } - - if now.duration_since(seen_at) >= DESYNC_DEDUP_WINDOW { - stale_keys.push(*entry.key()); - } - } - for stale_key in stale_keys { - dedup.remove(&stale_key); - } - if dedup.len() >= DESYNC_DEDUP_MAX_ENTRIES { - let Some((evict_key, _)) = oldest_candidate else { - return false; - }; - dedup.remove(&evict_key); - dedup.insert(key, now); - return should_emit_full_desync_full_cache(now); - } - } - - dedup.insert(key, now); - let saturated_after = dedup.len() >= DESYNC_DEDUP_MAX_ENTRIES; - // Preserve the first sequential insert that reaches capacity as a normal - // emit, while still gating concurrent newcomer churn after the cache has - // ever been observed at saturation. - let was_ever_saturated = if saturated_after { - ever_saturated.swap(true, Ordering::Relaxed) - } else { - ever_saturated.load(Ordering::Relaxed) }; - if saturated_before || (saturated_after && was_ever_saturated) { + let rotate_now = match state.current_started_at { + Some(current_started_at) => match now.checked_duration_since(current_started_at) { + Some(elapsed) => elapsed >= DESYNC_DEDUP_WINDOW, + None => true, + }, + None => true, + }; + if rotate_now { + dedup_previous.clear(); + for entry in dedup_current.iter() { + dedup_previous.insert(*entry.key(), *entry.value()); + } + dedup_current.clear(); + state.current_started_at = Some(now); + } + + if let Some(seen_at) = dedup_current.get(&key).map(|entry| *entry.value()) { + let within_window = match now.checked_duration_since(seen_at) { + Some(elapsed) => elapsed < DESYNC_DEDUP_WINDOW, + None => true, + }; + if within_window { + return false; + } + dedup_current.insert(key, now); + return true; + } + + if let Some(seen_at) = dedup_previous.get(&key).map(|entry| *entry.value()) { + let within_window = match now.checked_duration_since(seen_at) { + Some(elapsed) => elapsed < DESYNC_DEDUP_WINDOW, + None => true, + }; + if within_window { + // Keep the original timestamp when promoting from previous bucket, + // so dedup expiry remains tied to first-seen time. + dedup_current.insert(key, seen_at); + return false; + } + dedup_previous.remove(&key); + } + + if dedup_current.len() >= DESYNC_DEDUP_MAX_ENTRIES { + // Bounded eviction path: rotate buckets instead of scanning/evicting + // arbitrary entries from a saturated single map. + dedup_previous.clear(); + for entry in dedup_current.iter() { + dedup_previous.insert(*entry.key(), *entry.value()); + } + dedup_current.clear(); + state.current_started_at = Some(now); + dedup_current.insert(key, now); should_emit_full_desync_full_cache(now) } else { + dedup_current.insert(key, now); true } } @@ -405,8 +422,20 @@ fn clear_desync_dedup_for_testing() { if let Some(dedup) = DESYNC_DEDUP.get() { dedup.clear(); } - if let Some(ever_saturated) = DESYNC_DEDUP_EVER_SATURATED.get() { - ever_saturated.store(false, Ordering::Relaxed); + if let Some(dedup_previous) = DESYNC_DEDUP_PREVIOUS.get() { + dedup_previous.clear(); + } + if let Some(rotation_state) = DESYNC_DEDUP_ROTATION_STATE.get() { + match rotation_state.lock() { + Ok(mut guard) => { + *guard = DesyncDedupRotationState::default(); + } + Err(poisoned) => { + let mut guard = poisoned.into_inner(); + *guard = DesyncDedupRotationState::default(); + rotation_state.clear_poison(); + } + } } if let Some(last_emit_at) = DESYNC_FULL_CACHE_LAST_EMIT_AT.get() { match last_emit_at.lock() { diff --git a/src/transport/middle_proxy/pool.rs b/src/transport/middle_proxy/pool.rs index 5faa76d..07d4d19 100644 --- a/src/transport/middle_proxy/pool.rs +++ b/src/transport/middle_proxy/pool.rs @@ -2,6 +2,7 @@ use std::collections::{HashMap, HashSet}; use std::net::{IpAddr, Ipv6Addr, SocketAddr}; +use std::ops::{Deref, DerefMut}; use std::sync::Arc; use std::sync::atomic::{ AtomicBool, AtomicI32, AtomicU8, AtomicU32, AtomicU64, AtomicUsize, Ordering, @@ -56,6 +57,87 @@ pub struct MeWriter { pub allow_drain_fallback: Arc, } +pub(super) struct WritersState { + // HARD INVARIANT: + // All writers.store() calls MUST be guarded by writers_write_guard. + writers: ArcSwap>, + writers_write_guard: Mutex<()>, +} + +impl WritersState { + pub(super) fn new() -> Self { + Self { + writers: ArcSwap::from_pointee(Vec::new()), + writers_write_guard: Mutex::new(()), + } + } + + pub(super) fn snapshot(&self) -> Arc> { + self.writers.load_full() + } + + pub(super) async fn read(&self) -> Arc> { + self.snapshot() + } + + pub(super) async fn write(&self) -> WritersWriteGuard<'_> { + let guard = self.writers_write_guard.lock().await; + let writers = (*self.writers.load_full()).clone(); + WritersWriteGuard { + state: self, + _guard: guard, + writers, + } + } + + pub(super) async fn update(&self, f: F) -> R + where + F: FnOnce(&mut Vec) -> R, + { + let mut guard = self.write().await; + f(&mut guard) + } + + fn debug_assert_store_guarded(&self) { + debug_assert!( + self.writers_write_guard.try_lock().is_err(), + "HARD INVARIANT violated: writers.store() without writers_write_guard" + ); + } + + fn store_guarded(&self, writers: Vec) { + self.debug_assert_store_guarded(); + self.writers.store(Arc::new(writers)); + } +} + +pub(super) struct WritersWriteGuard<'a> { + state: &'a WritersState, + _guard: tokio::sync::MutexGuard<'a, ()>, + writers: Vec, +} + +impl Deref for WritersWriteGuard<'_> { + type Target = Vec; + + fn deref(&self) -> &Self::Target { + &self.writers + } +} + +impl DerefMut for WritersWriteGuard<'_> { + fn deref_mut(&mut self) -> &mut Self::Target { + &mut self.writers + } +} + +impl Drop for WritersWriteGuard<'_> { + fn drop(&mut self) { + let writers = std::mem::take(&mut self.writers); + self.state.store_guarded(writers); + } +} + #[derive(Debug, Clone, Copy, PartialEq, Eq)] #[repr(u8)] pub(super) enum WriterContour { @@ -178,7 +260,7 @@ pub struct SecretSnapshot { #[allow(dead_code)] pub struct MePool { pub(super) registry: Arc, - pub(super) writers: Arc>>, + pub(super) writers: Arc, pub(super) rr: AtomicU64, pub(super) decision: NetworkDecision, pub(super) upstream: Option>, @@ -307,7 +389,7 @@ pub struct MePool { pub(super) me_last_drain_gate_updated_at_epoch_secs: AtomicU64, pub(super) runtime_ready: AtomicBool, pool_size: usize, - pub(super) preferred_endpoints_by_dc: Arc>>>, + pub(super) preferred_endpoints_by_dc: ArcSwap>>, } #[derive(Debug, Default)] @@ -443,7 +525,7 @@ impl MePool { let now_epoch_secs = Self::now_epoch_secs(); Arc::new(Self { registry, - writers: Arc::new(RwLock::new(Vec::new())), + writers: Arc::new(WritersState::new()), rr: AtomicU64::new(0), decision, upstream, @@ -649,7 +731,7 @@ impl MePool { me_last_drain_gate_block_reason: AtomicU8::new(MeDrainGateReason::Open as u8), me_last_drain_gate_updated_at_epoch_secs: AtomicU64::new(now_epoch_secs), runtime_ready: AtomicBool::new(false), - preferred_endpoints_by_dc: Arc::new(RwLock::new(preferred_endpoints_by_dc)), + preferred_endpoints_by_dc: ArcSwap::from_pointee(preferred_endpoints_by_dc), }) } @@ -1004,7 +1086,7 @@ impl MePool { MeSocksKdfPolicy::from_u8(self.me_socks_kdf_policy.load(Ordering::Relaxed)) } - pub(super) fn writers_arc(&self) -> Arc>> { + pub(super) fn writers_arc(&self) -> Arc { self.writers.clone() } @@ -1602,7 +1684,7 @@ impl MePool { let rebuilt = Self::build_endpoint_dc_map_from_maps(&map_v4, &map_v6); let preferred = Self::build_preferred_endpoints_by_dc(&self.decision, &map_v4, &map_v6); *self.endpoint_dc_map.write().await = rebuilt; - *self.preferred_endpoints_by_dc.write().await = preferred; + self.preferred_endpoints_by_dc.store(Arc::new(preferred)); let configured_endpoints = self .endpoint_dc_map .read() @@ -1622,7 +1704,7 @@ impl MePool { } pub(super) async fn preferred_endpoints_for_dc(&self, dc: i32) -> Vec { - let guard = self.preferred_endpoints_by_dc.read().await; + let guard = self.preferred_endpoints_by_dc.load(); guard.get(&dc).cloned().unwrap_or_default() } diff --git a/src/transport/middle_proxy/pool_config.rs b/src/transport/middle_proxy/pool_config.rs index ebbadd2..6e29918 100644 --- a/src/transport/middle_proxy/pool_config.rs +++ b/src/transport/middle_proxy/pool_config.rs @@ -112,7 +112,7 @@ impl MePool { pub async fn reconnect_all(self: &Arc) { let ws = self.writers.read().await.clone(); - for w in ws { + for w in ws.iter() { if let Ok(()) = self .connect_one_for_dc(w.addr, w.writer_dc, self.rng.as_ref()) .await diff --git a/src/transport/middle_proxy/pool_status.rs b/src/transport/middle_proxy/pool_status.rs index 1ef59e1..afb8efe 100644 --- a/src/transport/middle_proxy/pool_status.rs +++ b/src/transport/middle_proxy/pool_status.rs @@ -160,7 +160,7 @@ impl MePool { let writers = self.writers.read().await.clone(); let mut live_writers_by_dc = HashMap::::new(); - for writer in writers { + for writer in writers.iter() { if writer.draining.load(Ordering::Relaxed) { continue; } @@ -197,7 +197,7 @@ impl MePool { let writers = self.writers.read().await.clone(); let mut live_writers_by_dc = HashMap::::new(); - for writer in writers { + for writer in writers.iter() { if writer.draining.load(Ordering::Relaxed) { continue; } @@ -255,7 +255,7 @@ impl MePool { let mut dc_rtt_agg = HashMap::::new(); let mut writer_rows = Vec::::with_capacity(writers.len()); - for writer in writers { + for writer in writers.iter() { let endpoint = writer.addr; let dc = i16::try_from(writer.writer_dc).ok(); let draining = writer.draining.load(Ordering::Relaxed); diff --git a/src/transport/middle_proxy/pool_writer.rs b/src/transport/middle_proxy/pool_writer.rs index 908b113..ef5a766 100644 --- a/src/transport/middle_proxy/pool_writer.rs +++ b/src/transport/middle_proxy/pool_writer.rs @@ -195,7 +195,9 @@ impl MePool { drain_deadline_epoch_secs: drain_deadline_epoch_secs.clone(), allow_drain_fallback: allow_drain_fallback.clone(), }; - self.writers.write().await.push(writer.clone()); + self.writers + .update(|writers| writers.push(writer.clone())) + .await; self.registry.register_writer(writer_id, tx.clone()).await; self.registry.mark_writer_idle(writer_id).await; self.conn_count.fetch_add(1, Ordering::Relaxed); From 70c2f0f0457e7b16afaa77ef8e82a6f92ee5e559 Mon Sep 17 00:00:00 2001 From: Alexey <247128645+axkurcom@users.noreply.github.com> Date: Wed, 25 Mar 2026 17:50:44 +0300 Subject: [PATCH 03/17] RoutingTable + BindingState Co-Authored-By: brekotis <93345790+brekotis@users.noreply.github.com> --- src/transport/middle_proxy/pool_writer.rs | 11 +- src/transport/middle_proxy/registry.rs | 124 +++++++++++++--------- 2 files changed, 75 insertions(+), 60 deletions(-) diff --git a/src/transport/middle_proxy/pool_writer.rs b/src/transport/middle_proxy/pool_writer.rs index ef5a766..6f19789 100644 --- a/src/transport/middle_proxy/pool_writer.rs +++ b/src/transport/middle_proxy/pool_writer.rs @@ -425,15 +425,8 @@ impl MePool { }; let (conn_id, mut service_rx) = pool.registry.register().await; - if !pool - .registry - .bind_writer(conn_id, writer_id, meta.clone()) - .await - { - let _ = pool.registry.unregister(conn_id).await; - stats_signal.increment_me_rpc_proxy_req_signal_skipped_no_meta_total(); - continue; - } + // Service RPC_PROXY_REQ signal path is intentionally route-only: + // do not bind synthetic conn_id into regular writer/client accounting. let payload = build_proxy_req_payload( conn_id, diff --git a/src/transport/middle_proxy/registry.rs b/src/transport/middle_proxy/registry.rs index becd8c4..8277e7f 100644 --- a/src/transport/middle_proxy/registry.rs +++ b/src/transport/middle_proxy/registry.rs @@ -51,7 +51,15 @@ pub(super) struct WriterActivitySnapshot { } struct RegistryInner { + routing: RoutingTable, + binding: BindingState, +} + +struct RoutingTable { map: HashMap>, +} + +struct BindingState { writers: HashMap>, writer_for_conn: HashMap, conns_for_writer: HashMap>, @@ -63,13 +71,17 @@ struct RegistryInner { impl RegistryInner { fn new() -> Self { Self { - map: HashMap::new(), - writers: HashMap::new(), - writer_for_conn: HashMap::new(), - conns_for_writer: HashMap::new(), - meta: HashMap::new(), - last_meta_for_writer: HashMap::new(), - writer_idle_since_epoch_secs: HashMap::new(), + routing: RoutingTable { + map: HashMap::new(), + }, + binding: BindingState { + writers: HashMap::new(), + writer_for_conn: HashMap::new(), + conns_for_writer: HashMap::new(), + meta: HashMap::new(), + last_meta_for_writer: HashMap::new(), + writer_idle_since_epoch_secs: HashMap::new(), + }, } } } @@ -130,14 +142,15 @@ impl ConnRegistry { pub async fn register(&self) -> (u64, mpsc::Receiver) { let id = self.next_id.fetch_add(1, Ordering::Relaxed); let (tx, rx) = mpsc::channel(self.route_channel_capacity); - self.inner.write().await.map.insert(id, tx); + self.inner.write().await.routing.map.insert(id, tx); (id, rx) } pub async fn register_writer(&self, writer_id: u64, tx: mpsc::Sender) { let mut inner = self.inner.write().await; - inner.writers.insert(writer_id, tx); + inner.binding.writers.insert(writer_id, tx); inner + .binding .conns_for_writer .entry(writer_id) .or_insert_with(HashSet::new); @@ -146,10 +159,10 @@ impl ConnRegistry { /// Unregister connection, returning associated writer_id if any. pub async fn unregister(&self, id: u64) -> Option { let mut inner = self.inner.write().await; - inner.map.remove(&id); - inner.meta.remove(&id); - if let Some(writer_id) = inner.writer_for_conn.remove(&id) { - let became_empty = if let Some(set) = inner.conns_for_writer.get_mut(&writer_id) { + inner.routing.map.remove(&id); + inner.binding.meta.remove(&id); + if let Some(writer_id) = inner.binding.writer_for_conn.remove(&id) { + let became_empty = if let Some(set) = inner.binding.conns_for_writer.get_mut(&writer_id) { set.remove(&id); set.is_empty() } else { @@ -157,6 +170,7 @@ impl ConnRegistry { }; if became_empty { inner + .binding .writer_idle_since_epoch_secs .insert(writer_id, Self::now_epoch_secs()); } @@ -169,7 +183,7 @@ impl ConnRegistry { pub async fn route(&self, id: u64, resp: MeResponse) -> RouteResult { let tx = { let inner = self.inner.read().await; - inner.map.get(&id).cloned() + inner.routing.map.get(&id).cloned() }; let Some(tx) = tx else { @@ -225,7 +239,7 @@ impl ConnRegistry { pub async fn route_nowait(&self, id: u64, resp: MeResponse) -> RouteResult { let tx = { let inner = self.inner.read().await; - inner.map.get(&id).cloned() + inner.routing.map.get(&id).cloned() }; let Some(tx) = tx else { @@ -251,7 +265,7 @@ impl ConnRegistry { let tx = { let inner = self.inner.read().await; - inner.map.get(&id).cloned() + inner.routing.map.get(&id).cloned() }; let Some(tx) = tx else { @@ -295,19 +309,19 @@ impl ConnRegistry { // ROUTING IS THE SOURCE OF TRUTH: // never keep/attach writer binding for a connection that is already // absent from the routing table. - if !inner.map.contains_key(&conn_id) { + if !inner.routing.map.contains_key(&conn_id) { return false; } - if !inner.writers.contains_key(&writer_id) { + if !inner.binding.writers.contains_key(&writer_id) { return false; } - let previous_writer_id = inner.writer_for_conn.insert(conn_id, writer_id); + let previous_writer_id = inner.binding.writer_for_conn.insert(conn_id, writer_id); if let Some(previous_writer_id) = previous_writer_id && previous_writer_id != writer_id { let became_empty = - if let Some(set) = inner.conns_for_writer.get_mut(&previous_writer_id) { + if let Some(set) = inner.binding.conns_for_writer.get_mut(&previous_writer_id) { set.remove(&conn_id); set.is_empty() } else { @@ -315,15 +329,17 @@ impl ConnRegistry { }; if became_empty { inner + .binding .writer_idle_since_epoch_secs .insert(previous_writer_id, Self::now_epoch_secs()); } } - inner.meta.insert(conn_id, meta.clone()); - inner.last_meta_for_writer.insert(writer_id, meta); - inner.writer_idle_since_epoch_secs.remove(&writer_id); + inner.binding.meta.insert(conn_id, meta.clone()); + inner.binding.last_meta_for_writer.insert(writer_id, meta); + inner.binding.writer_idle_since_epoch_secs.remove(&writer_id); inner + .binding .conns_for_writer .entry(writer_id) .or_insert_with(HashSet::new) @@ -334,10 +350,12 @@ impl ConnRegistry { pub async fn mark_writer_idle(&self, writer_id: u64) { let mut inner = self.inner.write().await; inner + .binding .conns_for_writer .entry(writer_id) .or_insert_with(HashSet::new); inner + .binding .writer_idle_since_epoch_secs .entry(writer_id) .or_insert(Self::now_epoch_secs()); @@ -345,19 +363,19 @@ impl ConnRegistry { pub async fn get_last_writer_meta(&self, writer_id: u64) -> Option { let inner = self.inner.read().await; - inner.last_meta_for_writer.get(&writer_id).cloned() + inner.binding.last_meta_for_writer.get(&writer_id).cloned() } pub async fn writer_idle_since_snapshot(&self) -> HashMap { let inner = self.inner.read().await; - inner.writer_idle_since_epoch_secs.clone() + inner.binding.writer_idle_since_epoch_secs.clone() } pub async fn writer_idle_since_for_writer_ids(&self, writer_ids: &[u64]) -> HashMap { let inner = self.inner.read().await; let mut out = HashMap::::with_capacity(writer_ids.len()); for writer_id in writer_ids { - if let Some(idle_since) = inner.writer_idle_since_epoch_secs.get(writer_id).copied() { + if let Some(idle_since) = inner.binding.writer_idle_since_epoch_secs.get(writer_id).copied() { out.insert(*writer_id, idle_since); } } @@ -369,10 +387,10 @@ impl ConnRegistry { let mut bound_clients_by_writer = HashMap::::new(); let mut active_sessions_by_target_dc = HashMap::::new(); - for (writer_id, conn_ids) in &inner.conns_for_writer { + for (writer_id, conn_ids) in &inner.binding.conns_for_writer { bound_clients_by_writer.insert(*writer_id, conn_ids.len()); } - for conn_meta in inner.meta.values() { + for conn_meta in inner.binding.meta.values() { if conn_meta.target_dc == 0 { continue; } @@ -392,14 +410,15 @@ impl ConnRegistry { // ROUTING IS THE SOURCE OF TRUTH: // stale bindings are ignored and lazily cleaned when routing no longer // contains the connection. - if !inner.map.contains_key(&conn_id) { - inner.meta.remove(&conn_id); - if let Some(stale_writer_id) = inner.writer_for_conn.remove(&conn_id) - && let Some(conns) = inner.conns_for_writer.get_mut(&stale_writer_id) + if !inner.routing.map.contains_key(&conn_id) { + inner.binding.meta.remove(&conn_id); + if let Some(stale_writer_id) = inner.binding.writer_for_conn.remove(&conn_id) + && let Some(conns) = inner.binding.conns_for_writer.get_mut(&stale_writer_id) { conns.remove(&conn_id); if conns.is_empty() { inner + .binding .writer_idle_since_epoch_secs .insert(stale_writer_id, Self::now_epoch_secs()); } @@ -407,14 +426,15 @@ impl ConnRegistry { return None; } - let writer_id = inner.writer_for_conn.get(&conn_id).copied()?; - let Some(writer) = inner.writers.get(&writer_id).cloned() else { - inner.writer_for_conn.remove(&conn_id); - inner.meta.remove(&conn_id); - if let Some(conns) = inner.conns_for_writer.get_mut(&writer_id) { + let writer_id = inner.binding.writer_for_conn.get(&conn_id).copied()?; + let Some(writer) = inner.binding.writers.get(&writer_id).cloned() else { + inner.binding.writer_for_conn.remove(&conn_id); + inner.binding.meta.remove(&conn_id); + if let Some(conns) = inner.binding.conns_for_writer.get_mut(&writer_id) { conns.remove(&conn_id); if conns.is_empty() { inner + .binding .writer_idle_since_epoch_secs .insert(writer_id, Self::now_epoch_secs()); } @@ -429,15 +449,16 @@ impl ConnRegistry { pub async fn active_conn_ids(&self) -> Vec { let inner = self.inner.read().await; - inner.writer_for_conn.keys().copied().collect() + inner.binding.writer_for_conn.keys().copied().collect() } pub async fn writer_lost(&self, writer_id: u64) -> Vec { let mut inner = self.inner.write().await; - inner.writers.remove(&writer_id); - inner.last_meta_for_writer.remove(&writer_id); - inner.writer_idle_since_epoch_secs.remove(&writer_id); + inner.binding.writers.remove(&writer_id); + inner.binding.last_meta_for_writer.remove(&writer_id); + inner.binding.writer_idle_since_epoch_secs.remove(&writer_id); let conns = inner + .binding .conns_for_writer .remove(&writer_id) .unwrap_or_default() @@ -446,11 +467,11 @@ impl ConnRegistry { let mut out = Vec::new(); for conn_id in conns { - if inner.writer_for_conn.get(&conn_id).copied() != Some(writer_id) { + if inner.binding.writer_for_conn.get(&conn_id).copied() != Some(writer_id) { continue; } - inner.writer_for_conn.remove(&conn_id); - if let Some(m) = inner.meta.get(&conn_id) { + inner.binding.writer_for_conn.remove(&conn_id); + if let Some(m) = inner.binding.meta.get(&conn_id) { out.push(BoundConn { conn_id, meta: m.clone(), @@ -463,12 +484,13 @@ impl ConnRegistry { #[allow(dead_code)] pub async fn get_meta(&self, conn_id: u64) -> Option { let inner = self.inner.read().await; - inner.meta.get(&conn_id).cloned() + inner.binding.meta.get(&conn_id).cloned() } pub async fn is_writer_empty(&self, writer_id: u64) -> bool { let inner = self.inner.read().await; inner + .binding .conns_for_writer .get(&writer_id) .map(|s| s.is_empty()) @@ -478,7 +500,7 @@ impl ConnRegistry { #[allow(dead_code)] pub async fn unregister_writer_if_empty(&self, writer_id: u64) -> bool { let mut inner = self.inner.write().await; - let Some(conn_ids) = inner.conns_for_writer.get(&writer_id) else { + let Some(conn_ids) = inner.binding.conns_for_writer.get(&writer_id) else { // Writer is already absent from the registry. return true; }; @@ -486,10 +508,10 @@ impl ConnRegistry { return false; } - inner.writers.remove(&writer_id); - inner.last_meta_for_writer.remove(&writer_id); - inner.writer_idle_since_epoch_secs.remove(&writer_id); - inner.conns_for_writer.remove(&writer_id); + inner.binding.writers.remove(&writer_id); + inner.binding.last_meta_for_writer.remove(&writer_id); + inner.binding.writer_idle_since_epoch_secs.remove(&writer_id); + inner.binding.conns_for_writer.remove(&writer_id); true } @@ -498,7 +520,7 @@ impl ConnRegistry { let inner = self.inner.read().await; let mut out = HashSet::::with_capacity(writer_ids.len()); for writer_id in writer_ids { - if let Some(conns) = inner.conns_for_writer.get(writer_id) + if let Some(conns) = inner.binding.conns_for_writer.get(writer_id) && !conns.is_empty() { out.insert(*writer_id); From 28d318d72468265de369fdec771e1b40b9249f4e Mon Sep 17 00:00:00 2001 From: Alexey <247128645+axkurcom@users.noreply.github.com> Date: Wed, 25 Mar 2026 17:59:54 +0300 Subject: [PATCH 04/17] ME Writer Task Consolidation Co-Authored-By: brekotis <93345790+brekotis@users.noreply.github.com> --- src/transport/middle_proxy/pool_writer.rs | 611 +++++++++++----------- 1 file changed, 318 insertions(+), 293 deletions(-) diff --git a/src/transport/middle_proxy/pool_writer.rs b/src/transport/middle_proxy/pool_writer.rs index 6f19789..506c354 100644 --- a/src/transport/middle_proxy/pool_writer.rs +++ b/src/transport/middle_proxy/pool_writer.rs @@ -38,6 +38,233 @@ fn is_me_peer_closed_error(error: &ProxyError) -> bool { matches!(error, ProxyError::Io(ioe) if ioe.kind() == ErrorKind::UnexpectedEof) } +enum WriterLifecycleExit { + Reader(Result<()>), + Writer(Result<()>), + Ping, + Signal, + Cancelled, +} + +async fn writer_command_loop( + mut rx: mpsc::Receiver, + mut rpc_writer: RpcWriter, + cancel: CancellationToken, +) -> Result<()> { + loop { + tokio::select! { + cmd = rx.recv() => { + match cmd { + Some(WriterCommand::Data(payload)) => { + rpc_writer.send(&payload).await?; + } + Some(WriterCommand::DataAndFlush(payload)) => { + rpc_writer.send_and_flush(&payload).await?; + } + Some(WriterCommand::Close) | None => return Ok(()), + } + } + _ = cancel.cancelled() => return Ok(()), + } + } +} + +#[allow(clippy::too_many_arguments)] +async fn ping_loop( + pool_ping: std::sync::Weak, + writer_id: u64, + tx_ping: mpsc::Sender, + ping_tracker_ping: Arc>>, + stats_ping: Arc, + keepalive_enabled: bool, + keepalive_interval: Duration, + keepalive_jitter: Duration, + cancel_ping_token: CancellationToken, +) { + let mut ping_id: i64 = rand::random::(); + let mut cleanup_tick: u32 = 0; + let idle_interval_cap = Duration::from_secs(ME_IDLE_KEEPALIVE_MAX_SECS); + // Per-writer jittered start to avoid phase sync. + let startup_jitter = if keepalive_enabled { + let mut interval = keepalive_interval; + let Some(pool) = pool_ping.upgrade() else { + return; + }; + if pool.registry.is_writer_empty(writer_id).await { + interval = interval.min(idle_interval_cap); + } + let jitter_cap_ms = interval.as_millis() / 2; + let effective_jitter_ms = keepalive_jitter.as_millis().min(jitter_cap_ms).max(1); + Duration::from_millis(rand::rng().random_range(0..=effective_jitter_ms as u64)) + } else { + let jitter = rand::rng().random_range(-ME_ACTIVE_PING_JITTER_SECS..=ME_ACTIVE_PING_JITTER_SECS); + let wait = (ME_ACTIVE_PING_SECS as i64 + jitter).max(5) as u64; + Duration::from_secs(wait) + }; + tokio::select! { + _ = cancel_ping_token.cancelled() => return, + _ = tokio::time::sleep(startup_jitter) => {} + } + loop { + let wait = if keepalive_enabled { + let mut interval = keepalive_interval; + let Some(pool) = pool_ping.upgrade() else { + return; + }; + if pool.registry.is_writer_empty(writer_id).await { + interval = interval.min(idle_interval_cap); + } + let jitter_cap_ms = interval.as_millis() / 2; + let effective_jitter_ms = keepalive_jitter.as_millis().min(jitter_cap_ms).max(1); + interval + Duration::from_millis(rand::rng().random_range(0..=effective_jitter_ms as u64)) + } else { + let jitter = rand::rng().random_range(-ME_ACTIVE_PING_JITTER_SECS..=ME_ACTIVE_PING_JITTER_SECS); + let secs = (ME_ACTIVE_PING_SECS as i64 + jitter).max(5) as u64; + Duration::from_secs(secs) + }; + tokio::select! { + _ = cancel_ping_token.cancelled() => return, + _ = tokio::time::sleep(wait) => {} + } + let sent_id = ping_id; + let mut p = Vec::with_capacity(12); + p.extend_from_slice(&RPC_PING_U32.to_le_bytes()); + p.extend_from_slice(&sent_id.to_le_bytes()); + { + let mut tracker = ping_tracker_ping.lock().await; + cleanup_tick = cleanup_tick.wrapping_add(1); + if cleanup_tick.is_multiple_of(ME_PING_TRACKER_CLEANUP_EVERY) { + let before = tracker.len(); + tracker.retain(|_, ts| ts.elapsed() < Duration::from_secs(120)); + let expired = before.saturating_sub(tracker.len()); + if expired > 0 { + stats_ping.increment_me_keepalive_timeout_by(expired as u64); + } + } + tracker.insert(sent_id, std::time::Instant::now()); + } + ping_id = ping_id.wrapping_add(1); + stats_ping.increment_me_keepalive_sent(); + if tx_ping + .send(WriterCommand::DataAndFlush(Bytes::from(p))) + .await + .is_err() + { + stats_ping.increment_me_keepalive_failed(); + debug!("ME ping failed, removing dead writer"); + return; + } + } +} + +#[allow(clippy::too_many_arguments)] +async fn rpc_proxy_req_signal_loop( + pool_signal: std::sync::Weak, + writer_id: u64, + tx_signal: mpsc::Sender, + stats_signal: Arc, + cancel_signal: CancellationToken, + keepalive_jitter_signal: Duration, + rpc_proxy_req_every_secs: u64, +) { + if rpc_proxy_req_every_secs == 0 { + return; + } + + let interval = Duration::from_secs(rpc_proxy_req_every_secs); + let startup_jitter_ms = { + let jitter_cap_ms = interval.as_millis() / 2; + let effective_jitter_ms = keepalive_jitter_signal + .as_millis() + .min(jitter_cap_ms) + .max(1); + rand::rng().random_range(0..=effective_jitter_ms as u64) + }; + + tokio::select! { + _ = cancel_signal.cancelled() => return, + _ = tokio::time::sleep(Duration::from_millis(startup_jitter_ms)) => {} + } + + loop { + let wait = { + let jitter_cap_ms = interval.as_millis() / 2; + let effective_jitter_ms = keepalive_jitter_signal + .as_millis() + .min(jitter_cap_ms) + .max(1); + interval + Duration::from_millis(rand::rng().random_range(0..=effective_jitter_ms as u64)) + }; + + tokio::select! { + _ = cancel_signal.cancelled() => return, + _ = tokio::time::sleep(wait) => {} + } + + let Some(pool) = pool_signal.upgrade() else { + return; + }; + + let Some(meta) = pool.registry.get_last_writer_meta(writer_id).await else { + stats_signal.increment_me_rpc_proxy_req_signal_skipped_no_meta_total(); + continue; + }; + + let (conn_id, mut service_rx) = pool.registry.register().await; + // Service RPC_PROXY_REQ signal path is intentionally route-only: + // do not bind synthetic conn_id into regular writer/client accounting. + + let payload = build_proxy_req_payload( + conn_id, + meta.client_addr, + meta.our_addr, + &[], + pool.proxy_tag.as_deref(), + meta.proto_flags, + ); + + if tx_signal + .send(WriterCommand::DataAndFlush(payload)) + .await + .is_err() + { + stats_signal.increment_me_rpc_proxy_req_signal_failed_total(); + let _ = pool.registry.unregister(conn_id).await; + return; + } + + stats_signal.increment_me_rpc_proxy_req_signal_sent_total(); + + if matches!( + tokio::time::timeout( + Duration::from_millis(ME_RPC_PROXY_REQ_RESPONSE_WAIT_MS), + service_rx.recv(), + ) + .await, + Ok(Some(_)) + ) { + stats_signal.increment_me_rpc_proxy_req_signal_response_total(); + } + + let mut close_payload = Vec::with_capacity(12); + close_payload.extend_from_slice(&RPC_CLOSE_EXT_U32.to_le_bytes()); + close_payload.extend_from_slice(&conn_id.to_le_bytes()); + + if tx_signal + .send(WriterCommand::DataAndFlush(Bytes::from(close_payload))) + .await + .is_err() + { + stats_signal.increment_me_rpc_proxy_req_signal_failed_total(); + let _ = pool.registry.unregister(conn_id).await; + return; + } + + stats_signal.increment_me_rpc_proxy_req_signal_close_sent_total(); + let _ = pool.registry.unregister(conn_id).await; + } +} + impl MePool { pub(crate) async fn prune_closed_writers(self: &Arc) { let closed_writer_ids: Vec = { @@ -138,46 +365,14 @@ impl MePool { let draining_started_at_epoch_secs = Arc::new(AtomicU64::new(0)); let drain_deadline_epoch_secs = Arc::new(AtomicU64::new(0)); let allow_drain_fallback = Arc::new(AtomicBool::new(false)); - let (tx, mut rx) = mpsc::channel::(self.writer_cmd_channel_capacity); - let mut rpc_writer = RpcWriter { + let (tx, rx) = mpsc::channel::(self.writer_cmd_channel_capacity); + let rpc_writer = RpcWriter { writer: hs.wr, key: hs.write_key, iv: hs.write_iv, seq_no: 0, crc_mode: hs.crc_mode, }; - let cancel_wr = cancel.clone(); - let cleanup_done = Arc::new(AtomicBool::new(false)); - let cleanup_for_writer = cleanup_done.clone(); - let pool_writer_task = Arc::downgrade(self); - tokio::spawn(async move { - loop { - tokio::select! { - cmd = rx.recv() => { - match cmd { - Some(WriterCommand::Data(payload)) => { - if rpc_writer.send(&payload).await.is_err() { break; } - } - Some(WriterCommand::DataAndFlush(payload)) => { - if rpc_writer.send_and_flush(&payload).await.is_err() { break; } - } - Some(WriterCommand::Close) | None => break, - } - } - _ = cancel_wr.cancelled() => break, - } - } - if cleanup_for_writer - .compare_exchange(false, true, Ordering::AcqRel, Ordering::Relaxed) - .is_ok() - { - if let Some(pool) = pool_writer_task.upgrade() { - pool.remove_writer_and_close_clients(writer_id).await; - } else { - cancel_wr.cancel(); - } - } - }); let writer = MeWriter { id: writer_id, addr, @@ -207,290 +402,120 @@ impl MePool { let writers_arc = self.writers_arc(); let ping_tracker = Arc::new(tokio::sync::Mutex::new(HashMap::::new())); let ping_tracker_reader = ping_tracker.clone(); + let ping_tracker_ping = ping_tracker.clone(); let rtt_stats = self.rtt_stats.clone(); let stats_reader = self.stats.clone(); let stats_reader_close = self.stats.clone(); let stats_ping = self.stats.clone(); - let pool = Arc::downgrade(self); - let cancel_ping = cancel.clone(); + let stats_signal = self.stats.clone(); + let pool_lifecycle = Arc::downgrade(self); + let pool_ping = Arc::downgrade(self); + let pool_signal = Arc::downgrade(self); + let tx_reader = tx.clone(); let tx_ping = tx.clone(); - let ping_tracker_ping = ping_tracker.clone(); - let cleanup_for_reader = cleanup_done.clone(); - let cleanup_for_ping = cleanup_done.clone(); + let tx_signal = tx.clone(); let keepalive_enabled = self.me_keepalive_enabled; let keepalive_interval = self.me_keepalive_interval; let keepalive_jitter = self.me_keepalive_jitter; - let rpc_proxy_req_every_secs = self.rpc_proxy_req_every_secs.load(Ordering::Relaxed); - let tx_signal = tx.clone(); - let stats_signal = self.stats.clone(); - let cancel_signal = cancel.clone(); - let cleanup_for_signal = cleanup_done.clone(); - let pool_signal = Arc::downgrade(self); let keepalive_jitter_signal = self.me_keepalive_jitter; - let cancel_reader_token = cancel.clone(); - let cancel_ping_token = cancel_ping.clone(); + let rpc_proxy_req_every_secs = self.rpc_proxy_req_every_secs.load(Ordering::Relaxed); + let cancel_reader = cancel.clone(); + let cancel_writer = cancel.clone(); + let cancel_ping = cancel.clone(); + let cancel_signal = cancel.clone(); + let cancel_select = cancel.clone(); + let cancel_cleanup = cancel.clone(); let reader_route_data_wait_ms = self.me_reader_route_data_wait_ms.clone(); tokio::spawn(async move { - let res = reader_loop( - hs.rd, - hs.read_key, - hs.read_iv, - hs.crc_mode, - reg.clone(), - BytesMut::new(), - BytesMut::new(), - tx.clone(), - ping_tracker_reader, - rtt_stats.clone(), - stats_reader, - writer_id, - degraded.clone(), - rtt_ema_ms_x10.clone(), - reader_route_data_wait_ms, - cancel_reader_token.clone(), - ) - .await; - let idle_close_by_peer = if let Err(e) = res.as_ref() { - is_me_peer_closed_error(e) && reg.is_writer_empty(writer_id).await - } else { - false - }; - if idle_close_by_peer { - stats_reader_close.increment_me_idle_close_by_peer_total(); - info!(writer_id, "ME socket closed by peer on idle writer"); - } - if cleanup_for_reader - .compare_exchange(false, true, Ordering::AcqRel, Ordering::Relaxed) - .is_ok() - { - if let Some(pool) = pool.upgrade() { - pool.remove_writer_and_close_clients(writer_id).await; - } else { - // Fallback for shutdown races: make writer task exit quickly so stale - // channels are observable by periodic prune. - cancel_reader_token.cancel(); - } - } - if let Err(e) = res - && !idle_close_by_peer - { - warn!(error = %e, "ME reader ended"); - } - let remaining = writers_arc.read().await.len(); - debug!(writer_id, remaining, "ME reader task finished"); - }); + // Reader MUST be the first branch in biased select! to avoid read starvation. + let exit = tokio::select! { + biased; - let pool_ping = Arc::downgrade(self); - tokio::spawn(async move { - let mut ping_id: i64 = rand::random::(); - let mut cleanup_tick: u32 = 0; - let idle_interval_cap = Duration::from_secs(ME_IDLE_KEEPALIVE_MAX_SECS); - // Per-writer jittered start to avoid phase sync. - let startup_jitter = if keepalive_enabled { - let mut interval = keepalive_interval; - if let Some(pool) = pool_ping.upgrade() { - if pool.registry.is_writer_empty(writer_id).await { - interval = interval.min(idle_interval_cap); - } - } else { - return; + reader_res = reader_loop( + hs.rd, + hs.read_key, + hs.read_iv, + hs.crc_mode, + reg.clone(), + BytesMut::new(), + BytesMut::new(), + tx_reader, + ping_tracker_reader, + rtt_stats, + stats_reader, + writer_id, + degraded, + rtt_ema_ms_x10, + reader_route_data_wait_ms, + cancel_reader, + ) => WriterLifecycleExit::Reader(reader_res), + writer_res = writer_command_loop(rx, rpc_writer, cancel_writer) => { + WriterLifecycleExit::Writer(writer_res) } - let jitter_cap_ms = interval.as_millis() / 2; - let effective_jitter_ms = keepalive_jitter.as_millis().min(jitter_cap_ms).max(1); - Duration::from_millis(rand::rng().random_range(0..=effective_jitter_ms as u64)) - } else { - let jitter = rand::rng() - .random_range(-ME_ACTIVE_PING_JITTER_SECS..=ME_ACTIVE_PING_JITTER_SECS); - let wait = (ME_ACTIVE_PING_SECS as i64 + jitter).max(5) as u64; - Duration::from_secs(wait) + _ = ping_loop( + pool_ping, + writer_id, + tx_ping, + ping_tracker_ping, + stats_ping, + keepalive_enabled, + keepalive_interval, + keepalive_jitter, + cancel_ping, + ) => WriterLifecycleExit::Ping, + _ = rpc_proxy_req_signal_loop( + pool_signal, + writer_id, + tx_signal, + stats_signal, + cancel_signal, + keepalive_jitter_signal, + rpc_proxy_req_every_secs, + ) => WriterLifecycleExit::Signal, + _ = cancel_select.cancelled() => WriterLifecycleExit::Cancelled, }; - tokio::select! { - _ = cancel_ping_token.cancelled() => return, - _ = tokio::time::sleep(startup_jitter) => {} - } - loop { - let wait = if keepalive_enabled { - let mut interval = keepalive_interval; - if let Some(pool) = pool_ping.upgrade() { - if pool.registry.is_writer_empty(writer_id).await { - interval = interval.min(idle_interval_cap); - } + + match exit { + WriterLifecycleExit::Reader(res) => { + let idle_close_by_peer = if let Err(e) = res.as_ref() { + is_me_peer_closed_error(e) && reg.is_writer_empty(writer_id).await } else { - break; + false + }; + if idle_close_by_peer { + stats_reader_close.increment_me_idle_close_by_peer_total(); + info!(writer_id, "ME socket closed by peer on idle writer"); } - let jitter_cap_ms = interval.as_millis() / 2; - let effective_jitter_ms = - keepalive_jitter.as_millis().min(jitter_cap_ms).max(1); - interval - + Duration::from_millis( - rand::rng().random_range(0..=effective_jitter_ms as u64), - ) - } else { - let jitter = rand::rng() - .random_range(-ME_ACTIVE_PING_JITTER_SECS..=ME_ACTIVE_PING_JITTER_SECS); - let secs = (ME_ACTIVE_PING_SECS as i64 + jitter).max(5) as u64; - Duration::from_secs(secs) - }; - tokio::select! { - _ = cancel_ping_token.cancelled() => { - break; - } - _ = tokio::time::sleep(wait) => {} - } - let sent_id = ping_id; - let mut p = Vec::with_capacity(12); - p.extend_from_slice(&RPC_PING_U32.to_le_bytes()); - p.extend_from_slice(&sent_id.to_le_bytes()); - { - let mut tracker = ping_tracker_ping.lock().await; - cleanup_tick = cleanup_tick.wrapping_add(1); - if cleanup_tick.is_multiple_of(ME_PING_TRACKER_CLEANUP_EVERY) { - let before = tracker.len(); - tracker.retain(|_, ts| ts.elapsed() < Duration::from_secs(120)); - let expired = before.saturating_sub(tracker.len()); - if expired > 0 { - stats_ping.increment_me_keepalive_timeout_by(expired as u64); - } - } - tracker.insert(sent_id, std::time::Instant::now()); - } - ping_id = ping_id.wrapping_add(1); - stats_ping.increment_me_keepalive_sent(); - if tx_ping - .send(WriterCommand::DataAndFlush(Bytes::from(p))) - .await - .is_err() - { - stats_ping.increment_me_keepalive_failed(); - debug!("ME ping failed, removing dead writer"); - cancel_ping.cancel(); - if cleanup_for_ping - .compare_exchange(false, true, Ordering::AcqRel, Ordering::Relaxed) - .is_ok() - && let Some(pool) = pool_ping.upgrade() + if let Err(e) = res + && !idle_close_by_peer { - pool.remove_writer_and_close_clients(writer_id).await; + warn!(error = %e, "ME reader ended"); } - break; } - } - }); - - tokio::spawn(async move { - if rpc_proxy_req_every_secs == 0 { - return; - } - - let interval = Duration::from_secs(rpc_proxy_req_every_secs); - let startup_jitter_ms = { - let jitter_cap_ms = interval.as_millis() / 2; - let effective_jitter_ms = keepalive_jitter_signal - .as_millis() - .min(jitter_cap_ms) - .max(1); - rand::rng().random_range(0..=effective_jitter_ms as u64) - }; - - tokio::select! { - _ = cancel_signal.cancelled() => return, - _ = tokio::time::sleep(Duration::from_millis(startup_jitter_ms)) => {} - } - - loop { - let wait = { - let jitter_cap_ms = interval.as_millis() / 2; - let effective_jitter_ms = keepalive_jitter_signal - .as_millis() - .min(jitter_cap_ms) - .max(1); - interval - + Duration::from_millis( - rand::rng().random_range(0..=effective_jitter_ms as u64), - ) - }; - - tokio::select! { - _ = cancel_signal.cancelled() => break, - _ = tokio::time::sleep(wait) => {} - } - - let Some(pool) = pool_signal.upgrade() else { - break; - }; - - let Some(meta) = pool.registry.get_last_writer_meta(writer_id).await else { - stats_signal.increment_me_rpc_proxy_req_signal_skipped_no_meta_total(); - continue; - }; - - let (conn_id, mut service_rx) = pool.registry.register().await; - // Service RPC_PROXY_REQ signal path is intentionally route-only: - // do not bind synthetic conn_id into regular writer/client accounting. - - let payload = build_proxy_req_payload( - conn_id, - meta.client_addr, - meta.our_addr, - &[], - pool.proxy_tag.as_deref(), - meta.proto_flags, - ); - - if tx_signal - .send(WriterCommand::DataAndFlush(payload)) - .await - .is_err() - { - stats_signal.increment_me_rpc_proxy_req_signal_failed_total(); - let _ = pool.registry.unregister(conn_id).await; - cancel_signal.cancel(); - if cleanup_for_signal - .compare_exchange(false, true, Ordering::AcqRel, Ordering::Relaxed) - .is_ok() - { - pool.remove_writer_and_close_clients(writer_id).await; + WriterLifecycleExit::Writer(res) => { + if let Err(e) = res { + warn!(error = %e, "ME writer command loop ended"); } - break; } - - stats_signal.increment_me_rpc_proxy_req_signal_sent_total(); - - if matches!( - tokio::time::timeout( - Duration::from_millis(ME_RPC_PROXY_REQ_RESPONSE_WAIT_MS), - service_rx.recv(), - ) - .await, - Ok(Some(_)) - ) { - stats_signal.increment_me_rpc_proxy_req_signal_response_total(); + WriterLifecycleExit::Ping => { + debug!(writer_id, "ME ping loop finished"); } - - let mut close_payload = Vec::with_capacity(12); - close_payload.extend_from_slice(&RPC_CLOSE_EXT_U32.to_le_bytes()); - close_payload.extend_from_slice(&conn_id.to_le_bytes()); - - if tx_signal - .send(WriterCommand::DataAndFlush(Bytes::from(close_payload))) - .await - .is_err() - { - stats_signal.increment_me_rpc_proxy_req_signal_failed_total(); - let _ = pool.registry.unregister(conn_id).await; - cancel_signal.cancel(); - if cleanup_for_signal - .compare_exchange(false, true, Ordering::AcqRel, Ordering::Relaxed) - .is_ok() - { - pool.remove_writer_and_close_clients(writer_id).await; - } - break; + WriterLifecycleExit::Signal => { + debug!(writer_id, "ME rpc_proxy_req signal loop finished"); } - - stats_signal.increment_me_rpc_proxy_req_signal_close_sent_total(); - let _ = pool.registry.unregister(conn_id).await; + WriterLifecycleExit::Cancelled => {} } + + if let Some(pool) = pool_lifecycle.upgrade() { + pool.remove_writer_and_close_clients(writer_id).await; + } else { + // Fallback for shutdown races: make lifecycle exit observable by prune. + cancel_cleanup.cancel(); + } + + let remaining = writers_arc.read().await.len(); + debug!(writer_id, remaining, "ME writer lifecycle task finished"); }); Ok(()) From 0b78583cf58a14fb4ef1fd2e37eec064f2c0ff77 Mon Sep 17 00:00:00 2001 From: Alexey <247128645+axkurcom@users.noreply.github.com> Date: Wed, 25 Mar 2026 18:18:06 +0300 Subject: [PATCH 05/17] ME Routing Core Co-Authored-By: brekotis <93345790+brekotis@users.noreply.github.com> --- src/proxy/middle_relay.rs | 4 +++ src/transport/middle_proxy/pool.rs | 32 ++++++++++++++++------- src/transport/middle_proxy/pool_writer.rs | 3 ++- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/src/proxy/middle_relay.rs b/src/proxy/middle_relay.rs index ca32e6f..6c7aef9 100644 --- a/src/proxy/middle_relay.rs +++ b/src/proxy/middle_relay.rs @@ -60,6 +60,9 @@ static DESYNC_DEDUP_PREVIOUS: OnceLock> = OnceLock::new(); static DESYNC_HASHER: OnceLock = OnceLock::new(); static DESYNC_FULL_CACHE_LAST_EMIT_AT: OnceLock>> = OnceLock::new(); static DESYNC_DEDUP_ROTATION_STATE: OnceLock> = OnceLock::new(); +// Invariant for async callers: +// this std::sync::Mutex is allowed only because critical sections are short, +// synchronous, and MUST never cross an `.await`. static RELAY_IDLE_CANDIDATE_REGISTRY: OnceLock> = OnceLock::new(); static RELAY_IDLE_MARK_SEQ: AtomicU64 = AtomicU64::new(0); @@ -100,6 +103,7 @@ fn relay_idle_candidate_registry() -> &'static Mutex fn relay_idle_candidate_registry_lock() -> std::sync::MutexGuard<'static, RelayIdleCandidateRegistry> { + // Keep lock scope narrow and synchronous: callers must drop guard before any `.await`. let registry = relay_idle_candidate_registry(); match registry.lock() { Ok(guard) => guard, diff --git a/src/transport/middle_proxy/pool.rs b/src/transport/middle_proxy/pool.rs index 07d4d19..fdda988 100644 --- a/src/transport/middle_proxy/pool.rs +++ b/src/transport/middle_proxy/pool.rs @@ -257,11 +257,17 @@ pub struct SecretSnapshot { pub secret: Vec, } -#[allow(dead_code)] -pub struct MePool { +pub struct RoutingCore { pub(super) registry: Arc, pub(super) writers: Arc, pub(super) rr: AtomicU64, + pub(super) writer_epoch: watch::Sender, + pub(super) preferred_endpoints_by_dc: ArcSwap>>, +} + +#[allow(dead_code)] +pub struct MePool { + pub(super) routing: Arc, pub(super) decision: NetworkDecision, pub(super) upstream: Option>, pub(super) rng: Arc, @@ -332,7 +338,6 @@ pub struct MePool { pub(super) nat_reflection_cache: Arc>, pub(super) nat_reflection_singleflight_v4: Arc>, pub(super) nat_reflection_singleflight_v6: Arc>, - pub(super) writer_epoch: watch::Sender, pub(super) refill_inflight: Arc>>, pub(super) refill_inflight_dc: Arc>>, pub(super) conn_count: AtomicUsize, @@ -389,7 +394,14 @@ pub struct MePool { pub(super) me_last_drain_gate_updated_at_epoch_secs: AtomicU64, pub(super) runtime_ready: AtomicBool, pool_size: usize, - pub(super) preferred_endpoints_by_dc: ArcSwap>>, +} + +impl Deref for MePool { + type Target = RoutingCore; + + fn deref(&self) -> &Self::Target { + self.routing.as_ref() + } } #[derive(Debug, Default)] @@ -524,9 +536,13 @@ impl MePool { let (writer_epoch, _) = watch::channel(0u64); let now_epoch_secs = Self::now_epoch_secs(); Arc::new(Self { - registry, - writers: Arc::new(WritersState::new()), - rr: AtomicU64::new(0), + routing: Arc::new(RoutingCore { + registry, + writers: Arc::new(WritersState::new()), + rr: AtomicU64::new(0), + writer_epoch, + preferred_endpoints_by_dc: ArcSwap::from_pointee(preferred_endpoints_by_dc), + }), decision, upstream, rng, @@ -644,7 +660,6 @@ impl MePool { nat_reflection_cache: Arc::new(Mutex::new(NatReflectionCache::default())), nat_reflection_singleflight_v4: Arc::new(Mutex::new(())), nat_reflection_singleflight_v6: Arc::new(Mutex::new(())), - writer_epoch, refill_inflight: Arc::new(Mutex::new(HashSet::new())), refill_inflight_dc: Arc::new(Mutex::new(HashSet::new())), conn_count: AtomicUsize::new(0), @@ -731,7 +746,6 @@ impl MePool { me_last_drain_gate_block_reason: AtomicU8::new(MeDrainGateReason::Open as u8), me_last_drain_gate_updated_at_epoch_secs: AtomicU64::new(now_epoch_secs), runtime_ready: AtomicBool::new(false), - preferred_endpoints_by_dc: ArcSwap::from_pointee(preferred_endpoints_by_dc), }) } diff --git a/src/transport/middle_proxy/pool_writer.rs b/src/transport/middle_proxy/pool_writer.rs index 506c354..d2d7420 100644 --- a/src/transport/middle_proxy/pool_writer.rs +++ b/src/transport/middle_proxy/pool_writer.rs @@ -596,7 +596,8 @@ impl MePool { let _ = self.registry.writer_lost(writer_id).await; self.rtt_stats.lock().await.remove(&writer_id); if let Some(tx) = close_tx { - let _ = tx.send(WriterCommand::Close).await; + // Keep teardown critical path non-blocking: close is best-effort only. + let _ = tx.try_send(WriterCommand::Close); } if let Some(addr) = removed_addr { if let Some(uptime) = removed_uptime { From 1c3e0d4e46b1809ecaabcf80bb2442ee908dea5d Mon Sep 17 00:00:00 2001 From: Alexey <247128645+axkurcom@users.noreply.github.com> Date: Wed, 25 Mar 2026 19:43:02 +0300 Subject: [PATCH 06/17] ME Reinit Core Co-Authored-By: brekotis <93345790+brekotis@users.noreply.github.com> --- src/transport/middle_proxy/pool.rs | 38 +++++++++++-------- src/transport/middle_proxy/pool_reinit.rs | 45 +++++++++++++++-------- src/transport/middle_proxy/pool_status.rs | 12 ++++-- 3 files changed, 59 insertions(+), 36 deletions(-) diff --git a/src/transport/middle_proxy/pool.rs b/src/transport/middle_proxy/pool.rs index fdda988..9ebc412 100644 --- a/src/transport/middle_proxy/pool.rs +++ b/src/transport/middle_proxy/pool.rs @@ -265,9 +265,20 @@ pub struct RoutingCore { pub(super) preferred_endpoints_by_dc: ArcSwap>>, } +pub(super) struct ReinitCore { + pub(super) generation: AtomicU64, + pub(super) active_generation: AtomicU64, + pub(super) warm_generation: AtomicU64, + pub(super) pending_hardswap_generation: AtomicU64, + pub(super) pending_hardswap_started_at_epoch_secs: AtomicU64, + pub(super) pending_hardswap_map_hash: AtomicU64, + pub(super) hardswap: AtomicBool, +} + #[allow(dead_code)] pub struct MePool { pub(super) routing: Arc, + pub(super) reinit: Arc, pub(super) decision: NetworkDecision, pub(super) upstream: Option>, pub(super) rng: Arc, @@ -343,13 +354,6 @@ pub struct MePool { pub(super) conn_count: AtomicUsize, pub(super) draining_active_runtime: AtomicU64, pub(super) stats: Arc, - pub(super) generation: AtomicU64, - pub(super) active_generation: AtomicU64, - pub(super) warm_generation: AtomicU64, - pub(super) pending_hardswap_generation: AtomicU64, - pub(super) pending_hardswap_started_at_epoch_secs: AtomicU64, - pub(super) pending_hardswap_map_hash: AtomicU64, - pub(super) hardswap: AtomicBool, pub(super) endpoint_quarantine: Arc>>, pub(super) kdf_material_fingerprint: Arc>>, pub(super) me_pool_drain_ttl_secs: AtomicU64, @@ -543,6 +547,15 @@ impl MePool { writer_epoch, preferred_endpoints_by_dc: ArcSwap::from_pointee(preferred_endpoints_by_dc), }), + reinit: Arc::new(ReinitCore { + generation: AtomicU64::new(1), + active_generation: AtomicU64::new(1), + warm_generation: AtomicU64::new(0), + pending_hardswap_generation: AtomicU64::new(0), + pending_hardswap_started_at_epoch_secs: AtomicU64::new(0), + pending_hardswap_map_hash: AtomicU64::new(0), + hardswap: AtomicBool::new(hardswap), + }), decision, upstream, rng, @@ -664,13 +677,6 @@ impl MePool { refill_inflight_dc: Arc::new(Mutex::new(HashSet::new())), conn_count: AtomicUsize::new(0), draining_active_runtime: AtomicU64::new(0), - generation: AtomicU64::new(1), - active_generation: AtomicU64::new(1), - warm_generation: AtomicU64::new(0), - pending_hardswap_generation: AtomicU64::new(0), - pending_hardswap_started_at_epoch_secs: AtomicU64::new(0), - pending_hardswap_map_hash: AtomicU64::new(0), - hardswap: AtomicBool::new(hardswap), endpoint_quarantine: Arc::new(Mutex::new(HashMap::new())), kdf_material_fingerprint: Arc::new(RwLock::new(HashMap::new())), me_pool_drain_ttl_secs: AtomicU64::new(me_pool_drain_ttl_secs), @@ -750,7 +756,7 @@ impl MePool { } pub fn current_generation(&self) -> u64 { - self.active_generation.load(Ordering::Relaxed) + self.reinit.active_generation.load(Ordering::Relaxed) } pub fn set_runtime_ready(&self, ready: bool) { @@ -934,7 +940,7 @@ impl MePool { me_health_interval_ms_healthy: u64, me_warn_rate_limit_ms: u64, ) { - self.hardswap.store(hardswap, Ordering::Relaxed); + self.reinit.hardswap.store(hardswap, Ordering::Relaxed); self.me_pool_drain_ttl_secs .store(drain_ttl_secs, Ordering::Relaxed); self.me_instadrain.store(instadrain, Ordering::Relaxed); diff --git a/src/transport/middle_proxy/pool_reinit.rs b/src/transport/middle_proxy/pool_reinit.rs index 663007b..be08649 100644 --- a/src/transport/middle_proxy/pool_reinit.rs +++ b/src/transport/middle_proxy/pool_reinit.rs @@ -37,16 +37,21 @@ impl MePool { } fn clear_pending_hardswap_state(&self) { - self.pending_hardswap_generation.store(0, Ordering::Relaxed); - self.pending_hardswap_started_at_epoch_secs + self.reinit.pending_hardswap_generation.store(0, Ordering::Relaxed); + self.reinit + .pending_hardswap_started_at_epoch_secs .store(0, Ordering::Relaxed); - self.pending_hardswap_map_hash.store(0, Ordering::Relaxed); - self.warm_generation.store(0, Ordering::Relaxed); + self.reinit + .pending_hardswap_map_hash + .store(0, Ordering::Relaxed); + self.reinit.warm_generation.store(0, Ordering::Relaxed); } async fn promote_warm_generation_to_active(&self, generation: u64) { - self.active_generation.store(generation, Ordering::Relaxed); - self.warm_generation.store(0, Ordering::Relaxed); + self.reinit + .active_generation + .store(generation, Ordering::Relaxed); + self.reinit.warm_generation.store(0, Ordering::Relaxed); let ws = self.writers.read().await; for writer in ws.iter() { @@ -369,13 +374,17 @@ impl MePool { let desired_map_hash = Self::desired_map_hash(&desired_by_dc); let previous_generation = self.current_generation(); - let hardswap = self.hardswap.load(Ordering::Relaxed); + let hardswap = self.reinit.hardswap.load(Ordering::Relaxed); let generation = if hardswap { - let pending_generation = self.pending_hardswap_generation.load(Ordering::Relaxed); + let pending_generation = self + .reinit + .pending_hardswap_generation + .load(Ordering::Relaxed); let pending_started_at = self + .reinit .pending_hardswap_started_at_epoch_secs .load(Ordering::Relaxed); - let pending_map_hash = self.pending_hardswap_map_hash.load(Ordering::Relaxed); + let pending_map_hash = self.reinit.pending_hardswap_map_hash.load(Ordering::Relaxed); let pending_age_secs = now_epoch_secs.saturating_sub(pending_started_at); let pending_ttl_expired = pending_started_at > 0 && pending_age_secs > ME_HARDSWAP_PENDING_TTL_SECS; @@ -405,24 +414,28 @@ impl MePool { "ME hardswap pending generation expired by TTL; starting fresh generation" ); } - let next_generation = self.generation.fetch_add(1, Ordering::Relaxed) + 1; - self.pending_hardswap_generation + let next_generation = self.reinit.generation.fetch_add(1, Ordering::Relaxed) + 1; + self.reinit + .pending_hardswap_generation .store(next_generation, Ordering::Relaxed); - self.pending_hardswap_started_at_epoch_secs + self.reinit + .pending_hardswap_started_at_epoch_secs .store(now_epoch_secs, Ordering::Relaxed); - self.pending_hardswap_map_hash + self.reinit + .pending_hardswap_map_hash .store(desired_map_hash, Ordering::Relaxed); - self.warm_generation + self.reinit + .warm_generation .store(next_generation, Ordering::Relaxed); next_generation } } else { self.clear_pending_hardswap_state(); - self.generation.fetch_add(1, Ordering::Relaxed) + 1 + self.reinit.generation.fetch_add(1, Ordering::Relaxed) + 1 }; if hardswap { - self.warm_generation.store(generation, Ordering::Relaxed); + self.reinit.warm_generation.store(generation, Ordering::Relaxed); self.warmup_generation_for_all_dcs(rng, generation, &desired_by_dc) .await; } else { diff --git a/src/transport/middle_proxy/pool_status.rs b/src/transport/middle_proxy/pool_status.rs index afb8efe..e636b34 100644 --- a/src/transport/middle_proxy/pool_status.rs +++ b/src/transport/middle_proxy/pool_status.rs @@ -436,6 +436,7 @@ impl MePool { let now = Instant::now(); let now_epoch_secs = Self::now_epoch_secs(); let pending_started_at = self + .reinit .pending_hardswap_started_at_epoch_secs .load(Ordering::Relaxed); let pending_hardswap_age_secs = @@ -477,11 +478,14 @@ impl MePool { } MeApiRuntimeSnapshot { - active_generation: self.active_generation.load(Ordering::Relaxed), - warm_generation: self.warm_generation.load(Ordering::Relaxed), - pending_hardswap_generation: self.pending_hardswap_generation.load(Ordering::Relaxed), + active_generation: self.reinit.active_generation.load(Ordering::Relaxed), + warm_generation: self.reinit.warm_generation.load(Ordering::Relaxed), + pending_hardswap_generation: self + .reinit + .pending_hardswap_generation + .load(Ordering::Relaxed), pending_hardswap_age_secs, - hardswap_enabled: self.hardswap.load(Ordering::Relaxed), + hardswap_enabled: self.reinit.hardswap.load(Ordering::Relaxed), floor_mode: floor_mode_label(self.floor_mode()), adaptive_floor_idle_secs: self.me_adaptive_floor_idle_secs.load(Ordering::Relaxed), adaptive_floor_min_writers_single_endpoint: self From dc6b6d3f9dc39788bd4345b66cf0f3cf34c795b8 Mon Sep 17 00:00:00 2001 From: Alexey <247128645+axkurcom@users.noreply.github.com> Date: Wed, 25 Mar 2026 19:47:41 +0300 Subject: [PATCH 07/17] ME Writer Lifecycle Core Co-Authored-By: brekotis <93345790+brekotis@users.noreply.github.com> --- src/transport/middle_proxy/pool.rs | 30 ++++++++++++++--------- src/transport/middle_proxy/pool_status.rs | 13 ++++++---- src/transport/middle_proxy/pool_writer.rs | 17 ++++++++----- src/transport/middle_proxy/send.rs | 2 +- 4 files changed, 38 insertions(+), 24 deletions(-) diff --git a/src/transport/middle_proxy/pool.rs b/src/transport/middle_proxy/pool.rs index 9ebc412..e617a43 100644 --- a/src/transport/middle_proxy/pool.rs +++ b/src/transport/middle_proxy/pool.rs @@ -275,10 +275,20 @@ pub(super) struct ReinitCore { pub(super) hardswap: AtomicBool, } +pub(super) struct WriterLifecycleCore { + pub(super) me_keepalive_enabled: bool, + pub(super) me_keepalive_interval: Duration, + pub(super) me_keepalive_jitter: Duration, + pub(super) me_keepalive_payload_random: bool, + pub(super) rpc_proxy_req_every_secs: AtomicU64, + pub(super) writer_cmd_channel_capacity: usize, +} + #[allow(dead_code)] pub struct MePool { pub(super) routing: Arc, pub(super) reinit: Arc, + pub(super) writer_lifecycle: Arc, pub(super) decision: NetworkDecision, pub(super) upstream: Option>, pub(super) rng: Arc, @@ -297,12 +307,6 @@ pub struct MePool { pub(super) stun_backoff_until: Arc>>, pub(super) me_one_retry: u8, pub(super) me_one_timeout: Duration, - pub(super) me_keepalive_enabled: bool, - pub(super) me_keepalive_interval: Duration, - pub(super) me_keepalive_jitter: Duration, - pub(super) me_keepalive_payload_random: bool, - pub(super) rpc_proxy_req_every_secs: AtomicU64, - pub(super) writer_cmd_channel_capacity: usize, pub(super) me_warmup_stagger_enabled: bool, pub(super) me_warmup_step_delay: Duration, pub(super) me_warmup_step_jitter: Duration, @@ -556,6 +560,14 @@ impl MePool { pending_hardswap_map_hash: AtomicU64::new(0), hardswap: AtomicBool::new(hardswap), }), + writer_lifecycle: Arc::new(WriterLifecycleCore { + me_keepalive_enabled, + me_keepalive_interval: Duration::from_secs(me_keepalive_interval_secs), + me_keepalive_jitter: Duration::from_secs(me_keepalive_jitter_secs), + me_keepalive_payload_random, + rpc_proxy_req_every_secs: AtomicU64::new(rpc_proxy_req_every_secs), + writer_cmd_channel_capacity: me_writer_cmd_channel_capacity.max(1), + }), decision, upstream, rng, @@ -588,12 +600,6 @@ impl MePool { me_one_retry, me_one_timeout: Duration::from_millis(me_one_timeout_ms), stats, - me_keepalive_enabled, - me_keepalive_interval: Duration::from_secs(me_keepalive_interval_secs), - me_keepalive_jitter: Duration::from_secs(me_keepalive_jitter_secs), - me_keepalive_payload_random, - rpc_proxy_req_every_secs: AtomicU64::new(rpc_proxy_req_every_secs), - writer_cmd_channel_capacity: me_writer_cmd_channel_capacity.max(1), me_warmup_stagger_enabled, me_warmup_step_delay: Duration::from_millis(me_warmup_step_delay_ms), me_warmup_step_jitter: Duration::from_millis(me_warmup_step_jitter_ms), diff --git a/src/transport/middle_proxy/pool_status.rs b/src/transport/middle_proxy/pool_status.rs index e636b34..7ee4422 100644 --- a/src/transport/middle_proxy/pool_status.rs +++ b/src/transport/middle_proxy/pool_status.rs @@ -558,11 +558,14 @@ impl MePool { adaptive_floor_warm_writers_current: self .me_adaptive_floor_warm_writers_current .load(Ordering::Relaxed), - me_keepalive_enabled: self.me_keepalive_enabled, - me_keepalive_interval_secs: self.me_keepalive_interval.as_secs(), - me_keepalive_jitter_secs: self.me_keepalive_jitter.as_secs(), - me_keepalive_payload_random: self.me_keepalive_payload_random, - rpc_proxy_req_every_secs: self.rpc_proxy_req_every_secs.load(Ordering::Relaxed), + me_keepalive_enabled: self.writer_lifecycle.me_keepalive_enabled, + me_keepalive_interval_secs: self.writer_lifecycle.me_keepalive_interval.as_secs(), + me_keepalive_jitter_secs: self.writer_lifecycle.me_keepalive_jitter.as_secs(), + me_keepalive_payload_random: self.writer_lifecycle.me_keepalive_payload_random, + rpc_proxy_req_every_secs: self + .writer_lifecycle + .rpc_proxy_req_every_secs + .load(Ordering::Relaxed), me_reconnect_max_concurrent_per_dc: self.me_reconnect_max_concurrent_per_dc, me_reconnect_backoff_base_ms: self.me_reconnect_backoff_base.as_millis() as u64, me_reconnect_backoff_cap_ms: self.me_reconnect_backoff_cap.as_millis() as u64, diff --git a/src/transport/middle_proxy/pool_writer.rs b/src/transport/middle_proxy/pool_writer.rs index d2d7420..f697257 100644 --- a/src/transport/middle_proxy/pool_writer.rs +++ b/src/transport/middle_proxy/pool_writer.rs @@ -365,7 +365,9 @@ impl MePool { let draining_started_at_epoch_secs = Arc::new(AtomicU64::new(0)); let drain_deadline_epoch_secs = Arc::new(AtomicU64::new(0)); let allow_drain_fallback = Arc::new(AtomicBool::new(false)); - let (tx, rx) = mpsc::channel::(self.writer_cmd_channel_capacity); + let (tx, rx) = mpsc::channel::( + self.writer_lifecycle.writer_cmd_channel_capacity, + ); let rpc_writer = RpcWriter { writer: hs.wr, key: hs.write_key, @@ -414,11 +416,14 @@ impl MePool { let tx_reader = tx.clone(); let tx_ping = tx.clone(); let tx_signal = tx.clone(); - let keepalive_enabled = self.me_keepalive_enabled; - let keepalive_interval = self.me_keepalive_interval; - let keepalive_jitter = self.me_keepalive_jitter; - let keepalive_jitter_signal = self.me_keepalive_jitter; - let rpc_proxy_req_every_secs = self.rpc_proxy_req_every_secs.load(Ordering::Relaxed); + let keepalive_enabled = self.writer_lifecycle.me_keepalive_enabled; + let keepalive_interval = self.writer_lifecycle.me_keepalive_interval; + let keepalive_jitter = self.writer_lifecycle.me_keepalive_jitter; + let keepalive_jitter_signal = self.writer_lifecycle.me_keepalive_jitter; + let rpc_proxy_req_every_secs = self + .writer_lifecycle + .rpc_proxy_req_every_secs + .load(Ordering::Relaxed); let cancel_reader = cancel.clone(); let cancel_writer = cancel.clone(); let cancel_ping = cancel.clone(); diff --git a/src/transport/middle_proxy/send.rs b/src/transport/middle_proxy/send.rs index d38775f..80fd2ea 100644 --- a/src/transport/middle_proxy/send.rs +++ b/src/transport/middle_proxy/send.rs @@ -857,7 +857,7 @@ impl MePool { (self.writer_idle_rank_for_selection(writer, idle_since_by_writer, now_epoch_secs) as u64) * 100; - let queue_cap = self.writer_cmd_channel_capacity.max(1) as u64; + let queue_cap = self.writer_lifecycle.writer_cmd_channel_capacity.max(1) as u64; let queue_remaining = writer.tx.capacity() as u64; let queue_used = queue_cap.saturating_sub(queue_remaining.min(queue_cap)); let queue_util_pct = queue_used.saturating_mul(100) / queue_cap; From 97f66495848a4a6bc85259a3ab05c0409855c182 Mon Sep 17 00:00:00 2001 From: Alexey <247128645+axkurcom@users.noreply.github.com> Date: Wed, 25 Mar 2026 19:56:25 +0300 Subject: [PATCH 08/17] ME Route Runtime Core Co-Authored-By: brekotis <93345790+brekotis@users.noreply.github.com> --- src/transport/middle_proxy/pool.rs | 54 +++++++++++++++------------- src/transport/middle_proxy/send.rs | 58 +++++++++++++++++++++--------- 2 files changed, 72 insertions(+), 40 deletions(-) diff --git a/src/transport/middle_proxy/pool.rs b/src/transport/middle_proxy/pool.rs index e617a43..d4fa0aa 100644 --- a/src/transport/middle_proxy/pool.rs +++ b/src/transport/middle_proxy/pool.rs @@ -284,11 +284,24 @@ pub(super) struct WriterLifecycleCore { pub(super) writer_cmd_channel_capacity: usize, } +pub(super) struct RouteRuntimeCore { + pub(super) me_route_no_writer_mode: AtomicU8, + pub(super) me_route_no_writer_wait: Duration, + pub(super) me_route_hybrid_max_wait: Duration, + pub(super) me_route_blocking_send_timeout: Option, + pub(super) me_route_last_success_epoch_ms: AtomicU64, + pub(super) me_route_hybrid_timeout_warn_epoch_ms: AtomicU64, + pub(super) me_async_recovery_last_trigger_epoch_ms: AtomicU64, + pub(super) me_route_inline_recovery_attempts: u32, + pub(super) me_route_inline_recovery_wait: Duration, +} + #[allow(dead_code)] pub struct MePool { pub(super) routing: Arc, pub(super) reinit: Arc, pub(super) writer_lifecycle: Arc, + pub(super) route_runtime: Arc, pub(super) decision: NetworkDecision, pub(super) upstream: Option>, pub(super) rng: Arc, @@ -382,15 +395,6 @@ pub struct MePool { pub(super) me_writer_pick_sample_size: AtomicU8, pub(super) me_socks_kdf_policy: AtomicU8, pub(super) me_reader_route_data_wait_ms: Arc, - pub(super) me_route_no_writer_mode: AtomicU8, - pub(super) me_route_no_writer_wait: Duration, - pub(super) me_route_hybrid_max_wait: Duration, - pub(super) me_route_blocking_send_timeout: Option, - pub(super) me_route_last_success_epoch_ms: AtomicU64, - pub(super) me_route_hybrid_timeout_warn_epoch_ms: AtomicU64, - pub(super) me_async_recovery_last_trigger_epoch_ms: AtomicU64, - pub(super) me_route_inline_recovery_attempts: u32, - pub(super) me_route_inline_recovery_wait: Duration, pub(super) me_health_interval_ms_unhealthy: AtomicU64, pub(super) me_health_interval_ms_healthy: AtomicU64, pub(super) me_warn_rate_limit_ms: AtomicU64, @@ -568,6 +572,23 @@ impl MePool { rpc_proxy_req_every_secs: AtomicU64::new(rpc_proxy_req_every_secs), writer_cmd_channel_capacity: me_writer_cmd_channel_capacity.max(1), }), + route_runtime: Arc::new(RouteRuntimeCore { + me_route_no_writer_mode: AtomicU8::new(me_route_no_writer_mode.as_u8()), + me_route_no_writer_wait: Duration::from_millis(me_route_no_writer_wait_ms), + me_route_hybrid_max_wait: Duration::from_millis(me_route_hybrid_max_wait_ms.max(50)), + me_route_blocking_send_timeout: if me_route_blocking_send_timeout_ms == 0 { + None + } else { + Some(Duration::from_millis( + me_route_blocking_send_timeout_ms.min(5_000), + )) + }, + me_route_last_success_epoch_ms: AtomicU64::new(0), + me_route_hybrid_timeout_warn_epoch_ms: AtomicU64::new(0), + me_async_recovery_last_trigger_epoch_ms: AtomicU64::new(0), + me_route_inline_recovery_attempts, + me_route_inline_recovery_wait: Duration::from_millis(me_route_inline_recovery_wait_ms), + }), decision, upstream, rng, @@ -721,21 +742,6 @@ impl MePool { me_writer_pick_sample_size: AtomicU8::new(me_writer_pick_sample_size.clamp(2, 4)), me_socks_kdf_policy: AtomicU8::new(me_socks_kdf_policy.as_u8()), me_reader_route_data_wait_ms: Arc::new(AtomicU64::new(me_reader_route_data_wait_ms)), - me_route_no_writer_mode: AtomicU8::new(me_route_no_writer_mode.as_u8()), - me_route_no_writer_wait: Duration::from_millis(me_route_no_writer_wait_ms), - me_route_hybrid_max_wait: Duration::from_millis(me_route_hybrid_max_wait_ms.max(50)), - me_route_blocking_send_timeout: if me_route_blocking_send_timeout_ms == 0 { - None - } else { - Some(Duration::from_millis( - me_route_blocking_send_timeout_ms.min(5_000), - )) - }, - me_route_last_success_epoch_ms: AtomicU64::new(0), - me_route_hybrid_timeout_warn_epoch_ms: AtomicU64::new(0), - me_async_recovery_last_trigger_epoch_ms: AtomicU64::new(0), - me_route_inline_recovery_attempts, - me_route_inline_recovery_wait: Duration::from_millis(me_route_inline_recovery_wait_ms), me_health_interval_ms_unhealthy: AtomicU64::new(me_health_interval_ms_unhealthy.max(1)), me_health_interval_ms_healthy: AtomicU64::new(me_health_interval_ms_healthy.max(1)), me_warn_rate_limit_ms: AtomicU64::new(me_warn_rate_limit_ms.max(1)), diff --git a/src/transport/middle_proxy/send.rs b/src/transport/middle_proxy/send.rs index 80fd2ea..faec2ec 100644 --- a/src/transport/middle_proxy/send.rs +++ b/src/transport/middle_proxy/send.rs @@ -71,8 +71,11 @@ impl MePool { }, ) }; - let no_writer_mode = - MeRouteNoWriterMode::from_u8(self.me_route_no_writer_mode.load(Ordering::Relaxed)); + let no_writer_mode = MeRouteNoWriterMode::from_u8( + self.route_runtime + .me_route_no_writer_mode + .load(Ordering::Relaxed), + ); let (routed_dc, unknown_target_dc) = self.resolve_target_dc_for_routing(target_dc as i32).await; let mut no_writer_deadline: Option = None; @@ -81,7 +84,10 @@ impl MePool { let mut hybrid_recovery_round = 0u32; let mut hybrid_last_recovery_at: Option = None; let mut hybrid_total_deadline: Option = None; - let hybrid_wait_step = self.me_route_no_writer_wait.max(Duration::from_millis(50)); + let hybrid_wait_step = self + .route_runtime + .me_route_no_writer_wait + .max(Duration::from_millis(50)); let mut hybrid_wait_current = hybrid_wait_step; loop { @@ -126,7 +132,7 @@ impl MePool { match no_writer_mode { MeRouteNoWriterMode::AsyncRecoveryFailfast => { let deadline = *no_writer_deadline.get_or_insert_with(|| { - Instant::now() + self.me_route_no_writer_wait + Instant::now() + self.route_runtime.me_route_no_writer_wait }); if !async_recovery_triggered && !unknown_target_dc { let triggered = @@ -147,7 +153,8 @@ impl MePool { MeRouteNoWriterMode::InlineRecoveryLegacy => { self.stats.increment_me_inline_recovery_total(); if !unknown_target_dc { - for _ in 0..self.me_route_inline_recovery_attempts.max(1) { + for _ in 0..self.route_runtime.me_route_inline_recovery_attempts.max(1) + { for family in self.family_order() { let map = match family { IpFamily::V4 => self.proxy_map_v4.read().await.clone(), @@ -176,7 +183,7 @@ impl MePool { continue; } let deadline = *no_writer_deadline.get_or_insert_with(|| { - Instant::now() + self.me_route_inline_recovery_wait + Instant::now() + self.route_runtime.me_route_inline_recovery_wait }); if !self.wait_for_writer_until(deadline).await { if !self.writers.read().await.is_empty() { @@ -231,8 +238,9 @@ impl MePool { let pick_mode = self.writer_pick_mode(); match no_writer_mode { MeRouteNoWriterMode::AsyncRecoveryFailfast => { - let deadline = *no_writer_deadline - .get_or_insert_with(|| Instant::now() + self.me_route_no_writer_wait); + let deadline = *no_writer_deadline.get_or_insert_with(|| { + Instant::now() + self.route_runtime.me_route_no_writer_wait + }); if !async_recovery_triggered && !unknown_target_dc { let triggered = self.trigger_async_recovery_for_target_dc(routed_dc).await; @@ -255,7 +263,7 @@ impl MePool { self.stats.increment_me_inline_recovery_total(); if unknown_target_dc { let deadline = *no_writer_deadline.get_or_insert_with(|| { - Instant::now() + self.me_route_inline_recovery_wait + Instant::now() + self.route_runtime.me_route_inline_recovery_wait }); if self.wait_for_candidate_until(routed_dc, deadline).await { continue; @@ -267,7 +275,9 @@ impl MePool { "No ME writers available for target DC".into(), )); } - if emergency_attempts >= self.me_route_inline_recovery_attempts.max(1) { + if emergency_attempts + >= self.route_runtime.me_route_inline_recovery_attempts.max(1) + { self.stats .increment_me_writer_pick_no_candidate_total(pick_mode); self.stats.increment_me_no_writer_failfast_total(); @@ -480,7 +490,8 @@ impl MePool { .increment_me_writer_pick_blocking_fallback_total(); let effective_our_addr = SocketAddr::new(w.source_ip, our_addr.port()); let (payload, meta) = build_routed_payload(effective_our_addr); - let reserve_result = if let Some(timeout) = self.me_route_blocking_send_timeout { + let reserve_result = if let Some(timeout) = self.route_runtime.me_route_blocking_send_timeout + { match tokio::time::timeout(timeout, w.tx.clone().reserve_owned()).await { Ok(result) => result, Err(_) => { @@ -646,9 +657,15 @@ impl MePool { } fn hybrid_total_wait_budget(&self) -> Duration { - let base = self.me_route_hybrid_max_wait.max(Duration::from_millis(50)); + let base = self + .route_runtime + .me_route_hybrid_max_wait + .max(Duration::from_millis(50)); let now_ms = Self::now_epoch_millis(); - let last_success_ms = self.me_route_last_success_epoch_ms.load(Ordering::Relaxed); + let last_success_ms = self + .route_runtime + .me_route_last_success_epoch_ms + .load(Ordering::Relaxed); if last_success_ms != 0 && now_ms.saturating_sub(last_success_ms) <= HYBRID_RECENT_SUCCESS_WINDOW_MS { @@ -658,7 +675,8 @@ impl MePool { } fn note_hybrid_route_success(&self) { - self.me_route_last_success_epoch_ms + self.route_runtime + .me_route_last_success_epoch_ms .store(Self::now_epoch_millis(), Ordering::Relaxed); } @@ -666,10 +684,14 @@ impl MePool { self.stats.increment_me_hybrid_timeout_total(); let now_ms = Self::now_epoch_millis(); let mut last_warn_ms = self + .route_runtime .me_route_hybrid_timeout_warn_epoch_ms .load(Ordering::Relaxed); while now_ms.saturating_sub(last_warn_ms) >= HYBRID_TIMEOUT_WARN_RATE_LIMIT_MS { - match self.me_route_hybrid_timeout_warn_epoch_ms.compare_exchange_weak( + match self + .route_runtime + .me_route_hybrid_timeout_warn_epoch_ms + .compare_exchange_weak( last_warn_ms, now_ms, Ordering::AcqRel, @@ -692,13 +714,17 @@ impl MePool { fn try_consume_hybrid_recovery_trigger_slot(&self, min_interval_ms: u64) -> bool { let now_ms = Self::now_epoch_millis(); let mut last_trigger_ms = self + .route_runtime .me_async_recovery_last_trigger_epoch_ms .load(Ordering::Relaxed); loop { if now_ms.saturating_sub(last_trigger_ms) < min_interval_ms { return false; } - match self.me_async_recovery_last_trigger_epoch_ms.compare_exchange_weak( + match self + .route_runtime + .me_async_recovery_last_trigger_epoch_ms + .compare_exchange_weak( last_trigger_ms, now_ms, Ordering::AcqRel, From 6ee4d4648cf4099cfb2291efccedea8c72ff041b Mon Sep 17 00:00:00 2001 From: Alexey <247128645+axkurcom@users.noreply.github.com> Date: Wed, 25 Mar 2026 20:01:44 +0300 Subject: [PATCH 09/17] ME Health Core Co-Authored-By: brekotis <93345790+brekotis@users.noreply.github.com> --- src/transport/middle_proxy/pool.rs | 112 ++++++++++++++++++----------- 1 file changed, 72 insertions(+), 40 deletions(-) diff --git a/src/transport/middle_proxy/pool.rs b/src/transport/middle_proxy/pool.rs index d4fa0aa..0146e78 100644 --- a/src/transport/middle_proxy/pool.rs +++ b/src/transport/middle_proxy/pool.rs @@ -296,12 +296,21 @@ pub(super) struct RouteRuntimeCore { pub(super) me_route_inline_recovery_wait: Duration, } +pub(super) struct HealthRuntimeCore { + pub(super) me_health_interval_ms_unhealthy: AtomicU64, + pub(super) me_health_interval_ms_healthy: AtomicU64, + pub(super) me_warn_rate_limit_ms: AtomicU64, + pub(super) family_health_v4: ArcSwap, + pub(super) family_health_v6: ArcSwap, +} + #[allow(dead_code)] pub struct MePool { pub(super) routing: Arc, pub(super) reinit: Arc, pub(super) writer_lifecycle: Arc, pub(super) route_runtime: Arc, + pub(super) health_runtime: Arc, pub(super) decision: NetworkDecision, pub(super) upstream: Option>, pub(super) rng: Arc, @@ -395,11 +404,6 @@ pub struct MePool { pub(super) me_writer_pick_sample_size: AtomicU8, pub(super) me_socks_kdf_policy: AtomicU8, pub(super) me_reader_route_data_wait_ms: Arc, - pub(super) me_health_interval_ms_unhealthy: AtomicU64, - pub(super) me_health_interval_ms_healthy: AtomicU64, - pub(super) me_warn_rate_limit_ms: AtomicU64, - pub(super) family_health_v4: ArcSwap, - pub(super) family_health_v6: ArcSwap, pub(super) me_last_drain_gate_route_quorum_ok: AtomicBool, pub(super) me_last_drain_gate_redundancy_ok: AtomicBool, pub(super) me_last_drain_gate_block_reason: AtomicU8, @@ -589,6 +593,25 @@ impl MePool { me_route_inline_recovery_attempts, me_route_inline_recovery_wait: Duration::from_millis(me_route_inline_recovery_wait_ms), }), + health_runtime: Arc::new(HealthRuntimeCore { + me_health_interval_ms_unhealthy: AtomicU64::new(me_health_interval_ms_unhealthy.max(1)), + me_health_interval_ms_healthy: AtomicU64::new(me_health_interval_ms_healthy.max(1)), + me_warn_rate_limit_ms: AtomicU64::new(me_warn_rate_limit_ms.max(1)), + family_health_v4: ArcSwap::from_pointee(FamilyHealthSnapshot::new( + MeFamilyRuntimeState::Healthy, + now_epoch_secs, + 0, + 0, + 0, + )), + family_health_v6: ArcSwap::from_pointee(FamilyHealthSnapshot::new( + MeFamilyRuntimeState::Healthy, + now_epoch_secs, + 0, + 0, + 0, + )), + }), decision, upstream, rng, @@ -742,23 +765,6 @@ impl MePool { me_writer_pick_sample_size: AtomicU8::new(me_writer_pick_sample_size.clamp(2, 4)), me_socks_kdf_policy: AtomicU8::new(me_socks_kdf_policy.as_u8()), me_reader_route_data_wait_ms: Arc::new(AtomicU64::new(me_reader_route_data_wait_ms)), - me_health_interval_ms_unhealthy: AtomicU64::new(me_health_interval_ms_unhealthy.max(1)), - me_health_interval_ms_healthy: AtomicU64::new(me_health_interval_ms_healthy.max(1)), - me_warn_rate_limit_ms: AtomicU64::new(me_warn_rate_limit_ms.max(1)), - family_health_v4: ArcSwap::from_pointee(FamilyHealthSnapshot::new( - MeFamilyRuntimeState::Healthy, - now_epoch_secs, - 0, - 0, - 0, - )), - family_health_v6: ArcSwap::from_pointee(FamilyHealthSnapshot::new( - MeFamilyRuntimeState::Healthy, - now_epoch_secs, - 0, - 0, - 0, - )), me_last_drain_gate_route_quorum_ok: AtomicBool::new(false), me_last_drain_gate_redundancy_ok: AtomicBool::new(false), me_last_drain_gate_block_reason: AtomicU8::new(MeDrainGateReason::Open as u8), @@ -810,43 +816,59 @@ impl MePool { recover_success_streak, )); match family { - IpFamily::V4 => self.family_health_v4.store(snapshot), - IpFamily::V6 => self.family_health_v6.store(snapshot), + IpFamily::V4 => self.health_runtime.family_health_v4.store(snapshot), + IpFamily::V6 => self.health_runtime.family_health_v6.store(snapshot), } } pub(crate) fn family_runtime_state(&self, family: IpFamily) -> MeFamilyRuntimeState { match family { - IpFamily::V4 => self.family_health_v4.load().state, - IpFamily::V6 => self.family_health_v6.load().state, + IpFamily::V4 => self.health_runtime.family_health_v4.load().state, + IpFamily::V6 => self.health_runtime.family_health_v6.load().state, } } pub(crate) fn family_runtime_state_since_epoch_secs(&self, family: IpFamily) -> u64 { match family { - IpFamily::V4 => self.family_health_v4.load().state_since_epoch_secs, - IpFamily::V6 => self.family_health_v6.load().state_since_epoch_secs, + IpFamily::V4 => self.health_runtime.family_health_v4.load().state_since_epoch_secs, + IpFamily::V6 => self.health_runtime.family_health_v6.load().state_since_epoch_secs, } } pub(crate) fn family_suppressed_until_epoch_secs(&self, family: IpFamily) -> u64 { match family { - IpFamily::V4 => self.family_health_v4.load().suppressed_until_epoch_secs, - IpFamily::V6 => self.family_health_v6.load().suppressed_until_epoch_secs, + IpFamily::V4 => self + .health_runtime + .family_health_v4 + .load() + .suppressed_until_epoch_secs, + IpFamily::V6 => self + .health_runtime + .family_health_v6 + .load() + .suppressed_until_epoch_secs, } } pub(crate) fn family_fail_streak(&self, family: IpFamily) -> u32 { match family { - IpFamily::V4 => self.family_health_v4.load().fail_streak, - IpFamily::V6 => self.family_health_v6.load().fail_streak, + IpFamily::V4 => self.health_runtime.family_health_v4.load().fail_streak, + IpFamily::V6 => self.health_runtime.family_health_v6.load().fail_streak, } } pub(crate) fn family_recover_success_streak(&self, family: IpFamily) -> u32 { match family { - IpFamily::V4 => self.family_health_v4.load().recover_success_streak, - IpFamily::V6 => self.family_health_v6.load().recover_success_streak, + IpFamily::V4 => self + .health_runtime + .family_health_v4 + .load() + .recover_success_streak, + IpFamily::V6 => self + .health_runtime + .family_health_v6 + .load() + .recover_success_streak, } } @@ -1056,11 +1078,14 @@ impl MePool { .store(adaptive_floor_max_active_writers_global, Ordering::Relaxed); self.me_adaptive_floor_max_warm_writers_global .store(adaptive_floor_max_warm_writers_global, Ordering::Relaxed); - self.me_health_interval_ms_unhealthy + self.health_runtime + .me_health_interval_ms_unhealthy .store(me_health_interval_ms_unhealthy.max(1), Ordering::Relaxed); - self.me_health_interval_ms_healthy + self.health_runtime + .me_health_interval_ms_healthy .store(me_health_interval_ms_healthy.max(1), Ordering::Relaxed); - self.me_warn_rate_limit_ms + self.health_runtime + .me_warn_rate_limit_ms .store(me_warn_rate_limit_ms.max(1), Ordering::Relaxed); if previous_floor_mode != floor_mode { self.stats.increment_me_floor_mode_switch_total(); @@ -1742,7 +1767,8 @@ impl MePool { pub(super) fn health_interval_unhealthy(&self) -> Duration { Duration::from_millis( - self.me_health_interval_ms_unhealthy + self.health_runtime + .me_health_interval_ms_unhealthy .load(Ordering::Relaxed) .max(1), ) @@ -1750,13 +1776,19 @@ impl MePool { pub(super) fn health_interval_healthy(&self) -> Duration { Duration::from_millis( - self.me_health_interval_ms_healthy + self.health_runtime + .me_health_interval_ms_healthy .load(Ordering::Relaxed) .max(1), ) } pub(super) fn warn_rate_limit_duration(&self) -> Duration { - Duration::from_millis(self.me_warn_rate_limit_ms.load(Ordering::Relaxed).max(1)) + Duration::from_millis( + self.health_runtime + .me_warn_rate_limit_ms + .load(Ordering::Relaxed) + .max(1), + ) } } From 41493462a131cbb19133986780f0c62e8504647f Mon Sep 17 00:00:00 2001 From: Alexey <247128645+axkurcom@users.noreply.github.com> Date: Wed, 25 Mar 2026 20:29:22 +0300 Subject: [PATCH 10/17] Drain + Single-Endpoint Runtime Cores Co-Authored-By: brekotis <93345790+brekotis@users.noreply.github.com> --- src/transport/middle_proxy/health.rs | 7 +- src/transport/middle_proxy/pool.rs | 250 +++++++++++++--------- src/transport/middle_proxy/pool_reinit.rs | 3 +- src/transport/middle_proxy/pool_status.rs | 24 ++- 4 files changed, 180 insertions(+), 104 deletions(-) diff --git a/src/transport/middle_proxy/health.rs b/src/transport/middle_proxy/health.rs index 9e0933f..29a6f7d 100644 --- a/src/transport/middle_proxy/health.rs +++ b/src/transport/middle_proxy/health.rs @@ -137,9 +137,11 @@ pub(super) async fn reap_draining_writers( let now_epoch_secs = MePool::now_epoch_secs(); let now = Instant::now(); let drain_ttl_secs = pool + .drain_runtime .me_pool_drain_ttl_secs .load(std::sync::atomic::Ordering::Relaxed); let drain_threshold = pool + .drain_runtime .me_pool_drain_threshold .load(std::sync::atomic::Ordering::Relaxed); let activity = pool.registry.writer_activity_snapshot().await; @@ -223,7 +225,10 @@ pub(super) async fn reap_draining_writers( endpoint = %writer.addr, generation = writer.generation, drain_ttl_secs, - force_close_secs = pool.me_pool_force_close_secs.load(std::sync::atomic::Ordering::Relaxed), + force_close_secs = pool + .drain_runtime + .me_pool_force_close_secs + .load(std::sync::atomic::Ordering::Relaxed), allow_drain_fallback = writer.allow_drain_fallback, "ME draining writer remains non-empty past drain TTL" ); diff --git a/src/transport/middle_proxy/pool.rs b/src/transport/middle_proxy/pool.rs index 0146e78..6d51bc8 100644 --- a/src/transport/middle_proxy/pool.rs +++ b/src/transport/middle_proxy/pool.rs @@ -304,6 +304,32 @@ pub(super) struct HealthRuntimeCore { pub(super) family_health_v6: ArcSwap, } +pub(super) struct DrainRuntimeCore { + pub(super) me_pool_drain_ttl_secs: AtomicU64, + pub(super) me_instadrain: AtomicBool, + pub(super) me_pool_drain_threshold: AtomicU64, + pub(super) me_pool_drain_soft_evict_enabled: AtomicBool, + pub(super) me_pool_drain_soft_evict_grace_secs: AtomicU64, + pub(super) me_pool_drain_soft_evict_per_writer: AtomicU8, + pub(super) me_pool_drain_soft_evict_budget_per_core: AtomicU32, + pub(super) me_pool_drain_soft_evict_cooldown_ms: AtomicU64, + pub(super) me_pool_force_close_secs: AtomicU64, + pub(super) me_pool_min_fresh_ratio_permille: AtomicU32, + pub(super) me_last_drain_gate_route_quorum_ok: AtomicBool, + pub(super) me_last_drain_gate_redundancy_ok: AtomicBool, + pub(super) me_last_drain_gate_block_reason: AtomicU8, + pub(super) me_last_drain_gate_updated_at_epoch_secs: AtomicU64, +} + +pub(super) struct SingleEndpointRuntimeCore { + pub(super) me_single_endpoint_shadow_writers: AtomicU8, + pub(super) me_single_endpoint_outage_mode_enabled: AtomicBool, + pub(super) me_single_endpoint_outage_disable_quarantine: AtomicBool, + pub(super) me_single_endpoint_outage_backoff_min_ms: AtomicU64, + pub(super) me_single_endpoint_outage_backoff_max_ms: AtomicU64, + pub(super) me_single_endpoint_shadow_rotate_every_secs: AtomicU64, +} + #[allow(dead_code)] pub struct MePool { pub(super) routing: Arc, @@ -311,6 +337,8 @@ pub struct MePool { pub(super) writer_lifecycle: Arc, pub(super) route_runtime: Arc, pub(super) health_runtime: Arc, + pub(super) drain_runtime: Arc, + pub(super) single_endpoint_runtime: Arc, pub(super) decision: NetworkDecision, pub(super) upstream: Option>, pub(super) rng: Arc, @@ -336,12 +364,6 @@ pub struct MePool { pub(super) me_reconnect_backoff_base: Duration, pub(super) me_reconnect_backoff_cap: Duration, pub(super) me_reconnect_fast_retry_count: u32, - pub(super) me_single_endpoint_shadow_writers: AtomicU8, - pub(super) me_single_endpoint_outage_mode_enabled: AtomicBool, - pub(super) me_single_endpoint_outage_disable_quarantine: AtomicBool, - pub(super) me_single_endpoint_outage_backoff_min_ms: AtomicU64, - pub(super) me_single_endpoint_outage_backoff_max_ms: AtomicU64, - pub(super) me_single_endpoint_shadow_rotate_every_secs: AtomicU64, pub(super) me_floor_mode: AtomicU8, pub(super) me_adaptive_floor_idle_secs: AtomicU64, pub(super) me_adaptive_floor_min_writers_single_endpoint: AtomicU8, @@ -382,16 +404,6 @@ pub struct MePool { pub(super) stats: Arc, pub(super) endpoint_quarantine: Arc>>, pub(super) kdf_material_fingerprint: Arc>>, - pub(super) me_pool_drain_ttl_secs: AtomicU64, - pub(super) me_instadrain: AtomicBool, - pub(super) me_pool_drain_threshold: AtomicU64, - pub(super) me_pool_drain_soft_evict_enabled: AtomicBool, - pub(super) me_pool_drain_soft_evict_grace_secs: AtomicU64, - pub(super) me_pool_drain_soft_evict_per_writer: AtomicU8, - pub(super) me_pool_drain_soft_evict_budget_per_core: AtomicU32, - pub(super) me_pool_drain_soft_evict_cooldown_ms: AtomicU64, - pub(super) me_pool_force_close_secs: AtomicU64, - pub(super) me_pool_min_fresh_ratio_permille: AtomicU32, pub(super) me_hardswap_warmup_delay_min_ms: AtomicU64, pub(super) me_hardswap_warmup_delay_max_ms: AtomicU64, pub(super) me_hardswap_warmup_extra_passes: AtomicU32, @@ -404,10 +416,6 @@ pub struct MePool { pub(super) me_writer_pick_sample_size: AtomicU8, pub(super) me_socks_kdf_policy: AtomicU8, pub(super) me_reader_route_data_wait_ms: Arc, - pub(super) me_last_drain_gate_route_quorum_ok: AtomicBool, - pub(super) me_last_drain_gate_redundancy_ok: AtomicBool, - pub(super) me_last_drain_gate_block_reason: AtomicU8, - pub(super) me_last_drain_gate_updated_at_epoch_secs: AtomicU64, pub(super) runtime_ready: AtomicBool, pool_size: usize, } @@ -612,6 +620,52 @@ impl MePool { 0, )), }), + drain_runtime: Arc::new(DrainRuntimeCore { + me_pool_drain_ttl_secs: AtomicU64::new(me_pool_drain_ttl_secs), + me_instadrain: AtomicBool::new(me_instadrain), + me_pool_drain_threshold: AtomicU64::new(me_pool_drain_threshold), + me_pool_drain_soft_evict_enabled: AtomicBool::new(me_pool_drain_soft_evict_enabled), + me_pool_drain_soft_evict_grace_secs: AtomicU64::new( + me_pool_drain_soft_evict_grace_secs, + ), + me_pool_drain_soft_evict_per_writer: AtomicU8::new( + me_pool_drain_soft_evict_per_writer.max(1), + ), + me_pool_drain_soft_evict_budget_per_core: AtomicU32::new( + me_pool_drain_soft_evict_budget_per_core.max(1) as u32, + ), + me_pool_drain_soft_evict_cooldown_ms: AtomicU64::new( + me_pool_drain_soft_evict_cooldown_ms.max(1), + ), + me_pool_force_close_secs: AtomicU64::new(Self::normalize_force_close_secs( + me_pool_force_close_secs, + )), + me_pool_min_fresh_ratio_permille: AtomicU32::new(Self::ratio_to_permille( + me_pool_min_fresh_ratio, + )), + me_last_drain_gate_route_quorum_ok: AtomicBool::new(false), + me_last_drain_gate_redundancy_ok: AtomicBool::new(false), + me_last_drain_gate_block_reason: AtomicU8::new(MeDrainGateReason::Open as u8), + me_last_drain_gate_updated_at_epoch_secs: AtomicU64::new(now_epoch_secs), + }), + single_endpoint_runtime: Arc::new(SingleEndpointRuntimeCore { + me_single_endpoint_shadow_writers: AtomicU8::new(me_single_endpoint_shadow_writers), + me_single_endpoint_outage_mode_enabled: AtomicBool::new( + me_single_endpoint_outage_mode_enabled, + ), + me_single_endpoint_outage_disable_quarantine: AtomicBool::new( + me_single_endpoint_outage_disable_quarantine, + ), + me_single_endpoint_outage_backoff_min_ms: AtomicU64::new( + me_single_endpoint_outage_backoff_min_ms, + ), + me_single_endpoint_outage_backoff_max_ms: AtomicU64::new( + me_single_endpoint_outage_backoff_max_ms, + ), + me_single_endpoint_shadow_rotate_every_secs: AtomicU64::new( + me_single_endpoint_shadow_rotate_every_secs, + ), + }), decision, upstream, rng, @@ -651,22 +705,6 @@ impl MePool { me_reconnect_backoff_base: Duration::from_millis(me_reconnect_backoff_base_ms), me_reconnect_backoff_cap: Duration::from_millis(me_reconnect_backoff_cap_ms), me_reconnect_fast_retry_count, - me_single_endpoint_shadow_writers: AtomicU8::new(me_single_endpoint_shadow_writers), - me_single_endpoint_outage_mode_enabled: AtomicBool::new( - me_single_endpoint_outage_mode_enabled, - ), - me_single_endpoint_outage_disable_quarantine: AtomicBool::new( - me_single_endpoint_outage_disable_quarantine, - ), - me_single_endpoint_outage_backoff_min_ms: AtomicU64::new( - me_single_endpoint_outage_backoff_min_ms, - ), - me_single_endpoint_outage_backoff_max_ms: AtomicU64::new( - me_single_endpoint_outage_backoff_max_ms, - ), - me_single_endpoint_shadow_rotate_every_secs: AtomicU64::new( - me_single_endpoint_shadow_rotate_every_secs, - ), me_floor_mode: AtomicU8::new(me_floor_mode.as_u8()), me_adaptive_floor_idle_secs: AtomicU64::new(me_adaptive_floor_idle_secs), me_adaptive_floor_min_writers_single_endpoint: AtomicU8::new( @@ -729,28 +767,6 @@ impl MePool { draining_active_runtime: AtomicU64::new(0), endpoint_quarantine: Arc::new(Mutex::new(HashMap::new())), kdf_material_fingerprint: Arc::new(RwLock::new(HashMap::new())), - me_pool_drain_ttl_secs: AtomicU64::new(me_pool_drain_ttl_secs), - me_instadrain: AtomicBool::new(me_instadrain), - me_pool_drain_threshold: AtomicU64::new(me_pool_drain_threshold), - me_pool_drain_soft_evict_enabled: AtomicBool::new(me_pool_drain_soft_evict_enabled), - me_pool_drain_soft_evict_grace_secs: AtomicU64::new( - me_pool_drain_soft_evict_grace_secs, - ), - me_pool_drain_soft_evict_per_writer: AtomicU8::new( - me_pool_drain_soft_evict_per_writer.max(1), - ), - me_pool_drain_soft_evict_budget_per_core: AtomicU32::new( - me_pool_drain_soft_evict_budget_per_core.max(1) as u32, - ), - me_pool_drain_soft_evict_cooldown_ms: AtomicU64::new( - me_pool_drain_soft_evict_cooldown_ms.max(1), - ), - me_pool_force_close_secs: AtomicU64::new(Self::normalize_force_close_secs( - me_pool_force_close_secs, - )), - me_pool_min_fresh_ratio_permille: AtomicU32::new(Self::ratio_to_permille( - me_pool_min_fresh_ratio, - )), me_hardswap_warmup_delay_min_ms: AtomicU64::new(me_hardswap_warmup_delay_min_ms), me_hardswap_warmup_delay_max_ms: AtomicU64::new(me_hardswap_warmup_delay_max_ms), me_hardswap_warmup_extra_passes: AtomicU32::new(me_hardswap_warmup_extra_passes as u32), @@ -765,10 +781,6 @@ impl MePool { me_writer_pick_sample_size: AtomicU8::new(me_writer_pick_sample_size.clamp(2, 4)), me_socks_kdf_policy: AtomicU8::new(me_socks_kdf_policy.as_u8()), me_reader_route_data_wait_ms: Arc::new(AtomicU64::new(me_reader_route_data_wait_ms)), - me_last_drain_gate_route_quorum_ok: AtomicBool::new(false), - me_last_drain_gate_redundancy_ok: AtomicBool::new(false), - me_last_drain_gate_block_reason: AtomicU8::new(MeDrainGateReason::Open as u8), - me_last_drain_gate_updated_at_epoch_secs: AtomicU64::new(now_epoch_secs), runtime_ready: AtomicBool::new(false), }) } @@ -899,32 +911,43 @@ impl MePool { block_reason: MeDrainGateReason, updated_at_epoch_secs: u64, ) { - self.me_last_drain_gate_route_quorum_ok + self.drain_runtime + .me_last_drain_gate_route_quorum_ok .store(route_quorum_ok, Ordering::Relaxed); - self.me_last_drain_gate_redundancy_ok + self.drain_runtime + .me_last_drain_gate_redundancy_ok .store(redundancy_ok, Ordering::Relaxed); - self.me_last_drain_gate_block_reason + self.drain_runtime + .me_last_drain_gate_block_reason .store(block_reason as u8, Ordering::Relaxed); - self.me_last_drain_gate_updated_at_epoch_secs + self.drain_runtime + .me_last_drain_gate_updated_at_epoch_secs .store(updated_at_epoch_secs, Ordering::Relaxed); } pub(crate) fn last_drain_gate_route_quorum_ok(&self) -> bool { - self.me_last_drain_gate_route_quorum_ok + self.drain_runtime + .me_last_drain_gate_route_quorum_ok .load(Ordering::Relaxed) } pub(crate) fn last_drain_gate_redundancy_ok(&self) -> bool { - self.me_last_drain_gate_redundancy_ok + self.drain_runtime + .me_last_drain_gate_redundancy_ok .load(Ordering::Relaxed) } pub(crate) fn last_drain_gate_block_reason(&self) -> MeDrainGateReason { - MeDrainGateReason::from_u8(self.me_last_drain_gate_block_reason.load(Ordering::Relaxed)) + MeDrainGateReason::from_u8( + self.drain_runtime + .me_last_drain_gate_block_reason + .load(Ordering::Relaxed), + ) } pub(crate) fn last_drain_gate_updated_at_epoch_secs(&self) -> u64 { - self.me_last_drain_gate_updated_at_epoch_secs + self.drain_runtime + .me_last_drain_gate_updated_at_epoch_secs .load(Ordering::Relaxed) } @@ -975,31 +998,42 @@ impl MePool { me_warn_rate_limit_ms: u64, ) { self.reinit.hardswap.store(hardswap, Ordering::Relaxed); - self.me_pool_drain_ttl_secs + self.drain_runtime + .me_pool_drain_ttl_secs .store(drain_ttl_secs, Ordering::Relaxed); - self.me_instadrain.store(instadrain, Ordering::Relaxed); - self.me_pool_drain_threshold + self.drain_runtime + .me_instadrain + .store(instadrain, Ordering::Relaxed); + self.drain_runtime + .me_pool_drain_threshold .store(pool_drain_threshold, Ordering::Relaxed); // Runtime soft-evict knobs are updated lock-free to keep control-plane // writes non-blocking; readers observe a short eventual-consistency // window by design. - self.me_pool_drain_soft_evict_enabled + self.drain_runtime + .me_pool_drain_soft_evict_enabled .store(pool_drain_soft_evict_enabled, Ordering::Relaxed); - self.me_pool_drain_soft_evict_grace_secs + self.drain_runtime + .me_pool_drain_soft_evict_grace_secs .store(pool_drain_soft_evict_grace_secs, Ordering::Relaxed); - self.me_pool_drain_soft_evict_per_writer + self.drain_runtime + .me_pool_drain_soft_evict_per_writer .store(pool_drain_soft_evict_per_writer.max(1), Ordering::Relaxed); - self.me_pool_drain_soft_evict_budget_per_core.store( - pool_drain_soft_evict_budget_per_core.max(1) as u32, - Ordering::Relaxed, - ); - self.me_pool_drain_soft_evict_cooldown_ms + self.drain_runtime + .me_pool_drain_soft_evict_budget_per_core + .store( + pool_drain_soft_evict_budget_per_core.max(1) as u32, + Ordering::Relaxed, + ); + self.drain_runtime + .me_pool_drain_soft_evict_cooldown_ms .store(pool_drain_soft_evict_cooldown_ms.max(1), Ordering::Relaxed); - self.me_pool_force_close_secs.store( + self.drain_runtime.me_pool_force_close_secs.store( Self::normalize_force_close_secs(force_close_secs), Ordering::Relaxed, ); - self.me_pool_min_fresh_ratio_permille + self.drain_runtime + .me_pool_min_fresh_ratio_permille .store(Self::ratio_to_permille(min_fresh_ratio), Ordering::Relaxed); self.me_hardswap_warmup_delay_min_ms .store(hardswap_warmup_delay_min_ms, Ordering::Relaxed); @@ -1025,17 +1059,23 @@ impl MePool { if previous_writer_pick_mode != writer_pick_mode { self.stats.increment_me_writer_pick_mode_switch_total(); } - self.me_single_endpoint_shadow_writers + self.single_endpoint_runtime + .me_single_endpoint_shadow_writers .store(single_endpoint_shadow_writers, Ordering::Relaxed); - self.me_single_endpoint_outage_mode_enabled + self.single_endpoint_runtime + .me_single_endpoint_outage_mode_enabled .store(single_endpoint_outage_mode_enabled, Ordering::Relaxed); - self.me_single_endpoint_outage_disable_quarantine + self.single_endpoint_runtime + .me_single_endpoint_outage_disable_quarantine .store(single_endpoint_outage_disable_quarantine, Ordering::Relaxed); - self.me_single_endpoint_outage_backoff_min_ms + self.single_endpoint_runtime + .me_single_endpoint_outage_backoff_min_ms .store(single_endpoint_outage_backoff_min_ms, Ordering::Relaxed); - self.me_single_endpoint_outage_backoff_max_ms + self.single_endpoint_runtime + .me_single_endpoint_outage_backoff_max_ms .store(single_endpoint_outage_backoff_max_ms, Ordering::Relaxed); - self.me_single_endpoint_shadow_rotate_every_secs + self.single_endpoint_runtime + .me_single_endpoint_shadow_rotate_every_secs .store(single_endpoint_shadow_rotate_every_secs, Ordering::Relaxed); let previous_floor_mode = self.floor_mode(); self.me_floor_mode @@ -1148,33 +1188,40 @@ impl MePool { } pub(super) fn force_close_timeout(&self) -> Option { - let secs = - Self::normalize_force_close_secs(self.me_pool_force_close_secs.load(Ordering::Relaxed)); + let secs = Self::normalize_force_close_secs( + self.drain_runtime + .me_pool_force_close_secs + .load(Ordering::Relaxed), + ); Some(Duration::from_secs(secs)) } #[allow(dead_code)] pub(super) fn drain_soft_evict_enabled(&self) -> bool { - self.me_pool_drain_soft_evict_enabled + self.drain_runtime + .me_pool_drain_soft_evict_enabled .load(Ordering::Relaxed) } #[allow(dead_code)] pub(super) fn drain_soft_evict_grace_secs(&self) -> u64 { - self.me_pool_drain_soft_evict_grace_secs + self.drain_runtime + .me_pool_drain_soft_evict_grace_secs .load(Ordering::Relaxed) } #[allow(dead_code)] pub(super) fn drain_soft_evict_per_writer(&self) -> usize { - self.me_pool_drain_soft_evict_per_writer + self.drain_runtime + .me_pool_drain_soft_evict_per_writer .load(Ordering::Relaxed) .max(1) as usize } #[allow(dead_code)] pub(super) fn drain_soft_evict_budget_per_core(&self) -> usize { - self.me_pool_drain_soft_evict_budget_per_core + self.drain_runtime + .me_pool_drain_soft_evict_budget_per_core .load(Ordering::Relaxed) .max(1) as usize } @@ -1182,7 +1229,8 @@ impl MePool { #[allow(dead_code)] pub(super) fn drain_soft_evict_cooldown(&self) -> Duration { Duration::from_millis( - self.me_pool_drain_soft_evict_cooldown_ms + self.drain_runtime + .me_pool_drain_soft_evict_cooldown_ms .load(Ordering::Relaxed) .max(1), ) @@ -1265,6 +1313,7 @@ impl MePool { } if endpoint_count == 1 { let shadow = self + .single_endpoint_runtime .me_single_endpoint_shadow_writers .load(Ordering::Relaxed) as usize; return (1 + shadow).max(3); @@ -1533,20 +1582,24 @@ impl MePool { } pub(super) fn single_endpoint_outage_mode_enabled(&self) -> bool { - self.me_single_endpoint_outage_mode_enabled + self.single_endpoint_runtime + .me_single_endpoint_outage_mode_enabled .load(Ordering::Relaxed) } pub(super) fn single_endpoint_outage_disable_quarantine(&self) -> bool { - self.me_single_endpoint_outage_disable_quarantine + self.single_endpoint_runtime + .me_single_endpoint_outage_disable_quarantine .load(Ordering::Relaxed) } pub(super) fn single_endpoint_outage_backoff_bounds_ms(&self) -> (u64, u64) { let min_ms = self + .single_endpoint_runtime .me_single_endpoint_outage_backoff_min_ms .load(Ordering::Relaxed); let max_ms = self + .single_endpoint_runtime .me_single_endpoint_outage_backoff_max_ms .load(Ordering::Relaxed); if min_ms <= max_ms { @@ -1558,6 +1611,7 @@ impl MePool { pub(super) fn single_endpoint_shadow_rotate_interval(&self) -> Option { let secs = self + .single_endpoint_runtime .me_single_endpoint_shadow_rotate_every_secs .load(Ordering::Relaxed); if secs == 0 { diff --git a/src/transport/middle_proxy/pool_reinit.rs b/src/transport/middle_proxy/pool_reinit.rs index be08649..2bf2e5b 100644 --- a/src/transport/middle_proxy/pool_reinit.rs +++ b/src/transport/middle_proxy/pool_reinit.rs @@ -449,7 +449,8 @@ impl MePool { .map(|w| (w.writer_dc, w.addr)) .collect(); let min_ratio = Self::permille_to_ratio( - self.me_pool_min_fresh_ratio_permille + self.drain_runtime + .me_pool_min_fresh_ratio_permille .load(Ordering::Relaxed), ); let (coverage_ratio, missing_dc) = diff --git a/src/transport/middle_proxy/pool_status.rs b/src/transport/middle_proxy/pool_status.rs index 7ee4422..d8d4a39 100644 --- a/src/transport/middle_proxy/pool_status.rs +++ b/src/transport/middle_proxy/pool_status.rs @@ -224,7 +224,10 @@ impl MePool { pub(crate) async fn api_status_snapshot(&self) -> MeApiStatusSnapshot { let now_epoch_secs = Self::now_epoch_secs(); let active_generation = self.current_generation(); - let drain_ttl_secs = self.me_pool_drain_ttl_secs.load(Ordering::Relaxed); + let drain_ttl_secs = self + .drain_runtime + .me_pool_drain_ttl_secs + .load(Ordering::Relaxed); let mut endpoints_by_dc = BTreeMap::>::new(); if self.decision.ipv4_me { @@ -570,30 +573,43 @@ impl MePool { me_reconnect_backoff_base_ms: self.me_reconnect_backoff_base.as_millis() as u64, me_reconnect_backoff_cap_ms: self.me_reconnect_backoff_cap.as_millis() as u64, me_reconnect_fast_retry_count: self.me_reconnect_fast_retry_count, - me_pool_drain_ttl_secs: self.me_pool_drain_ttl_secs.load(Ordering::Relaxed), - me_pool_force_close_secs: self.me_pool_force_close_secs.load(Ordering::Relaxed), + me_pool_drain_ttl_secs: self + .drain_runtime + .me_pool_drain_ttl_secs + .load(Ordering::Relaxed), + me_pool_force_close_secs: self + .drain_runtime + .me_pool_force_close_secs + .load(Ordering::Relaxed), me_pool_min_fresh_ratio: Self::permille_to_ratio( - self.me_pool_min_fresh_ratio_permille + self.drain_runtime + .me_pool_min_fresh_ratio_permille .load(Ordering::Relaxed), ), me_bind_stale_mode: bind_stale_mode_label(self.bind_stale_mode()), me_bind_stale_ttl_secs: self.me_bind_stale_ttl_secs.load(Ordering::Relaxed), me_single_endpoint_shadow_writers: self + .single_endpoint_runtime .me_single_endpoint_shadow_writers .load(Ordering::Relaxed), me_single_endpoint_outage_mode_enabled: self + .single_endpoint_runtime .me_single_endpoint_outage_mode_enabled .load(Ordering::Relaxed), me_single_endpoint_outage_disable_quarantine: self + .single_endpoint_runtime .me_single_endpoint_outage_disable_quarantine .load(Ordering::Relaxed), me_single_endpoint_outage_backoff_min_ms: self + .single_endpoint_runtime .me_single_endpoint_outage_backoff_min_ms .load(Ordering::Relaxed), me_single_endpoint_outage_backoff_max_ms: self + .single_endpoint_runtime .me_single_endpoint_outage_backoff_max_ms .load(Ordering::Relaxed), me_single_endpoint_shadow_rotate_every_secs: self + .single_endpoint_runtime .me_single_endpoint_shadow_rotate_every_secs .load(Ordering::Relaxed), me_deterministic_writer_sort: self.me_deterministic_writer_sort.load(Ordering::Relaxed), From 7ce5fc66db4900fe25b1251554c80b2095a6b9c6 Mon Sep 17 00:00:00 2001 From: Alexey <247128645+axkurcom@users.noreply.github.com> Date: Wed, 25 Mar 2026 20:35:57 +0300 Subject: [PATCH 11/17] ME Reinit Core advancing + Binding Policy Core Co-Authored-By: brekotis <93345790+brekotis@users.noreply.github.com> --- src/transport/middle_proxy/pool.rs | 60 +++++++++++++++-------- src/transport/middle_proxy/pool_reinit.rs | 12 ++++- src/transport/middle_proxy/pool_status.rs | 5 +- src/transport/middle_proxy/pool_writer.rs | 5 +- 4 files changed, 57 insertions(+), 25 deletions(-) diff --git a/src/transport/middle_proxy/pool.rs b/src/transport/middle_proxy/pool.rs index 6d51bc8..916f39c 100644 --- a/src/transport/middle_proxy/pool.rs +++ b/src/transport/middle_proxy/pool.rs @@ -273,6 +273,10 @@ pub(super) struct ReinitCore { pub(super) pending_hardswap_started_at_epoch_secs: AtomicU64, pub(super) pending_hardswap_map_hash: AtomicU64, pub(super) hardswap: AtomicBool, + pub(super) me_hardswap_warmup_delay_min_ms: AtomicU64, + pub(super) me_hardswap_warmup_delay_max_ms: AtomicU64, + pub(super) me_hardswap_warmup_extra_passes: AtomicU32, + pub(super) me_hardswap_warmup_pass_backoff_base_ms: AtomicU64, } pub(super) struct WriterLifecycleCore { @@ -330,6 +334,11 @@ pub(super) struct SingleEndpointRuntimeCore { pub(super) me_single_endpoint_shadow_rotate_every_secs: AtomicU64, } +pub(super) struct BindingPolicyCore { + pub(super) me_bind_stale_mode: AtomicU8, + pub(super) me_bind_stale_ttl_secs: AtomicU64, +} + #[allow(dead_code)] pub struct MePool { pub(super) routing: Arc, @@ -339,6 +348,7 @@ pub struct MePool { pub(super) health_runtime: Arc, pub(super) drain_runtime: Arc, pub(super) single_endpoint_runtime: Arc, + pub(super) binding_policy: Arc, pub(super) decision: NetworkDecision, pub(super) upstream: Option>, pub(super) rng: Arc, @@ -404,12 +414,6 @@ pub struct MePool { pub(super) stats: Arc, pub(super) endpoint_quarantine: Arc>>, pub(super) kdf_material_fingerprint: Arc>>, - pub(super) me_hardswap_warmup_delay_min_ms: AtomicU64, - pub(super) me_hardswap_warmup_delay_max_ms: AtomicU64, - pub(super) me_hardswap_warmup_extra_passes: AtomicU32, - pub(super) me_hardswap_warmup_pass_backoff_base_ms: AtomicU64, - pub(super) me_bind_stale_mode: AtomicU8, - pub(super) me_bind_stale_ttl_secs: AtomicU64, pub(super) secret_atomic_snapshot: AtomicBool, pub(super) me_deterministic_writer_sort: AtomicBool, pub(super) me_writer_pick_mode: AtomicU8, @@ -575,6 +579,14 @@ impl MePool { pending_hardswap_started_at_epoch_secs: AtomicU64::new(0), pending_hardswap_map_hash: AtomicU64::new(0), hardswap: AtomicBool::new(hardswap), + me_hardswap_warmup_delay_min_ms: AtomicU64::new(me_hardswap_warmup_delay_min_ms), + me_hardswap_warmup_delay_max_ms: AtomicU64::new(me_hardswap_warmup_delay_max_ms), + me_hardswap_warmup_extra_passes: AtomicU32::new( + me_hardswap_warmup_extra_passes as u32, + ), + me_hardswap_warmup_pass_backoff_base_ms: AtomicU64::new( + me_hardswap_warmup_pass_backoff_base_ms, + ), }), writer_lifecycle: Arc::new(WriterLifecycleCore { me_keepalive_enabled, @@ -666,6 +678,10 @@ impl MePool { me_single_endpoint_shadow_rotate_every_secs, ), }), + binding_policy: Arc::new(BindingPolicyCore { + me_bind_stale_mode: AtomicU8::new(me_bind_stale_mode.as_u8()), + me_bind_stale_ttl_secs: AtomicU64::new(me_bind_stale_ttl_secs), + }), decision, upstream, rng, @@ -767,14 +783,6 @@ impl MePool { draining_active_runtime: AtomicU64::new(0), endpoint_quarantine: Arc::new(Mutex::new(HashMap::new())), kdf_material_fingerprint: Arc::new(RwLock::new(HashMap::new())), - me_hardswap_warmup_delay_min_ms: AtomicU64::new(me_hardswap_warmup_delay_min_ms), - me_hardswap_warmup_delay_max_ms: AtomicU64::new(me_hardswap_warmup_delay_max_ms), - me_hardswap_warmup_extra_passes: AtomicU32::new(me_hardswap_warmup_extra_passes as u32), - me_hardswap_warmup_pass_backoff_base_ms: AtomicU64::new( - me_hardswap_warmup_pass_backoff_base_ms, - ), - me_bind_stale_mode: AtomicU8::new(me_bind_stale_mode.as_u8()), - me_bind_stale_ttl_secs: AtomicU64::new(me_bind_stale_ttl_secs), secret_atomic_snapshot: AtomicBool::new(me_secret_atomic_snapshot), me_deterministic_writer_sort: AtomicBool::new(me_deterministic_writer_sort), me_writer_pick_mode: AtomicU8::new(me_writer_pick_mode.as_u8()), @@ -1035,17 +1043,23 @@ impl MePool { self.drain_runtime .me_pool_min_fresh_ratio_permille .store(Self::ratio_to_permille(min_fresh_ratio), Ordering::Relaxed); - self.me_hardswap_warmup_delay_min_ms + self.reinit + .me_hardswap_warmup_delay_min_ms .store(hardswap_warmup_delay_min_ms, Ordering::Relaxed); - self.me_hardswap_warmup_delay_max_ms + self.reinit + .me_hardswap_warmup_delay_max_ms .store(hardswap_warmup_delay_max_ms, Ordering::Relaxed); - self.me_hardswap_warmup_extra_passes + self.reinit + .me_hardswap_warmup_extra_passes .store(hardswap_warmup_extra_passes as u32, Ordering::Relaxed); - self.me_hardswap_warmup_pass_backoff_base_ms + self.reinit + .me_hardswap_warmup_pass_backoff_base_ms .store(hardswap_warmup_pass_backoff_base_ms, Ordering::Relaxed); - self.me_bind_stale_mode + self.binding_policy + .me_bind_stale_mode .store(bind_stale_mode.as_u8(), Ordering::Relaxed); - self.me_bind_stale_ttl_secs + self.binding_policy + .me_bind_stale_ttl_secs .store(bind_stale_ttl_secs, Ordering::Relaxed); self.secret_atomic_snapshot .store(secret_atomic_snapshot, Ordering::Relaxed); @@ -1294,7 +1308,11 @@ impl MePool { } pub(super) fn bind_stale_mode(&self) -> MeBindStaleMode { - MeBindStaleMode::from_u8(self.me_bind_stale_mode.load(Ordering::Relaxed)) + MeBindStaleMode::from_u8( + self.binding_policy + .me_bind_stale_mode + .load(Ordering::Relaxed), + ) } pub(super) fn writer_pick_mode(&self) -> MeWriterPickMode { diff --git a/src/transport/middle_proxy/pool_reinit.rs b/src/transport/middle_proxy/pool_reinit.rs index 2bf2e5b..009f850 100644 --- a/src/transport/middle_proxy/pool_reinit.rs +++ b/src/transport/middle_proxy/pool_reinit.rs @@ -189,8 +189,14 @@ impl MePool { } fn hardswap_warmup_connect_delay_ms(&self) -> u64 { - let min_ms = self.me_hardswap_warmup_delay_min_ms.load(Ordering::Relaxed); - let max_ms = self.me_hardswap_warmup_delay_max_ms.load(Ordering::Relaxed); + let min_ms = self + .reinit + .me_hardswap_warmup_delay_min_ms + .load(Ordering::Relaxed); + let max_ms = self + .reinit + .me_hardswap_warmup_delay_max_ms + .load(Ordering::Relaxed); let (min_ms, max_ms) = if min_ms <= max_ms { (min_ms, max_ms) } else { @@ -204,6 +210,7 @@ impl MePool { fn hardswap_warmup_backoff_ms(&self, pass_idx: usize) -> u64 { let base_ms = self + .reinit .me_hardswap_warmup_pass_backoff_base_ms .load(Ordering::Relaxed); let cap_ms = (self.me_reconnect_backoff_cap.as_millis() as u64).max(base_ms); @@ -249,6 +256,7 @@ impl MePool { desired_by_dc: &HashMap>, ) { let extra_passes = self + .reinit .me_hardswap_warmup_extra_passes .load(Ordering::Relaxed) .min(10) as usize; diff --git a/src/transport/middle_proxy/pool_status.rs b/src/transport/middle_proxy/pool_status.rs index d8d4a39..6cd5cec 100644 --- a/src/transport/middle_proxy/pool_status.rs +++ b/src/transport/middle_proxy/pool_status.rs @@ -587,7 +587,10 @@ impl MePool { .load(Ordering::Relaxed), ), me_bind_stale_mode: bind_stale_mode_label(self.bind_stale_mode()), - me_bind_stale_ttl_secs: self.me_bind_stale_ttl_secs.load(Ordering::Relaxed), + me_bind_stale_ttl_secs: self + .binding_policy + .me_bind_stale_ttl_secs + .load(Ordering::Relaxed), me_single_endpoint_shadow_writers: self .single_endpoint_runtime .me_single_endpoint_shadow_writers diff --git a/src/transport/middle_proxy/pool_writer.rs b/src/transport/middle_proxy/pool_writer.rs index f697257..c1f3de9 100644 --- a/src/transport/middle_proxy/pool_writer.rs +++ b/src/transport/middle_proxy/pool_writer.rs @@ -681,7 +681,10 @@ impl MePool { MeBindStaleMode::Never => false, MeBindStaleMode::Always => true, MeBindStaleMode::Ttl => { - let ttl_secs = self.me_bind_stale_ttl_secs.load(Ordering::Relaxed); + let ttl_secs = self + .binding_policy + .me_bind_stale_ttl_secs + .load(Ordering::Relaxed); if ttl_secs == 0 { return true; } From ceae1564afe154cb713b35cd6c14475eb0f28225 Mon Sep 17 00:00:00 2001 From: Alexey <247128645+axkurcom@users.noreply.github.com> Date: Wed, 25 Mar 2026 20:55:20 +0300 Subject: [PATCH 12/17] Floor Runtime + Writer Selection Policy + Reconnect/Warmup + TransportPolicy + NAT Runtime Cores Co-Authored-By: brekotis <93345790+brekotis@users.noreply.github.com> --- src/proxy/middle_relay.rs | 4 +- src/stats/mod.rs | 3 +- src/transport/middle_proxy/handshake.rs | 11 +- src/transport/middle_proxy/health.rs | 36 +- src/transport/middle_proxy/pool.rs | 475 +++++++++++------- src/transport/middle_proxy/pool_init.rs | 15 +- src/transport/middle_proxy/pool_nat.rs | 64 ++- src/transport/middle_proxy/pool_refill.rs | 2 +- src/transport/middle_proxy/pool_reinit.rs | 16 +- .../middle_proxy/pool_runtime_api.rs | 16 +- src/transport/middle_proxy/pool_status.rs | 55 +- src/transport/middle_proxy/pool_writer.rs | 21 +- src/transport/middle_proxy/registry.rs | 25 +- src/transport/middle_proxy/send.rs | 55 +- 14 files changed, 502 insertions(+), 296 deletions(-) diff --git a/src/proxy/middle_relay.rs b/src/proxy/middle_relay.rs index 6c7aef9..e8fc52a 100644 --- a/src/proxy/middle_relay.rs +++ b/src/proxy/middle_relay.rs @@ -323,8 +323,8 @@ fn should_emit_full_desync(key: u64, all_full: bool, now: Instant) -> bool { let dedup_current = DESYNC_DEDUP.get_or_init(DashMap::new); let dedup_previous = DESYNC_DEDUP_PREVIOUS.get_or_init(DashMap::new); - let rotation_state = DESYNC_DEDUP_ROTATION_STATE - .get_or_init(|| Mutex::new(DesyncDedupRotationState::default())); + let rotation_state = + DESYNC_DEDUP_ROTATION_STATE.get_or_init(|| Mutex::new(DesyncDedupRotationState::default())); let mut state = match rotation_state.lock() { Ok(guard) => guard, diff --git a/src/stats/mod.rs b/src/stats/mod.rs index 9cba3e8..4144f82 100644 --- a/src/stats/mod.rs +++ b/src/stats/mod.rs @@ -1206,8 +1206,7 @@ impl Stats { } pub fn increment_me_hybrid_timeout_total(&self) { if self.telemetry_me_allows_normal() { - self.me_hybrid_timeout_total - .fetch_add(1, Ordering::Relaxed); + self.me_hybrid_timeout_total.fetch_add(1, Ordering::Relaxed); } } pub fn increment_me_async_recovery_trigger_total(&self) { diff --git a/src/transport/middle_proxy/handshake.rs b/src/transport/middle_proxy/handshake.rs index b6eff37..01206e2 100644 --- a/src/transport/middle_proxy/handshake.rs +++ b/src/transport/middle_proxy/handshake.rs @@ -161,7 +161,7 @@ impl MePool { } else { let connect_fut = async { if addr.is_ipv6() - && let Some(v6) = self.detected_ipv6 + && let Some(v6) = self.nat_runtime.detected_ipv6 { match TcpSocket::new_v6() { Ok(sock) => { @@ -305,7 +305,7 @@ impl MePool { } MeSocksKdfPolicy::Compat => { self.stats.increment_me_socks_kdf_compat_fallback(); - if self.nat_probe { + if self.nat_runtime.nat_probe { let bind_ip = Self::direct_bind_ip_for_stun(family, upstream_egress); self.maybe_reflect_public_addr(family, bind_ip).await } else { @@ -313,7 +313,7 @@ impl MePool { } } } - } else if self.nat_probe { + } else if self.nat_runtime.nat_probe { let bind_ip = Self::direct_bind_ip_for_stun(family, upstream_egress); self.maybe_reflect_public_addr(family, bind_ip).await } else { @@ -343,7 +343,10 @@ impl MePool { .unwrap_or_default() .as_secs() as u32; - let secret_atomic_snapshot = self.secret_atomic_snapshot.load(Ordering::Relaxed); + let secret_atomic_snapshot = self + .writer_selection_policy + .secret_atomic_snapshot + .load(Ordering::Relaxed); let (ks, secret) = if secret_atomic_snapshot { let snapshot = self.secret_snapshot().await; (snapshot.key_selector, snapshot.secret) diff --git a/src/transport/middle_proxy/health.rs b/src/transport/middle_proxy/health.rs index 29a6f7d..ca6e681 100644 --- a/src/transport/middle_proxy/health.rs +++ b/src/transport/middle_proxy/health.rs @@ -530,7 +530,7 @@ async fn check_family( let now = Instant::now(); if reconnect_sem.available_permits() == 0 { - let base_ms = pool.me_reconnect_backoff_base.as_millis() as u64; + let base_ms = pool.reconnect_runtime.me_reconnect_backoff_base.as_millis() as u64; let next_ms = (*backoff.get(&key).unwrap_or(&base_ms)).max(base_ms); let jitter = next_ms / JITTER_FRAC_NUM; let wait = Duration::from_millis(next_ms) @@ -553,7 +553,10 @@ async fn check_family( continue; } - let max_concurrent = pool.me_reconnect_max_concurrent_per_dc.max(1) as usize; + let max_concurrent = pool + .reconnect_runtime + .me_reconnect_max_concurrent_per_dc + .max(1) as usize; if *inflight.get(&key).unwrap_or(&0) >= max_concurrent { continue; } @@ -610,7 +613,7 @@ async fn check_family( break; } let res = tokio::time::timeout( - pool.me_one_timeout, + pool.reconnect_runtime.me_one_timeout, pool.connect_endpoints_round_robin(dc, &endpoints, rng.as_ref()), ) .await; @@ -641,17 +644,21 @@ async fn check_family( endpoint_count = endpoints.len(), "ME writer floor restored for DC" ); - backoff.insert(key, pool.me_reconnect_backoff_base.as_millis() as u64); - let jitter = pool.me_reconnect_backoff_base.as_millis() as u64 / JITTER_FRAC_NUM; - let wait = pool.me_reconnect_backoff_base + backoff.insert( + key, + pool.reconnect_runtime.me_reconnect_backoff_base.as_millis() as u64, + ); + let jitter = pool.reconnect_runtime.me_reconnect_backoff_base.as_millis() as u64 + / JITTER_FRAC_NUM; + let wait = pool.reconnect_runtime.me_reconnect_backoff_base + Duration::from_millis(rand::rng().random_range(0..=jitter.max(1))); next_attempt.insert(key, now + wait); } else { let curr = *backoff .get(&key) - .unwrap_or(&(pool.me_reconnect_backoff_base.as_millis() as u64)); - let next_ms = - (curr.saturating_mul(2)).min(pool.me_reconnect_backoff_cap.as_millis() as u64); + .unwrap_or(&(pool.reconnect_runtime.me_reconnect_backoff_base.as_millis() as u64)); + let next_ms = (curr.saturating_mul(2)) + .min(pool.reconnect_runtime.me_reconnect_backoff_cap.as_millis() as u64); backoff.insert(key, next_ms); let jitter = next_ms / JITTER_FRAC_NUM; let wait = Duration::from_millis(next_ms) @@ -723,6 +730,7 @@ fn adaptive_floor_class_min( ) -> usize { if endpoint_count <= 1 { let min_single = (pool + .floor_runtime .me_adaptive_floor_min_writers_single_endpoint .load(std::sync::atomic::Ordering::Relaxed) as usize) .max(1); @@ -979,7 +987,7 @@ async fn maybe_swap_idle_writer_for_cap( }; let connected = match tokio::time::timeout( - pool.me_one_timeout, + pool.reconnect_runtime.me_one_timeout, pool.connect_one_for_dc(endpoint, dc, rng.as_ref()), ) .await @@ -1085,7 +1093,7 @@ async fn maybe_refresh_idle_writer_for_dc( }; let rotate_ok = match tokio::time::timeout( - pool.me_one_timeout, + pool.reconnect_runtime.me_one_timeout, pool.connect_one_for_dc(endpoint, dc, rng.as_ref()), ) .await @@ -1236,7 +1244,7 @@ async fn recover_single_endpoint_outage( pool.stats .increment_me_single_endpoint_quarantine_bypass_total(); match tokio::time::timeout( - pool.me_one_timeout, + pool.reconnect_runtime.me_one_timeout, pool.connect_one_for_dc(endpoint, key.0, rng.as_ref()), ) .await @@ -1265,7 +1273,7 @@ async fn recover_single_endpoint_outage( } else { let one_endpoint = [endpoint]; match tokio::time::timeout( - pool.me_one_timeout, + pool.reconnect_runtime.me_one_timeout, pool.connect_endpoints_round_robin(key.0, &one_endpoint, rng.as_ref()), ) .await @@ -1390,7 +1398,7 @@ async fn maybe_rotate_single_endpoint_shadow( }; let rotate_ok = match tokio::time::timeout( - pool.me_one_timeout, + pool.reconnect_runtime.me_one_timeout, pool.connect_one_for_dc(endpoint, dc, rng.as_ref()), ) .await diff --git a/src/transport/middle_proxy/pool.rs b/src/transport/middle_proxy/pool.rs index 916f39c..af37a0b 100644 --- a/src/transport/middle_proxy/pool.rs +++ b/src/transport/middle_proxy/pool.rs @@ -339,21 +339,7 @@ pub(super) struct BindingPolicyCore { pub(super) me_bind_stale_ttl_secs: AtomicU64, } -#[allow(dead_code)] -pub struct MePool { - pub(super) routing: Arc, - pub(super) reinit: Arc, - pub(super) writer_lifecycle: Arc, - pub(super) route_runtime: Arc, - pub(super) health_runtime: Arc, - pub(super) drain_runtime: Arc, - pub(super) single_endpoint_runtime: Arc, - pub(super) binding_policy: Arc, - pub(super) decision: NetworkDecision, - pub(super) upstream: Option>, - pub(super) rng: Arc, - pub(super) proxy_tag: Option>, - pub(super) proxy_secret: Arc>, +pub(super) struct NatRuntimeCore { pub(super) nat_ip_cfg: Option, pub(super) nat_ip_detected: Arc>>, pub(super) nat_probe: bool, @@ -365,6 +351,13 @@ pub struct MePool { pub(super) nat_probe_attempts: std::sync::atomic::AtomicU8, pub(super) nat_probe_disabled: std::sync::atomic::AtomicBool, pub(super) stun_backoff_until: Arc>>, + pub(super) nat_reflection_cache: Arc>, + pub(super) nat_reflection_singleflight_v4: Arc>, + pub(super) nat_reflection_singleflight_v6: Arc>, +} + +pub(super) struct ReconnectRuntimeCore { + #[allow(dead_code)] pub(super) me_one_retry: u8, pub(super) me_one_timeout: Duration, pub(super) me_warmup_stagger_enabled: bool, @@ -374,6 +367,9 @@ pub struct MePool { pub(super) me_reconnect_backoff_base: Duration, pub(super) me_reconnect_backoff_cap: Duration, pub(super) me_reconnect_fast_retry_count: u32, +} + +pub(super) struct FloorRuntimeCore { pub(super) me_floor_mode: AtomicU8, pub(super) me_adaptive_floor_idle_secs: AtomicU64, pub(super) me_adaptive_floor_min_writers_single_endpoint: AtomicU8, @@ -398,15 +394,46 @@ pub struct MePool { pub(super) me_adaptive_floor_warm_cap_effective: AtomicU64, pub(super) me_adaptive_floor_active_writers_current: AtomicU64, pub(super) me_adaptive_floor_warm_writers_current: AtomicU64, +} + +pub(super) struct WriterSelectionPolicyCore { + pub(super) secret_atomic_snapshot: AtomicBool, + pub(super) me_deterministic_writer_sort: AtomicBool, + pub(super) me_writer_pick_mode: AtomicU8, + pub(super) me_writer_pick_sample_size: AtomicU8, +} + +pub(super) struct TransportPolicyCore { + pub(super) me_socks_kdf_policy: AtomicU8, + pub(super) me_reader_route_data_wait_ms: Arc, +} + +#[allow(dead_code)] +pub struct MePool { + pub(super) routing: Arc, + pub(super) reinit: Arc, + pub(super) writer_lifecycle: Arc, + pub(super) route_runtime: Arc, + pub(super) health_runtime: Arc, + pub(super) drain_runtime: Arc, + pub(super) single_endpoint_runtime: Arc, + pub(super) binding_policy: Arc, + pub(super) nat_runtime: Arc, + pub(super) reconnect_runtime: Arc, + pub(super) floor_runtime: Arc, + pub(super) writer_selection_policy: Arc, + pub(super) transport_policy: Arc, + pub(super) decision: NetworkDecision, + pub(super) upstream: Option>, + pub(super) rng: Arc, + pub(super) proxy_tag: Option>, + pub(super) proxy_secret: Arc>, pub(super) proxy_map_v4: Arc>>>, pub(super) proxy_map_v6: Arc>>>, pub(super) endpoint_dc_map: Arc>>>, pub(super) default_dc: AtomicI32, pub(super) next_writer_id: AtomicU64, pub(super) rtt_stats: Arc>>, - pub(super) nat_reflection_cache: Arc>, - pub(super) nat_reflection_singleflight_v4: Arc>, - pub(super) nat_reflection_singleflight_v6: Arc>, pub(super) refill_inflight: Arc>>, pub(super) refill_inflight_dc: Arc>>, pub(super) conn_count: AtomicUsize, @@ -414,12 +441,6 @@ pub struct MePool { pub(super) stats: Arc, pub(super) endpoint_quarantine: Arc>>, pub(super) kdf_material_fingerprint: Arc>>, - pub(super) secret_atomic_snapshot: AtomicBool, - pub(super) me_deterministic_writer_sort: AtomicBool, - pub(super) me_writer_pick_mode: AtomicU8, - pub(super) me_writer_pick_sample_size: AtomicU8, - pub(super) me_socks_kdf_policy: AtomicU8, - pub(super) me_reader_route_data_wait_ms: Arc, pub(super) runtime_ready: AtomicBool, pool_size: usize, } @@ -599,7 +620,9 @@ impl MePool { route_runtime: Arc::new(RouteRuntimeCore { me_route_no_writer_mode: AtomicU8::new(me_route_no_writer_mode.as_u8()), me_route_no_writer_wait: Duration::from_millis(me_route_no_writer_wait_ms), - me_route_hybrid_max_wait: Duration::from_millis(me_route_hybrid_max_wait_ms.max(50)), + me_route_hybrid_max_wait: Duration::from_millis( + me_route_hybrid_max_wait_ms.max(50), + ), me_route_blocking_send_timeout: if me_route_blocking_send_timeout_ms == 0 { None } else { @@ -611,10 +634,14 @@ impl MePool { me_route_hybrid_timeout_warn_epoch_ms: AtomicU64::new(0), me_async_recovery_last_trigger_epoch_ms: AtomicU64::new(0), me_route_inline_recovery_attempts, - me_route_inline_recovery_wait: Duration::from_millis(me_route_inline_recovery_wait_ms), + me_route_inline_recovery_wait: Duration::from_millis( + me_route_inline_recovery_wait_ms, + ), }), health_runtime: Arc::new(HealthRuntimeCore { - me_health_interval_ms_unhealthy: AtomicU64::new(me_health_interval_ms_unhealthy.max(1)), + me_health_interval_ms_unhealthy: AtomicU64::new( + me_health_interval_ms_unhealthy.max(1), + ), me_health_interval_ms_healthy: AtomicU64::new(me_health_interval_ms_healthy.max(1)), me_warn_rate_limit_ms: AtomicU64::new(me_warn_rate_limit_ms.max(1)), family_health_v4: ArcSwap::from_pointee(FamilyHealthSnapshot::new( @@ -682,6 +709,93 @@ impl MePool { me_bind_stale_mode: AtomicU8::new(me_bind_stale_mode.as_u8()), me_bind_stale_ttl_secs: AtomicU64::new(me_bind_stale_ttl_secs), }), + nat_runtime: Arc::new(NatRuntimeCore { + nat_ip_cfg: nat_ip, + nat_ip_detected: Arc::new(RwLock::new(None)), + nat_probe, + nat_stun, + nat_stun_servers, + nat_stun_live_servers: Arc::new(RwLock::new(Vec::new())), + nat_probe_concurrency: nat_probe_concurrency.max(1), + detected_ipv6, + nat_probe_attempts: std::sync::atomic::AtomicU8::new(0), + nat_probe_disabled: std::sync::atomic::AtomicBool::new(false), + stun_backoff_until: Arc::new(RwLock::new(None)), + nat_reflection_cache: Arc::new(Mutex::new(NatReflectionCache::default())), + nat_reflection_singleflight_v4: Arc::new(Mutex::new(())), + nat_reflection_singleflight_v6: Arc::new(Mutex::new(())), + }), + reconnect_runtime: Arc::new(ReconnectRuntimeCore { + me_one_retry, + me_one_timeout: Duration::from_millis(me_one_timeout_ms), + me_warmup_stagger_enabled, + me_warmup_step_delay: Duration::from_millis(me_warmup_step_delay_ms), + me_warmup_step_jitter: Duration::from_millis(me_warmup_step_jitter_ms), + me_reconnect_max_concurrent_per_dc, + me_reconnect_backoff_base: Duration::from_millis(me_reconnect_backoff_base_ms), + me_reconnect_backoff_cap: Duration::from_millis(me_reconnect_backoff_cap_ms), + me_reconnect_fast_retry_count, + }), + floor_runtime: Arc::new(FloorRuntimeCore { + me_floor_mode: AtomicU8::new(me_floor_mode.as_u8()), + me_adaptive_floor_idle_secs: AtomicU64::new(me_adaptive_floor_idle_secs), + me_adaptive_floor_min_writers_single_endpoint: AtomicU8::new( + me_adaptive_floor_min_writers_single_endpoint, + ), + me_adaptive_floor_min_writers_multi_endpoint: AtomicU8::new( + me_adaptive_floor_min_writers_multi_endpoint, + ), + me_adaptive_floor_recover_grace_secs: AtomicU64::new( + me_adaptive_floor_recover_grace_secs, + ), + me_adaptive_floor_writers_per_core_total: AtomicU32::new( + me_adaptive_floor_writers_per_core_total as u32, + ), + me_adaptive_floor_cpu_cores_override: AtomicU32::new( + me_adaptive_floor_cpu_cores_override as u32, + ), + me_adaptive_floor_max_extra_writers_single_per_core: AtomicU32::new( + me_adaptive_floor_max_extra_writers_single_per_core as u32, + ), + me_adaptive_floor_max_extra_writers_multi_per_core: AtomicU32::new( + me_adaptive_floor_max_extra_writers_multi_per_core as u32, + ), + me_adaptive_floor_max_active_writers_per_core: AtomicU32::new( + me_adaptive_floor_max_active_writers_per_core as u32, + ), + me_adaptive_floor_max_warm_writers_per_core: AtomicU32::new( + me_adaptive_floor_max_warm_writers_per_core as u32, + ), + me_adaptive_floor_max_active_writers_global: AtomicU32::new( + me_adaptive_floor_max_active_writers_global, + ), + me_adaptive_floor_max_warm_writers_global: AtomicU32::new( + me_adaptive_floor_max_warm_writers_global, + ), + me_adaptive_floor_cpu_cores_detected: AtomicU32::new(1), + me_adaptive_floor_cpu_cores_effective: AtomicU32::new(1), + me_adaptive_floor_global_cap_raw: AtomicU64::new(0), + me_adaptive_floor_global_cap_effective: AtomicU64::new(0), + me_adaptive_floor_target_writers_total: AtomicU64::new(0), + me_adaptive_floor_active_cap_configured: AtomicU64::new(0), + me_adaptive_floor_active_cap_effective: AtomicU64::new(0), + me_adaptive_floor_warm_cap_configured: AtomicU64::new(0), + me_adaptive_floor_warm_cap_effective: AtomicU64::new(0), + me_adaptive_floor_active_writers_current: AtomicU64::new(0), + me_adaptive_floor_warm_writers_current: AtomicU64::new(0), + }), + writer_selection_policy: Arc::new(WriterSelectionPolicyCore { + secret_atomic_snapshot: AtomicBool::new(me_secret_atomic_snapshot), + me_deterministic_writer_sort: AtomicBool::new(me_deterministic_writer_sort), + me_writer_pick_mode: AtomicU8::new(me_writer_pick_mode.as_u8()), + me_writer_pick_sample_size: AtomicU8::new(me_writer_pick_sample_size.clamp(2, 4)), + }), + transport_policy: Arc::new(TransportPolicyCore { + me_socks_kdf_policy: AtomicU8::new(me_socks_kdf_policy.as_u8()), + me_reader_route_data_wait_ms: Arc::new(AtomicU64::new( + me_reader_route_data_wait_ms, + )), + }), decision, upstream, rng, @@ -700,73 +814,7 @@ impl MePool { }, secret: proxy_secret, })), - nat_ip_cfg: nat_ip, - nat_ip_detected: Arc::new(RwLock::new(None)), - nat_probe, - nat_stun, - nat_stun_servers, - nat_stun_live_servers: Arc::new(RwLock::new(Vec::new())), - nat_probe_concurrency: nat_probe_concurrency.max(1), - detected_ipv6, - nat_probe_attempts: std::sync::atomic::AtomicU8::new(0), - nat_probe_disabled: std::sync::atomic::AtomicBool::new(false), - stun_backoff_until: Arc::new(RwLock::new(None)), - me_one_retry, - me_one_timeout: Duration::from_millis(me_one_timeout_ms), stats, - me_warmup_stagger_enabled, - me_warmup_step_delay: Duration::from_millis(me_warmup_step_delay_ms), - me_warmup_step_jitter: Duration::from_millis(me_warmup_step_jitter_ms), - me_reconnect_max_concurrent_per_dc, - me_reconnect_backoff_base: Duration::from_millis(me_reconnect_backoff_base_ms), - me_reconnect_backoff_cap: Duration::from_millis(me_reconnect_backoff_cap_ms), - me_reconnect_fast_retry_count, - me_floor_mode: AtomicU8::new(me_floor_mode.as_u8()), - me_adaptive_floor_idle_secs: AtomicU64::new(me_adaptive_floor_idle_secs), - me_adaptive_floor_min_writers_single_endpoint: AtomicU8::new( - me_adaptive_floor_min_writers_single_endpoint, - ), - me_adaptive_floor_min_writers_multi_endpoint: AtomicU8::new( - me_adaptive_floor_min_writers_multi_endpoint, - ), - me_adaptive_floor_recover_grace_secs: AtomicU64::new( - me_adaptive_floor_recover_grace_secs, - ), - me_adaptive_floor_writers_per_core_total: AtomicU32::new( - me_adaptive_floor_writers_per_core_total as u32, - ), - me_adaptive_floor_cpu_cores_override: AtomicU32::new( - me_adaptive_floor_cpu_cores_override as u32, - ), - me_adaptive_floor_max_extra_writers_single_per_core: AtomicU32::new( - me_adaptive_floor_max_extra_writers_single_per_core as u32, - ), - me_adaptive_floor_max_extra_writers_multi_per_core: AtomicU32::new( - me_adaptive_floor_max_extra_writers_multi_per_core as u32, - ), - me_adaptive_floor_max_active_writers_per_core: AtomicU32::new( - me_adaptive_floor_max_active_writers_per_core as u32, - ), - me_adaptive_floor_max_warm_writers_per_core: AtomicU32::new( - me_adaptive_floor_max_warm_writers_per_core as u32, - ), - me_adaptive_floor_max_active_writers_global: AtomicU32::new( - me_adaptive_floor_max_active_writers_global, - ), - me_adaptive_floor_max_warm_writers_global: AtomicU32::new( - me_adaptive_floor_max_warm_writers_global, - ), - me_adaptive_floor_cpu_cores_detected: AtomicU32::new(1), - me_adaptive_floor_cpu_cores_effective: AtomicU32::new(1), - me_adaptive_floor_global_cap_raw: AtomicU64::new(0), - me_adaptive_floor_global_cap_effective: AtomicU64::new(0), - me_adaptive_floor_target_writers_total: AtomicU64::new(0), - me_adaptive_floor_active_cap_configured: AtomicU64::new(0), - me_adaptive_floor_active_cap_effective: AtomicU64::new(0), - me_adaptive_floor_warm_cap_configured: AtomicU64::new(0), - me_adaptive_floor_warm_cap_effective: AtomicU64::new(0), - me_adaptive_floor_active_writers_current: AtomicU64::new(0), - me_adaptive_floor_warm_writers_current: AtomicU64::new(0), pool_size: 2, proxy_map_v4: Arc::new(RwLock::new(proxy_map_v4)), proxy_map_v6: Arc::new(RwLock::new(proxy_map_v6)), @@ -774,21 +822,12 @@ impl MePool { default_dc: AtomicI32::new(default_dc.unwrap_or(2)), next_writer_id: AtomicU64::new(1), rtt_stats: Arc::new(Mutex::new(HashMap::new())), - nat_reflection_cache: Arc::new(Mutex::new(NatReflectionCache::default())), - nat_reflection_singleflight_v4: Arc::new(Mutex::new(())), - nat_reflection_singleflight_v6: Arc::new(Mutex::new(())), refill_inflight: Arc::new(Mutex::new(HashSet::new())), refill_inflight_dc: Arc::new(Mutex::new(HashSet::new())), conn_count: AtomicUsize::new(0), draining_active_runtime: AtomicU64::new(0), endpoint_quarantine: Arc::new(Mutex::new(HashMap::new())), kdf_material_fingerprint: Arc::new(RwLock::new(HashMap::new())), - secret_atomic_snapshot: AtomicBool::new(me_secret_atomic_snapshot), - me_deterministic_writer_sort: AtomicBool::new(me_deterministic_writer_sort), - me_writer_pick_mode: AtomicU8::new(me_writer_pick_mode.as_u8()), - me_writer_pick_sample_size: AtomicU8::new(me_writer_pick_sample_size.clamp(2, 4)), - me_socks_kdf_policy: AtomicU8::new(me_socks_kdf_policy.as_u8()), - me_reader_route_data_wait_ms: Arc::new(AtomicU64::new(me_reader_route_data_wait_ms)), runtime_ready: AtomicBool::new(false), }) } @@ -850,23 +889,35 @@ impl MePool { pub(crate) fn family_runtime_state_since_epoch_secs(&self, family: IpFamily) -> u64 { match family { - IpFamily::V4 => self.health_runtime.family_health_v4.load().state_since_epoch_secs, - IpFamily::V6 => self.health_runtime.family_health_v6.load().state_since_epoch_secs, + IpFamily::V4 => { + self.health_runtime + .family_health_v4 + .load() + .state_since_epoch_secs + } + IpFamily::V6 => { + self.health_runtime + .family_health_v6 + .load() + .state_since_epoch_secs + } } } pub(crate) fn family_suppressed_until_epoch_secs(&self, family: IpFamily) -> u64 { match family { - IpFamily::V4 => self - .health_runtime - .family_health_v4 - .load() - .suppressed_until_epoch_secs, - IpFamily::V6 => self - .health_runtime - .family_health_v6 - .load() - .suppressed_until_epoch_secs, + IpFamily::V4 => { + self.health_runtime + .family_health_v4 + .load() + .suppressed_until_epoch_secs + } + IpFamily::V6 => { + self.health_runtime + .family_health_v6 + .load() + .suppressed_until_epoch_secs + } } } @@ -879,16 +930,18 @@ impl MePool { pub(crate) fn family_recover_success_streak(&self, family: IpFamily) -> u32 { match family { - IpFamily::V4 => self - .health_runtime - .family_health_v4 - .load() - .recover_success_streak, - IpFamily::V6 => self - .health_runtime - .family_health_v6 - .load() - .recover_success_streak, + IpFamily::V4 => { + self.health_runtime + .family_health_v4 + .load() + .recover_success_streak + } + IpFamily::V6 => { + self.health_runtime + .family_health_v6 + .load() + .recover_success_streak + } } } @@ -1061,14 +1114,18 @@ impl MePool { self.binding_policy .me_bind_stale_ttl_secs .store(bind_stale_ttl_secs, Ordering::Relaxed); - self.secret_atomic_snapshot + self.writer_selection_policy + .secret_atomic_snapshot .store(secret_atomic_snapshot, Ordering::Relaxed); - self.me_deterministic_writer_sort + self.writer_selection_policy + .me_deterministic_writer_sort .store(deterministic_writer_sort, Ordering::Relaxed); let previous_writer_pick_mode = self.writer_pick_mode(); - self.me_writer_pick_mode + self.writer_selection_policy + .me_writer_pick_mode .store(writer_pick_mode.as_u8(), Ordering::Relaxed); - self.me_writer_pick_sample_size + self.writer_selection_policy + .me_writer_pick_sample_size .store(writer_pick_sample_size.clamp(2, 4), Ordering::Relaxed); if previous_writer_pick_mode != writer_pick_mode { self.stats.increment_me_writer_pick_mode_switch_total(); @@ -1092,45 +1149,62 @@ impl MePool { .me_single_endpoint_shadow_rotate_every_secs .store(single_endpoint_shadow_rotate_every_secs, Ordering::Relaxed); let previous_floor_mode = self.floor_mode(); - self.me_floor_mode + self.floor_runtime + .me_floor_mode .store(floor_mode.as_u8(), Ordering::Relaxed); - self.me_adaptive_floor_idle_secs + self.floor_runtime + .me_adaptive_floor_idle_secs .store(adaptive_floor_idle_secs, Ordering::Relaxed); - self.me_adaptive_floor_min_writers_single_endpoint.store( - adaptive_floor_min_writers_single_endpoint, - Ordering::Relaxed, - ); - self.me_adaptive_floor_min_writers_multi_endpoint + self.floor_runtime + .me_adaptive_floor_min_writers_single_endpoint + .store( + adaptive_floor_min_writers_single_endpoint, + Ordering::Relaxed, + ); + self.floor_runtime + .me_adaptive_floor_min_writers_multi_endpoint .store(adaptive_floor_min_writers_multi_endpoint, Ordering::Relaxed); - self.me_adaptive_floor_recover_grace_secs + self.floor_runtime + .me_adaptive_floor_recover_grace_secs .store(adaptive_floor_recover_grace_secs, Ordering::Relaxed); - self.me_adaptive_floor_writers_per_core_total.store( - adaptive_floor_writers_per_core_total as u32, - Ordering::Relaxed, - ); - self.me_adaptive_floor_cpu_cores_override + self.floor_runtime + .me_adaptive_floor_writers_per_core_total + .store( + adaptive_floor_writers_per_core_total as u32, + Ordering::Relaxed, + ); + self.floor_runtime + .me_adaptive_floor_cpu_cores_override .store(adaptive_floor_cpu_cores_override as u32, Ordering::Relaxed); - self.me_adaptive_floor_max_extra_writers_single_per_core + self.floor_runtime + .me_adaptive_floor_max_extra_writers_single_per_core .store( adaptive_floor_max_extra_writers_single_per_core as u32, Ordering::Relaxed, ); - self.me_adaptive_floor_max_extra_writers_multi_per_core + self.floor_runtime + .me_adaptive_floor_max_extra_writers_multi_per_core .store( adaptive_floor_max_extra_writers_multi_per_core as u32, Ordering::Relaxed, ); - self.me_adaptive_floor_max_active_writers_per_core.store( - adaptive_floor_max_active_writers_per_core as u32, - Ordering::Relaxed, - ); - self.me_adaptive_floor_max_warm_writers_per_core.store( - adaptive_floor_max_warm_writers_per_core as u32, - Ordering::Relaxed, - ); - self.me_adaptive_floor_max_active_writers_global + self.floor_runtime + .me_adaptive_floor_max_active_writers_per_core + .store( + adaptive_floor_max_active_writers_per_core as u32, + Ordering::Relaxed, + ); + self.floor_runtime + .me_adaptive_floor_max_warm_writers_per_core + .store( + adaptive_floor_max_warm_writers_per_core as u32, + Ordering::Relaxed, + ); + self.floor_runtime + .me_adaptive_floor_max_active_writers_global .store(adaptive_floor_max_active_writers_global, Ordering::Relaxed); - self.me_adaptive_floor_max_warm_writers_global + self.floor_runtime + .me_adaptive_floor_max_warm_writers_global .store(adaptive_floor_max_warm_writers_global, Ordering::Relaxed); self.health_runtime .me_health_interval_ms_unhealthy @@ -1158,9 +1232,13 @@ impl MePool { } pub fn reset_stun_state(&self) { - self.nat_probe_attempts.store(0, Ordering::Relaxed); - self.nat_probe_disabled.store(false, Ordering::Relaxed); - if let Ok(mut live) = self.nat_stun_live_servers.try_write() { + self.nat_runtime + .nat_probe_attempts + .store(0, Ordering::Relaxed); + self.nat_runtime + .nat_probe_disabled + .store(false, Ordering::Relaxed); + if let Ok(mut live) = self.nat_runtime.nat_stun_live_servers.try_write() { live.clear(); } } @@ -1182,9 +1260,11 @@ impl MePool { route_backpressure_high_watermark_pct: u8, reader_route_data_wait_ms: u64, ) { - self.me_socks_kdf_policy + self.transport_policy + .me_socks_kdf_policy .store(socks_kdf_policy.as_u8(), Ordering::Relaxed); - self.me_reader_route_data_wait_ms + self.transport_policy + .me_reader_route_data_wait_ms .store(reader_route_data_wait_ms, Ordering::Relaxed); self.registry.update_route_backpressure_policy( route_backpressure_base_timeout_ms, @@ -1194,7 +1274,11 @@ impl MePool { } pub(super) fn socks_kdf_policy(&self) -> MeSocksKdfPolicy { - MeSocksKdfPolicy::from_u8(self.me_socks_kdf_policy.load(Ordering::Relaxed)) + MeSocksKdfPolicy::from_u8( + self.transport_policy + .me_socks_kdf_policy + .load(Ordering::Relaxed), + ) } pub(super) fn writers_arc(&self) -> Arc { @@ -1316,11 +1400,16 @@ impl MePool { } pub(super) fn writer_pick_mode(&self) -> MeWriterPickMode { - MeWriterPickMode::from_u8(self.me_writer_pick_mode.load(Ordering::Relaxed)) + MeWriterPickMode::from_u8( + self.writer_selection_policy + .me_writer_pick_mode + .load(Ordering::Relaxed), + ) } pub(super) fn writer_pick_sample_size(&self) -> usize { - self.me_writer_pick_sample_size + self.writer_selection_policy + .me_writer_pick_sample_size .load(Ordering::Relaxed) .clamp(2, 4) as usize } @@ -1340,39 +1429,48 @@ impl MePool { } pub(super) fn floor_mode(&self) -> MeFloorMode { - MeFloorMode::from_u8(self.me_floor_mode.load(Ordering::Relaxed)) + MeFloorMode::from_u8(self.floor_runtime.me_floor_mode.load(Ordering::Relaxed)) } pub(super) fn adaptive_floor_idle_duration(&self) -> Duration { - Duration::from_secs(self.me_adaptive_floor_idle_secs.load(Ordering::Relaxed)) + Duration::from_secs( + self.floor_runtime + .me_adaptive_floor_idle_secs + .load(Ordering::Relaxed), + ) } pub(super) fn adaptive_floor_recover_grace_duration(&self) -> Duration { Duration::from_secs( - self.me_adaptive_floor_recover_grace_secs + self.floor_runtime + .me_adaptive_floor_recover_grace_secs .load(Ordering::Relaxed), ) } pub(super) fn adaptive_floor_min_writers_multi_endpoint(&self) -> usize { (self + .floor_runtime .me_adaptive_floor_min_writers_multi_endpoint .load(Ordering::Relaxed) as usize) .max(1) } pub(super) fn adaptive_floor_max_extra_single_per_core(&self) -> usize { - self.me_adaptive_floor_max_extra_writers_single_per_core + self.floor_runtime + .me_adaptive_floor_max_extra_writers_single_per_core .load(Ordering::Relaxed) as usize } pub(super) fn adaptive_floor_max_extra_multi_per_core(&self) -> usize { - self.me_adaptive_floor_max_extra_writers_multi_per_core + self.floor_runtime + .me_adaptive_floor_max_extra_writers_multi_per_core .load(Ordering::Relaxed) as usize } pub(super) fn adaptive_floor_max_active_writers_per_core(&self) -> usize { (self + .floor_runtime .me_adaptive_floor_max_active_writers_per_core .load(Ordering::Relaxed) as usize) .max(1) @@ -1380,6 +1478,7 @@ impl MePool { pub(super) fn adaptive_floor_max_warm_writers_per_core(&self) -> usize { (self + .floor_runtime .me_adaptive_floor_max_warm_writers_per_core .load(Ordering::Relaxed) as usize) .max(1) @@ -1387,6 +1486,7 @@ impl MePool { pub(super) fn adaptive_floor_max_active_writers_global(&self) -> usize { (self + .floor_runtime .me_adaptive_floor_max_active_writers_global .load(Ordering::Relaxed) as usize) .max(1) @@ -1394,6 +1494,7 @@ impl MePool { pub(super) fn adaptive_floor_max_warm_writers_global(&self) -> usize { (self + .floor_runtime .me_adaptive_floor_max_warm_writers_global .load(Ordering::Relaxed) as usize) .max(1) @@ -1409,6 +1510,7 @@ impl MePool { pub(super) fn adaptive_floor_effective_cpu_cores(&self) -> usize { let detected = self.adaptive_floor_detected_cpu_cores(); let override_cores = self + .floor_runtime .me_adaptive_floor_cpu_cores_override .load(Ordering::Relaxed) as usize; let effective = if override_cores == 0 { @@ -1416,9 +1518,11 @@ impl MePool { } else { override_cores.max(1) }; - self.me_adaptive_floor_cpu_cores_detected + self.floor_runtime + .me_adaptive_floor_cpu_cores_detected .store(detected as u32, Ordering::Relaxed); - self.me_adaptive_floor_cpu_cores_effective + self.floor_runtime + .me_adaptive_floor_cpu_cores_effective .store(effective as u32, Ordering::Relaxed); self.stats .set_me_floor_cpu_cores_detected_gauge(detected as u64); @@ -1450,7 +1554,8 @@ impl MePool { .min(self.adaptive_floor_max_active_writers_global()) .min(per_contour_budget) .max(1); - self.me_adaptive_floor_active_cap_configured + self.floor_runtime + .me_adaptive_floor_active_cap_configured .store(configured as u64, Ordering::Relaxed); self.stats .set_me_floor_active_cap_configured_gauge(configured as u64); @@ -1465,7 +1570,8 @@ impl MePool { .min(self.adaptive_floor_max_warm_writers_global()) .min(per_contour_budget) .max(1); - self.me_adaptive_floor_warm_cap_configured + self.floor_runtime + .me_adaptive_floor_warm_cap_configured .store(configured as u64, Ordering::Relaxed); self.stats .set_me_floor_warm_cap_configured_gauge(configured as u64); @@ -1482,23 +1588,32 @@ impl MePool { active_writers_current: usize, warm_writers_current: usize, ) { - self.me_adaptive_floor_global_cap_raw + self.floor_runtime + .me_adaptive_floor_global_cap_raw .store(active_cap_configured as u64, Ordering::Relaxed); - self.me_adaptive_floor_global_cap_effective + self.floor_runtime + .me_adaptive_floor_global_cap_effective .store(active_cap_effective as u64, Ordering::Relaxed); - self.me_adaptive_floor_target_writers_total + self.floor_runtime + .me_adaptive_floor_target_writers_total .store(target_writers_total as u64, Ordering::Relaxed); - self.me_adaptive_floor_active_cap_configured + self.floor_runtime + .me_adaptive_floor_active_cap_configured .store(active_cap_configured as u64, Ordering::Relaxed); - self.me_adaptive_floor_active_cap_effective + self.floor_runtime + .me_adaptive_floor_active_cap_effective .store(active_cap_effective as u64, Ordering::Relaxed); - self.me_adaptive_floor_warm_cap_configured + self.floor_runtime + .me_adaptive_floor_warm_cap_configured .store(warm_cap_configured as u64, Ordering::Relaxed); - self.me_adaptive_floor_warm_cap_effective + self.floor_runtime + .me_adaptive_floor_warm_cap_effective .store(warm_cap_effective as u64, Ordering::Relaxed); - self.me_adaptive_floor_active_writers_current + self.floor_runtime + .me_adaptive_floor_active_writers_current .store(active_writers_current as u64, Ordering::Relaxed); - self.me_adaptive_floor_warm_writers_current + self.floor_runtime + .me_adaptive_floor_warm_writers_current .store(warm_writers_current as u64, Ordering::Relaxed); self.stats .set_me_floor_global_cap_raw_gauge(active_cap_configured as u64); @@ -1587,11 +1702,13 @@ impl MePool { } let min_writers = if endpoint_count == 1 { (self + .floor_runtime .me_adaptive_floor_min_writers_single_endpoint .load(Ordering::Relaxed) as usize) .max(1) } else { (self + .floor_runtime .me_adaptive_floor_min_writers_multi_endpoint .load(Ordering::Relaxed) as usize) .max(1) diff --git a/src/transport/middle_proxy/pool_init.rs b/src/transport/middle_proxy/pool_init.rs index 2e3bc1d..3f7cad7 100644 --- a/src/transport/middle_proxy/pool_init.rs +++ b/src/transport/middle_proxy/pool_init.rs @@ -14,7 +14,10 @@ use super::pool::MePool; impl MePool { pub async fn init(self: &Arc, pool_size: usize, rng: &Arc) -> Result<()> { let family_order = self.family_order(); - let connect_concurrency = self.me_reconnect_max_concurrent_per_dc.max(1) as usize; + let connect_concurrency = self + .reconnect_runtime + .me_reconnect_max_concurrent_per_dc + .max(1) as usize; let ks = self.key_selector().await; info!( me_servers = self.proxy_map_v4.read().await.len(), @@ -250,10 +253,12 @@ impl MePool { return false; } - if self.me_warmup_stagger_enabled { - let jitter = - rand::rng().random_range(0..=self.me_warmup_step_jitter.as_millis() as u64); - let delay_ms = self.me_warmup_step_delay.as_millis() as u64 + jitter; + if self.reconnect_runtime.me_warmup_stagger_enabled { + let jitter = rand::rng().random_range( + 0..=self.reconnect_runtime.me_warmup_step_jitter.as_millis() as u64, + ); + let delay_ms = + self.reconnect_runtime.me_warmup_step_delay.as_millis() as u64 + jitter; tokio::time::sleep(std::time::Duration::from_millis(delay_ms)).await; } } diff --git a/src/transport/middle_proxy/pool_nat.rs b/src/transport/middle_proxy/pool_nat.rs index f382fd4..be2d9df 100644 --- a/src/transport/middle_proxy/pool_nat.rs +++ b/src/transport/middle_proxy/pool_nat.rs @@ -42,10 +42,10 @@ pub async fn detect_public_ip() -> Option { impl MePool { fn configured_stun_servers(&self) -> Vec { - if !self.nat_stun_servers.is_empty() { - return self.nat_stun_servers.clone(); + if !self.nat_runtime.nat_stun_servers.is_empty() { + return self.nat_runtime.nat_stun_servers.clone(); } - if let Some(s) = &self.nat_stun + if let Some(s) = &self.nat_runtime.nat_stun && !s.trim().is_empty() { return vec![s.clone()]; @@ -64,7 +64,7 @@ impl MePool { let mut next_idx = 0usize; let mut live_servers = Vec::new(); let mut best_by_ip: HashMap = HashMap::new(); - let concurrency = self.nat_probe_concurrency.max(1); + let concurrency = self.nat_runtime.nat_probe_concurrency.max(1); while next_idx < servers.len() || !join_set.is_empty() { while next_idx < servers.len() && join_set.len() < concurrency { @@ -137,9 +137,13 @@ impl MePool { } pub(super) fn translate_ip_for_nat(&self, ip: IpAddr) -> IpAddr { - let nat_ip = self - .nat_ip_cfg - .or_else(|| self.nat_ip_detected.try_read().ok().and_then(|g| *g)); + let nat_ip = self.nat_runtime.nat_ip_cfg.or_else(|| { + self.nat_runtime + .nat_ip_detected + .try_read() + .ok() + .and_then(|g| *g) + }); let Some(nat_ip) = nat_ip else { return ip; @@ -163,7 +167,7 @@ impl MePool { addr: std::net::SocketAddr, reflected: Option, ) -> std::net::SocketAddr { - let ip = if let Some(nat_ip) = self.nat_ip_cfg { + let ip = if let Some(nat_ip) = self.nat_runtime.nat_ip_cfg { match (addr.ip(), nat_ip) { (IpAddr::V4(_), IpAddr::V4(dst)) => IpAddr::V4(dst), (IpAddr::V6(_), IpAddr::V6(dst)) => IpAddr::V6(dst), @@ -185,22 +189,22 @@ impl MePool { } pub(super) async fn maybe_detect_nat_ip(&self, local_ip: IpAddr) -> Option { - if self.nat_ip_cfg.is_some() { - return self.nat_ip_cfg; + if self.nat_runtime.nat_ip_cfg.is_some() { + return self.nat_runtime.nat_ip_cfg; } if !(is_bogon(local_ip) || local_ip.is_loopback() || local_ip.is_unspecified()) { return None; } - if let Some(ip) = *self.nat_ip_detected.read().await { + if let Some(ip) = *self.nat_runtime.nat_ip_detected.read().await { return Some(ip); } match fetch_public_ipv4_with_retry().await { Ok(Some(ip)) => { { - let mut guard = self.nat_ip_detected.write().await; + let mut guard = self.nat_runtime.nat_ip_detected.write().await; *guard = Some(IpAddr::V4(ip)); } info!(public_ip = %ip, "Auto-detected public IP for NAT translation"); @@ -231,10 +235,10 @@ impl MePool { } // Backoff window if use_shared_cache - && let Some(until) = *self.stun_backoff_until.read().await + && let Some(until) = *self.nat_runtime.stun_backoff_until.read().await && Instant::now() < until { - if let Ok(cache) = self.nat_reflection_cache.try_lock() { + if let Ok(cache) = self.nat_runtime.nat_reflection_cache.try_lock() { let slot = match family { IpFamily::V4 => cache.v4, IpFamily::V6 => cache.v6, @@ -244,7 +248,8 @@ impl MePool { return None; } - if use_shared_cache && let Ok(mut cache) = self.nat_reflection_cache.try_lock() { + if use_shared_cache && let Ok(mut cache) = self.nat_runtime.nat_reflection_cache.try_lock() + { let slot = match family { IpFamily::V4 => &mut cache.v4, IpFamily::V6 => &mut cache.v6, @@ -258,18 +263,18 @@ impl MePool { let _singleflight_guard = if use_shared_cache { Some(match family { - IpFamily::V4 => self.nat_reflection_singleflight_v4.lock().await, - IpFamily::V6 => self.nat_reflection_singleflight_v6.lock().await, + IpFamily::V4 => self.nat_runtime.nat_reflection_singleflight_v4.lock().await, + IpFamily::V6 => self.nat_runtime.nat_reflection_singleflight_v6.lock().await, }) } else { None }; if use_shared_cache - && let Some(until) = *self.stun_backoff_until.read().await + && let Some(until) = *self.nat_runtime.stun_backoff_until.read().await && Instant::now() < until { - if let Ok(cache) = self.nat_reflection_cache.try_lock() { + if let Ok(cache) = self.nat_runtime.nat_reflection_cache.try_lock() { let slot = match family { IpFamily::V4 => cache.v4, IpFamily::V6 => cache.v6, @@ -279,7 +284,8 @@ impl MePool { return None; } - if use_shared_cache && let Ok(mut cache) = self.nat_reflection_cache.try_lock() { + if use_shared_cache && let Ok(mut cache) = self.nat_runtime.nat_reflection_cache.try_lock() + { let slot = match family { IpFamily::V4 => &mut cache.v4, IpFamily::V6 => &mut cache.v6, @@ -292,13 +298,14 @@ impl MePool { } let attempt = if use_shared_cache { - self.nat_probe_attempts + self.nat_runtime + .nat_probe_attempts .fetch_add(1, std::sync::atomic::Ordering::Relaxed) } else { 0 }; let configured_servers = self.configured_stun_servers(); - let live_snapshot = self.nat_stun_live_servers.read().await.clone(); + let live_snapshot = self.nat_runtime.nat_stun_live_servers.read().await.clone(); let primary_servers = if live_snapshot.is_empty() { configured_servers.clone() } else { @@ -322,14 +329,15 @@ impl MePool { let live_server_count = live_servers.len(); if !live_servers.is_empty() { - *self.nat_stun_live_servers.write().await = live_servers; + *self.nat_runtime.nat_stun_live_servers.write().await = live_servers; } else { - self.nat_stun_live_servers.write().await.clear(); + self.nat_runtime.nat_stun_live_servers.write().await.clear(); } if let Some(reflected_addr) = selected_reflected { if use_shared_cache { - self.nat_probe_attempts + self.nat_runtime + .nat_probe_attempts .store(0, std::sync::atomic::Ordering::Relaxed); } info!( @@ -338,7 +346,9 @@ impl MePool { "STUN-Quorum reached, IP: {}", reflected_addr.ip() ); - if use_shared_cache && let Ok(mut cache) = self.nat_reflection_cache.try_lock() { + if use_shared_cache + && let Ok(mut cache) = self.nat_runtime.nat_reflection_cache.try_lock() + { let slot = match family { IpFamily::V4 => &mut cache.v4, IpFamily::V6 => &mut cache.v6, @@ -350,7 +360,7 @@ impl MePool { if use_shared_cache { let backoff = Duration::from_secs(60 * 2u64.pow((attempt as u32).min(6))); - *self.stun_backoff_until.write().await = Some(Instant::now() + backoff); + *self.nat_runtime.stun_backoff_until.write().await = Some(Instant::now() + backoff); } None } diff --git a/src/transport/middle_proxy/pool_refill.rs b/src/transport/middle_proxy/pool_refill.rs index fc5c996..ddeb3e3 100644 --- a/src/transport/middle_proxy/pool_refill.rs +++ b/src/transport/middle_proxy/pool_refill.rs @@ -219,7 +219,7 @@ impl MePool { } async fn refill_writer_after_loss(self: &Arc, addr: SocketAddr, writer_dc: i32) -> bool { - let fast_retries = self.me_reconnect_fast_retry_count.max(1); + let fast_retries = self.reconnect_runtime.me_reconnect_fast_retry_count.max(1); let mut total_attempts = 0u32; let same_endpoint_quarantined = self.is_endpoint_quarantined(addr).await; diff --git a/src/transport/middle_proxy/pool_reinit.rs b/src/transport/middle_proxy/pool_reinit.rs index 009f850..db6411c 100644 --- a/src/transport/middle_proxy/pool_reinit.rs +++ b/src/transport/middle_proxy/pool_reinit.rs @@ -37,7 +37,9 @@ impl MePool { } fn clear_pending_hardswap_state(&self) { - self.reinit.pending_hardswap_generation.store(0, Ordering::Relaxed); + self.reinit + .pending_hardswap_generation + .store(0, Ordering::Relaxed); self.reinit .pending_hardswap_started_at_epoch_secs .store(0, Ordering::Relaxed); @@ -213,7 +215,8 @@ impl MePool { .reinit .me_hardswap_warmup_pass_backoff_base_ms .load(Ordering::Relaxed); - let cap_ms = (self.me_reconnect_backoff_cap.as_millis() as u64).max(base_ms); + let cap_ms = + (self.reconnect_runtime.me_reconnect_backoff_cap.as_millis() as u64).max(base_ms); let shift = (pass_idx as u32).min(20); let scaled = base_ms.saturating_mul(1u64 << shift); let core = scaled.min(cap_ms); @@ -392,7 +395,10 @@ impl MePool { .reinit .pending_hardswap_started_at_epoch_secs .load(Ordering::Relaxed); - let pending_map_hash = self.reinit.pending_hardswap_map_hash.load(Ordering::Relaxed); + let pending_map_hash = self + .reinit + .pending_hardswap_map_hash + .load(Ordering::Relaxed); let pending_age_secs = now_epoch_secs.saturating_sub(pending_started_at); let pending_ttl_expired = pending_started_at > 0 && pending_age_secs > ME_HARDSWAP_PENDING_TTL_SECS; @@ -443,7 +449,9 @@ impl MePool { }; if hardswap { - self.reinit.warm_generation.store(generation, Ordering::Relaxed); + self.reinit + .warm_generation + .store(generation, Ordering::Relaxed); self.warmup_generation_for_all_dcs(rng, generation, &desired_by_dc) .await; } else { diff --git a/src/transport/middle_proxy/pool_runtime_api.rs b/src/transport/middle_proxy/pool_runtime_api.rs index 7c15216..539f397 100644 --- a/src/transport/middle_proxy/pool_runtime_api.rs +++ b/src/transport/middle_proxy/pool_runtime_api.rs @@ -94,9 +94,9 @@ impl MePool { pub(crate) async fn api_nat_stun_snapshot(&self) -> MeApiNatStunSnapshot { let now = Instant::now(); - let mut configured_servers = if !self.nat_stun_servers.is_empty() { - self.nat_stun_servers.clone() - } else if let Some(stun) = &self.nat_stun { + let mut configured_servers = if !self.nat_runtime.nat_stun_servers.is_empty() { + self.nat_runtime.nat_stun_servers.clone() + } else if let Some(stun) = &self.nat_runtime.nat_stun { if stun.trim().is_empty() { Vec::new() } else { @@ -108,11 +108,11 @@ impl MePool { configured_servers.sort(); configured_servers.dedup(); - let mut live_servers = self.nat_stun_live_servers.read().await.clone(); + let mut live_servers = self.nat_runtime.nat_stun_live_servers.read().await.clone(); live_servers.sort(); live_servers.dedup(); - let reflection = self.nat_reflection_cache.lock().await; + let reflection = self.nat_runtime.nat_reflection_cache.lock().await; let reflection_v4 = reflection.v4.map(|(ts, addr)| MeApiNatReflectionSnapshot { addr, age_secs: now.saturating_duration_since(ts).as_secs(), @@ -123,17 +123,19 @@ impl MePool { }); drop(reflection); - let backoff_until = *self.stun_backoff_until.read().await; + let backoff_until = *self.nat_runtime.stun_backoff_until.read().await; let stun_backoff_remaining_ms = backoff_until.and_then(|until| { (until > now).then_some(until.duration_since(now).as_millis() as u64) }); MeApiNatStunSnapshot { - nat_probe_enabled: self.nat_probe, + nat_probe_enabled: self.nat_runtime.nat_probe, nat_probe_disabled_runtime: self + .nat_runtime .nat_probe_disabled .load(std::sync::atomic::Ordering::Relaxed), nat_probe_attempts: self + .nat_runtime .nat_probe_attempts .load(std::sync::atomic::Ordering::Relaxed), configured_servers, diff --git a/src/transport/middle_proxy/pool_status.rs b/src/transport/middle_proxy/pool_status.rs index 6cd5cec..ae9038b 100644 --- a/src/transport/middle_proxy/pool_status.rs +++ b/src/transport/middle_proxy/pool_status.rs @@ -339,6 +339,7 @@ impl MePool { let mut fresh_alive_writers = 0usize; let floor_mode = self.floor_mode(); let adaptive_cpu_cores = (self + .floor_runtime .me_adaptive_floor_cpu_cores_effective .load(Ordering::Relaxed) as usize) .max(1); @@ -353,22 +354,26 @@ impl MePool { self.required_writers_for_dc_with_floor_mode(endpoint_count, false); let floor_min = if endpoint_count <= 1 { (self + .floor_runtime .me_adaptive_floor_min_writers_single_endpoint .load(Ordering::Relaxed) as usize) .max(1) .min(base_required.max(1)) } else { (self + .floor_runtime .me_adaptive_floor_min_writers_multi_endpoint .load(Ordering::Relaxed) as usize) .max(1) .min(base_required.max(1)) }; let extra_per_core = if endpoint_count <= 1 { - self.me_adaptive_floor_max_extra_writers_single_per_core + self.floor_runtime + .me_adaptive_floor_max_extra_writers_single_per_core .load(Ordering::Relaxed) as usize } else { - self.me_adaptive_floor_max_extra_writers_multi_per_core + self.floor_runtime + .me_adaptive_floor_max_extra_writers_multi_per_core .load(Ordering::Relaxed) as usize }; let floor_max = @@ -490,75 +495,100 @@ impl MePool { pending_hardswap_age_secs, hardswap_enabled: self.reinit.hardswap.load(Ordering::Relaxed), floor_mode: floor_mode_label(self.floor_mode()), - adaptive_floor_idle_secs: self.me_adaptive_floor_idle_secs.load(Ordering::Relaxed), + adaptive_floor_idle_secs: self + .floor_runtime + .me_adaptive_floor_idle_secs + .load(Ordering::Relaxed), adaptive_floor_min_writers_single_endpoint: self + .floor_runtime .me_adaptive_floor_min_writers_single_endpoint .load(Ordering::Relaxed), adaptive_floor_min_writers_multi_endpoint: self + .floor_runtime .me_adaptive_floor_min_writers_multi_endpoint .load(Ordering::Relaxed), adaptive_floor_recover_grace_secs: self + .floor_runtime .me_adaptive_floor_recover_grace_secs .load(Ordering::Relaxed), adaptive_floor_writers_per_core_total: self + .floor_runtime .me_adaptive_floor_writers_per_core_total .load(Ordering::Relaxed) as u16, adaptive_floor_cpu_cores_override: self + .floor_runtime .me_adaptive_floor_cpu_cores_override .load(Ordering::Relaxed) as u16, adaptive_floor_max_extra_writers_single_per_core: self + .floor_runtime .me_adaptive_floor_max_extra_writers_single_per_core .load(Ordering::Relaxed) as u16, adaptive_floor_max_extra_writers_multi_per_core: self + .floor_runtime .me_adaptive_floor_max_extra_writers_multi_per_core .load(Ordering::Relaxed) as u16, adaptive_floor_max_active_writers_per_core: self + .floor_runtime .me_adaptive_floor_max_active_writers_per_core .load(Ordering::Relaxed) as u16, adaptive_floor_max_warm_writers_per_core: self + .floor_runtime .me_adaptive_floor_max_warm_writers_per_core .load(Ordering::Relaxed) as u16, adaptive_floor_max_active_writers_global: self + .floor_runtime .me_adaptive_floor_max_active_writers_global .load(Ordering::Relaxed), adaptive_floor_max_warm_writers_global: self + .floor_runtime .me_adaptive_floor_max_warm_writers_global .load(Ordering::Relaxed), adaptive_floor_cpu_cores_detected: self + .floor_runtime .me_adaptive_floor_cpu_cores_detected .load(Ordering::Relaxed), adaptive_floor_cpu_cores_effective: self + .floor_runtime .me_adaptive_floor_cpu_cores_effective .load(Ordering::Relaxed), adaptive_floor_global_cap_raw: self + .floor_runtime .me_adaptive_floor_global_cap_raw .load(Ordering::Relaxed), adaptive_floor_global_cap_effective: self + .floor_runtime .me_adaptive_floor_global_cap_effective .load(Ordering::Relaxed), adaptive_floor_target_writers_total: self + .floor_runtime .me_adaptive_floor_target_writers_total .load(Ordering::Relaxed), adaptive_floor_active_cap_configured: self + .floor_runtime .me_adaptive_floor_active_cap_configured .load(Ordering::Relaxed), adaptive_floor_active_cap_effective: self + .floor_runtime .me_adaptive_floor_active_cap_effective .load(Ordering::Relaxed), adaptive_floor_warm_cap_configured: self + .floor_runtime .me_adaptive_floor_warm_cap_configured .load(Ordering::Relaxed), adaptive_floor_warm_cap_effective: self + .floor_runtime .me_adaptive_floor_warm_cap_effective .load(Ordering::Relaxed), adaptive_floor_active_writers_current: self + .floor_runtime .me_adaptive_floor_active_writers_current .load(Ordering::Relaxed), adaptive_floor_warm_writers_current: self + .floor_runtime .me_adaptive_floor_warm_writers_current .load(Ordering::Relaxed), me_keepalive_enabled: self.writer_lifecycle.me_keepalive_enabled, @@ -569,10 +599,16 @@ impl MePool { .writer_lifecycle .rpc_proxy_req_every_secs .load(Ordering::Relaxed), - me_reconnect_max_concurrent_per_dc: self.me_reconnect_max_concurrent_per_dc, - me_reconnect_backoff_base_ms: self.me_reconnect_backoff_base.as_millis() as u64, - me_reconnect_backoff_cap_ms: self.me_reconnect_backoff_cap.as_millis() as u64, - me_reconnect_fast_retry_count: self.me_reconnect_fast_retry_count, + me_reconnect_max_concurrent_per_dc: self + .reconnect_runtime + .me_reconnect_max_concurrent_per_dc, + me_reconnect_backoff_base_ms: self + .reconnect_runtime + .me_reconnect_backoff_base + .as_millis() as u64, + me_reconnect_backoff_cap_ms: self.reconnect_runtime.me_reconnect_backoff_cap.as_millis() + as u64, + me_reconnect_fast_retry_count: self.reconnect_runtime.me_reconnect_fast_retry_count, me_pool_drain_ttl_secs: self .drain_runtime .me_pool_drain_ttl_secs @@ -615,7 +651,10 @@ impl MePool { .single_endpoint_runtime .me_single_endpoint_shadow_rotate_every_secs .load(Ordering::Relaxed), - me_deterministic_writer_sort: self.me_deterministic_writer_sort.load(Ordering::Relaxed), + me_deterministic_writer_sort: self + .writer_selection_policy + .me_deterministic_writer_sort + .load(Ordering::Relaxed), me_writer_pick_mode: writer_pick_mode_label(self.writer_pick_mode()), me_writer_pick_sample_size: self.writer_pick_sample_size() as u8, me_socks_kdf_policy: socks_kdf_policy_label(self.socks_kdf_policy()), diff --git a/src/transport/middle_proxy/pool_writer.rs b/src/transport/middle_proxy/pool_writer.rs index c1f3de9..75f2d65 100644 --- a/src/transport/middle_proxy/pool_writer.rs +++ b/src/transport/middle_proxy/pool_writer.rs @@ -1,6 +1,6 @@ +use std::collections::HashMap; use std::io::ErrorKind; use std::net::SocketAddr; -use std::collections::HashMap; use std::sync::Arc; use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU32, AtomicU64, Ordering}; use std::time::{Duration, Instant}; @@ -97,7 +97,8 @@ async fn ping_loop( let effective_jitter_ms = keepalive_jitter.as_millis().min(jitter_cap_ms).max(1); Duration::from_millis(rand::rng().random_range(0..=effective_jitter_ms as u64)) } else { - let jitter = rand::rng().random_range(-ME_ACTIVE_PING_JITTER_SECS..=ME_ACTIVE_PING_JITTER_SECS); + let jitter = + rand::rng().random_range(-ME_ACTIVE_PING_JITTER_SECS..=ME_ACTIVE_PING_JITTER_SECS); let wait = (ME_ACTIVE_PING_SECS as i64 + jitter).max(5) as u64; Duration::from_secs(wait) }; @@ -116,9 +117,11 @@ async fn ping_loop( } let jitter_cap_ms = interval.as_millis() / 2; let effective_jitter_ms = keepalive_jitter.as_millis().min(jitter_cap_ms).max(1); - interval + Duration::from_millis(rand::rng().random_range(0..=effective_jitter_ms as u64)) + interval + + Duration::from_millis(rand::rng().random_range(0..=effective_jitter_ms as u64)) } else { - let jitter = rand::rng().random_range(-ME_ACTIVE_PING_JITTER_SECS..=ME_ACTIVE_PING_JITTER_SECS); + let jitter = + rand::rng().random_range(-ME_ACTIVE_PING_JITTER_SECS..=ME_ACTIVE_PING_JITTER_SECS); let secs = (ME_ACTIVE_PING_SECS as i64 + jitter).max(5) as u64; Duration::from_secs(secs) }; @@ -193,7 +196,8 @@ async fn rpc_proxy_req_signal_loop( .as_millis() .min(jitter_cap_ms) .max(1); - interval + Duration::from_millis(rand::rng().random_range(0..=effective_jitter_ms as u64)) + interval + + Duration::from_millis(rand::rng().random_range(0..=effective_jitter_ms as u64)) }; tokio::select! { @@ -365,9 +369,8 @@ impl MePool { let draining_started_at_epoch_secs = Arc::new(AtomicU64::new(0)); let drain_deadline_epoch_secs = Arc::new(AtomicU64::new(0)); let allow_drain_fallback = Arc::new(AtomicBool::new(false)); - let (tx, rx) = mpsc::channel::( - self.writer_lifecycle.writer_cmd_channel_capacity, - ); + let (tx, rx) = + mpsc::channel::(self.writer_lifecycle.writer_cmd_channel_capacity); let rpc_writer = RpcWriter { writer: hs.wr, key: hs.write_key, @@ -430,7 +433,7 @@ impl MePool { let cancel_signal = cancel.clone(); let cancel_select = cancel.clone(); let cancel_cleanup = cancel.clone(); - let reader_route_data_wait_ms = self.me_reader_route_data_wait_ms.clone(); + let reader_route_data_wait_ms = self.transport_policy.me_reader_route_data_wait_ms.clone(); tokio::spawn(async move { // Reader MUST be the first branch in biased select! to avoid read starvation. diff --git a/src/transport/middle_proxy/registry.rs b/src/transport/middle_proxy/registry.rs index 8277e7f..6d830a1 100644 --- a/src/transport/middle_proxy/registry.rs +++ b/src/transport/middle_proxy/registry.rs @@ -162,7 +162,8 @@ impl ConnRegistry { inner.routing.map.remove(&id); inner.binding.meta.remove(&id); if let Some(writer_id) = inner.binding.writer_for_conn.remove(&id) { - let became_empty = if let Some(set) = inner.binding.conns_for_writer.get_mut(&writer_id) { + let became_empty = if let Some(set) = inner.binding.conns_for_writer.get_mut(&writer_id) + { set.remove(&id); set.is_empty() } else { @@ -337,7 +338,10 @@ impl ConnRegistry { inner.binding.meta.insert(conn_id, meta.clone()); inner.binding.last_meta_for_writer.insert(writer_id, meta); - inner.binding.writer_idle_since_epoch_secs.remove(&writer_id); + inner + .binding + .writer_idle_since_epoch_secs + .remove(&writer_id); inner .binding .conns_for_writer @@ -375,7 +379,12 @@ impl ConnRegistry { let inner = self.inner.read().await; let mut out = HashMap::::with_capacity(writer_ids.len()); for writer_id in writer_ids { - if let Some(idle_since) = inner.binding.writer_idle_since_epoch_secs.get(writer_id).copied() { + if let Some(idle_since) = inner + .binding + .writer_idle_since_epoch_secs + .get(writer_id) + .copied() + { out.insert(*writer_id, idle_since); } } @@ -456,7 +465,10 @@ impl ConnRegistry { let mut inner = self.inner.write().await; inner.binding.writers.remove(&writer_id); inner.binding.last_meta_for_writer.remove(&writer_id); - inner.binding.writer_idle_since_epoch_secs.remove(&writer_id); + inner + .binding + .writer_idle_since_epoch_secs + .remove(&writer_id); let conns = inner .binding .conns_for_writer @@ -510,7 +522,10 @@ impl ConnRegistry { inner.binding.writers.remove(&writer_id); inner.binding.last_meta_for_writer.remove(&writer_id); - inner.binding.writer_idle_since_epoch_secs.remove(&writer_id); + inner + .binding + .writer_idle_since_epoch_secs + .remove(&writer_id); inner.binding.conns_for_writer.remove(&writer_id); true } diff --git a/src/transport/middle_proxy/send.rs b/src/transport/middle_proxy/send.rs index faec2ec..9a5c828 100644 --- a/src/transport/middle_proxy/send.rs +++ b/src/transport/middle_proxy/send.rs @@ -153,7 +153,8 @@ impl MePool { MeRouteNoWriterMode::InlineRecoveryLegacy => { self.stats.increment_me_inline_recovery_total(); if !unknown_target_dc { - for _ in 0..self.route_runtime.me_route_inline_recovery_attempts.max(1) + for _ in + 0..self.route_runtime.me_route_inline_recovery_attempts.max(1) { for family in self.family_order() { let map = match family { @@ -319,8 +320,9 @@ impl MePool { } } MeRouteNoWriterMode::HybridAsyncPersistent => { - let total_deadline = *hybrid_total_deadline - .get_or_insert_with(|| Instant::now() + self.hybrid_total_wait_budget()); + let total_deadline = *hybrid_total_deadline.get_or_insert_with(|| { + Instant::now() + self.hybrid_total_wait_budget() + }); if Instant::now() >= total_deadline { self.on_hybrid_timeout(total_deadline, routed_dc); return Err(ProxyError::Proxy( @@ -368,7 +370,11 @@ impl MePool { pick_sample_size, ) } else { - if self.me_deterministic_writer_sort.load(Ordering::Relaxed) { + if self + .writer_selection_policy + .me_deterministic_writer_sort + .load(Ordering::Relaxed) + { candidate_indices.sort_by(|lhs, rhs| { let left = &writers_snapshot[*lhs]; let right = &writers_snapshot[*rhs]; @@ -490,18 +496,18 @@ impl MePool { .increment_me_writer_pick_blocking_fallback_total(); let effective_our_addr = SocketAddr::new(w.source_ip, our_addr.port()); let (payload, meta) = build_routed_payload(effective_our_addr); - let reserve_result = if let Some(timeout) = self.route_runtime.me_route_blocking_send_timeout - { - match tokio::time::timeout(timeout, w.tx.clone().reserve_owned()).await { - Ok(result) => result, - Err(_) => { - self.stats.increment_me_writer_pick_full_total(pick_mode); - continue; + let reserve_result = + if let Some(timeout) = self.route_runtime.me_route_blocking_send_timeout { + match tokio::time::timeout(timeout, w.tx.clone().reserve_owned()).await { + Ok(result) => result, + Err(_) => { + self.stats.increment_me_writer_pick_full_total(pick_mode); + continue; + } } - } - } else { - w.tx.clone().reserve_owned().await - }; + } else { + w.tx.clone().reserve_owned().await + }; match reserve_result { Ok(permit) => { if !self.registry.bind_writer(conn_id, w.id, meta).await { @@ -637,8 +643,7 @@ impl MePool { hybrid_last_recovery_at: &mut Option, hybrid_wait_step: Duration, ) { - if !self.try_consume_hybrid_recovery_trigger_slot(HYBRID_RECOVERY_TRIGGER_MIN_INTERVAL_MS) - { + if !self.try_consume_hybrid_recovery_trigger_slot(HYBRID_RECOVERY_TRIGGER_MIN_INTERVAL_MS) { return; } if let Some(last) = *hybrid_last_recovery_at @@ -691,12 +696,8 @@ impl MePool { match self .route_runtime .me_route_hybrid_timeout_warn_epoch_ms - .compare_exchange_weak( - last_warn_ms, - now_ms, - Ordering::AcqRel, - Ordering::Relaxed, - ) { + .compare_exchange_weak(last_warn_ms, now_ms, Ordering::AcqRel, Ordering::Relaxed) + { Ok(_) => { warn!( routed_dc, @@ -724,12 +725,8 @@ impl MePool { match self .route_runtime .me_async_recovery_last_trigger_epoch_ms - .compare_exchange_weak( - last_trigger_ms, - now_ms, - Ordering::AcqRel, - Ordering::Relaxed, - ) { + .compare_exchange_weak(last_trigger_ms, now_ms, Ordering::AcqRel, Ordering::Relaxed) + { Ok(_) => return true, Err(actual) => last_trigger_ms = actual, } From b94746a6e0b34d90333dca54aeaae356b709e378 Mon Sep 17 00:00:00 2001 From: Alexey <247128645+axkurcom@users.noreply.github.com> Date: Wed, 25 Mar 2026 21:26:20 +0300 Subject: [PATCH 13/17] Dashmap-driven Routing + Health Parallel + Family Runtime State Co-Authored-By: brekotis <93345790+brekotis@users.noreply.github.com> --- src/maestro/mod.rs | 16 +- src/tls_front/fetcher.rs | 5 +- src/transport/middle_proxy/health.rs | 290 ++++++++++++++++++------- src/transport/middle_proxy/pool.rs | 12 +- src/transport/middle_proxy/registry.rs | 215 ++++++++---------- 5 files changed, 310 insertions(+), 228 deletions(-) diff --git a/src/maestro/mod.rs b/src/maestro/mod.rs index 7d3b168..5f3fd3a 100644 --- a/src/maestro/mod.rs +++ b/src/maestro/mod.rs @@ -115,15 +115,13 @@ pub async fn run() -> std::result::Result<(), Box> { ); std::process::exit(1); } - } else { - if let Err(e) = std::fs::create_dir_all(data_path) { - eprintln!( - "[telemt] Can't create data_path {}: {}", - data_path.display(), - e - ); - std::process::exit(1); - } + } else if let Err(e) = std::fs::create_dir_all(data_path) { + eprintln!( + "[telemt] Can't create data_path {}: {}", + data_path.display(), + e + ); + std::process::exit(1); } if let Err(e) = std::env::set_current_dir(data_path) { diff --git a/src/tls_front/fetcher.rs b/src/tls_front/fetcher.rs index bbfc336..45d56ce 100644 --- a/src/tls_front/fetcher.rs +++ b/src/tls_front/fetcher.rs @@ -244,10 +244,9 @@ fn order_profiles( if let Some(pos) = ordered .iter() .position(|profile| *profile == cached.profile) + && pos != 0 { - if pos != 0 { - ordered.swap(0, pos); - } + ordered.swap(0, pos); } } diff --git a/src/transport/middle_proxy/health.rs b/src/transport/middle_proxy/health.rs index ca6e681..257d8f3 100644 --- a/src/transport/middle_proxy/health.rs +++ b/src/transport/middle_proxy/health.rs @@ -8,6 +8,7 @@ use std::time::{Duration, Instant}; use rand::RngExt; use tokio::sync::Semaphore; +use tokio::task::JoinSet; use tracing::{debug, info, warn}; use crate::config::MeFloorMode; @@ -15,6 +16,7 @@ use crate::crypto::SecureRandom; use crate::network::IpFamily; use super::MePool; +use super::pool::MeFamilyRuntimeState; const JITTER_FRAC_NUM: u64 = 2; // jitter up to 50% of backoff #[allow(dead_code)] @@ -28,6 +30,9 @@ const HEALTH_RECONNECT_BUDGET_PER_CORE: usize = 2; const HEALTH_RECONNECT_BUDGET_PER_DC: usize = 1; const HEALTH_RECONNECT_BUDGET_MIN: usize = 4; const HEALTH_RECONNECT_BUDGET_MAX: usize = 128; +const FAMILY_SUPPRESS_FAIL_STREAK_THRESHOLD: u32 = 5; +const FAMILY_SUPPRESS_DURATION_SECS: u64 = 60; +const FAMILY_RECOVER_SUCCESS_STREAK_TARGET: u32 = 2; const HEALTH_DRAIN_CLOSE_BUDGET_PER_CORE: usize = 16; const HEALTH_DRAIN_CLOSE_BUDGET_MIN: usize = 16; const HEALTH_DRAIN_CLOSE_BUDGET_MAX: usize = 256; @@ -57,6 +62,17 @@ struct FamilyFloorPlan { target_writers_total: usize, } +#[derive(Debug)] +struct FamilyReconnectOutcome { + key: (i32, IpFamily), + dc: i32, + family: IpFamily, + alive: usize, + required: usize, + endpoint_count: usize, + restored: usize, +} + pub async fn me_health_monitor(pool: Arc, rng: Arc, _min_connections: usize) { let mut backoff: HashMap<(i32, IpFamily), u64> = HashMap::new(); let mut next_attempt: HashMap<(i32, IpFamily), Instant> = HashMap::new(); @@ -115,6 +131,8 @@ pub async fn me_health_monitor(pool: Arc, rng: Arc, _min_c &mut floor_warn_next_allowed, ) .await; + update_family_runtime_state(&pool, IpFamily::V4, v4_degraded); + update_family_runtime_state(&pool, IpFamily::V6, v6_degraded); degraded_interval = v4_degraded || v6_degraded; } } @@ -430,6 +448,10 @@ async fn check_family( floor_plan.active_writers_current, floor_plan.warm_writers_current, ); + let live_writer_ids_by_addr = Arc::new(live_writer_ids_by_addr); + let writer_idle_since = Arc::new(writer_idle_since); + let bound_clients_by_writer = Arc::new(bound_clients_by_writer); + let mut reconnect_set = JoinSet::::new(); for (dc, endpoints) in dc_endpoints { if endpoints.is_empty() { @@ -503,9 +525,9 @@ async fn check_family( &endpoints, alive, required, - &live_writer_ids_by_addr, - &writer_idle_since, - &bound_clients_by_writer, + live_writer_ids_by_addr.as_ref(), + writer_idle_since.as_ref(), + bound_clients_by_writer.as_ref(), idle_refresh_next_attempt, ) .await; @@ -518,8 +540,8 @@ async fn check_family( &endpoints, alive, required, - &live_writer_ids_by_addr, - &bound_clients_by_writer, + live_writer_ids_by_addr.as_ref(), + bound_clients_by_writer.as_ref(), shadow_rotate_deadline, ) .await; @@ -575,121 +597,165 @@ async fn check_family( continue; } *inflight.entry(key).or_insert(0) += 1; - - let mut restored = 0usize; - for _ in 0..missing { - let Ok(reconnect_permit) = reconnect_sem.clone().try_acquire_owned() else { - break; - }; - if pool.active_contour_writer_count_total().await - >= floor_plan.active_cap_effective_total - { - let swapped = maybe_swap_idle_writer_for_cap( - pool, - rng, - dc, - family, - &endpoints, - &live_writer_ids_by_addr, - &writer_idle_since, - &bound_clients_by_writer, + let pool_for_reconnect = pool.clone(); + let rng_for_reconnect = rng.clone(); + let reconnect_sem_for_dc = reconnect_sem.clone(); + let endpoints_for_dc = endpoints.clone(); + let live_writer_ids_by_addr_for_dc = live_writer_ids_by_addr.clone(); + let writer_idle_since_for_dc = writer_idle_since.clone(); + let bound_clients_by_writer_for_dc = bound_clients_by_writer.clone(); + let active_cap_effective_total = floor_plan.active_cap_effective_total; + reconnect_set.spawn(async move { + let mut restored = 0usize; + for _ in 0..missing { + let Ok(reconnect_permit) = reconnect_sem_for_dc.clone().try_acquire_owned() else { + break; + }; + if pool_for_reconnect.active_contour_writer_count_total().await + >= active_cap_effective_total + { + let swapped = maybe_swap_idle_writer_for_cap( + &pool_for_reconnect, + &rng_for_reconnect, + dc, + family, + &endpoints_for_dc, + live_writer_ids_by_addr_for_dc.as_ref(), + writer_idle_since_for_dc.as_ref(), + bound_clients_by_writer_for_dc.as_ref(), + ) + .await; + if swapped { + pool_for_reconnect + .stats + .increment_me_floor_swap_idle_total(); + restored += 1; + continue; + } + pool_for_reconnect + .stats + .increment_me_floor_cap_block_total(); + pool_for_reconnect + .stats + .increment_me_floor_swap_idle_failed_total(); + debug!( + dc = %dc, + ?family, + alive, + required, + active_cap_effective_total, + "Adaptive floor cap reached, reconnect attempt blocked" + ); + break; + } + let res = tokio::time::timeout( + pool_for_reconnect.reconnect_runtime.me_one_timeout, + pool_for_reconnect.connect_endpoints_round_robin( + dc, + &endpoints_for_dc, + rng_for_reconnect.as_ref(), + ), ) .await; - if swapped { - pool.stats.increment_me_floor_swap_idle_total(); - restored += 1; - continue; + match res { + Ok(true) => { + restored += 1; + pool_for_reconnect.stats.increment_me_reconnect_success(); + } + Ok(false) => { + pool_for_reconnect.stats.increment_me_reconnect_attempt(); + debug!(dc = %dc, ?family, "ME round-robin reconnect failed") + } + Err(_) => { + pool_for_reconnect.stats.increment_me_reconnect_attempt(); + debug!(dc = %dc, ?family, "ME reconnect timed out"); + } } - pool.stats.increment_me_floor_cap_block_total(); - pool.stats.increment_me_floor_swap_idle_failed_total(); - debug!( - dc = %dc, - ?family, - alive, - required, - active_cap_effective_total = floor_plan.active_cap_effective_total, - "Adaptive floor cap reached, reconnect attempt blocked" - ); - break; + drop(reconnect_permit); } - let res = tokio::time::timeout( - pool.reconnect_runtime.me_one_timeout, - pool.connect_endpoints_round_robin(dc, &endpoints, rng.as_ref()), - ) - .await; - match res { - Ok(true) => { - restored += 1; - pool.stats.increment_me_reconnect_success(); - } - Ok(false) => { - pool.stats.increment_me_reconnect_attempt(); - debug!(dc = %dc, ?family, "ME round-robin reconnect failed") - } - Err(_) => { - pool.stats.increment_me_reconnect_attempt(); - debug!(dc = %dc, ?family, "ME reconnect timed out"); - } - } - drop(reconnect_permit); - } - let now_alive = alive + restored; - if now_alive >= required { - info!( - dc = %dc, - ?family, - alive = now_alive, + FamilyReconnectOutcome { + key, + dc, + family, + alive, required, - endpoint_count = endpoints.len(), + endpoint_count: endpoints_for_dc.len(), + restored, + } + }); + } + + while let Some(joined) = reconnect_set.join_next().await { + let outcome = match joined { + Ok(outcome) => outcome, + Err(join_error) => { + debug!(error = %join_error, "Health reconnect task failed"); + continue; + } + }; + let now = Instant::now(); + let now_alive = outcome.alive + outcome.restored; + if now_alive >= outcome.required { + info!( + dc = %outcome.dc, + family = ?outcome.family, + alive = now_alive, + required = outcome.required, + endpoint_count = outcome.endpoint_count, "ME writer floor restored for DC" ); backoff.insert( - key, + outcome.key, pool.reconnect_runtime.me_reconnect_backoff_base.as_millis() as u64, ); let jitter = pool.reconnect_runtime.me_reconnect_backoff_base.as_millis() as u64 / JITTER_FRAC_NUM; let wait = pool.reconnect_runtime.me_reconnect_backoff_base + Duration::from_millis(rand::rng().random_range(0..=jitter.max(1))); - next_attempt.insert(key, now + wait); + next_attempt.insert(outcome.key, now + wait); } else { let curr = *backoff - .get(&key) + .get(&outcome.key) .unwrap_or(&(pool.reconnect_runtime.me_reconnect_backoff_base.as_millis() as u64)); let next_ms = (curr.saturating_mul(2)) .min(pool.reconnect_runtime.me_reconnect_backoff_cap.as_millis() as u64); - backoff.insert(key, next_ms); + backoff.insert(outcome.key, next_ms); let jitter = next_ms / JITTER_FRAC_NUM; let wait = Duration::from_millis(next_ms) + Duration::from_millis(rand::rng().random_range(0..=jitter.max(1))); - next_attempt.insert(key, now + wait); + next_attempt.insert(outcome.key, now + wait); if pool.is_runtime_ready() { let warn_cooldown = pool.warn_rate_limit_duration(); - if should_emit_rate_limited_warn(floor_warn_next_allowed, key, now, warn_cooldown) { + if should_emit_rate_limited_warn( + floor_warn_next_allowed, + outcome.key, + now, + warn_cooldown, + ) { warn!( - dc = %dc, - ?family, + dc = %outcome.dc, + family = ?outcome.family, alive = now_alive, - required, - endpoint_count = endpoints.len(), + required = outcome.required, + endpoint_count = outcome.endpoint_count, backoff_ms = next_ms, "DC writer floor is below required level, scheduled reconnect" ); } } else { info!( - dc = %dc, - ?family, + dc = %outcome.dc, + family = ?outcome.family, alive = now_alive, - required, - endpoint_count = endpoints.len(), + required = outcome.required, + endpoint_count = outcome.endpoint_count, backoff_ms = next_ms, "DC writer floor is below required level during startup, scheduled reconnect" ); } } - if let Some(v) = inflight.get_mut(&key) { + if let Some(v) = inflight.get_mut(&outcome.key) { *v = v.saturating_sub(1); } } @@ -706,6 +772,68 @@ fn health_reconnect_budget(pool: &Arc, dc_groups: usize) -> usize { .clamp(HEALTH_RECONNECT_BUDGET_MIN, HEALTH_RECONNECT_BUDGET_MAX) } +fn update_family_runtime_state(pool: &Arc, family: IpFamily, degraded: bool) { + let now_epoch_secs = MePool::now_epoch_secs(); + let previous_state = pool.family_runtime_state(family); + let mut state_since_epoch_secs = pool.family_runtime_state_since_epoch_secs(family); + let previous_suppressed_until_epoch_secs = pool.family_suppressed_until_epoch_secs(family); + let previous_fail_streak = pool.family_fail_streak(family); + let previous_recover_success_streak = pool.family_recover_success_streak(family); + + let (next_state, suppressed_until_epoch_secs, fail_streak, recover_success_streak) = + if previous_suppressed_until_epoch_secs > now_epoch_secs { + let fail_streak = if degraded { + previous_fail_streak.saturating_add(1) + } else { + previous_fail_streak + }; + ( + MeFamilyRuntimeState::Suppressed, + previous_suppressed_until_epoch_secs, + fail_streak, + 0, + ) + } else if degraded { + let fail_streak = previous_fail_streak.saturating_add(1); + if fail_streak >= FAMILY_SUPPRESS_FAIL_STREAK_THRESHOLD { + ( + MeFamilyRuntimeState::Suppressed, + now_epoch_secs.saturating_add(FAMILY_SUPPRESS_DURATION_SECS), + fail_streak, + 0, + ) + } else { + (MeFamilyRuntimeState::Degraded, 0, fail_streak, 0) + } + } else if matches!(previous_state, MeFamilyRuntimeState::Healthy) { + (MeFamilyRuntimeState::Healthy, 0, 0, 0) + } else { + let recover_success_streak = previous_recover_success_streak.saturating_add(1); + if recover_success_streak >= FAMILY_RECOVER_SUCCESS_STREAK_TARGET { + (MeFamilyRuntimeState::Healthy, 0, 0, 0) + } else { + ( + MeFamilyRuntimeState::Recovering, + 0, + 0, + recover_success_streak, + ) + } + }; + + if next_state != previous_state || state_since_epoch_secs == 0 { + state_since_epoch_secs = now_epoch_secs; + } + pool.set_family_runtime_state( + family, + next_state, + state_since_epoch_secs, + suppressed_until_epoch_secs, + fail_streak, + recover_success_streak, + ); +} + fn should_emit_rate_limited_warn( next_allowed: &mut HashMap<(i32, IpFamily), Instant>, key: (i32, IpFamily), diff --git a/src/transport/middle_proxy/pool.rs b/src/transport/middle_proxy/pool.rs index af37a0b..249d387 100644 --- a/src/transport/middle_proxy/pool.rs +++ b/src/transport/middle_proxy/pool.rs @@ -202,15 +202,6 @@ impl FamilyHealthSnapshot { } impl MeFamilyRuntimeState { - pub(crate) fn from_u8(value: u8) -> Self { - match value { - 1 => Self::Degraded, - 2 => Self::Suppressed, - 3 => Self::Recovering, - _ => Self::Healthy, - } - } - pub(crate) fn as_str(self) -> &'static str { match self { Self::Healthy => "healthy", @@ -852,12 +843,11 @@ impl MePool { } pub(super) fn notify_writer_epoch(&self) { - let _ = self.writer_epoch.send_modify(|epoch| { + self.writer_epoch.send_modify(|epoch| { *epoch = epoch.wrapping_add(1); }); } - #[allow(dead_code)] pub(super) fn set_family_runtime_state( &self, family: IpFamily, diff --git a/src/transport/middle_proxy/registry.rs b/src/transport/middle_proxy/registry.rs index 6d830a1..ff4a68b 100644 --- a/src/transport/middle_proxy/registry.rs +++ b/src/transport/middle_proxy/registry.rs @@ -3,8 +3,9 @@ use std::net::SocketAddr; use std::sync::atomic::{AtomicU8, AtomicU64, Ordering}; use std::time::{Duration, SystemTime, UNIX_EPOCH}; +use dashmap::DashMap; use tokio::sync::mpsc::error::TrySendError; -use tokio::sync::{RwLock, mpsc}; +use tokio::sync::{Mutex, mpsc}; use super::MeResponse; use super::codec::WriterCommand; @@ -50,16 +51,15 @@ pub(super) struct WriterActivitySnapshot { pub active_sessions_by_target_dc: HashMap, } -struct RegistryInner { - routing: RoutingTable, - binding: BindingState, -} - struct RoutingTable { - map: HashMap>, + map: DashMap>, } struct BindingState { + inner: Mutex, +} + +struct BindingInner { writers: HashMap>, writer_for_conn: HashMap, conns_for_writer: HashMap>, @@ -68,26 +68,22 @@ struct BindingState { writer_idle_since_epoch_secs: HashMap, } -impl RegistryInner { +impl BindingInner { fn new() -> Self { Self { - routing: RoutingTable { - map: HashMap::new(), - }, - binding: BindingState { - writers: HashMap::new(), - writer_for_conn: HashMap::new(), - conns_for_writer: HashMap::new(), - meta: HashMap::new(), - last_meta_for_writer: HashMap::new(), - writer_idle_since_epoch_secs: HashMap::new(), - }, + writers: HashMap::new(), + writer_for_conn: HashMap::new(), + conns_for_writer: HashMap::new(), + meta: HashMap::new(), + last_meta_for_writer: HashMap::new(), + writer_idle_since_epoch_secs: HashMap::new(), } } } pub struct ConnRegistry { - inner: RwLock, + routing: RoutingTable, + binding: BindingState, next_id: AtomicU64, route_channel_capacity: usize, route_backpressure_base_timeout_ms: AtomicU64, @@ -106,7 +102,12 @@ impl ConnRegistry { pub fn with_route_channel_capacity(route_channel_capacity: usize) -> Self { let start = rand::random::() | 1; Self { - inner: RwLock::new(RegistryInner::new()), + routing: RoutingTable { + map: DashMap::new(), + }, + binding: BindingState { + inner: Mutex::new(BindingInner::new()), + }, next_id: AtomicU64::new(start), route_channel_capacity: route_channel_capacity.max(1), route_backpressure_base_timeout_ms: AtomicU64::new(ROUTE_BACKPRESSURE_BASE_TIMEOUT_MS), @@ -142,15 +143,14 @@ impl ConnRegistry { pub async fn register(&self) -> (u64, mpsc::Receiver) { let id = self.next_id.fetch_add(1, Ordering::Relaxed); let (tx, rx) = mpsc::channel(self.route_channel_capacity); - self.inner.write().await.routing.map.insert(id, tx); + self.routing.map.insert(id, tx); (id, rx) } pub async fn register_writer(&self, writer_id: u64, tx: mpsc::Sender) { - let mut inner = self.inner.write().await; - inner.binding.writers.insert(writer_id, tx); - inner - .binding + let mut binding = self.binding.inner.lock().await; + binding.writers.insert(writer_id, tx); + binding .conns_for_writer .entry(writer_id) .or_insert_with(HashSet::new); @@ -158,20 +158,18 @@ impl ConnRegistry { /// Unregister connection, returning associated writer_id if any. pub async fn unregister(&self, id: u64) -> Option { - let mut inner = self.inner.write().await; - inner.routing.map.remove(&id); - inner.binding.meta.remove(&id); - if let Some(writer_id) = inner.binding.writer_for_conn.remove(&id) { - let became_empty = if let Some(set) = inner.binding.conns_for_writer.get_mut(&writer_id) - { + self.routing.map.remove(&id); + let mut binding = self.binding.inner.lock().await; + binding.meta.remove(&id); + if let Some(writer_id) = binding.writer_for_conn.remove(&id) { + let became_empty = if let Some(set) = binding.conns_for_writer.get_mut(&writer_id) { set.remove(&id); set.is_empty() } else { false }; if became_empty { - inner - .binding + binding .writer_idle_since_epoch_secs .insert(writer_id, Self::now_epoch_secs()); } @@ -182,10 +180,7 @@ impl ConnRegistry { #[allow(dead_code)] pub async fn route(&self, id: u64, resp: MeResponse) -> RouteResult { - let tx = { - let inner = self.inner.read().await; - inner.routing.map.get(&id).cloned() - }; + let tx = self.routing.map.get(&id).map(|entry| entry.value().clone()); let Some(tx) = tx else { return RouteResult::NoConn; @@ -238,10 +233,7 @@ impl ConnRegistry { } pub async fn route_nowait(&self, id: u64, resp: MeResponse) -> RouteResult { - let tx = { - let inner = self.inner.read().await; - inner.routing.map.get(&id).cloned() - }; + let tx = self.routing.map.get(&id).map(|entry| entry.value().clone()); let Some(tx) = tx else { return RouteResult::NoConn; @@ -264,10 +256,7 @@ impl ConnRegistry { return self.route_nowait(id, resp).await; } - let tx = { - let inner = self.inner.read().await; - inner.routing.map.get(&id).cloned() - }; + let tx = self.routing.map.get(&id).map(|entry| entry.value().clone()); let Some(tx) = tx else { return RouteResult::NoConn; @@ -306,44 +295,39 @@ impl ConnRegistry { } pub async fn bind_writer(&self, conn_id: u64, writer_id: u64, meta: ConnMeta) -> bool { - let mut inner = self.inner.write().await; + let mut binding = self.binding.inner.lock().await; // ROUTING IS THE SOURCE OF TRUTH: // never keep/attach writer binding for a connection that is already // absent from the routing table. - if !inner.routing.map.contains_key(&conn_id) { + if !self.routing.map.contains_key(&conn_id) { return false; } - if !inner.binding.writers.contains_key(&writer_id) { + if !binding.writers.contains_key(&writer_id) { return false; } - let previous_writer_id = inner.binding.writer_for_conn.insert(conn_id, writer_id); + let previous_writer_id = binding.writer_for_conn.insert(conn_id, writer_id); if let Some(previous_writer_id) = previous_writer_id && previous_writer_id != writer_id { let became_empty = - if let Some(set) = inner.binding.conns_for_writer.get_mut(&previous_writer_id) { + if let Some(set) = binding.conns_for_writer.get_mut(&previous_writer_id) { set.remove(&conn_id); set.is_empty() } else { false }; if became_empty { - inner - .binding + binding .writer_idle_since_epoch_secs .insert(previous_writer_id, Self::now_epoch_secs()); } } - inner.binding.meta.insert(conn_id, meta.clone()); - inner.binding.last_meta_for_writer.insert(writer_id, meta); - inner - .binding - .writer_idle_since_epoch_secs - .remove(&writer_id); - inner - .binding + binding.meta.insert(conn_id, meta.clone()); + binding.last_meta_for_writer.insert(writer_id, meta); + binding.writer_idle_since_epoch_secs.remove(&writer_id); + binding .conns_for_writer .entry(writer_id) .or_insert_with(HashSet::new) @@ -352,39 +336,32 @@ impl ConnRegistry { } pub async fn mark_writer_idle(&self, writer_id: u64) { - let mut inner = self.inner.write().await; - inner - .binding + let mut binding = self.binding.inner.lock().await; + binding .conns_for_writer .entry(writer_id) .or_insert_with(HashSet::new); - inner - .binding + binding .writer_idle_since_epoch_secs .entry(writer_id) .or_insert(Self::now_epoch_secs()); } pub async fn get_last_writer_meta(&self, writer_id: u64) -> Option { - let inner = self.inner.read().await; - inner.binding.last_meta_for_writer.get(&writer_id).cloned() + let binding = self.binding.inner.lock().await; + binding.last_meta_for_writer.get(&writer_id).cloned() } pub async fn writer_idle_since_snapshot(&self) -> HashMap { - let inner = self.inner.read().await; - inner.binding.writer_idle_since_epoch_secs.clone() + let binding = self.binding.inner.lock().await; + binding.writer_idle_since_epoch_secs.clone() } pub async fn writer_idle_since_for_writer_ids(&self, writer_ids: &[u64]) -> HashMap { - let inner = self.inner.read().await; + let binding = self.binding.inner.lock().await; let mut out = HashMap::::with_capacity(writer_ids.len()); for writer_id in writer_ids { - if let Some(idle_since) = inner - .binding - .writer_idle_since_epoch_secs - .get(writer_id) - .copied() - { + if let Some(idle_since) = binding.writer_idle_since_epoch_secs.get(writer_id).copied() { out.insert(*writer_id, idle_since); } } @@ -392,14 +369,14 @@ impl ConnRegistry { } pub(super) async fn writer_activity_snapshot(&self) -> WriterActivitySnapshot { - let inner = self.inner.read().await; + let binding = self.binding.inner.lock().await; let mut bound_clients_by_writer = HashMap::::new(); let mut active_sessions_by_target_dc = HashMap::::new(); - for (writer_id, conn_ids) in &inner.binding.conns_for_writer { + for (writer_id, conn_ids) in &binding.conns_for_writer { bound_clients_by_writer.insert(*writer_id, conn_ids.len()); } - for conn_meta in inner.binding.meta.values() { + for conn_meta in binding.meta.values() { if conn_meta.target_dc == 0 { continue; } @@ -415,19 +392,18 @@ impl ConnRegistry { } pub async fn get_writer(&self, conn_id: u64) -> Option { - let mut inner = self.inner.write().await; + let mut binding = self.binding.inner.lock().await; // ROUTING IS THE SOURCE OF TRUTH: // stale bindings are ignored and lazily cleaned when routing no longer // contains the connection. - if !inner.routing.map.contains_key(&conn_id) { - inner.binding.meta.remove(&conn_id); - if let Some(stale_writer_id) = inner.binding.writer_for_conn.remove(&conn_id) - && let Some(conns) = inner.binding.conns_for_writer.get_mut(&stale_writer_id) + if !self.routing.map.contains_key(&conn_id) { + binding.meta.remove(&conn_id); + if let Some(stale_writer_id) = binding.writer_for_conn.remove(&conn_id) + && let Some(conns) = binding.conns_for_writer.get_mut(&stale_writer_id) { conns.remove(&conn_id); if conns.is_empty() { - inner - .binding + binding .writer_idle_since_epoch_secs .insert(stale_writer_id, Self::now_epoch_secs()); } @@ -435,15 +411,14 @@ impl ConnRegistry { return None; } - let writer_id = inner.binding.writer_for_conn.get(&conn_id).copied()?; - let Some(writer) = inner.binding.writers.get(&writer_id).cloned() else { - inner.binding.writer_for_conn.remove(&conn_id); - inner.binding.meta.remove(&conn_id); - if let Some(conns) = inner.binding.conns_for_writer.get_mut(&writer_id) { + let writer_id = binding.writer_for_conn.get(&conn_id).copied()?; + let Some(writer) = binding.writers.get(&writer_id).cloned() else { + binding.writer_for_conn.remove(&conn_id); + binding.meta.remove(&conn_id); + if let Some(conns) = binding.conns_for_writer.get_mut(&writer_id) { conns.remove(&conn_id); if conns.is_empty() { - inner - .binding + binding .writer_idle_since_epoch_secs .insert(writer_id, Self::now_epoch_secs()); } @@ -457,20 +432,16 @@ impl ConnRegistry { } pub async fn active_conn_ids(&self) -> Vec { - let inner = self.inner.read().await; - inner.binding.writer_for_conn.keys().copied().collect() + let binding = self.binding.inner.lock().await; + binding.writer_for_conn.keys().copied().collect() } pub async fn writer_lost(&self, writer_id: u64) -> Vec { - let mut inner = self.inner.write().await; - inner.binding.writers.remove(&writer_id); - inner.binding.last_meta_for_writer.remove(&writer_id); - inner - .binding - .writer_idle_since_epoch_secs - .remove(&writer_id); - let conns = inner - .binding + let mut binding = self.binding.inner.lock().await; + binding.writers.remove(&writer_id); + binding.last_meta_for_writer.remove(&writer_id); + binding.writer_idle_since_epoch_secs.remove(&writer_id); + let conns = binding .conns_for_writer .remove(&writer_id) .unwrap_or_default() @@ -479,11 +450,11 @@ impl ConnRegistry { let mut out = Vec::new(); for conn_id in conns { - if inner.binding.writer_for_conn.get(&conn_id).copied() != Some(writer_id) { + if binding.writer_for_conn.get(&conn_id).copied() != Some(writer_id) { continue; } - inner.binding.writer_for_conn.remove(&conn_id); - if let Some(m) = inner.binding.meta.get(&conn_id) { + binding.writer_for_conn.remove(&conn_id); + if let Some(m) = binding.meta.get(&conn_id) { out.push(BoundConn { conn_id, meta: m.clone(), @@ -495,14 +466,13 @@ impl ConnRegistry { #[allow(dead_code)] pub async fn get_meta(&self, conn_id: u64) -> Option { - let inner = self.inner.read().await; - inner.binding.meta.get(&conn_id).cloned() + let binding = self.binding.inner.lock().await; + binding.meta.get(&conn_id).cloned() } pub async fn is_writer_empty(&self, writer_id: u64) -> bool { - let inner = self.inner.read().await; - inner - .binding + let binding = self.binding.inner.lock().await; + binding .conns_for_writer .get(&writer_id) .map(|s| s.is_empty()) @@ -511,8 +481,8 @@ impl ConnRegistry { #[allow(dead_code)] pub async fn unregister_writer_if_empty(&self, writer_id: u64) -> bool { - let mut inner = self.inner.write().await; - let Some(conn_ids) = inner.binding.conns_for_writer.get(&writer_id) else { + let mut binding = self.binding.inner.lock().await; + let Some(conn_ids) = binding.conns_for_writer.get(&writer_id) else { // Writer is already absent from the registry. return true; }; @@ -520,22 +490,19 @@ impl ConnRegistry { return false; } - inner.binding.writers.remove(&writer_id); - inner.binding.last_meta_for_writer.remove(&writer_id); - inner - .binding - .writer_idle_since_epoch_secs - .remove(&writer_id); - inner.binding.conns_for_writer.remove(&writer_id); + binding.writers.remove(&writer_id); + binding.last_meta_for_writer.remove(&writer_id); + binding.writer_idle_since_epoch_secs.remove(&writer_id); + binding.conns_for_writer.remove(&writer_id); true } #[allow(dead_code)] pub(super) async fn non_empty_writer_ids(&self, writer_ids: &[u64]) -> HashSet { - let inner = self.inner.read().await; + let binding = self.binding.inner.lock().await; let mut out = HashSet::::with_capacity(writer_ids.len()); for writer_id in writer_ids { - if let Some(conns) = inner.binding.conns_for_writer.get(writer_id) + if let Some(conns) = binding.conns_for_writer.get(writer_id) && !conns.is_empty() { out.insert(*writer_id); From 70479c4094a262c6770996d85adacb2e00e475fe Mon Sep 17 00:00:00 2001 From: Alexey <247128645+axkurcom@users.noreply.github.com> Date: Wed, 25 Mar 2026 22:25:39 +0300 Subject: [PATCH 14/17] Unexpected-only Quarantine Co-Authored-By: brekotis <93345790+brekotis@users.noreply.github.com> --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/metrics.rs | 46 +++++++++++++++++++ src/stats/mod.rs | 22 +++++++++ src/transport/middle_proxy/pool_refill.rs | 2 + src/transport/middle_proxy/pool_writer.rs | 17 ++++++- .../tests/pool_writer_security_tests.rs | 43 +++++++++++++++-- 7 files changed, 125 insertions(+), 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f79a2ea..cf52770 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2793,7 +2793,7 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" [[package]] name = "telemt" -version = "3.3.31" +version = "3.3.32" dependencies = [ "aes", "anyhow", diff --git a/Cargo.toml b/Cargo.toml index ebadbb5..62b3b13 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "telemt" -version = "3.3.31" +version = "3.3.32" edition = "2024" [features] diff --git a/src/metrics.rs b/src/metrics.rs index c125ef5..2c87ed6 100644 --- a/src/metrics.rs +++ b/src/metrics.rs @@ -1558,6 +1558,40 @@ async fn render_metrics(stats: &Stats, config: &ProxyConfig, ip_tracker: &UserIp 0 } ); + let _ = writeln!( + out, + "# HELP telemt_me_endpoint_quarantine_unexpected_total ME endpoint quarantines caused by unexpected writer removals" + ); + let _ = writeln!( + out, + "# TYPE telemt_me_endpoint_quarantine_unexpected_total counter" + ); + let _ = writeln!( + out, + "telemt_me_endpoint_quarantine_unexpected_total {}", + if me_allows_normal { + stats.get_me_endpoint_quarantine_unexpected_total() + } else { + 0 + } + ); + let _ = writeln!( + out, + "# HELP telemt_me_endpoint_quarantine_draining_suppressed_total Draining writer removals that skipped endpoint quarantine" + ); + let _ = writeln!( + out, + "# TYPE telemt_me_endpoint_quarantine_draining_suppressed_total counter" + ); + let _ = writeln!( + out, + "telemt_me_endpoint_quarantine_draining_suppressed_total {}", + if me_allows_normal { + stats.get_me_endpoint_quarantine_draining_suppressed_total() + } else { + 0 + } + ); let _ = writeln!( out, @@ -2622,6 +2656,9 @@ mod tests { stats.increment_me_d2c_write_mode(crate::stats::MeD2cWriteMode::Coalesced); stats.increment_me_d2c_quota_reject_total(crate::stats::MeD2cQuotaRejectStage::PostWrite); stats.observe_me_d2c_frame_buf_shrink(4096); + stats.increment_me_endpoint_quarantine_total(); + stats.increment_me_endpoint_quarantine_unexpected_total(); + stats.increment_me_endpoint_quarantine_draining_suppressed_total(); stats.increment_user_connects("alice"); stats.increment_user_curr_connects("alice"); stats.add_user_octets_from("alice", 1024); @@ -2672,6 +2709,9 @@ mod tests { assert!(output.contains("telemt_me_d2c_quota_reject_total{stage=\"post_write\"} 1")); assert!(output.contains("telemt_me_d2c_frame_buf_shrink_total 1")); assert!(output.contains("telemt_me_d2c_frame_buf_shrink_bytes_total 4096")); + assert!(output.contains("telemt_me_endpoint_quarantine_total 1")); + assert!(output.contains("telemt_me_endpoint_quarantine_unexpected_total 1")); + assert!(output.contains("telemt_me_endpoint_quarantine_draining_suppressed_total 1")); assert!(output.contains("telemt_user_connections_total{user=\"alice\"} 1")); assert!(output.contains("telemt_user_connections_current{user=\"alice\"} 1")); assert!(output.contains("telemt_user_octets_from_client{user=\"alice\"} 1024")); @@ -2738,6 +2778,12 @@ mod tests { assert!(output.contains("# TYPE telemt_me_d2c_write_mode_total counter")); assert!(output.contains("# TYPE telemt_me_d2c_batch_frames_bucket_total counter")); assert!(output.contains("# TYPE telemt_me_d2c_flush_duration_us_bucket_total counter")); + assert!(output.contains("# TYPE telemt_me_endpoint_quarantine_total counter")); + assert!(output.contains("# TYPE telemt_me_endpoint_quarantine_unexpected_total counter")); + assert!( + output + .contains("# TYPE telemt_me_endpoint_quarantine_draining_suppressed_total counter") + ); assert!(output.contains("# TYPE telemt_me_writer_removed_total counter")); assert!( output diff --git a/src/stats/mod.rs b/src/stats/mod.rs index 4144f82..2d1f413 100644 --- a/src/stats/mod.rs +++ b/src/stats/mod.rs @@ -128,6 +128,8 @@ pub struct Stats { me_crc_mismatch: AtomicU64, me_seq_mismatch: AtomicU64, me_endpoint_quarantine_total: AtomicU64, + me_endpoint_quarantine_unexpected_total: AtomicU64, + me_endpoint_quarantine_draining_suppressed_total: AtomicU64, me_kdf_drift_total: AtomicU64, me_kdf_port_only_drift_total: AtomicU64, me_hardswap_pending_reuse_total: AtomicU64, @@ -1251,6 +1253,18 @@ impl Stats { .fetch_add(1, Ordering::Relaxed); } } + pub fn increment_me_endpoint_quarantine_unexpected_total(&self) { + if self.telemetry_me_allows_normal() { + self.me_endpoint_quarantine_unexpected_total + .fetch_add(1, Ordering::Relaxed); + } + } + pub fn increment_me_endpoint_quarantine_draining_suppressed_total(&self) { + if self.telemetry_me_allows_normal() { + self.me_endpoint_quarantine_draining_suppressed_total + .fetch_add(1, Ordering::Relaxed); + } + } pub fn increment_me_kdf_drift_total(&self) { if self.telemetry_me_allows_normal() { self.me_kdf_drift_total.fetch_add(1, Ordering::Relaxed); @@ -1503,6 +1517,14 @@ impl Stats { pub fn get_me_endpoint_quarantine_total(&self) -> u64 { self.me_endpoint_quarantine_total.load(Ordering::Relaxed) } + pub fn get_me_endpoint_quarantine_unexpected_total(&self) -> u64 { + self.me_endpoint_quarantine_unexpected_total + .load(Ordering::Relaxed) + } + pub fn get_me_endpoint_quarantine_draining_suppressed_total(&self) -> u64 { + self.me_endpoint_quarantine_draining_suppressed_total + .load(Ordering::Relaxed) + } pub fn get_me_kdf_drift_total(&self) -> u64 { self.me_kdf_drift_total.load(Ordering::Relaxed) } diff --git a/src/transport/middle_proxy/pool_refill.rs b/src/transport/middle_proxy/pool_refill.rs index ddeb3e3..06e450b 100644 --- a/src/transport/middle_proxy/pool_refill.rs +++ b/src/transport/middle_proxy/pool_refill.rs @@ -33,6 +33,7 @@ impl MePool { &self, addr: SocketAddr, uptime: Duration, + reason: &'static str, ) { if uptime > Duration::from_secs(ME_FLAP_UPTIME_THRESHOLD_SECS) { return; @@ -45,6 +46,7 @@ impl MePool { self.stats.increment_me_endpoint_quarantine_total(); warn!( %addr, + reason, uptime_ms = uptime.as_millis(), quarantine_secs = ME_FLAP_QUARANTINE_SECS, "ME endpoint temporarily quarantined due to rapid writer flap" diff --git a/src/transport/middle_proxy/pool_writer.rs b/src/transport/middle_proxy/pool_writer.rs index 75f2d65..0c31ecb 100644 --- a/src/transport/middle_proxy/pool_writer.rs +++ b/src/transport/middle_proxy/pool_writer.rs @@ -609,8 +609,21 @@ impl MePool { } if let Some(addr) = removed_addr { if let Some(uptime) = removed_uptime { - // Quarantine flapping endpoints regardless of draining state. - self.maybe_quarantine_flapping_endpoint(addr, uptime).await; + // Quarantine contract: only unexpected removals are considered endpoint flap. + if trigger_refill { + self.stats + .increment_me_endpoint_quarantine_unexpected_total(); + self.maybe_quarantine_flapping_endpoint(addr, uptime, "unexpected") + .await; + } else { + self.stats + .increment_me_endpoint_quarantine_draining_suppressed_total(); + debug!( + %addr, + uptime_ms = uptime.as_millis(), + "Skipping endpoint quarantine for draining writer removal" + ); + } } if trigger_refill && let Some(writer_dc) = removed_dc { self.trigger_immediate_refill_for_dc(addr, writer_dc); diff --git a/src/transport/middle_proxy/tests/pool_writer_security_tests.rs b/src/transport/middle_proxy/tests/pool_writer_security_tests.rs index fc5135b..d6ab8d6 100644 --- a/src/transport/middle_proxy/tests/pool_writer_security_tests.rs +++ b/src/transport/middle_proxy/tests/pool_writer_security_tests.rs @@ -173,10 +173,15 @@ async fn bind_conn_to_writer(pool: &Arc, writer_id: u64, port: u16) -> u } #[tokio::test] -async fn remove_draining_writer_still_quarantines_flapping_endpoint() { +async fn remove_draining_writer_does_not_quarantine_flapping_endpoint() { let pool = make_pool().await; let writer_id = 77; let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 12, 0, 77)), 443); + let before_total = pool.stats.get_me_endpoint_quarantine_total(); + let before_unexpected = pool.stats.get_me_endpoint_quarantine_unexpected_total(); + let before_suppressed = pool + .stats + .get_me_endpoint_quarantine_draining_suppressed_total(); insert_writer( &pool, writer_id, @@ -200,8 +205,18 @@ async fn remove_draining_writer_still_quarantines_flapping_endpoint() { "writer must be removed from pool after cleanup" ); assert!( - pool.is_endpoint_quarantined(addr).await, - "draining removals must still quarantine flapping endpoints" + !pool.is_endpoint_quarantined(addr).await, + "draining removals must not quarantine endpoint" + ); + assert_eq!(pool.stats.get_me_endpoint_quarantine_total(), before_total); + assert_eq!( + pool.stats.get_me_endpoint_quarantine_unexpected_total(), + before_unexpected + ); + assert_eq!( + pool.stats + .get_me_endpoint_quarantine_draining_suppressed_total(), + before_suppressed + 1 ); assert_eq!(pool.conn_count.load(Ordering::Relaxed), 0); } @@ -257,16 +272,21 @@ async fn edge_draining_only_detach_rejects_active_writer() { } #[tokio::test] -async fn adversarial_blackhat_single_remove_establishes_single_quarantine_entry() { +async fn adversarial_blackhat_single_unexpected_remove_establishes_single_quarantine_entry() { let pool = make_pool().await; let writer_id = 93; let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 12, 0, 93)), 443); + let before_total = pool.stats.get_me_endpoint_quarantine_total(); + let before_unexpected = pool.stats.get_me_endpoint_quarantine_unexpected_total(); + let before_suppressed = pool + .stats + .get_me_endpoint_quarantine_draining_suppressed_total(); insert_writer( &pool, writer_id, 2, addr, - true, + false, Instant::now() - Duration::from_secs(1), ) .await; @@ -274,6 +294,19 @@ async fn adversarial_blackhat_single_remove_establishes_single_quarantine_entry( pool.remove_writer_and_close_clients(writer_id).await; assert!(pool.is_endpoint_quarantined(addr).await); assert_eq!(pool.endpoint_quarantine.lock().await.len(), 1); + assert_eq!( + pool.stats.get_me_endpoint_quarantine_total(), + before_total + 1 + ); + assert_eq!( + pool.stats.get_me_endpoint_quarantine_unexpected_total(), + before_unexpected + 1 + ); + assert_eq!( + pool.stats + .get_me_endpoint_quarantine_draining_suppressed_total(), + before_suppressed + ); } #[tokio::test] From 7de822dd153aa5f197601e25e60c6ae562bea603 Mon Sep 17 00:00:00 2001 From: Alexey <247128645+axkurcom@users.noreply.github.com> Date: Wed, 25 Mar 2026 22:51:00 +0300 Subject: [PATCH 15/17] RPC Proxy-req fixes Co-Authored-By: brekotis <93345790+brekotis@users.noreply.github.com> --- src/transport/middle_proxy/pool_writer.rs | 3 +++ src/transport/middle_proxy/tests/pool_writer_security_tests.rs | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/transport/middle_proxy/pool_writer.rs b/src/transport/middle_proxy/pool_writer.rs index 0c31ecb..fae68b9 100644 --- a/src/transport/middle_proxy/pool_writer.rs +++ b/src/transport/middle_proxy/pool_writer.rs @@ -171,6 +171,9 @@ async fn rpc_proxy_req_signal_loop( rpc_proxy_req_every_secs: u64, ) { if rpc_proxy_req_every_secs == 0 { + // Disabled service signal loop must stay parked until writer cancellation. + // Returning immediately here would complete `select!` and tear down writer lifecycle. + cancel_signal.cancelled().await; return; } diff --git a/src/transport/middle_proxy/tests/pool_writer_security_tests.rs b/src/transport/middle_proxy/tests/pool_writer_security_tests.rs index d6ab8d6..e287624 100644 --- a/src/transport/middle_proxy/tests/pool_writer_security_tests.rs +++ b/src/transport/middle_proxy/tests/pool_writer_security_tests.rs @@ -35,7 +35,7 @@ async fn make_pool() -> Arc { NetworkDecision::default(), None, Arc::new(SecureRandom::new()), - Arc::new(Stats::default()), + Arc::new(Stats::new()), general.me_keepalive_enabled, general.me_keepalive_interval_secs, general.me_keepalive_jitter_secs, From 7a075b2ffec948379d07515576fd71e13afdfc32 Mon Sep 17 00:00:00 2001 From: Alexey <247128645+axkurcom@users.noreply.github.com> Date: Thu, 26 Mar 2026 02:18:39 +0300 Subject: [PATCH 16/17] Middle Relay fixes Co-Authored-By: brekotis <93345790+brekotis@users.noreply.github.com> --- src/api/runtime_zero.rs | 18 +++++ src/config/defaults.rs | 4 ++ src/config/hot_reload.rs | 6 +- src/config/load.rs | 2 + src/config/types.rs | 6 ++ src/maestro/admission.rs | 71 +++++++++++++------ src/proxy/middle_relay.rs | 37 ++++++---- ...le_relay_stub_completion_security_tests.rs | 1 + src/transport/middle_proxy/pool_refill.rs | 12 ++++ .../tests/pool_writer_security_tests.rs | 30 ++++++++ 10 files changed, 153 insertions(+), 34 deletions(-) diff --git a/src/api/runtime_zero.rs b/src/api/runtime_zero.rs index a6eb163..0ed84a8 100644 --- a/src/api/runtime_zero.rs +++ b/src/api/runtime_zero.rs @@ -35,11 +35,14 @@ pub(super) struct RuntimeGatesData { pub(super) conditional_cast_enabled: bool, pub(super) me_runtime_ready: bool, pub(super) me2dc_fallback_enabled: bool, + pub(super) me2dc_fast_enabled: bool, pub(super) use_middle_proxy: bool, pub(super) route_mode: &'static str, pub(super) reroute_active: bool, #[serde(skip_serializing_if = "Option::is_none")] pub(super) reroute_to_direct_at_epoch_secs: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub(super) reroute_reason: Option<&'static str>, pub(super) startup_status: &'static str, pub(super) startup_stage: String, pub(super) startup_progress_pct: f64, @@ -86,6 +89,7 @@ pub(super) struct EffectiveMiddleProxyLimits { pub(super) writer_pick_mode: &'static str, pub(super) writer_pick_sample_size: u8, pub(super) me2dc_fallback: bool, + pub(super) me2dc_fast: bool, } #[derive(Serialize)] @@ -169,6 +173,8 @@ pub(super) async fn build_runtime_gates_data( let startup_summary = build_runtime_startup_summary(shared).await; let route_state = shared.route_runtime.snapshot(); let route_mode = route_state.mode.as_str(); + let fast_fallback_enabled = + cfg.general.use_middle_proxy && cfg.general.me2dc_fallback && cfg.general.me2dc_fast; let reroute_active = cfg.general.use_middle_proxy && cfg.general.me2dc_fallback && matches!(route_state.mode, RelayRouteMode::Direct); @@ -177,6 +183,15 @@ pub(super) async fn build_runtime_gates_data( } else { None }; + let reroute_reason = if reroute_active { + if fast_fallback_enabled { + Some("fast_not_ready_fallback") + } else { + Some("strict_grace_fallback") + } + } else { + None + }; let me_runtime_ready = if !cfg.general.use_middle_proxy { true } else { @@ -194,10 +209,12 @@ pub(super) async fn build_runtime_gates_data( conditional_cast_enabled: cfg.general.use_middle_proxy, me_runtime_ready, me2dc_fallback_enabled: cfg.general.me2dc_fallback, + me2dc_fast_enabled: fast_fallback_enabled, use_middle_proxy: cfg.general.use_middle_proxy, route_mode, reroute_active, reroute_to_direct_at_epoch_secs, + reroute_reason, startup_status: startup_summary.status, startup_stage: startup_summary.stage, startup_progress_pct: startup_summary.progress_pct, @@ -263,6 +280,7 @@ pub(super) fn build_limits_effective_data(cfg: &ProxyConfig) -> EffectiveLimitsD writer_pick_mode: me_writer_pick_mode_label(cfg.general.me_writer_pick_mode), writer_pick_sample_size: cfg.general.me_writer_pick_sample_size, me2dc_fallback: cfg.general.me2dc_fallback, + me2dc_fast: cfg.general.me2dc_fast, }, user_ip_policy: EffectiveUserIpPolicyLimits { global_each: cfg.access.user_max_unique_ips_global_each, diff --git a/src/config/defaults.rs b/src/config/defaults.rs index b0aaf5b..608e1b8 100644 --- a/src/config/defaults.rs +++ b/src/config/defaults.rs @@ -273,6 +273,10 @@ pub(crate) fn default_me2dc_fallback() -> bool { true } +pub(crate) fn default_me2dc_fast() -> bool { + false +} + pub(crate) fn default_keepalive_interval() -> u64 { 8 } diff --git a/src/config/hot_reload.rs b/src/config/hot_reload.rs index f8064dd..9bd2927 100644 --- a/src/config/hot_reload.rs +++ b/src/config/hot_reload.rs @@ -672,9 +672,11 @@ fn warn_non_hot_changes(old: &ProxyConfig, new: &ProxyConfig, non_hot_changed: b warned = true; warn!("config reload: general.me_init_retry_attempts changed; restart required"); } - if old.general.me2dc_fallback != new.general.me2dc_fallback { + if old.general.me2dc_fallback != new.general.me2dc_fallback + || old.general.me2dc_fast != new.general.me2dc_fast + { warned = true; - warn!("config reload: general.me2dc_fallback changed; restart required"); + warn!("config reload: general.me2dc_fallback/me2dc_fast changed; restart required"); } if old.general.proxy_config_v4_cache_path != new.general.proxy_config_v4_cache_path || old.general.proxy_config_v6_cache_path != new.general.proxy_config_v6_cache_path diff --git a/src/config/load.rs b/src/config/load.rs index 3cb6627..7892e2c 100644 --- a/src/config/load.rs +++ b/src/config/load.rs @@ -1217,6 +1217,7 @@ mod tests { default_me_init_retry_attempts() ); assert_eq!(cfg.general.me2dc_fallback, default_me2dc_fallback()); + assert_eq!(cfg.general.me2dc_fast, default_me2dc_fast()); assert_eq!( cfg.general.proxy_config_v4_cache_path, default_proxy_config_v4_cache_path() @@ -1356,6 +1357,7 @@ mod tests { default_me_init_retry_attempts() ); assert_eq!(general.me2dc_fallback, default_me2dc_fallback()); + assert_eq!(general.me2dc_fast, default_me2dc_fast()); assert_eq!( general.proxy_config_v4_cache_path, default_proxy_config_v4_cache_path() diff --git a/src/config/types.rs b/src/config/types.rs index 3939664..cb14747 100644 --- a/src/config/types.rs +++ b/src/config/types.rs @@ -429,6 +429,11 @@ pub struct GeneralConfig { #[serde(default = "default_me2dc_fallback")] pub me2dc_fallback: bool, + /// Fast ME->Direct fallback mode for new sessions. + /// Active only when both `use_middle_proxy=true` and `me2dc_fallback=true`. + #[serde(default = "default_me2dc_fast")] + pub me2dc_fast: bool, + /// Enable ME keepalive padding frames. #[serde(default = "default_true")] pub me_keepalive_enabled: bool, @@ -939,6 +944,7 @@ impl Default for GeneralConfig { middle_proxy_warm_standby: default_middle_proxy_warm_standby(), me_init_retry_attempts: default_me_init_retry_attempts(), me2dc_fallback: default_me2dc_fallback(), + me2dc_fast: default_me2dc_fast(), me_keepalive_enabled: default_true(), me_keepalive_interval_secs: default_keepalive_interval(), me_keepalive_jitter_secs: default_keepalive_jitter(), diff --git a/src/maestro/admission.rs b/src/maestro/admission.rs index 69a9c9f..c781078 100644 --- a/src/maestro/admission.rs +++ b/src/maestro/admission.rs @@ -21,10 +21,24 @@ pub(crate) async fn configure_admission_gate( if config.general.use_middle_proxy { if let Some(pool) = me_pool.as_ref() { let initial_ready = pool.admission_ready_conditional_cast().await; - admission_tx.send_replace(initial_ready); - let _ = route_runtime.set_mode(RelayRouteMode::Middle); + let mut fallback_enabled = config.general.me2dc_fallback; + let mut fast_fallback_enabled = fallback_enabled && config.general.me2dc_fast; + let (initial_gate_open, initial_route_mode, initial_fallback_reason) = if initial_ready { + (true, RelayRouteMode::Middle, None) + } else if fast_fallback_enabled { + (true, RelayRouteMode::Direct, Some("fast_not_ready_fallback")) + } else { + (false, RelayRouteMode::Middle, None) + }; + admission_tx.send_replace(initial_gate_open); + let _ = route_runtime.set_mode(initial_route_mode); if initial_ready { info!("Conditional-admission gate: open / ME pool READY"); + } else if let Some(reason) = initial_fallback_reason { + warn!( + fallback_reason = reason, + "Conditional-admission gate opened in ME fast fallback mode" + ); } else { warn!("Conditional-admission gate: closed / ME pool is NOT ready)"); } @@ -34,10 +48,9 @@ pub(crate) async fn configure_admission_gate( let route_runtime_gate = route_runtime.clone(); let mut config_rx_gate = config_rx.clone(); let mut admission_poll_ms = config.general.me_admission_poll_ms.max(1); - let mut fallback_enabled = config.general.me2dc_fallback; tokio::spawn(async move { - let mut gate_open = initial_ready; - let mut route_mode = RelayRouteMode::Middle; + let mut gate_open = initial_gate_open; + let mut route_mode = initial_route_mode; let mut ready_observed = initial_ready; let mut not_ready_since = if initial_ready { None @@ -53,16 +66,19 @@ pub(crate) async fn configure_admission_gate( let cfg = config_rx_gate.borrow_and_update().clone(); admission_poll_ms = cfg.general.me_admission_poll_ms.max(1); fallback_enabled = cfg.general.me2dc_fallback; + fast_fallback_enabled = cfg.general.me2dc_fallback && cfg.general.me2dc_fast; continue; } _ = tokio::time::sleep(Duration::from_millis(admission_poll_ms)) => {} } let ready = pool_for_gate.admission_ready_conditional_cast().await; let now = Instant::now(); - let (next_gate_open, next_route_mode, next_fallback_active) = if ready { + let (next_gate_open, next_route_mode, next_fallback_reason) = if ready { ready_observed = true; not_ready_since = None; - (true, RelayRouteMode::Middle, false) + (true, RelayRouteMode::Middle, None) + } else if fast_fallback_enabled { + (true, RelayRouteMode::Direct, Some("fast_not_ready_fallback")) } else { let not_ready_started_at = *not_ready_since.get_or_insert(now); let not_ready_for = now.saturating_duration_since(not_ready_started_at); @@ -72,11 +88,12 @@ pub(crate) async fn configure_admission_gate( STARTUP_FALLBACK_AFTER }; if fallback_enabled && not_ready_for > fallback_after { - (true, RelayRouteMode::Direct, true) + (true, RelayRouteMode::Direct, Some("strict_grace_fallback")) } else { - (false, RelayRouteMode::Middle, false) + (false, RelayRouteMode::Middle, None) } }; + let next_fallback_active = next_fallback_reason.is_some(); if next_route_mode != route_mode { route_mode = next_route_mode; @@ -88,17 +105,28 @@ pub(crate) async fn configure_admission_gate( "Middle-End routing restored for new sessions" ); } else { - let fallback_after = if ready_observed { - RUNTIME_FALLBACK_AFTER + let fallback_reason = next_fallback_reason.unwrap_or("unknown"); + if fallback_reason == "strict_grace_fallback" { + let fallback_after = if ready_observed { + RUNTIME_FALLBACK_AFTER + } else { + STARTUP_FALLBACK_AFTER + }; + warn!( + target_mode = route_mode.as_str(), + cutover_generation = snapshot.generation, + grace_secs = fallback_after.as_secs(), + fallback_reason, + "ME pool stayed not-ready beyond grace; routing new sessions via Direct-DC" + ); } else { - STARTUP_FALLBACK_AFTER - }; - warn!( - target_mode = route_mode.as_str(), - cutover_generation = snapshot.generation, - grace_secs = fallback_after.as_secs(), - "ME pool stayed not-ready beyond grace; routing new sessions via Direct-DC" - ); + warn!( + target_mode = route_mode.as_str(), + cutover_generation = snapshot.generation, + fallback_reason, + "ME pool not-ready; routing new sessions via Direct-DC (fast mode)" + ); + } } } } @@ -108,7 +136,10 @@ pub(crate) async fn configure_admission_gate( admission_tx_gate.send_replace(gate_open); if gate_open { if next_fallback_active { - warn!("Conditional-admission gate opened in ME fallback mode"); + warn!( + fallback_reason = next_fallback_reason.unwrap_or("unknown"), + "Conditional-admission gate opened in ME fallback mode" + ); } else { info!("Conditional-admission gate opened / ME pool READY"); } diff --git a/src/proxy/middle_relay.rs b/src/proxy/middle_relay.rs index e8fc52a..d2f37a6 100644 --- a/src/proxy/middle_relay.rs +++ b/src/proxy/middle_relay.rs @@ -45,10 +45,6 @@ const RELAY_IDLE_IO_POLL_MAX: Duration = Duration::from_secs(1); const TINY_FRAME_DEBT_PER_TINY: u32 = 8; const TINY_FRAME_DEBT_LIMIT: u32 = 512; #[cfg(test)] -const C2ME_SEND_TIMEOUT: Duration = Duration::from_millis(50); -#[cfg(not(test))] -const C2ME_SEND_TIMEOUT: Duration = Duration::from_secs(5); -#[cfg(test)] const RELAY_TEST_STEP_TIMEOUT: Duration = Duration::from_secs(1); const ME_D2C_FLUSH_BATCH_MAX_FRAMES_MIN: usize = 1; const ME_D2C_FLUSH_BATCH_MAX_BYTES_MIN: usize = 4096; @@ -648,6 +644,7 @@ pub(crate) fn relay_idle_pressure_test_scope() -> std::sync::MutexGuard<'static, async fn enqueue_c2me_command( tx: &mpsc::Sender, cmd: C2MeCommand, + send_timeout: Option, ) -> std::result::Result<(), mpsc::error::SendError> { match tx.try_send(cmd) { Ok(()) => Ok(()), @@ -658,12 +655,18 @@ async fn enqueue_c2me_command( if tx.capacity() <= C2ME_SOFT_PRESSURE_MIN_FREE_SLOTS { tokio::task::yield_now().await; } - match timeout(C2ME_SEND_TIMEOUT, tx.reserve()).await { - Ok(Ok(permit)) => { + let reserve_result = match send_timeout { + Some(send_timeout) => match timeout(send_timeout, tx.reserve()).await { + Ok(result) => result, + Err(_) => return Err(mpsc::error::SendError(cmd)), + }, + None => tx.reserve().await, + }; + match reserve_result { + Ok(permit) => { permit.send(cmd); Ok(()) } - Ok(Err(_)) => Err(mpsc::error::SendError(cmd)), Err(_) => Err(mpsc::error::SendError(cmd)), } } @@ -789,6 +792,10 @@ where .general .me_c2me_channel_capacity .max(C2ME_CHANNEL_CAPACITY_FALLBACK); + let c2me_send_timeout = match config.general.me_c2me_send_timeout_ms { + 0 => None, + timeout_ms => Some(Duration::from_millis(timeout_ms)), + }; let (c2me_tx, mut c2me_rx) = mpsc::channel::(c2me_channel_capacity); let me_pool_c2me = me_pool.clone(); let c2me_sender = tokio::spawn(async move { @@ -1165,7 +1172,7 @@ where user = %user, "Middle-relay pressure eviction for idle-candidate session" ); - let _ = enqueue_c2me_command(&c2me_tx, C2MeCommand::Close).await; + let _ = enqueue_c2me_command(&c2me_tx, C2MeCommand::Close, c2me_send_timeout).await; main_result = Err(ProxyError::Proxy( "middle-relay session evicted under pressure (idle-candidate)".to_string(), )); @@ -1184,7 +1191,7 @@ where "Cutover affected middle session, closing client connection" ); tokio::time::sleep(delay).await; - let _ = enqueue_c2me_command(&c2me_tx, C2MeCommand::Close).await; + let _ = enqueue_c2me_command(&c2me_tx, C2MeCommand::Close, c2me_send_timeout).await; main_result = Err(ProxyError::Proxy(ROUTE_SWITCH_ERROR_MSG.to_string())); break; } @@ -1242,8 +1249,12 @@ where flags |= RPC_FLAG_NOT_ENCRYPTED; } // Keep client read loop lightweight: route heavy ME send path via a dedicated task. - if enqueue_c2me_command(&c2me_tx, C2MeCommand::Data { payload, flags }) - .await + if enqueue_c2me_command( + &c2me_tx, + C2MeCommand::Data { payload, flags }, + c2me_send_timeout, + ) + .await .is_err() { main_result = Err(ProxyError::Proxy("ME sender channel closed".into())); @@ -1253,7 +1264,9 @@ where Ok(None) => { debug!(conn_id, "Client EOF"); client_closed = true; - let _ = enqueue_c2me_command(&c2me_tx, C2MeCommand::Close).await; + let _ = + enqueue_c2me_command(&c2me_tx, C2MeCommand::Close, c2me_send_timeout) + .await; break; } Err(e) => { diff --git a/src/proxy/tests/middle_relay_stub_completion_security_tests.rs b/src/proxy/tests/middle_relay_stub_completion_security_tests.rs index 2635a28..fbb9081 100644 --- a/src/proxy/tests/middle_relay_stub_completion_security_tests.rs +++ b/src/proxy/tests/middle_relay_stub_completion_security_tests.rs @@ -126,6 +126,7 @@ async fn c2me_channel_full_path_yields_then_sends() { payload: make_pooled_payload(&[0xBB, 0xCC]), flags: 2, }, + None, ) .await }); diff --git a/src/transport/middle_proxy/pool_refill.rs b/src/transport/middle_proxy/pool_refill.rs index 06e450b..69d8aa0 100644 --- a/src/transport/middle_proxy/pool_refill.rs +++ b/src/transport/middle_proxy/pool_refill.rs @@ -13,6 +13,7 @@ use super::pool::{MePool, RefillDcKey, RefillEndpointKey, WriterContour}; const ME_FLAP_UPTIME_THRESHOLD_SECS: u64 = 20; const ME_FLAP_QUARANTINE_SECS: u64 = 25; +const ME_FLAP_MIN_UPTIME_MILLIS: u64 = 500; const ME_REFILL_TOTAL_ATTEMPT_CAP: u32 = 20; impl MePool { @@ -35,6 +36,17 @@ impl MePool { uptime: Duration, reason: &'static str, ) { + if uptime < Duration::from_millis(ME_FLAP_MIN_UPTIME_MILLIS) { + debug!( + %addr, + reason, + uptime_ms = uptime.as_millis(), + min_uptime_ms = ME_FLAP_MIN_UPTIME_MILLIS, + "Skipping flap quarantine for ultra-short writer lifetime" + ); + return; + } + if uptime > Duration::from_secs(ME_FLAP_UPTIME_THRESHOLD_SECS) { return; } diff --git a/src/transport/middle_proxy/tests/pool_writer_security_tests.rs b/src/transport/middle_proxy/tests/pool_writer_security_tests.rs index e287624..0184e11 100644 --- a/src/transport/middle_proxy/tests/pool_writer_security_tests.rs +++ b/src/transport/middle_proxy/tests/pool_writer_security_tests.rs @@ -309,6 +309,36 @@ async fn adversarial_blackhat_single_unexpected_remove_establishes_single_quaran ); } +#[tokio::test] +async fn remove_ultra_short_uptime_writer_skips_flap_quarantine() { + let pool = make_pool().await; + let writer_id = 931; + let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 12, 0, 131)), 443); + let before_total = pool.stats.get_me_endpoint_quarantine_total(); + let before_unexpected = pool.stats.get_me_endpoint_quarantine_unexpected_total(); + insert_writer( + &pool, + writer_id, + 2, + addr, + false, + Instant::now() - Duration::from_millis(50), + ) + .await; + + pool.remove_writer_and_close_clients(writer_id).await; + + assert!( + !pool.is_endpoint_quarantined(addr).await, + "ultra-short unexpected lifetime must not quarantine endpoint" + ); + assert_eq!(pool.stats.get_me_endpoint_quarantine_total(), before_total); + assert_eq!( + pool.stats.get_me_endpoint_quarantine_unexpected_total(), + before_unexpected + 1 + ); +} + #[tokio::test] async fn integration_old_uptime_writer_does_not_trigger_flap_quarantine() { let pool = make_pool().await; From cb5753f77c9d7cb12fc68dc069ca70dad111e596 Mon Sep 17 00:00:00 2001 From: Alexey <247128645+axkurcom@users.noreply.github.com> Date: Thu, 26 Mar 2026 02:19:35 +0300 Subject: [PATCH 17/17] Update admission.rs Co-Authored-By: brekotis <93345790+brekotis@users.noreply.github.com> --- src/maestro/admission.rs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/maestro/admission.rs b/src/maestro/admission.rs index c781078..82484ad 100644 --- a/src/maestro/admission.rs +++ b/src/maestro/admission.rs @@ -23,10 +23,15 @@ pub(crate) async fn configure_admission_gate( let initial_ready = pool.admission_ready_conditional_cast().await; let mut fallback_enabled = config.general.me2dc_fallback; let mut fast_fallback_enabled = fallback_enabled && config.general.me2dc_fast; - let (initial_gate_open, initial_route_mode, initial_fallback_reason) = if initial_ready { + let (initial_gate_open, initial_route_mode, initial_fallback_reason) = if initial_ready + { (true, RelayRouteMode::Middle, None) } else if fast_fallback_enabled { - (true, RelayRouteMode::Direct, Some("fast_not_ready_fallback")) + ( + true, + RelayRouteMode::Direct, + Some("fast_not_ready_fallback"), + ) } else { (false, RelayRouteMode::Middle, None) }; @@ -78,7 +83,11 @@ pub(crate) async fn configure_admission_gate( not_ready_since = None; (true, RelayRouteMode::Middle, None) } else if fast_fallback_enabled { - (true, RelayRouteMode::Direct, Some("fast_not_ready_fallback")) + ( + true, + RelayRouteMode::Direct, + Some("fast_not_ready_fallback"), + ) } else { let not_ready_started_at = *not_ready_since.get_or_insert(now); let not_ready_for = now.saturating_duration_since(not_ready_started_at);