ME Writer Draining-state fixes

This commit is contained in:
Alexey 2026-03-17 13:58:01 +03:00
parent d78360982c
commit 2e8be87ccf
No known key found for this signature in database
5 changed files with 309 additions and 52 deletions

View File

@ -115,59 +115,109 @@ async fn reap_draining_writers(
pool: &Arc<MePool>,
warn_next_allowed: &mut HashMap<u64, Instant>,
) {
if pool.draining_active_runtime() == 0 {
return;
}
let now_epoch_secs = MePool::now_epoch_secs();
let now = Instant::now();
let drain_ttl_secs = pool.me_pool_drain_ttl_secs.load(std::sync::atomic::Ordering::Relaxed);
let drain_threshold = pool
.me_pool_drain_threshold
.load(std::sync::atomic::Ordering::Relaxed);
let writers = pool.writers.read().await.clone();
let mut draining_writers = Vec::new();
for writer in writers {
if !writer.draining.load(std::sync::atomic::Ordering::Relaxed) {
continue;
let mut draining_writers = {
let writers = pool.writers.read().await;
let mut draining_writers = Vec::<DrainingWriterSnapshot>::new();
for writer in writers.iter() {
if !writer.draining.load(std::sync::atomic::Ordering::Relaxed) {
continue;
}
draining_writers.push(DrainingWriterSnapshot {
id: writer.id,
writer_dc: writer.writer_dc,
addr: writer.addr,
generation: writer.generation,
created_at: writer.created_at,
draining_started_at_epoch_secs: writer
.draining_started_at_epoch_secs
.load(std::sync::atomic::Ordering::Relaxed),
drain_deadline_epoch_secs: writer
.drain_deadline_epoch_secs
.load(std::sync::atomic::Ordering::Relaxed),
allow_drain_fallback: writer
.allow_drain_fallback
.load(std::sync::atomic::Ordering::Relaxed),
});
}
let is_empty = pool.registry.is_writer_empty(writer.id).await;
if is_empty {
pool.remove_writer_and_close_clients(writer.id).await;
continue;
}
draining_writers.push(writer);
draining_writers
};
if draining_writers.is_empty() {
return;
}
if drain_threshold > 0 && draining_writers.len() > drain_threshold as usize {
draining_writers.sort_by(|left, right| {
let left_started = left
.draining_started_at_epoch_secs
.load(std::sync::atomic::Ordering::Relaxed);
let right_started = right
.draining_started_at_epoch_secs
.load(std::sync::atomic::Ordering::Relaxed);
left_started
.cmp(&right_started)
.then_with(|| left.created_at.cmp(&right.created_at))
.then_with(|| left.id.cmp(&right.id))
});
let overflow = draining_writers.len().saturating_sub(drain_threshold as usize);
warn!(
draining_writers = draining_writers.len(),
me_pool_drain_threshold = drain_threshold,
removing_writers = overflow,
"ME draining writer threshold exceeded, force-closing oldest draining writers"
);
for writer in draining_writers.drain(..overflow) {
pool.stats.increment_pool_force_close_total();
let draining_ids: Vec<u64> = draining_writers.iter().map(|writer| writer.id).collect();
let non_empty_writer_ids = pool.registry.non_empty_writer_ids(&draining_ids).await;
let mut non_empty_draining_writers =
Vec::<DrainingWriterSnapshot>::with_capacity(draining_writers.len());
for writer in draining_writers.drain(..) {
if non_empty_writer_ids.contains(&writer.id) {
non_empty_draining_writers.push(writer);
} else {
pool.remove_writer_and_close_clients(writer.id).await;
}
}
draining_writers = non_empty_draining_writers;
if draining_writers.is_empty() {
return;
}
let overflow = if drain_threshold > 0 && draining_writers.len() > drain_threshold as usize {
draining_writers.len().saturating_sub(drain_threshold as usize)
} else {
0
};
let has_deadline_expired = draining_writers.iter().any(|writer| {
writer.drain_deadline_epoch_secs != 0 && now_epoch_secs >= writer.drain_deadline_epoch_secs
});
let can_drop_with_replacement = if overflow > 0 || has_deadline_expired {
pool.has_non_draining_writer_per_desired_dc_group().await
} else {
false
};
if overflow > 0 {
if can_drop_with_replacement {
draining_writers.sort_by(|left, right| {
left.draining_started_at_epoch_secs
.cmp(&right.draining_started_at_epoch_secs)
.then_with(|| left.created_at.cmp(&right.created_at))
.then_with(|| left.id.cmp(&right.id))
});
warn!(
draining_writers = draining_writers.len(),
me_pool_drain_threshold = drain_threshold,
removing_writers = overflow,
"ME draining writer threshold exceeded, force-closing oldest draining writers"
);
for writer in draining_writers.drain(..overflow) {
pool.stats.increment_pool_force_close_total();
pool.remove_writer_and_close_clients(writer.id).await;
}
} else {
warn!(
draining_writers = draining_writers.len(),
me_pool_drain_threshold = drain_threshold,
overflow,
"ME draining threshold exceeded, but replacement coverage is incomplete; keeping draining writers"
);
}
}
for writer in draining_writers {
let drain_started_at_epoch_secs = writer
.draining_started_at_epoch_secs
.load(std::sync::atomic::Ordering::Relaxed);
if drain_ttl_secs > 0
&& drain_started_at_epoch_secs != 0
&& now_epoch_secs.saturating_sub(drain_started_at_epoch_secs) > drain_ttl_secs
&& writer.draining_started_at_epoch_secs != 0
&& now_epoch_secs.saturating_sub(writer.draining_started_at_epoch_secs) > drain_ttl_secs
&& should_emit_writer_warn(
warn_next_allowed,
writer.id,
@ -182,21 +232,45 @@ async fn reap_draining_writers(
generation = writer.generation,
drain_ttl_secs,
force_close_secs = pool.me_pool_force_close_secs.load(std::sync::atomic::Ordering::Relaxed),
allow_drain_fallback = writer.allow_drain_fallback.load(std::sync::atomic::Ordering::Relaxed),
allow_drain_fallback = writer.allow_drain_fallback,
"ME draining writer remains non-empty past drain TTL"
);
}
let deadline_epoch_secs = writer
.drain_deadline_epoch_secs
.load(std::sync::atomic::Ordering::Relaxed);
if deadline_epoch_secs != 0 && now_epoch_secs >= deadline_epoch_secs {
warn!(writer_id = writer.id, "Drain timeout, force-closing");
pool.stats.increment_pool_force_close_total();
pool.remove_writer_and_close_clients(writer.id).await;
if writer.drain_deadline_epoch_secs != 0 && now_epoch_secs >= writer.drain_deadline_epoch_secs
{
if can_drop_with_replacement {
warn!(writer_id = writer.id, "Drain timeout, force-closing");
pool.stats.increment_pool_force_close_total();
pool.remove_writer_and_close_clients(writer.id).await;
} else if should_emit_writer_warn(
warn_next_allowed,
writer.id,
now,
pool.warn_rate_limit_duration(),
) {
warn!(
writer_id = writer.id,
writer_dc = writer.writer_dc,
endpoint = %writer.addr,
"Drain timeout reached, but replacement coverage is incomplete; keeping draining writer"
);
}
}
}
}
#[derive(Debug, Clone)]
struct DrainingWriterSnapshot {
id: u64,
writer_dc: i32,
addr: SocketAddr,
generation: u64,
created_at: Instant,
draining_started_at_epoch_secs: u64,
drain_deadline_epoch_secs: u64,
allow_drain_fallback: bool,
}
fn should_emit_writer_warn(
next_allowed: &mut HashMap<u64, Instant>,
writer_id: u64,
@ -1330,6 +1404,15 @@ mod tests {
me_pool_drain_threshold,
..GeneralConfig::default()
};
let mut proxy_map_v4 = HashMap::new();
proxy_map_v4.insert(
2,
vec![(IpAddr::V4(Ipv4Addr::new(203, 0, 113, 10)), 443)],
);
let decision = NetworkDecision {
ipv4_me: true,
..NetworkDecision::default()
};
MePool::new(
None,
vec![1u8; 32],
@ -1341,10 +1424,10 @@ mod tests {
None,
12,
1200,
HashMap::new(),
proxy_map_v4,
HashMap::new(),
None,
NetworkDecision::default(),
decision,
None,
Arc::new(SecureRandom::new()),
Arc::new(Stats::default()),
@ -1438,6 +1521,7 @@ mod tests {
pool.writers.write().await.push(writer);
pool.registry.register_writer(writer_id, tx).await;
pool.conn_count.fetch_add(1, Ordering::Relaxed);
pool.increment_draining_active_runtime();
assert!(
pool.registry
.bind_writer(
@ -1455,8 +1539,56 @@ mod tests {
conn_id
}
async fn insert_live_writer(pool: &Arc<MePool>, writer_id: u64, writer_dc: i32) {
let (tx, _writer_rx) = mpsc::channel::<WriterCommand>(8);
let writer = MeWriter {
id: writer_id,
addr: SocketAddr::new(
IpAddr::V4(Ipv4Addr::new(203, 0, 113, (writer_id as u8).saturating_add(1))),
4000 + writer_id as u16,
),
source_ip: IpAddr::V4(Ipv4Addr::LOCALHOST),
writer_dc,
generation: 2,
contour: Arc::new(AtomicU8::new(WriterContour::Active.as_u8())),
created_at: Instant::now(),
tx: tx.clone(),
cancel: CancellationToken::new(),
degraded: Arc::new(AtomicBool::new(false)),
rtt_ema_ms_x10: Arc::new(AtomicU32::new(0)),
draining: Arc::new(AtomicBool::new(false)),
draining_started_at_epoch_secs: Arc::new(AtomicU64::new(0)),
drain_deadline_epoch_secs: Arc::new(AtomicU64::new(0)),
allow_drain_fallback: Arc::new(AtomicBool::new(false)),
};
pool.writers.write().await.push(writer);
pool.registry.register_writer(writer_id, tx).await;
pool.conn_count.fetch_add(1, Ordering::Relaxed);
}
#[tokio::test]
async fn reap_draining_writers_force_closes_oldest_over_threshold() {
let pool = make_pool(2).await;
insert_live_writer(&pool, 1, 2).await;
assert!(pool.has_non_draining_writer_per_desired_dc_group().await);
let now_epoch_secs = MePool::now_epoch_secs();
let conn_a = insert_draining_writer(&pool, 10, now_epoch_secs.saturating_sub(30)).await;
let conn_b = insert_draining_writer(&pool, 20, now_epoch_secs.saturating_sub(20)).await;
let conn_c = insert_draining_writer(&pool, 30, now_epoch_secs.saturating_sub(10)).await;
let mut warn_next_allowed = HashMap::new();
reap_draining_writers(&pool, &mut warn_next_allowed).await;
let mut writer_ids: Vec<u64> = pool.writers.read().await.iter().map(|writer| writer.id).collect();
writer_ids.sort_unstable();
assert_eq!(writer_ids, vec![1, 20, 30]);
assert!(pool.registry.get_writer(conn_a).await.is_none());
assert_eq!(pool.registry.get_writer(conn_b).await.unwrap().writer_id, 20);
assert_eq!(pool.registry.get_writer(conn_c).await.unwrap().writer_id, 30);
}
#[tokio::test]
async fn reap_draining_writers_does_not_force_close_overflow_without_replacement() {
let pool = make_pool(2).await;
let now_epoch_secs = MePool::now_epoch_secs();
let conn_a = insert_draining_writer(&pool, 10, now_epoch_secs.saturating_sub(30)).await;
@ -1466,9 +1598,10 @@ mod tests {
reap_draining_writers(&pool, &mut warn_next_allowed).await;
let writer_ids: Vec<u64> = pool.writers.read().await.iter().map(|writer| writer.id).collect();
assert_eq!(writer_ids, vec![20, 30]);
assert!(pool.registry.get_writer(conn_a).await.is_none());
let mut writer_ids: Vec<u64> = pool.writers.read().await.iter().map(|writer| writer.id).collect();
writer_ids.sort_unstable();
assert_eq!(writer_ids, vec![10, 20, 30]);
assert_eq!(pool.registry.get_writer(conn_a).await.unwrap().writer_id, 10);
assert_eq!(pool.registry.get_writer(conn_b).await.unwrap().writer_id, 20);
assert_eq!(pool.registry.get_writer(conn_c).await.unwrap().writer_id, 30);
}

View File

@ -160,6 +160,7 @@ pub struct MePool {
pub(super) refill_inflight: Arc<Mutex<HashSet<RefillEndpointKey>>>,
pub(super) refill_inflight_dc: Arc<Mutex<HashSet<RefillDcKey>>>,
pub(super) conn_count: AtomicUsize,
pub(super) draining_active_runtime: AtomicU64,
pub(super) stats: Arc<crate::stats::Stats>,
pub(super) generation: AtomicU64,
pub(super) active_generation: AtomicU64,
@ -438,6 +439,7 @@ impl MePool {
refill_inflight: Arc::new(Mutex::new(HashSet::new())),
refill_inflight_dc: Arc::new(Mutex::new(HashSet::new())),
conn_count: AtomicUsize::new(0),
draining_active_runtime: AtomicU64::new(0),
generation: AtomicU64::new(1),
active_generation: AtomicU64::new(1),
warm_generation: AtomicU64::new(0),
@ -690,6 +692,32 @@ impl MePool {
}
}
pub(super) fn draining_active_runtime(&self) -> u64 {
self.draining_active_runtime.load(Ordering::Relaxed)
}
pub(super) fn increment_draining_active_runtime(&self) {
self.draining_active_runtime.fetch_add(1, Ordering::Relaxed);
}
pub(super) fn decrement_draining_active_runtime(&self) {
let mut current = self.draining_active_runtime.load(Ordering::Relaxed);
loop {
if current == 0 {
break;
}
match self.draining_active_runtime.compare_exchange_weak(
current,
current - 1,
Ordering::Relaxed,
Ordering::Relaxed,
) {
Ok(_) => break,
Err(actual) => current = actual,
}
}
}
pub(super) async fn key_selector(&self) -> u32 {
self.proxy_secret.read().await.key_selector
}

View File

@ -141,6 +141,38 @@ impl MePool {
out
}
pub(super) async fn has_non_draining_writer_per_desired_dc_group(&self) -> bool {
let desired_by_dc = self.desired_dc_endpoints().await;
let required_dcs: HashSet<i32> = desired_by_dc
.iter()
.filter_map(|(dc, endpoints)| {
if endpoints.is_empty() {
None
} else {
Some(*dc)
}
})
.collect();
if required_dcs.is_empty() {
return true;
}
let ws = self.writers.read().await;
let mut covered_dcs = HashSet::<i32>::with_capacity(required_dcs.len());
for writer in ws.iter() {
if writer.draining.load(Ordering::Relaxed) {
continue;
}
if required_dcs.contains(&writer.writer_dc) {
covered_dcs.insert(writer.writer_dc);
if covered_dcs.len() == required_dcs.len() {
return true;
}
}
}
false
}
fn hardswap_warmup_connect_delay_ms(&self) -> u64 {
let min_ms = self.me_hardswap_warmup_delay_min_ms.load(Ordering::Relaxed);
let max_ms = self.me_hardswap_warmup_delay_max_ms.load(Ordering::Relaxed);
@ -475,12 +507,30 @@ impl MePool {
coverage_ratio = format_args!("{coverage_ratio:.3}"),
min_ratio = format_args!("{min_ratio:.3}"),
drain_timeout_secs,
"ME reinit cycle covered; draining stale writers"
"ME reinit cycle covered; processing stale writers"
);
self.stats.increment_pool_swap_total();
let can_drop_with_replacement = self
.has_non_draining_writer_per_desired_dc_group()
.await;
if can_drop_with_replacement {
info!(
stale_writers = stale_writer_ids.len(),
"ME reinit stale writers: replacement coverage ready, force-closing clients for fast rebind"
);
} else {
warn!(
stale_writers = stale_writer_ids.len(),
"ME reinit stale writers: replacement coverage incomplete, keeping draining fallback"
);
}
for writer_id in stale_writer_ids {
self.mark_writer_draining_with_timeout(writer_id, drain_timeout, !hardswap)
.await;
if can_drop_with_replacement {
self.stats.increment_pool_force_close_total();
self.remove_writer_and_close_clients(writer_id).await;
}
}
if hardswap {
self.clear_pending_hardswap_state();

View File

@ -514,6 +514,7 @@ impl MePool {
let was_draining = w.draining.load(Ordering::Relaxed);
if was_draining {
self.stats.decrement_pool_drain_active();
self.decrement_draining_active_runtime();
}
self.stats.increment_me_writer_removed_total();
w.cancel.cancel();
@ -572,6 +573,7 @@ impl MePool {
.store(drain_deadline_epoch_secs, Ordering::Relaxed);
if !already_draining {
self.stats.increment_pool_drain_active();
self.increment_draining_active_runtime();
}
w.contour
.store(WriterContour::Draining.as_u8(), Ordering::Relaxed);

View File

@ -436,6 +436,19 @@ impl ConnRegistry {
.map(|s| s.is_empty())
.unwrap_or(true)
}
pub(super) async fn non_empty_writer_ids(&self, writer_ids: &[u64]) -> HashSet<u64> {
let inner = self.inner.read().await;
let mut out = HashSet::<u64>::with_capacity(writer_ids.len());
for writer_id in writer_ids {
if let Some(conns) = inner.conns_for_writer.get(writer_id)
&& !conns.is_empty()
{
out.insert(*writer_id);
}
}
out
}
}
#[cfg(test)]
@ -634,4 +647,35 @@ mod tests {
);
assert!(registry.get_writer(conn_id).await.is_none());
}
#[tokio::test]
async fn non_empty_writer_ids_returns_only_writers_with_bound_clients() {
let registry = ConnRegistry::new();
let (conn_id, _rx) = registry.register().await;
let (writer_tx_a, _writer_rx_a) = tokio::sync::mpsc::channel(8);
let (writer_tx_b, _writer_rx_b) = tokio::sync::mpsc::channel(8);
registry.register_writer(10, writer_tx_a).await;
registry.register_writer(20, writer_tx_b).await;
let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::LOCALHOST), 443);
assert!(
registry
.bind_writer(
conn_id,
10,
ConnMeta {
target_dc: 2,
client_addr: addr,
our_addr: addr,
proto_flags: 0,
},
)
.await
);
let non_empty = registry.non_empty_writer_ids(&[10, 20, 30]).await;
assert!(non_empty.contains(&10));
assert!(!non_empty.contains(&20));
assert!(!non_empty.contains(&30));
}
}