Teardown Monitoring in API

Co-Authored-By: brekotis <93345790+brekotis@users.noreply.github.com>
This commit is contained in:
Alexey
2026-03-20 12:45:53 +03:00
parent b55b264345
commit ef9b7b1492
9 changed files with 496 additions and 35 deletions

View File

@@ -10,6 +10,7 @@ use tracing::{debug, info, warn};
use crate::config::MeFloorMode;
use crate::crypto::SecureRandom;
use crate::network::IpFamily;
use crate::stats::MeWriterTeardownReason;
use super::MePool;
use super::pool::MeWriter;
@@ -358,7 +359,8 @@ pub(super) async fn reap_draining_writers(
continue;
}
pool.stats.increment_pool_force_close_total();
pool.remove_writer_and_close_clients(writer_id).await;
pool.remove_writer_and_close_clients(writer_id, MeWriterTeardownReason::ReapTimeoutExpired)
.await;
pool.stats
.increment_me_draining_writers_reap_progress_total();
}
@@ -376,7 +378,8 @@ pub(super) async fn reap_draining_writers(
continue;
}
pool.stats.increment_pool_force_close_total();
pool.remove_writer_and_close_clients(writer_id).await;
pool.remove_writer_and_close_clients(writer_id, MeWriterTeardownReason::ReapThresholdForce)
.await;
pool.stats
.increment_me_draining_writers_reap_progress_total();
closed_total = closed_total.saturating_add(1);
@@ -388,7 +391,8 @@ pub(super) async fn reap_draining_writers(
if !closed_writer_ids.insert(writer_id) {
continue;
}
pool.remove_writer_and_close_clients(writer_id).await;
pool.remove_writer_and_close_clients(writer_id, MeWriterTeardownReason::ReapEmpty)
.await;
pool.stats
.increment_me_draining_writers_reap_progress_total();
closed_total = closed_total.saturating_add(1);
@@ -1646,11 +1650,14 @@ pub async fn me_zombie_writer_watchdog(pool: Arc<MePool>) {
for (writer_id, had_clients) in &zombie_ids_with_meta {
let result = tokio::time::timeout(
Duration::from_secs(REMOVE_TIMEOUT_SECS),
pool.remove_writer_and_close_clients(*writer_id),
pool.remove_writer_and_close_clients(
*writer_id,
MeWriterTeardownReason::WatchdogStuckDraining,
),
)
.await;
match result {
Ok(()) => {
Ok(true) => {
removal_timeout_streak.remove(writer_id);
pool.stats.increment_pool_force_close_total();
pool.stats
@@ -1661,7 +1668,16 @@ pub async fn me_zombie_writer_watchdog(pool: Arc<MePool>) {
"Zombie writer removed by watchdog"
);
}
Ok(false) => {
removal_timeout_streak.remove(writer_id);
debug!(
writer_id,
had_clients,
"Zombie writer watchdog removal became no-op"
);
}
Err(_) => {
pool.stats.increment_me_writer_teardown_timeout_total();
let streak = removal_timeout_streak
.entry(*writer_id)
.and_modify(|value| *value = value.saturating_add(1))
@@ -1675,10 +1691,14 @@ pub async fn me_zombie_writer_watchdog(pool: Arc<MePool>) {
if *streak < HARD_DETACH_TIMEOUT_STREAK {
continue;
}
pool.stats.increment_me_writer_teardown_escalation_total();
let hard_detach = tokio::time::timeout(
Duration::from_secs(REMOVE_TIMEOUT_SECS),
pool.remove_draining_writer_hard_detach(*writer_id),
pool.remove_draining_writer_hard_detach(
*writer_id,
MeWriterTeardownReason::WatchdogStuckDraining,
),
)
.await;
match hard_detach {
@@ -1702,6 +1722,7 @@ pub async fn me_zombie_writer_watchdog(pool: Arc<MePool>) {
);
}
Err(_) => {
pool.stats.increment_me_writer_teardown_timeout_total();
warn!(
writer_id,
had_clients,

View File

@@ -316,7 +316,12 @@ async fn reap_draining_writers_maintains_warn_state_subset_property_under_bulk_c
let ids = sorted_writer_ids(&pool).await;
for writer_id in ids.into_iter().take(3) {
let _ = pool.remove_writer_and_close_clients(writer_id).await;
let _ = pool
.remove_writer_and_close_clients(
writer_id,
crate::stats::MeWriterTeardownReason::ReapEmpty,
)
.await;
}
reap_draining_writers(&pool, &mut warn_next_allowed, &mut soft_evict_next_allowed).await;

View File

@@ -197,7 +197,9 @@ async fn reap_draining_writers_drops_warn_state_for_removed_writer() {
reap_draining_writers(&pool, &mut warn_next_allowed, &mut soft_evict_next_allowed).await;
assert!(warn_next_allowed.contains_key(&7));
let _ = pool.remove_writer_and_close_clients(7).await;
let _ = pool
.remove_writer_and_close_clients(7, crate::stats::MeWriterTeardownReason::ReapEmpty)
.await;
assert!(pool.registry.get_writer(conn_ids[0]).await.is_none());
reap_draining_writers(&pool, &mut warn_next_allowed, &mut soft_evict_next_allowed).await;
@@ -527,7 +529,12 @@ async fn reap_draining_writers_warn_state_never_exceeds_live_draining_population
let existing_writer_ids = current_writer_ids(&pool).await;
for writer_id in existing_writer_ids.into_iter().take(4) {
let _ = pool.remove_writer_and_close_clients(writer_id).await;
let _ = pool
.remove_writer_and_close_clients(
writer_id,
crate::stats::MeWriterTeardownReason::ReapEmpty,
)
.await;
}
reap_draining_writers(&pool, &mut warn_next_allowed, &mut soft_evict_next_allowed).await;
assert!(warn_next_allowed.len() <= pool.writers.read().await.len());

View File

@@ -16,6 +16,9 @@ use crate::config::MeBindStaleMode;
use crate::crypto::SecureRandom;
use crate::error::{ProxyError, Result};
use crate::protocol::constants::{RPC_CLOSE_EXT_U32, RPC_PING_U32};
use crate::stats::{
MeWriterCleanupSideEffectStep, MeWriterTeardownMode, MeWriterTeardownReason,
};
use super::codec::{RpcWriter, WriterCommand};
use super::pool::{MePool, MeWriter, WriterContour};
@@ -28,7 +31,7 @@ const ME_IDLE_KEEPALIVE_MAX_SECS: u64 = 5;
const ME_RPC_PROXY_REQ_RESPONSE_WAIT_MS: u64 = 700;
#[derive(Clone, Copy)]
enum WriterTeardownMode {
enum WriterRemoveGuardMode {
Any,
DrainingOnly,
}
@@ -49,9 +52,16 @@ impl MePool {
for writer_id in closed_writer_ids {
if self.registry.is_writer_empty(writer_id).await {
let _ = self.remove_writer_only(writer_id).await;
let _ = self
.remove_writer_only(writer_id, MeWriterTeardownReason::PruneClosedWriter)
.await;
} else {
let _ = self.remove_writer_and_close_clients(writer_id).await;
let _ = self
.remove_writer_and_close_clients(
writer_id,
MeWriterTeardownReason::PruneClosedWriter,
)
.await;
}
}
}
@@ -173,7 +183,11 @@ impl MePool {
.is_ok()
{
if let Some(pool) = pool_writer_task.upgrade() {
pool.remove_writer_and_close_clients(writer_id).await;
pool.remove_writer_and_close_clients(
writer_id,
MeWriterTeardownReason::WriterTaskExit,
)
.await;
} else {
cancel_wr.cancel();
}
@@ -264,7 +278,11 @@ impl MePool {
.is_ok()
{
if let Some(pool) = pool.upgrade() {
pool.remove_writer_and_close_clients(writer_id).await;
pool.remove_writer_and_close_clients(
writer_id,
MeWriterTeardownReason::ReaderExit,
)
.await;
} else {
// Fallback for shutdown races: make writer task exit quickly so stale
// channels are observable by periodic prune.
@@ -372,7 +390,11 @@ impl MePool {
.compare_exchange(false, true, Ordering::AcqRel, Ordering::Relaxed)
.is_ok()
{
pool.remove_writer_and_close_clients(writer_id).await;
pool.remove_writer_and_close_clients(
writer_id,
MeWriterTeardownReason::PingSendFail,
)
.await;
}
break;
}
@@ -465,7 +487,11 @@ impl MePool {
.compare_exchange(false, true, Ordering::AcqRel, Ordering::Relaxed)
.is_ok()
{
pool.remove_writer_and_close_clients(writer_id).await;
pool.remove_writer_and_close_clients(
writer_id,
MeWriterTeardownReason::SignalSendFail,
)
.await;
}
break;
}
@@ -499,7 +525,11 @@ impl MePool {
.compare_exchange(false, true, Ordering::AcqRel, Ordering::Relaxed)
.is_ok()
{
pool.remove_writer_and_close_clients(writer_id).await;
pool.remove_writer_and_close_clients(
writer_id,
MeWriterTeardownReason::SignalSendFail,
)
.await;
}
break;
}
@@ -512,25 +542,48 @@ impl MePool {
Ok(())
}
pub(crate) async fn remove_writer_and_close_clients(self: &Arc<Self>, writer_id: u64) {
pub(crate) async fn remove_writer_and_close_clients(
self: &Arc<Self>,
writer_id: u64,
reason: MeWriterTeardownReason,
) -> bool {
// Full client cleanup now happens inside `registry.writer_lost` to keep
// writer reap/remove paths strictly non-blocking per connection.
let _ = self
.remove_writer_with_mode(writer_id, WriterTeardownMode::Any)
.await;
self.remove_writer_with_mode(
writer_id,
reason,
MeWriterTeardownMode::Normal,
WriterRemoveGuardMode::Any,
)
.await
}
pub(super) async fn remove_draining_writer_hard_detach(
self: &Arc<Self>,
writer_id: u64,
reason: MeWriterTeardownReason,
) -> bool {
self.remove_writer_with_mode(writer_id, WriterTeardownMode::DrainingOnly)
.await
self.remove_writer_with_mode(
writer_id,
reason,
MeWriterTeardownMode::HardDetach,
WriterRemoveGuardMode::DrainingOnly,
)
.await
}
async fn remove_writer_only(self: &Arc<Self>, writer_id: u64) -> bool {
self.remove_writer_with_mode(writer_id, WriterTeardownMode::Any)
.await
async fn remove_writer_only(
self: &Arc<Self>,
writer_id: u64,
reason: MeWriterTeardownReason,
) -> bool {
self.remove_writer_with_mode(
writer_id,
reason,
MeWriterTeardownMode::Normal,
WriterRemoveGuardMode::Any,
)
.await
}
// Authoritative teardown primitive shared by normal cleanup and watchdog path.
@@ -542,8 +595,13 @@ impl MePool {
async fn remove_writer_with_mode(
self: &Arc<Self>,
writer_id: u64,
mode: WriterTeardownMode,
reason: MeWriterTeardownReason,
mode: MeWriterTeardownMode,
guard_mode: WriterRemoveGuardMode,
) -> bool {
let started_at = Instant::now();
self.stats
.increment_me_writer_teardown_attempt_total(reason, mode);
let mut close_tx: Option<mpsc::Sender<WriterCommand>> = None;
let mut removed_addr: Option<SocketAddr> = None;
let mut removed_dc: Option<i32> = None;
@@ -553,9 +611,12 @@ impl MePool {
{
let mut ws = self.writers.write().await;
if let Some(pos) = ws.iter().position(|w| w.id == writer_id) {
if matches!(mode, WriterTeardownMode::DrainingOnly)
if matches!(guard_mode, WriterRemoveGuardMode::DrainingOnly)
&& !ws[pos].draining.load(Ordering::Relaxed)
{
self.stats.increment_me_writer_teardown_noop_total();
self.stats
.observe_me_writer_teardown_duration(mode, started_at.elapsed());
return false;
}
let w = ws.remove(pos);
@@ -595,6 +656,9 @@ impl MePool {
self.stats.increment_me_writer_close_signal_drop_total();
self.stats
.increment_me_writer_close_signal_channel_full_total();
self.stats.increment_me_writer_cleanup_side_effect_failures_total(
MeWriterCleanupSideEffectStep::CloseSignalChannelFull,
);
debug!(
writer_id,
"Skipping close signal for removed writer: command channel is full"
@@ -602,6 +666,9 @@ impl MePool {
}
Err(TrySendError::Closed(_)) => {
self.stats.increment_me_writer_close_signal_drop_total();
self.stats.increment_me_writer_cleanup_side_effect_failures_total(
MeWriterCleanupSideEffectStep::CloseSignalChannelClosed,
);
debug!(
writer_id,
"Skipping close signal for removed writer: command channel is closed"
@@ -619,6 +686,13 @@ impl MePool {
self.trigger_immediate_refill_for_dc(addr, writer_dc);
}
}
if removed {
self.stats.increment_me_writer_teardown_success_total(mode);
} else {
self.stats.increment_me_writer_teardown_noop_total();
}
self.stats
.observe_me_writer_teardown_duration(mode, started_at.elapsed());
removed
}

View File

@@ -14,6 +14,7 @@ use crate::config::{MeRouteNoWriterMode, MeWriterPickMode};
use crate::error::{ProxyError, Result};
use crate::network::IpFamily;
use crate::protocol::constants::{RPC_CLOSE_CONN_U32, RPC_CLOSE_EXT_U32};
use crate::stats::MeWriterTeardownReason;
use super::MePool;
use super::codec::WriterCommand;
@@ -134,7 +135,11 @@ impl MePool {
Ok(()) => return Ok(()),
Err(TimedSendError::Closed(_)) => {
warn!(writer_id = current.writer_id, "ME writer channel closed");
self.remove_writer_and_close_clients(current.writer_id).await;
self.remove_writer_and_close_clients(
current.writer_id,
MeWriterTeardownReason::RouteChannelClosed,
)
.await;
continue;
}
Err(TimedSendError::Timeout(_)) => {
@@ -151,7 +156,11 @@ impl MePool {
}
Err(TrySendError::Closed(_)) => {
warn!(writer_id = current.writer_id, "ME writer channel closed");
self.remove_writer_and_close_clients(current.writer_id).await;
self.remove_writer_and_close_clients(
current.writer_id,
MeWriterTeardownReason::RouteChannelClosed,
)
.await;
continue;
}
}
@@ -458,7 +467,11 @@ impl MePool {
Err(TrySendError::Closed(_)) => {
self.stats.increment_me_writer_pick_closed_total(pick_mode);
warn!(writer_id = w.id, "ME writer channel closed");
self.remove_writer_and_close_clients(w.id).await;
self.remove_writer_and_close_clients(
w.id,
MeWriterTeardownReason::RouteChannelClosed,
)
.await;
continue;
}
}
@@ -503,7 +516,11 @@ impl MePool {
Err(TimedSendError::Closed(_)) => {
self.stats.increment_me_writer_pick_closed_total(pick_mode);
warn!(writer_id = w.id, "ME writer channel closed (blocking)");
self.remove_writer_and_close_clients(w.id).await;
self.remove_writer_and_close_clients(
w.id,
MeWriterTeardownReason::RouteChannelClosed,
)
.await;
}
Err(TimedSendError::Timeout(_)) => {
self.stats.increment_me_writer_pick_full_total(pick_mode);
@@ -654,7 +671,11 @@ impl MePool {
}
Err(TrySendError::Closed(_)) => {
debug!("ME close write failed");
self.remove_writer_and_close_clients(w.writer_id).await;
self.remove_writer_and_close_clients(
w.writer_id,
MeWriterTeardownReason::CloseRpcChannelClosed,
)
.await;
}
}
} else {