Unexpected-only Quarantine

Co-Authored-By: brekotis <93345790+brekotis@users.noreply.github.com>
This commit is contained in:
Alexey 2026-03-25 22:25:39 +03:00
parent b94746a6e0
commit 70479c4094
No known key found for this signature in database
7 changed files with 125 additions and 9 deletions

2
Cargo.lock generated
View File

@ -2793,7 +2793,7 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417"
[[package]] [[package]]
name = "telemt" name = "telemt"
version = "3.3.31" version = "3.3.32"
dependencies = [ dependencies = [
"aes", "aes",
"anyhow", "anyhow",

View File

@ -1,6 +1,6 @@
[package] [package]
name = "telemt" name = "telemt"
version = "3.3.31" version = "3.3.32"
edition = "2024" edition = "2024"
[features] [features]

View File

@ -1558,6 +1558,40 @@ async fn render_metrics(stats: &Stats, config: &ProxyConfig, ip_tracker: &UserIp
0 0
} }
); );
let _ = writeln!(
out,
"# HELP telemt_me_endpoint_quarantine_unexpected_total ME endpoint quarantines caused by unexpected writer removals"
);
let _ = writeln!(
out,
"# TYPE telemt_me_endpoint_quarantine_unexpected_total counter"
);
let _ = writeln!(
out,
"telemt_me_endpoint_quarantine_unexpected_total {}",
if me_allows_normal {
stats.get_me_endpoint_quarantine_unexpected_total()
} else {
0
}
);
let _ = writeln!(
out,
"# HELP telemt_me_endpoint_quarantine_draining_suppressed_total Draining writer removals that skipped endpoint quarantine"
);
let _ = writeln!(
out,
"# TYPE telemt_me_endpoint_quarantine_draining_suppressed_total counter"
);
let _ = writeln!(
out,
"telemt_me_endpoint_quarantine_draining_suppressed_total {}",
if me_allows_normal {
stats.get_me_endpoint_quarantine_draining_suppressed_total()
} else {
0
}
);
let _ = writeln!( let _ = writeln!(
out, out,
@ -2622,6 +2656,9 @@ mod tests {
stats.increment_me_d2c_write_mode(crate::stats::MeD2cWriteMode::Coalesced); stats.increment_me_d2c_write_mode(crate::stats::MeD2cWriteMode::Coalesced);
stats.increment_me_d2c_quota_reject_total(crate::stats::MeD2cQuotaRejectStage::PostWrite); stats.increment_me_d2c_quota_reject_total(crate::stats::MeD2cQuotaRejectStage::PostWrite);
stats.observe_me_d2c_frame_buf_shrink(4096); stats.observe_me_d2c_frame_buf_shrink(4096);
stats.increment_me_endpoint_quarantine_total();
stats.increment_me_endpoint_quarantine_unexpected_total();
stats.increment_me_endpoint_quarantine_draining_suppressed_total();
stats.increment_user_connects("alice"); stats.increment_user_connects("alice");
stats.increment_user_curr_connects("alice"); stats.increment_user_curr_connects("alice");
stats.add_user_octets_from("alice", 1024); stats.add_user_octets_from("alice", 1024);
@ -2672,6 +2709,9 @@ mod tests {
assert!(output.contains("telemt_me_d2c_quota_reject_total{stage=\"post_write\"} 1")); assert!(output.contains("telemt_me_d2c_quota_reject_total{stage=\"post_write\"} 1"));
assert!(output.contains("telemt_me_d2c_frame_buf_shrink_total 1")); assert!(output.contains("telemt_me_d2c_frame_buf_shrink_total 1"));
assert!(output.contains("telemt_me_d2c_frame_buf_shrink_bytes_total 4096")); assert!(output.contains("telemt_me_d2c_frame_buf_shrink_bytes_total 4096"));
assert!(output.contains("telemt_me_endpoint_quarantine_total 1"));
assert!(output.contains("telemt_me_endpoint_quarantine_unexpected_total 1"));
assert!(output.contains("telemt_me_endpoint_quarantine_draining_suppressed_total 1"));
assert!(output.contains("telemt_user_connections_total{user=\"alice\"} 1")); assert!(output.contains("telemt_user_connections_total{user=\"alice\"} 1"));
assert!(output.contains("telemt_user_connections_current{user=\"alice\"} 1")); assert!(output.contains("telemt_user_connections_current{user=\"alice\"} 1"));
assert!(output.contains("telemt_user_octets_from_client{user=\"alice\"} 1024")); assert!(output.contains("telemt_user_octets_from_client{user=\"alice\"} 1024"));
@ -2738,6 +2778,12 @@ mod tests {
assert!(output.contains("# TYPE telemt_me_d2c_write_mode_total counter")); assert!(output.contains("# TYPE telemt_me_d2c_write_mode_total counter"));
assert!(output.contains("# TYPE telemt_me_d2c_batch_frames_bucket_total counter")); assert!(output.contains("# TYPE telemt_me_d2c_batch_frames_bucket_total counter"));
assert!(output.contains("# TYPE telemt_me_d2c_flush_duration_us_bucket_total counter")); assert!(output.contains("# TYPE telemt_me_d2c_flush_duration_us_bucket_total counter"));
assert!(output.contains("# TYPE telemt_me_endpoint_quarantine_total counter"));
assert!(output.contains("# TYPE telemt_me_endpoint_quarantine_unexpected_total counter"));
assert!(
output
.contains("# TYPE telemt_me_endpoint_quarantine_draining_suppressed_total counter")
);
assert!(output.contains("# TYPE telemt_me_writer_removed_total counter")); assert!(output.contains("# TYPE telemt_me_writer_removed_total counter"));
assert!( assert!(
output output

View File

@ -128,6 +128,8 @@ pub struct Stats {
me_crc_mismatch: AtomicU64, me_crc_mismatch: AtomicU64,
me_seq_mismatch: AtomicU64, me_seq_mismatch: AtomicU64,
me_endpoint_quarantine_total: AtomicU64, me_endpoint_quarantine_total: AtomicU64,
me_endpoint_quarantine_unexpected_total: AtomicU64,
me_endpoint_quarantine_draining_suppressed_total: AtomicU64,
me_kdf_drift_total: AtomicU64, me_kdf_drift_total: AtomicU64,
me_kdf_port_only_drift_total: AtomicU64, me_kdf_port_only_drift_total: AtomicU64,
me_hardswap_pending_reuse_total: AtomicU64, me_hardswap_pending_reuse_total: AtomicU64,
@ -1251,6 +1253,18 @@ impl Stats {
.fetch_add(1, Ordering::Relaxed); .fetch_add(1, Ordering::Relaxed);
} }
} }
pub fn increment_me_endpoint_quarantine_unexpected_total(&self) {
if self.telemetry_me_allows_normal() {
self.me_endpoint_quarantine_unexpected_total
.fetch_add(1, Ordering::Relaxed);
}
}
pub fn increment_me_endpoint_quarantine_draining_suppressed_total(&self) {
if self.telemetry_me_allows_normal() {
self.me_endpoint_quarantine_draining_suppressed_total
.fetch_add(1, Ordering::Relaxed);
}
}
pub fn increment_me_kdf_drift_total(&self) { pub fn increment_me_kdf_drift_total(&self) {
if self.telemetry_me_allows_normal() { if self.telemetry_me_allows_normal() {
self.me_kdf_drift_total.fetch_add(1, Ordering::Relaxed); self.me_kdf_drift_total.fetch_add(1, Ordering::Relaxed);
@ -1503,6 +1517,14 @@ impl Stats {
pub fn get_me_endpoint_quarantine_total(&self) -> u64 { pub fn get_me_endpoint_quarantine_total(&self) -> u64 {
self.me_endpoint_quarantine_total.load(Ordering::Relaxed) self.me_endpoint_quarantine_total.load(Ordering::Relaxed)
} }
pub fn get_me_endpoint_quarantine_unexpected_total(&self) -> u64 {
self.me_endpoint_quarantine_unexpected_total
.load(Ordering::Relaxed)
}
pub fn get_me_endpoint_quarantine_draining_suppressed_total(&self) -> u64 {
self.me_endpoint_quarantine_draining_suppressed_total
.load(Ordering::Relaxed)
}
pub fn get_me_kdf_drift_total(&self) -> u64 { pub fn get_me_kdf_drift_total(&self) -> u64 {
self.me_kdf_drift_total.load(Ordering::Relaxed) self.me_kdf_drift_total.load(Ordering::Relaxed)
} }

View File

@ -33,6 +33,7 @@ impl MePool {
&self, &self,
addr: SocketAddr, addr: SocketAddr,
uptime: Duration, uptime: Duration,
reason: &'static str,
) { ) {
if uptime > Duration::from_secs(ME_FLAP_UPTIME_THRESHOLD_SECS) { if uptime > Duration::from_secs(ME_FLAP_UPTIME_THRESHOLD_SECS) {
return; return;
@ -45,6 +46,7 @@ impl MePool {
self.stats.increment_me_endpoint_quarantine_total(); self.stats.increment_me_endpoint_quarantine_total();
warn!( warn!(
%addr, %addr,
reason,
uptime_ms = uptime.as_millis(), uptime_ms = uptime.as_millis(),
quarantine_secs = ME_FLAP_QUARANTINE_SECS, quarantine_secs = ME_FLAP_QUARANTINE_SECS,
"ME endpoint temporarily quarantined due to rapid writer flap" "ME endpoint temporarily quarantined due to rapid writer flap"

View File

@ -609,8 +609,21 @@ impl MePool {
} }
if let Some(addr) = removed_addr { if let Some(addr) = removed_addr {
if let Some(uptime) = removed_uptime { if let Some(uptime) = removed_uptime {
// Quarantine flapping endpoints regardless of draining state. // Quarantine contract: only unexpected removals are considered endpoint flap.
self.maybe_quarantine_flapping_endpoint(addr, uptime).await; if trigger_refill {
self.stats
.increment_me_endpoint_quarantine_unexpected_total();
self.maybe_quarantine_flapping_endpoint(addr, uptime, "unexpected")
.await;
} else {
self.stats
.increment_me_endpoint_quarantine_draining_suppressed_total();
debug!(
%addr,
uptime_ms = uptime.as_millis(),
"Skipping endpoint quarantine for draining writer removal"
);
}
} }
if trigger_refill && let Some(writer_dc) = removed_dc { if trigger_refill && let Some(writer_dc) = removed_dc {
self.trigger_immediate_refill_for_dc(addr, writer_dc); self.trigger_immediate_refill_for_dc(addr, writer_dc);

View File

@ -173,10 +173,15 @@ async fn bind_conn_to_writer(pool: &Arc<MePool>, writer_id: u64, port: u16) -> u
} }
#[tokio::test] #[tokio::test]
async fn remove_draining_writer_still_quarantines_flapping_endpoint() { async fn remove_draining_writer_does_not_quarantine_flapping_endpoint() {
let pool = make_pool().await; let pool = make_pool().await;
let writer_id = 77; let writer_id = 77;
let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 12, 0, 77)), 443); let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 12, 0, 77)), 443);
let before_total = pool.stats.get_me_endpoint_quarantine_total();
let before_unexpected = pool.stats.get_me_endpoint_quarantine_unexpected_total();
let before_suppressed = pool
.stats
.get_me_endpoint_quarantine_draining_suppressed_total();
insert_writer( insert_writer(
&pool, &pool,
writer_id, writer_id,
@ -200,8 +205,18 @@ async fn remove_draining_writer_still_quarantines_flapping_endpoint() {
"writer must be removed from pool after cleanup" "writer must be removed from pool after cleanup"
); );
assert!( assert!(
pool.is_endpoint_quarantined(addr).await, !pool.is_endpoint_quarantined(addr).await,
"draining removals must still quarantine flapping endpoints" "draining removals must not quarantine endpoint"
);
assert_eq!(pool.stats.get_me_endpoint_quarantine_total(), before_total);
assert_eq!(
pool.stats.get_me_endpoint_quarantine_unexpected_total(),
before_unexpected
);
assert_eq!(
pool.stats
.get_me_endpoint_quarantine_draining_suppressed_total(),
before_suppressed + 1
); );
assert_eq!(pool.conn_count.load(Ordering::Relaxed), 0); assert_eq!(pool.conn_count.load(Ordering::Relaxed), 0);
} }
@ -257,16 +272,21 @@ async fn edge_draining_only_detach_rejects_active_writer() {
} }
#[tokio::test] #[tokio::test]
async fn adversarial_blackhat_single_remove_establishes_single_quarantine_entry() { async fn adversarial_blackhat_single_unexpected_remove_establishes_single_quarantine_entry() {
let pool = make_pool().await; let pool = make_pool().await;
let writer_id = 93; let writer_id = 93;
let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 12, 0, 93)), 443); let addr = SocketAddr::new(IpAddr::V4(Ipv4Addr::new(127, 12, 0, 93)), 443);
let before_total = pool.stats.get_me_endpoint_quarantine_total();
let before_unexpected = pool.stats.get_me_endpoint_quarantine_unexpected_total();
let before_suppressed = pool
.stats
.get_me_endpoint_quarantine_draining_suppressed_total();
insert_writer( insert_writer(
&pool, &pool,
writer_id, writer_id,
2, 2,
addr, addr,
true, false,
Instant::now() - Duration::from_secs(1), Instant::now() - Duration::from_secs(1),
) )
.await; .await;
@ -274,6 +294,19 @@ async fn adversarial_blackhat_single_remove_establishes_single_quarantine_entry(
pool.remove_writer_and_close_clients(writer_id).await; pool.remove_writer_and_close_clients(writer_id).await;
assert!(pool.is_endpoint_quarantined(addr).await); assert!(pool.is_endpoint_quarantined(addr).await);
assert_eq!(pool.endpoint_quarantine.lock().await.len(), 1); assert_eq!(pool.endpoint_quarantine.lock().await.len(), 1);
assert_eq!(
pool.stats.get_me_endpoint_quarantine_total(),
before_total + 1
);
assert_eq!(
pool.stats.get_me_endpoint_quarantine_unexpected_total(),
before_unexpected + 1
);
assert_eq!(
pool.stats
.get_me_endpoint_quarantine_draining_suppressed_total(),
before_suppressed
);
} }
#[tokio::test] #[tokio::test]