mirror of
https://github.com/telemt/telemt.git
synced 2026-04-30 00:44:09 +03:00
Instadrain + Hard-remove for long draining-state
Co-Authored-By: brekotis <93345790+brekotis@users.noreply.github.com>
This commit is contained in:
@@ -12,6 +12,7 @@ use crate::crypto::SecureRandom;
|
||||
use crate::network::IpFamily;
|
||||
|
||||
use super::MePool;
|
||||
use super::pool::MeWriter;
|
||||
|
||||
const JITTER_FRAC_NUM: u64 = 2; // jitter up to 50% of backoff
|
||||
#[allow(dead_code)]
|
||||
@@ -31,10 +32,7 @@ const HEALTH_DRAIN_CLOSE_BUDGET_MAX: usize = 256;
|
||||
const HEALTH_DRAIN_SOFT_EVICT_BUDGET_MIN: usize = 8;
|
||||
const HEALTH_DRAIN_SOFT_EVICT_BUDGET_MAX: usize = 256;
|
||||
const HEALTH_DRAIN_REAP_OPPORTUNISTIC_INTERVAL_SECS: u64 = 1;
|
||||
#[cfg(not(test))]
|
||||
const HEALTH_DRAIN_STRICT_IMMEDIATE_FORCE_CLOSE: bool = true;
|
||||
#[cfg(test)]
|
||||
const HEALTH_DRAIN_STRICT_IMMEDIATE_FORCE_CLOSE: bool = false;
|
||||
const HEALTH_DRAIN_TIMEOUT_ENFORCER_INTERVAL_SECS: u64 = 1;
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct DcFloorPlanEntry {
|
||||
@@ -131,6 +129,55 @@ pub async fn me_health_monitor(pool: Arc<MePool>, rng: Arc<SecureRandom>, _min_c
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn me_drain_timeout_enforcer(pool: Arc<MePool>) {
|
||||
let mut drain_warn_next_allowed: HashMap<u64, Instant> = HashMap::new();
|
||||
let mut drain_soft_evict_next_allowed: HashMap<u64, Instant> = HashMap::new();
|
||||
loop {
|
||||
tokio::time::sleep(Duration::from_secs(
|
||||
HEALTH_DRAIN_TIMEOUT_ENFORCER_INTERVAL_SECS,
|
||||
))
|
||||
.await;
|
||||
reap_draining_writers(
|
||||
&pool,
|
||||
&mut drain_warn_next_allowed,
|
||||
&mut drain_soft_evict_next_allowed,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
||||
fn draining_writer_timeout_expired(
|
||||
pool: &MePool,
|
||||
writer: &MeWriter,
|
||||
now_epoch_secs: u64,
|
||||
drain_ttl_secs: u64,
|
||||
) -> bool {
|
||||
if pool
|
||||
.me_instadrain
|
||||
.load(std::sync::atomic::Ordering::Relaxed)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
let deadline_epoch_secs = writer
|
||||
.drain_deadline_epoch_secs
|
||||
.load(std::sync::atomic::Ordering::Relaxed);
|
||||
if deadline_epoch_secs != 0 {
|
||||
return now_epoch_secs >= deadline_epoch_secs;
|
||||
}
|
||||
|
||||
if drain_ttl_secs == 0 {
|
||||
return false;
|
||||
}
|
||||
let drain_started_at_epoch_secs = writer
|
||||
.draining_started_at_epoch_secs
|
||||
.load(std::sync::atomic::Ordering::Relaxed);
|
||||
if drain_started_at_epoch_secs == 0 {
|
||||
return false;
|
||||
}
|
||||
now_epoch_secs.saturating_sub(drain_started_at_epoch_secs) > drain_ttl_secs
|
||||
}
|
||||
|
||||
pub(super) async fn reap_draining_writers(
|
||||
pool: &Arc<MePool>,
|
||||
warn_next_allowed: &mut HashMap<u64, Instant>,
|
||||
@@ -146,11 +193,16 @@ pub(super) async fn reap_draining_writers(
|
||||
let activity = pool.registry.writer_activity_snapshot().await;
|
||||
let mut draining_writers = Vec::new();
|
||||
let mut empty_writer_ids = Vec::<u64>::new();
|
||||
let mut timeout_expired_writer_ids = Vec::<u64>::new();
|
||||
let mut force_close_writer_ids = Vec::<u64>::new();
|
||||
for writer in writers {
|
||||
if !writer.draining.load(std::sync::atomic::Ordering::Relaxed) {
|
||||
continue;
|
||||
}
|
||||
if draining_writer_timeout_expired(pool, &writer, now_epoch_secs, drain_ttl_secs) {
|
||||
timeout_expired_writer_ids.push(writer.id);
|
||||
continue;
|
||||
}
|
||||
if activity
|
||||
.bound_clients_by_writer
|
||||
.get(&writer.id)
|
||||
@@ -163,11 +215,6 @@ pub(super) async fn reap_draining_writers(
|
||||
}
|
||||
draining_writers.push(writer);
|
||||
}
|
||||
if HEALTH_DRAIN_STRICT_IMMEDIATE_FORCE_CLOSE {
|
||||
for writer in draining_writers.drain(..) {
|
||||
force_close_writer_ids.push(writer.id);
|
||||
}
|
||||
}
|
||||
|
||||
if drain_threshold > 0 && draining_writers.len() > drain_threshold as usize {
|
||||
draining_writers.sort_by(|left, right| {
|
||||
@@ -221,14 +268,6 @@ pub(super) async fn reap_draining_writers(
|
||||
"ME draining writer remains non-empty past drain TTL"
|
||||
);
|
||||
}
|
||||
let deadline_epoch_secs = writer
|
||||
.drain_deadline_epoch_secs
|
||||
.load(std::sync::atomic::Ordering::Relaxed);
|
||||
if deadline_epoch_secs != 0 && now_epoch_secs >= deadline_epoch_secs {
|
||||
warn!(writer_id = writer.id, "Drain timeout, force-closing");
|
||||
force_close_writer_ids.push(writer.id);
|
||||
active_draining_writer_ids.remove(&writer.id);
|
||||
}
|
||||
}
|
||||
|
||||
warn_next_allowed.retain(|writer_id, _| active_draining_writer_ids.contains(writer_id));
|
||||
@@ -313,15 +352,21 @@ pub(super) async fn reap_draining_writers(
|
||||
}
|
||||
}
|
||||
|
||||
let mut closed_writer_ids = HashSet::<u64>::new();
|
||||
for writer_id in timeout_expired_writer_ids {
|
||||
if !closed_writer_ids.insert(writer_id) {
|
||||
continue;
|
||||
}
|
||||
pool.stats.increment_pool_force_close_total();
|
||||
pool.remove_writer_and_close_clients(writer_id).await;
|
||||
pool.stats
|
||||
.increment_me_draining_writers_reap_progress_total();
|
||||
}
|
||||
|
||||
let requested_force_close = force_close_writer_ids.len();
|
||||
let requested_empty_close = empty_writer_ids.len();
|
||||
let requested_close_total = requested_force_close.saturating_add(requested_empty_close);
|
||||
let close_budget = if HEALTH_DRAIN_STRICT_IMMEDIATE_FORCE_CLOSE {
|
||||
requested_close_total
|
||||
} else {
|
||||
health_drain_close_budget()
|
||||
};
|
||||
let mut closed_writer_ids = HashSet::<u64>::new();
|
||||
let close_budget = health_drain_close_budget();
|
||||
let mut closed_total = 0usize;
|
||||
for writer_id in force_close_writer_ids {
|
||||
if closed_total >= close_budget {
|
||||
@@ -1581,6 +1626,7 @@ mod tests {
|
||||
general.me_adaptive_floor_max_warm_writers_global,
|
||||
general.hardswap,
|
||||
general.me_pool_drain_ttl_secs,
|
||||
general.me_instadrain,
|
||||
general.me_pool_drain_threshold,
|
||||
general.me_pool_drain_soft_evict_enabled,
|
||||
general.me_pool_drain_soft_evict_grace_secs,
|
||||
|
||||
Reference in New Issue
Block a user