mirror of
https://github.com/telemt/telemt.git
synced 2026-04-15 09:34:10 +03:00
Autofallback on ME-Init
This commit is contained in:
@@ -14,10 +14,27 @@ use super::MePool;
|
||||
pub async fn me_health_monitor(pool: Arc<MePool>, rng: Arc<SecureRandom>, _min_connections: usize) {
|
||||
let mut backoff: HashMap<(i32, IpFamily), u64> = HashMap::new();
|
||||
let mut last_attempt: HashMap<(i32, IpFamily), Instant> = HashMap::new();
|
||||
let mut inflight_single: HashSet<(i32, IpFamily)> = HashSet::new();
|
||||
loop {
|
||||
tokio::time::sleep(Duration::from_secs(30)).await;
|
||||
check_family(IpFamily::V4, &pool, &rng, &mut backoff, &mut last_attempt).await;
|
||||
check_family(IpFamily::V6, &pool, &rng, &mut backoff, &mut last_attempt).await;
|
||||
check_family(
|
||||
IpFamily::V4,
|
||||
&pool,
|
||||
&rng,
|
||||
&mut backoff,
|
||||
&mut last_attempt,
|
||||
&mut inflight_single,
|
||||
)
|
||||
.await;
|
||||
check_family(
|
||||
IpFamily::V6,
|
||||
&pool,
|
||||
&rng,
|
||||
&mut backoff,
|
||||
&mut last_attempt,
|
||||
&mut inflight_single,
|
||||
)
|
||||
.await;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -27,6 +44,7 @@ async fn check_family(
|
||||
rng: &Arc<SecureRandom>,
|
||||
backoff: &mut HashMap<(i32, IpFamily), u64>,
|
||||
last_attempt: &mut HashMap<(i32, IpFamily), Instant>,
|
||||
inflight_single: &mut HashSet<(i32, IpFamily)>,
|
||||
) {
|
||||
let enabled = match family {
|
||||
IpFamily::V4 => pool.decision.ipv4_me,
|
||||
@@ -48,16 +66,24 @@ async fn check_family(
|
||||
.map(|w| w.addr)
|
||||
.collect();
|
||||
|
||||
for (dc, addrs) in map.iter() {
|
||||
let dc_addrs: Vec<SocketAddr> = addrs
|
||||
.iter()
|
||||
.map(|(ip, port)| SocketAddr::new(*ip, *port))
|
||||
.collect();
|
||||
let entries: Vec<(i32, Vec<SocketAddr>)> = map
|
||||
.iter()
|
||||
.map(|(dc, addrs)| {
|
||||
let list = addrs
|
||||
.iter()
|
||||
.map(|(ip, port)| SocketAddr::new(*ip, *port))
|
||||
.collect::<Vec<_>>();
|
||||
(*dc, list)
|
||||
})
|
||||
.collect();
|
||||
|
||||
for (dc, dc_addrs) in entries {
|
||||
let has_coverage = dc_addrs.iter().any(|a| writer_addrs.contains(a));
|
||||
if has_coverage {
|
||||
inflight_single.remove(&(dc, family));
|
||||
continue;
|
||||
}
|
||||
let key = (*dc, family);
|
||||
let key = (dc, family);
|
||||
let delay = *backoff.get(&key).unwrap_or(&30);
|
||||
let now = Instant::now();
|
||||
if let Some(last) = last_attempt.get(&key) {
|
||||
@@ -65,6 +91,64 @@ async fn check_family(
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if dc_addrs.len() == 1 {
|
||||
// Single ME address: fast retries then slower background retries.
|
||||
if inflight_single.contains(&key) {
|
||||
continue;
|
||||
}
|
||||
inflight_single.insert(key);
|
||||
let addr = dc_addrs[0];
|
||||
let dc_id = dc;
|
||||
let pool_clone = pool.clone();
|
||||
let rng_clone = rng.clone();
|
||||
let timeout = pool.me_one_timeout;
|
||||
let quick_attempts = pool.me_one_retry.max(1);
|
||||
tokio::spawn(async move {
|
||||
let mut success = false;
|
||||
for _ in 0..quick_attempts {
|
||||
let res = tokio::time::timeout(timeout, pool_clone.connect_one(addr, rng_clone.as_ref())).await;
|
||||
match res {
|
||||
Ok(Ok(())) => {
|
||||
info!(%addr, dc = %dc_id, ?family, "ME reconnected for DC coverage");
|
||||
success = true;
|
||||
break;
|
||||
}
|
||||
Ok(Err(e)) => debug!(%addr, dc = %dc_id, error = %e, ?family, "ME reconnect failed"),
|
||||
Err(_) => debug!(%addr, dc = %dc_id, ?family, "ME reconnect timed out"),
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(1000)).await;
|
||||
}
|
||||
if success {
|
||||
return;
|
||||
}
|
||||
let timeout_ms = timeout.as_millis();
|
||||
warn!(
|
||||
dc = %dc_id,
|
||||
?family,
|
||||
attempts = quick_attempts,
|
||||
timeout_ms,
|
||||
"DC={} has no ME coverage: {} tries * {} ms... retry in 5 seconds...",
|
||||
dc_id,
|
||||
quick_attempts,
|
||||
timeout_ms
|
||||
);
|
||||
loop {
|
||||
tokio::time::sleep(Duration::from_secs(5)).await;
|
||||
let res = tokio::time::timeout(timeout, pool_clone.connect_one(addr, rng_clone.as_ref())).await;
|
||||
match res {
|
||||
Ok(Ok(())) => {
|
||||
info!(%addr, dc = %dc_id, ?family, "ME reconnected for DC coverage");
|
||||
break;
|
||||
}
|
||||
Ok(Err(e)) => debug!(%addr, dc = %dc_id, error = %e, ?family, "ME reconnect failed"),
|
||||
Err(_) => debug!(%addr, dc = %dc_id, ?family, "ME reconnect timed out"),
|
||||
}
|
||||
}
|
||||
// will drop inflight flag in outer loop when coverage detected
|
||||
});
|
||||
continue;
|
||||
}
|
||||
|
||||
warn!(dc = %dc, delay, ?family, "DC has no ME coverage, reconnecting...");
|
||||
let mut shuffled = dc_addrs.clone();
|
||||
shuffled.shuffle(&mut rand::rng());
|
||||
|
||||
Reference in New Issue
Block a user