mirror of https://github.com/telemt/telemt.git
UpstreamManager: Backoff Retries
This commit is contained in:
parent
6c12af2b94
commit
9afaa28add
|
|
@ -8,6 +8,8 @@ const DEFAULT_STUN_TCP_FALLBACK: bool = true;
|
|||
const DEFAULT_MIDDLE_PROXY_WARM_STANDBY: usize = 16;
|
||||
const DEFAULT_ME_RECONNECT_MAX_CONCURRENT_PER_DC: u32 = 8;
|
||||
const DEFAULT_ME_RECONNECT_FAST_RETRY_COUNT: u32 = 16;
|
||||
const DEFAULT_UPSTREAM_CONNECT_RETRY_ATTEMPTS: u32 = 3;
|
||||
const DEFAULT_UPSTREAM_UNHEALTHY_FAIL_THRESHOLD: u32 = 4;
|
||||
const DEFAULT_LISTEN_ADDR_IPV6: &str = "::";
|
||||
const DEFAULT_ACCESS_USER: &str = "default";
|
||||
const DEFAULT_ACCESS_SECRET: &str = "00000000000000000000000000000000";
|
||||
|
|
@ -158,6 +160,18 @@ pub(crate) fn default_me_reconnect_fast_retry_count() -> u32 {
|
|||
DEFAULT_ME_RECONNECT_FAST_RETRY_COUNT
|
||||
}
|
||||
|
||||
pub(crate) fn default_upstream_connect_retry_attempts() -> u32 {
|
||||
DEFAULT_UPSTREAM_CONNECT_RETRY_ATTEMPTS
|
||||
}
|
||||
|
||||
pub(crate) fn default_upstream_connect_retry_backoff_ms() -> u64 {
|
||||
250
|
||||
}
|
||||
|
||||
pub(crate) fn default_upstream_unhealthy_fail_threshold() -> u32 {
|
||||
DEFAULT_UPSTREAM_UNHEALTHY_FAIL_THRESHOLD
|
||||
}
|
||||
|
||||
pub(crate) fn default_crypto_pending_buffer() -> usize {
|
||||
256 * 1024
|
||||
}
|
||||
|
|
|
|||
|
|
@ -117,6 +117,14 @@ fn warn_non_hot_changes(old: &ProxyConfig, new: &ProxyConfig) {
|
|||
if old.general.stun_nat_probe_concurrency != new.general.stun_nat_probe_concurrency {
|
||||
warn!("config reload: general.stun_nat_probe_concurrency changed; restart required");
|
||||
}
|
||||
if old.general.upstream_connect_retry_attempts != new.general.upstream_connect_retry_attempts
|
||||
|| old.general.upstream_connect_retry_backoff_ms
|
||||
!= new.general.upstream_connect_retry_backoff_ms
|
||||
|| old.general.upstream_unhealthy_fail_threshold
|
||||
!= new.general.upstream_unhealthy_fail_threshold
|
||||
{
|
||||
warn!("config reload: general.upstream_* changed; restart required");
|
||||
}
|
||||
}
|
||||
|
||||
/// Resolve the public host for link generation — mirrors the logic in main.rs.
|
||||
|
|
|
|||
|
|
@ -237,6 +237,18 @@ impl ProxyConfig {
|
|||
));
|
||||
}
|
||||
|
||||
if config.general.upstream_connect_retry_attempts == 0 {
|
||||
return Err(ProxyError::Config(
|
||||
"general.upstream_connect_retry_attempts must be > 0".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
if config.general.upstream_unhealthy_fail_threshold == 0 {
|
||||
return Err(ProxyError::Config(
|
||||
"general.upstream_unhealthy_fail_threshold must be > 0".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
if config.general.me_reinit_every_secs == 0 {
|
||||
return Err(ProxyError::Config(
|
||||
"general.me_reinit_every_secs must be > 0".to_string(),
|
||||
|
|
@ -567,6 +579,18 @@ mod tests {
|
|||
cfg.general.me_reconnect_fast_retry_count,
|
||||
default_me_reconnect_fast_retry_count()
|
||||
);
|
||||
assert_eq!(
|
||||
cfg.general.upstream_connect_retry_attempts,
|
||||
default_upstream_connect_retry_attempts()
|
||||
);
|
||||
assert_eq!(
|
||||
cfg.general.upstream_connect_retry_backoff_ms,
|
||||
default_upstream_connect_retry_backoff_ms()
|
||||
);
|
||||
assert_eq!(
|
||||
cfg.general.upstream_unhealthy_fail_threshold,
|
||||
default_upstream_unhealthy_fail_threshold()
|
||||
);
|
||||
assert_eq!(cfg.general.update_every, default_update_every());
|
||||
assert_eq!(cfg.server.listen_addr_ipv4, default_listen_addr_ipv4());
|
||||
assert_eq!(cfg.server.listen_addr_ipv6, default_listen_addr_ipv6_opt());
|
||||
|
|
@ -593,6 +617,18 @@ mod tests {
|
|||
general.me_reconnect_fast_retry_count,
|
||||
default_me_reconnect_fast_retry_count()
|
||||
);
|
||||
assert_eq!(
|
||||
general.upstream_connect_retry_attempts,
|
||||
default_upstream_connect_retry_attempts()
|
||||
);
|
||||
assert_eq!(
|
||||
general.upstream_connect_retry_backoff_ms,
|
||||
default_upstream_connect_retry_backoff_ms()
|
||||
);
|
||||
assert_eq!(
|
||||
general.upstream_unhealthy_fail_threshold,
|
||||
default_upstream_unhealthy_fail_threshold()
|
||||
);
|
||||
assert_eq!(general.update_every, default_update_every());
|
||||
|
||||
let server = ServerConfig::default();
|
||||
|
|
@ -765,6 +801,46 @@ mod tests {
|
|||
let _ = std::fs::remove_file(path);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn upstream_connect_retry_attempts_zero_is_rejected() {
|
||||
let toml = r#"
|
||||
[general]
|
||||
upstream_connect_retry_attempts = 0
|
||||
|
||||
[censorship]
|
||||
tls_domain = "example.com"
|
||||
|
||||
[access.users]
|
||||
user = "00000000000000000000000000000000"
|
||||
"#;
|
||||
let dir = std::env::temp_dir();
|
||||
let path = dir.join("telemt_upstream_connect_retry_attempts_zero_test.toml");
|
||||
std::fs::write(&path, toml).unwrap();
|
||||
let err = ProxyConfig::load(&path).unwrap_err().to_string();
|
||||
assert!(err.contains("general.upstream_connect_retry_attempts must be > 0"));
|
||||
let _ = std::fs::remove_file(path);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn upstream_unhealthy_fail_threshold_zero_is_rejected() {
|
||||
let toml = r#"
|
||||
[general]
|
||||
upstream_unhealthy_fail_threshold = 0
|
||||
|
||||
[censorship]
|
||||
tls_domain = "example.com"
|
||||
|
||||
[access.users]
|
||||
user = "00000000000000000000000000000000"
|
||||
"#;
|
||||
let dir = std::env::temp_dir();
|
||||
let path = dir.join("telemt_upstream_unhealthy_fail_threshold_zero_test.toml");
|
||||
std::fs::write(&path, toml).unwrap();
|
||||
let err = ProxyConfig::load(&path).unwrap_err().to_string();
|
||||
assert!(err.contains("general.upstream_unhealthy_fail_threshold must be > 0"));
|
||||
let _ = std::fs::remove_file(path);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn me_hardswap_warmup_defaults_are_set() {
|
||||
let toml = r#"
|
||||
|
|
|
|||
|
|
@ -365,6 +365,18 @@ pub struct GeneralConfig {
|
|||
#[serde(default = "default_me_reconnect_fast_retry_count")]
|
||||
pub me_reconnect_fast_retry_count: u32,
|
||||
|
||||
/// Connect attempts for the selected upstream before returning error/fallback.
|
||||
#[serde(default = "default_upstream_connect_retry_attempts")]
|
||||
pub upstream_connect_retry_attempts: u32,
|
||||
|
||||
/// Delay in milliseconds between upstream connect attempts.
|
||||
#[serde(default = "default_upstream_connect_retry_backoff_ms")]
|
||||
pub upstream_connect_retry_backoff_ms: u64,
|
||||
|
||||
/// Consecutive failed requests before upstream is marked unhealthy.
|
||||
#[serde(default = "default_upstream_unhealthy_fail_threshold")]
|
||||
pub upstream_unhealthy_fail_threshold: u32,
|
||||
|
||||
/// Ignore STUN/interface IP mismatch (keep using Middle Proxy even if NAT detected).
|
||||
#[serde(default)]
|
||||
pub stun_iface_mismatch_ignore: bool,
|
||||
|
|
@ -522,6 +534,9 @@ impl Default for GeneralConfig {
|
|||
me_reconnect_backoff_base_ms: default_reconnect_backoff_base_ms(),
|
||||
me_reconnect_backoff_cap_ms: default_reconnect_backoff_cap_ms(),
|
||||
me_reconnect_fast_retry_count: default_me_reconnect_fast_retry_count(),
|
||||
upstream_connect_retry_attempts: default_upstream_connect_retry_attempts(),
|
||||
upstream_connect_retry_backoff_ms: default_upstream_connect_retry_backoff_ms(),
|
||||
upstream_unhealthy_fail_threshold: default_upstream_unhealthy_fail_threshold(),
|
||||
stun_iface_mismatch_ignore: false,
|
||||
unknown_dc_log_path: default_unknown_dc_log_path(),
|
||||
log_level: LogLevel::Normal,
|
||||
|
|
|
|||
|
|
@ -261,7 +261,12 @@ async fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
|
|||
warn!("Using default tls_domain. Consider setting a custom domain.");
|
||||
}
|
||||
|
||||
let upstream_manager = Arc::new(UpstreamManager::new(config.upstreams.clone()));
|
||||
let upstream_manager = Arc::new(UpstreamManager::new(
|
||||
config.upstreams.clone(),
|
||||
config.general.upstream_connect_retry_attempts,
|
||||
config.general.upstream_connect_retry_backoff_ms,
|
||||
config.general.upstream_unhealthy_fail_threshold,
|
||||
));
|
||||
|
||||
let mut tls_domains = Vec::with_capacity(1 + config.censorship.tls_domains.len());
|
||||
tls_domains.push(config.censorship.tls_domain.clone());
|
||||
|
|
|
|||
|
|
@ -172,10 +172,18 @@ pub struct UpstreamEgressInfo {
|
|||
#[derive(Clone)]
|
||||
pub struct UpstreamManager {
|
||||
upstreams: Arc<RwLock<Vec<UpstreamState>>>,
|
||||
connect_retry_attempts: u32,
|
||||
connect_retry_backoff: Duration,
|
||||
unhealthy_fail_threshold: u32,
|
||||
}
|
||||
|
||||
impl UpstreamManager {
|
||||
pub fn new(configs: Vec<UpstreamConfig>) -> Self {
|
||||
pub fn new(
|
||||
configs: Vec<UpstreamConfig>,
|
||||
connect_retry_attempts: u32,
|
||||
connect_retry_backoff_ms: u64,
|
||||
unhealthy_fail_threshold: u32,
|
||||
) -> Self {
|
||||
let states = configs.into_iter()
|
||||
.filter(|c| c.enabled)
|
||||
.map(UpstreamState::new)
|
||||
|
|
@ -183,6 +191,9 @@ impl UpstreamManager {
|
|||
|
||||
Self {
|
||||
upstreams: Arc::new(RwLock::new(states)),
|
||||
connect_retry_attempts: connect_retry_attempts.max(1),
|
||||
connect_retry_backoff: Duration::from_millis(connect_retry_backoff_ms),
|
||||
unhealthy_fail_threshold: unhealthy_fail_threshold.max(1),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -430,43 +441,83 @@ impl UpstreamManager {
|
|||
upstream.selected_scope = s.to_string();
|
||||
}
|
||||
|
||||
let start = Instant::now();
|
||||
|
||||
let bind_rr = {
|
||||
let guard = self.upstreams.read().await;
|
||||
guard.get(idx).map(|u| u.bind_rr.clone())
|
||||
};
|
||||
|
||||
match self.connect_via_upstream(&upstream, target, bind_rr).await {
|
||||
Ok((stream, egress)) => {
|
||||
let rtt_ms = start.elapsed().as_secs_f64() * 1000.0;
|
||||
let mut guard = self.upstreams.write().await;
|
||||
if let Some(u) = guard.get_mut(idx) {
|
||||
if !u.healthy {
|
||||
debug!(rtt_ms = format!("{:.1}", rtt_ms), "Upstream recovered");
|
||||
}
|
||||
u.healthy = true;
|
||||
u.fails = 0;
|
||||
let mut last_error: Option<ProxyError> = None;
|
||||
for attempt in 1..=self.connect_retry_attempts {
|
||||
let start = Instant::now();
|
||||
match self
|
||||
.connect_via_upstream(&upstream, target, bind_rr.clone())
|
||||
.await
|
||||
{
|
||||
Ok((stream, egress)) => {
|
||||
let rtt_ms = start.elapsed().as_secs_f64() * 1000.0;
|
||||
let mut guard = self.upstreams.write().await;
|
||||
if let Some(u) = guard.get_mut(idx) {
|
||||
if !u.healthy {
|
||||
debug!(rtt_ms = format!("{:.1}", rtt_ms), "Upstream recovered");
|
||||
}
|
||||
if attempt > 1 {
|
||||
debug!(
|
||||
attempt,
|
||||
attempts = self.connect_retry_attempts,
|
||||
rtt_ms = format!("{:.1}", rtt_ms),
|
||||
"Upstream connect recovered after retry"
|
||||
);
|
||||
}
|
||||
u.healthy = true;
|
||||
u.fails = 0;
|
||||
|
||||
if let Some(di) = dc_idx.and_then(UpstreamState::dc_array_idx) {
|
||||
u.dc_latency[di].update(rtt_ms);
|
||||
if let Some(di) = dc_idx.and_then(UpstreamState::dc_array_idx) {
|
||||
u.dc_latency[di].update(rtt_ms);
|
||||
}
|
||||
}
|
||||
return Ok((stream, egress));
|
||||
}
|
||||
Ok((stream, egress))
|
||||
},
|
||||
Err(e) => {
|
||||
let mut guard = self.upstreams.write().await;
|
||||
if let Some(u) = guard.get_mut(idx) {
|
||||
u.fails += 1;
|
||||
warn!(fails = u.fails, "Upstream failed: {}", e);
|
||||
if u.fails > 3 {
|
||||
u.healthy = false;
|
||||
warn!("Upstream marked unhealthy");
|
||||
Err(e) => {
|
||||
if attempt < self.connect_retry_attempts {
|
||||
debug!(
|
||||
attempt,
|
||||
attempts = self.connect_retry_attempts,
|
||||
target = %target,
|
||||
error = %e,
|
||||
"Upstream connect attempt failed, retrying"
|
||||
);
|
||||
if !self.connect_retry_backoff.is_zero() {
|
||||
tokio::time::sleep(self.connect_retry_backoff).await;
|
||||
}
|
||||
}
|
||||
last_error = Some(e);
|
||||
}
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
|
||||
let error = last_error.unwrap_or_else(|| {
|
||||
ProxyError::Config("Upstream connect attempts exhausted".to_string())
|
||||
});
|
||||
|
||||
let mut guard = self.upstreams.write().await;
|
||||
if let Some(u) = guard.get_mut(idx) {
|
||||
u.fails += 1;
|
||||
warn!(
|
||||
fails = u.fails,
|
||||
attempts = self.connect_retry_attempts,
|
||||
"Upstream failed after retries: {}",
|
||||
error
|
||||
);
|
||||
if u.fails >= self.unhealthy_fail_threshold {
|
||||
u.healthy = false;
|
||||
warn!(
|
||||
fails = u.fails,
|
||||
threshold = self.unhealthy_fail_threshold,
|
||||
"Upstream marked unhealthy"
|
||||
);
|
||||
}
|
||||
}
|
||||
Err(error)
|
||||
}
|
||||
|
||||
async fn connect_via_upstream(
|
||||
|
|
@ -1035,18 +1086,26 @@ impl UpstreamManager {
|
|||
u.fails += 1;
|
||||
debug!(dc = dc_zero_idx + 1, fails = u.fails,
|
||||
"Health check failed (both): {}", e);
|
||||
if u.fails > 3 {
|
||||
if u.fails >= self.unhealthy_fail_threshold {
|
||||
u.healthy = false;
|
||||
warn!("Upstream unhealthy (fails)");
|
||||
warn!(
|
||||
fails = u.fails,
|
||||
threshold = self.unhealthy_fail_threshold,
|
||||
"Upstream unhealthy (fails)"
|
||||
);
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
u.fails += 1;
|
||||
debug!(dc = dc_zero_idx + 1, fails = u.fails,
|
||||
"Health check timeout (both)");
|
||||
if u.fails > 3 {
|
||||
if u.fails >= self.unhealthy_fail_threshold {
|
||||
u.healthy = false;
|
||||
warn!("Upstream unhealthy (timeout)");
|
||||
warn!(
|
||||
fails = u.fails,
|
||||
threshold = self.unhealthy_fail_threshold,
|
||||
"Upstream unhealthy (timeout)"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1057,9 +1116,13 @@ impl UpstreamManager {
|
|||
let mut guard = self.upstreams.write().await;
|
||||
let u = &mut guard[i];
|
||||
u.fails += 1;
|
||||
if u.fails > 3 {
|
||||
if u.fails >= self.unhealthy_fail_threshold {
|
||||
u.healthy = false;
|
||||
warn!("Upstream unhealthy (no fallback family)");
|
||||
warn!(
|
||||
fails = u.fails,
|
||||
threshold = self.unhealthy_fail_threshold,
|
||||
"Upstream unhealthy (no fallback family)"
|
||||
);
|
||||
}
|
||||
u.last_check = std::time::Instant::now();
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue