mirror of https://github.com/telemt/telemt.git
ME Healthcheck + ME Keepalive + ME Pool in Metrics: merge pull request #297 from telemt/flow-drift
ME Healthcheck + ME Keepalive + ME Pool in Metrics
This commit is contained in:
commit
07ec84d071
|
|
@ -130,11 +130,11 @@ pub(crate) fn default_middle_proxy_warm_standby() -> usize {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn default_keepalive_interval() -> u64 {
|
pub(crate) fn default_keepalive_interval() -> u64 {
|
||||||
25
|
8
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn default_keepalive_jitter() -> u64 {
|
pub(crate) fn default_keepalive_jitter() -> u64 {
|
||||||
5
|
2
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn default_warmup_step_delay_ms() -> u64 {
|
pub(crate) fn default_warmup_step_delay_ms() -> u64 {
|
||||||
|
|
|
||||||
|
|
@ -449,6 +449,21 @@ async fn render_metrics(stats: &Stats, config: &ProxyConfig, ip_tracker: &UserIp
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let _ = writeln!(
|
||||||
|
out,
|
||||||
|
"# HELP telemt_me_kdf_port_only_drift_total ME KDF client-port changes with stable non-port material"
|
||||||
|
);
|
||||||
|
let _ = writeln!(out, "# TYPE telemt_me_kdf_port_only_drift_total counter");
|
||||||
|
let _ = writeln!(
|
||||||
|
out,
|
||||||
|
"telemt_me_kdf_port_only_drift_total {}",
|
||||||
|
if me_allows_debug {
|
||||||
|
stats.get_me_kdf_port_only_drift_total()
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
let _ = writeln!(
|
let _ = writeln!(
|
||||||
out,
|
out,
|
||||||
"# HELP telemt_me_hardswap_pending_reuse_total Hardswap cycles that reused an existing pending generation"
|
"# HELP telemt_me_hardswap_pending_reuse_total Hardswap cycles that reused an existing pending generation"
|
||||||
|
|
@ -587,6 +602,24 @@ async fn render_metrics(stats: &Stats, config: &ProxyConfig, ip_tracker: &UserIp
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let _ = writeln!(
|
||||||
|
out,
|
||||||
|
"# HELP telemt_me_single_endpoint_shadow_rotate_skipped_quarantine_total Shadow rotations skipped because endpoint is quarantined"
|
||||||
|
);
|
||||||
|
let _ = writeln!(
|
||||||
|
out,
|
||||||
|
"# TYPE telemt_me_single_endpoint_shadow_rotate_skipped_quarantine_total counter"
|
||||||
|
);
|
||||||
|
let _ = writeln!(
|
||||||
|
out,
|
||||||
|
"telemt_me_single_endpoint_shadow_rotate_skipped_quarantine_total {}",
|
||||||
|
if me_allows_normal {
|
||||||
|
stats.get_me_single_endpoint_shadow_rotate_skipped_quarantine_total()
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
let _ = writeln!(out, "# HELP telemt_secure_padding_invalid_total Invalid secure frame lengths");
|
let _ = writeln!(out, "# HELP telemt_secure_padding_invalid_total Invalid secure frame lengths");
|
||||||
let _ = writeln!(out, "# TYPE telemt_secure_padding_invalid_total counter");
|
let _ = writeln!(out, "# TYPE telemt_secure_padding_invalid_total counter");
|
||||||
let _ = writeln!(
|
let _ = writeln!(
|
||||||
|
|
@ -679,7 +712,7 @@ async fn render_metrics(stats: &Stats, config: &ProxyConfig, ip_tracker: &UserIp
|
||||||
let _ = writeln!(
|
let _ = writeln!(
|
||||||
out,
|
out,
|
||||||
"telemt_pool_swap_total {}",
|
"telemt_pool_swap_total {}",
|
||||||
if me_allows_debug {
|
if me_allows_normal {
|
||||||
stats.get_pool_swap_total()
|
stats.get_pool_swap_total()
|
||||||
} else {
|
} else {
|
||||||
0
|
0
|
||||||
|
|
|
||||||
|
|
@ -38,6 +38,7 @@ pub struct Stats {
|
||||||
me_seq_mismatch: AtomicU64,
|
me_seq_mismatch: AtomicU64,
|
||||||
me_endpoint_quarantine_total: AtomicU64,
|
me_endpoint_quarantine_total: AtomicU64,
|
||||||
me_kdf_drift_total: AtomicU64,
|
me_kdf_drift_total: AtomicU64,
|
||||||
|
me_kdf_port_only_drift_total: AtomicU64,
|
||||||
me_hardswap_pending_reuse_total: AtomicU64,
|
me_hardswap_pending_reuse_total: AtomicU64,
|
||||||
me_hardswap_pending_ttl_expired_total: AtomicU64,
|
me_hardswap_pending_ttl_expired_total: AtomicU64,
|
||||||
me_single_endpoint_outage_enter_total: AtomicU64,
|
me_single_endpoint_outage_enter_total: AtomicU64,
|
||||||
|
|
@ -46,6 +47,7 @@ pub struct Stats {
|
||||||
me_single_endpoint_outage_reconnect_success_total: AtomicU64,
|
me_single_endpoint_outage_reconnect_success_total: AtomicU64,
|
||||||
me_single_endpoint_quarantine_bypass_total: AtomicU64,
|
me_single_endpoint_quarantine_bypass_total: AtomicU64,
|
||||||
me_single_endpoint_shadow_rotate_total: AtomicU64,
|
me_single_endpoint_shadow_rotate_total: AtomicU64,
|
||||||
|
me_single_endpoint_shadow_rotate_skipped_quarantine_total: AtomicU64,
|
||||||
me_handshake_error_codes: DashMap<i32, AtomicU64>,
|
me_handshake_error_codes: DashMap<i32, AtomicU64>,
|
||||||
me_route_drop_no_conn: AtomicU64,
|
me_route_drop_no_conn: AtomicU64,
|
||||||
me_route_drop_channel_closed: AtomicU64,
|
me_route_drop_channel_closed: AtomicU64,
|
||||||
|
|
@ -290,7 +292,7 @@ impl Stats {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pub fn increment_pool_swap_total(&self) {
|
pub fn increment_pool_swap_total(&self) {
|
||||||
if self.telemetry_me_allows_debug() {
|
if self.telemetry_me_allows_normal() {
|
||||||
self.pool_swap_total.fetch_add(1, Ordering::Relaxed);
|
self.pool_swap_total.fetch_add(1, Ordering::Relaxed);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -377,6 +379,12 @@ impl Stats {
|
||||||
self.me_kdf_drift_total.fetch_add(1, Ordering::Relaxed);
|
self.me_kdf_drift_total.fetch_add(1, Ordering::Relaxed);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
pub fn increment_me_kdf_port_only_drift_total(&self) {
|
||||||
|
if self.telemetry_me_allows_debug() {
|
||||||
|
self.me_kdf_port_only_drift_total
|
||||||
|
.fetch_add(1, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
}
|
||||||
pub fn increment_me_hardswap_pending_reuse_total(&self) {
|
pub fn increment_me_hardswap_pending_reuse_total(&self) {
|
||||||
if self.telemetry_me_allows_debug() {
|
if self.telemetry_me_allows_debug() {
|
||||||
self.me_hardswap_pending_reuse_total
|
self.me_hardswap_pending_reuse_total
|
||||||
|
|
@ -425,6 +433,12 @@ impl Stats {
|
||||||
.fetch_add(1, Ordering::Relaxed);
|
.fetch_add(1, Ordering::Relaxed);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
pub fn increment_me_single_endpoint_shadow_rotate_skipped_quarantine_total(&self) {
|
||||||
|
if self.telemetry_me_allows_normal() {
|
||||||
|
self.me_single_endpoint_shadow_rotate_skipped_quarantine_total
|
||||||
|
.fetch_add(1, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
}
|
||||||
pub fn get_connects_all(&self) -> u64 { self.connects_all.load(Ordering::Relaxed) }
|
pub fn get_connects_all(&self) -> u64 { self.connects_all.load(Ordering::Relaxed) }
|
||||||
pub fn get_connects_bad(&self) -> u64 { self.connects_bad.load(Ordering::Relaxed) }
|
pub fn get_connects_bad(&self) -> u64 { self.connects_bad.load(Ordering::Relaxed) }
|
||||||
pub fn get_me_keepalive_sent(&self) -> u64 { self.me_keepalive_sent.load(Ordering::Relaxed) }
|
pub fn get_me_keepalive_sent(&self) -> u64 { self.me_keepalive_sent.load(Ordering::Relaxed) }
|
||||||
|
|
@ -447,6 +461,9 @@ impl Stats {
|
||||||
pub fn get_me_kdf_drift_total(&self) -> u64 {
|
pub fn get_me_kdf_drift_total(&self) -> u64 {
|
||||||
self.me_kdf_drift_total.load(Ordering::Relaxed)
|
self.me_kdf_drift_total.load(Ordering::Relaxed)
|
||||||
}
|
}
|
||||||
|
pub fn get_me_kdf_port_only_drift_total(&self) -> u64 {
|
||||||
|
self.me_kdf_port_only_drift_total.load(Ordering::Relaxed)
|
||||||
|
}
|
||||||
pub fn get_me_hardswap_pending_reuse_total(&self) -> u64 {
|
pub fn get_me_hardswap_pending_reuse_total(&self) -> u64 {
|
||||||
self.me_hardswap_pending_reuse_total
|
self.me_hardswap_pending_reuse_total
|
||||||
.load(Ordering::Relaxed)
|
.load(Ordering::Relaxed)
|
||||||
|
|
@ -479,6 +496,10 @@ impl Stats {
|
||||||
self.me_single_endpoint_shadow_rotate_total
|
self.me_single_endpoint_shadow_rotate_total
|
||||||
.load(Ordering::Relaxed)
|
.load(Ordering::Relaxed)
|
||||||
}
|
}
|
||||||
|
pub fn get_me_single_endpoint_shadow_rotate_skipped_quarantine_total(&self) -> u64 {
|
||||||
|
self.me_single_endpoint_shadow_rotate_skipped_quarantine_total
|
||||||
|
.load(Ordering::Relaxed)
|
||||||
|
}
|
||||||
pub fn get_me_handshake_error_code_counts(&self) -> Vec<(i32, u64)> {
|
pub fn get_me_handshake_error_code_counts(&self) -> Vec<(i32, u64)> {
|
||||||
let mut out: Vec<(i32, u64)> = self
|
let mut out: Vec<(i32, u64)> = self
|
||||||
.me_handshake_error_codes
|
.me_handshake_error_codes
|
||||||
|
|
|
||||||
|
|
@ -38,6 +38,22 @@ use super::MePool;
|
||||||
|
|
||||||
const ME_KDF_DRIFT_STRICT: bool = false;
|
const ME_KDF_DRIFT_STRICT: bool = false;
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
|
||||||
|
enum KdfClientPortSource {
|
||||||
|
LocalSocket = 0,
|
||||||
|
SocksBound = 1,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl KdfClientPortSource {
|
||||||
|
fn from_socks_bound_port(socks_bound_port: Option<u16>) -> Self {
|
||||||
|
if socks_bound_port.is_some() {
|
||||||
|
Self::SocksBound
|
||||||
|
} else {
|
||||||
|
Self::LocalSocket
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Result of a successful ME handshake with timings.
|
/// Result of a successful ME handshake with timings.
|
||||||
pub(crate) struct HandshakeOutput {
|
pub(crate) struct HandshakeOutput {
|
||||||
pub rd: ReadHalf<TcpStream>,
|
pub rd: ReadHalf<TcpStream>,
|
||||||
|
|
@ -52,18 +68,18 @@ pub(crate) struct HandshakeOutput {
|
||||||
|
|
||||||
impl MePool {
|
impl MePool {
|
||||||
fn kdf_material_fingerprint(
|
fn kdf_material_fingerprint(
|
||||||
local_addr_nat: SocketAddr,
|
local_ip_nat: IpAddr,
|
||||||
peer_addr_nat: SocketAddr,
|
peer_addr_nat: SocketAddr,
|
||||||
client_port_for_kdf: u16,
|
reflected_ip: Option<IpAddr>,
|
||||||
reflected: Option<SocketAddr>,
|
socks_bound_ip: Option<IpAddr>,
|
||||||
socks_bound_addr: Option<SocketAddr>,
|
client_port_source: KdfClientPortSource,
|
||||||
) -> u64 {
|
) -> u64 {
|
||||||
let mut hasher = DefaultHasher::new();
|
let mut hasher = DefaultHasher::new();
|
||||||
local_addr_nat.hash(&mut hasher);
|
local_ip_nat.hash(&mut hasher);
|
||||||
peer_addr_nat.hash(&mut hasher);
|
peer_addr_nat.hash(&mut hasher);
|
||||||
client_port_for_kdf.hash(&mut hasher);
|
reflected_ip.hash(&mut hasher);
|
||||||
reflected.hash(&mut hasher);
|
socks_bound_ip.hash(&mut hasher);
|
||||||
socks_bound_addr.hash(&mut hasher);
|
client_port_source.hash(&mut hasher);
|
||||||
hasher.finish()
|
hasher.finish()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -359,35 +375,48 @@ impl MePool {
|
||||||
|
|
||||||
let ts_bytes = crypto_ts.to_le_bytes();
|
let ts_bytes = crypto_ts.to_le_bytes();
|
||||||
let server_port_bytes = peer_addr_nat.port().to_le_bytes();
|
let server_port_bytes = peer_addr_nat.port().to_le_bytes();
|
||||||
let client_port_for_kdf = socks_bound_addr
|
let socks_bound_port = socks_bound_addr
|
||||||
.map(|bound| bound.port())
|
.map(|bound| bound.port())
|
||||||
.filter(|port| *port != 0)
|
.filter(|port| *port != 0);
|
||||||
.unwrap_or(local_addr_nat.port());
|
let client_port_for_kdf = socks_bound_port.unwrap_or(local_addr_nat.port());
|
||||||
|
let client_port_source = KdfClientPortSource::from_socks_bound_port(socks_bound_port);
|
||||||
let kdf_fingerprint = Self::kdf_material_fingerprint(
|
let kdf_fingerprint = Self::kdf_material_fingerprint(
|
||||||
local_addr_nat,
|
local_addr_nat.ip(),
|
||||||
peer_addr_nat,
|
peer_addr_nat,
|
||||||
client_port_for_kdf,
|
reflected.map(|value| value.ip()),
|
||||||
reflected,
|
socks_bound_addr.map(|value| value.ip()),
|
||||||
socks_bound_addr,
|
client_port_source,
|
||||||
);
|
);
|
||||||
let mut kdf_fingerprint_guard = self.kdf_material_fingerprint.lock().await;
|
let mut kdf_fingerprint_guard = self.kdf_material_fingerprint.lock().await;
|
||||||
if let Some(prev_fingerprint) = kdf_fingerprint_guard.get(&peer_addr_nat).copied()
|
if let Some((prev_fingerprint, prev_client_port)) =
|
||||||
&& prev_fingerprint != kdf_fingerprint
|
kdf_fingerprint_guard.get(&peer_addr_nat).copied()
|
||||||
{
|
{
|
||||||
|
if prev_fingerprint != kdf_fingerprint {
|
||||||
self.stats.increment_me_kdf_drift_total();
|
self.stats.increment_me_kdf_drift_total();
|
||||||
warn!(
|
warn!(
|
||||||
%peer_addr_nat,
|
%peer_addr_nat,
|
||||||
%local_addr_nat,
|
%local_addr_nat,
|
||||||
client_port_for_kdf,
|
client_port_for_kdf,
|
||||||
"ME KDF input drift detected for endpoint"
|
client_port_source = ?client_port_source,
|
||||||
|
"ME KDF material drift detected for endpoint"
|
||||||
);
|
);
|
||||||
if ME_KDF_DRIFT_STRICT {
|
if ME_KDF_DRIFT_STRICT {
|
||||||
return Err(ProxyError::InvalidHandshake(
|
return Err(ProxyError::InvalidHandshake(
|
||||||
"ME KDF input drift detected (strict mode)".to_string(),
|
"ME KDF material drift detected (strict mode)".to_string(),
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
} else if prev_client_port != client_port_for_kdf {
|
||||||
|
self.stats.increment_me_kdf_port_only_drift_total();
|
||||||
|
debug!(
|
||||||
|
%peer_addr_nat,
|
||||||
|
previous_client_port_for_kdf = prev_client_port,
|
||||||
|
client_port_for_kdf,
|
||||||
|
client_port_source = ?client_port_source,
|
||||||
|
"ME KDF client port changed with stable material"
|
||||||
|
);
|
||||||
}
|
}
|
||||||
kdf_fingerprint_guard.insert(peer_addr_nat, kdf_fingerprint);
|
}
|
||||||
|
kdf_fingerprint_guard.insert(peer_addr_nat, (kdf_fingerprint, client_port_for_kdf));
|
||||||
drop(kdf_fingerprint_guard);
|
drop(kdf_fingerprint_guard);
|
||||||
|
|
||||||
let client_port_bytes = client_port_for_kdf.to_le_bytes();
|
let client_port_bytes = client_port_for_kdf.to_le_bytes();
|
||||||
|
|
|
||||||
|
|
@ -395,6 +395,19 @@ async fn maybe_rotate_single_endpoint_shadow(
|
||||||
}
|
}
|
||||||
|
|
||||||
let endpoint = endpoints[0];
|
let endpoint = endpoints[0];
|
||||||
|
if pool.is_endpoint_quarantined(endpoint).await {
|
||||||
|
pool.stats
|
||||||
|
.increment_me_single_endpoint_shadow_rotate_skipped_quarantine_total();
|
||||||
|
shadow_rotate_deadline.insert(key, now + Duration::from_secs(SHADOW_ROTATE_RETRY_SECS));
|
||||||
|
debug!(
|
||||||
|
dc = %dc,
|
||||||
|
?family,
|
||||||
|
%endpoint,
|
||||||
|
"Single-endpoint shadow rotation skipped: endpoint is quarantined"
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
let Some(writer_ids) = live_writer_ids_by_addr.get(&endpoint) else {
|
let Some(writer_ids) = live_writer_ids_by_addr.get(&endpoint) else {
|
||||||
shadow_rotate_deadline.insert(key, now + Duration::from_secs(SHADOW_ROTATE_RETRY_SECS));
|
shadow_rotate_deadline.insert(key, now + Duration::from_secs(SHADOW_ROTATE_RETRY_SECS));
|
||||||
return;
|
return;
|
||||||
|
|
|
||||||
|
|
@ -127,7 +127,7 @@ pub struct MePool {
|
||||||
pub(super) pending_hardswap_map_hash: AtomicU64,
|
pub(super) pending_hardswap_map_hash: AtomicU64,
|
||||||
pub(super) hardswap: AtomicBool,
|
pub(super) hardswap: AtomicBool,
|
||||||
pub(super) endpoint_quarantine: Arc<Mutex<HashMap<SocketAddr, Instant>>>,
|
pub(super) endpoint_quarantine: Arc<Mutex<HashMap<SocketAddr, Instant>>>,
|
||||||
pub(super) kdf_material_fingerprint: Arc<Mutex<HashMap<SocketAddr, u64>>>,
|
pub(super) kdf_material_fingerprint: Arc<Mutex<HashMap<SocketAddr, (u64, u16)>>>,
|
||||||
pub(super) me_pool_drain_ttl_secs: AtomicU64,
|
pub(super) me_pool_drain_ttl_secs: AtomicU64,
|
||||||
pub(super) me_pool_force_close_secs: AtomicU64,
|
pub(super) me_pool_force_close_secs: AtomicU64,
|
||||||
pub(super) me_pool_min_fresh_ratio_permille: AtomicU32,
|
pub(super) me_pool_min_fresh_ratio_permille: AtomicU32,
|
||||||
|
|
|
||||||
|
|
@ -37,7 +37,7 @@ impl MePool {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn is_endpoint_quarantined(&self, addr: SocketAddr) -> bool {
|
pub(super) async fn is_endpoint_quarantined(&self, addr: SocketAddr) -> bool {
|
||||||
let mut guard = self.endpoint_quarantine.lock().await;
|
let mut guard = self.endpoint_quarantine.lock().await;
|
||||||
let now = Instant::now();
|
let now = Instant::now();
|
||||||
guard.retain(|_, expiry| *expiry > now);
|
guard.retain(|_, expiry| *expiry > now);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue