Compare commits

...

6 Commits

Author SHA1 Message Date
Alexey
37b6f7b985 Weighted Fairness + 3-Leveled Pressure Model
Co-Authored-By: brekotis <93345790+brekotis@users.noreply.github.com>
2026-04-18 00:37:04 +03:00
Alexey
50e9e5cf32 Active Ring and DRR Hardening
Co-Authored-By: brekotis <93345790+brekotis@users.noreply.github.com>
2026-04-18 00:34:35 +03:00
Alexey
1b25bada29 ServerHello fixes + Docker Health-Check + Conntrack Control for Docker: merge pull request #717 from telemt/flow
ServerHello fixes + Docker Health-Check + Conntrack Control for Docker
2026-04-17 19:43:59 +03:00
Alexey
bde30eaf05 Update emulator.rs 2026-04-17 19:20:06 +03:00
Alexey
b447f60a72 Rustfmt + Bump 2026-04-17 19:08:57 +03:00
Alexey
093faed0c2 Conntrack Control for Docker 2026-04-17 19:06:18 +03:00
16 changed files with 653 additions and 195 deletions

2
Cargo.lock generated
View File

@@ -2780,7 +2780,7 @@ checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417"
[[package]]
name = "telemt"
version = "3.4.2"
version = "3.4.3"
dependencies = [
"aes",
"anyhow",

View File

@@ -1,6 +1,6 @@
[package]
name = "telemt"
version = "3.4.2"
version = "3.4.3"
edition = "2024"
[features]

View File

@@ -82,6 +82,32 @@ HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 CMD ["/ap
ENTRYPOINT ["/app/telemt"]
CMD ["config.toml"]
# ==========================
# Production Netfilter Profile
# ==========================
FROM debian:12-slim AS prod-netfilter
RUN set -eux; \
apt-get update; \
apt-get install -y --no-install-recommends \
ca-certificates \
conntrack \
nftables \
iptables; \
rm -rf /var/lib/apt/lists/*
WORKDIR /app
COPY --from=minimal /telemt /app/telemt
COPY config.toml /app/config.toml
EXPOSE 443 9090 9091
HEALTHCHECK --interval=30s --timeout=5s --start-period=20s --retries=3 CMD ["/app/telemt", "healthcheck", "/app/config.toml", "--mode", "liveness"]
ENTRYPOINT ["/app/telemt"]
CMD ["config.toml"]
# ==========================
# Production Distroless on MUSL
# ==========================

View File

@@ -0,0 +1,10 @@
services:
telemt:
build:
context: .
target: prod-netfilter
network_mode: host
ports: []
cap_add:
- NET_BIND_SERVICE
- NET_ADMIN

View File

@@ -0,0 +1,8 @@
services:
telemt:
build:
context: .
target: prod-netfilter
cap_add:
- NET_BIND_SERVICE
- NET_ADMIN

View File

@@ -1,7 +1,9 @@
services:
telemt:
image: ghcr.io/telemt/telemt:latest
build: .
build:
context: .
target: prod
container_name: telemt
restart: unless-stopped
ports:
@@ -28,7 +30,6 @@ services:
- ALL
cap_add:
- NET_BIND_SERVICE
- NET_ADMIN
read_only: true
security_opt:
- no-new-privileges:true

View File

@@ -41,8 +41,8 @@ use config_store::{current_revision, load_config_from_disk, parse_if_match};
use events::ApiEventStore;
use http_utils::{error_response, read_json, read_optional_json, success_response};
use model::{
ApiFailure, CreateUserRequest, DeleteUserResponse, HealthData, HealthReadyData, PatchUserRequest,
RotateSecretRequest, SummaryData, UserActiveIps,
ApiFailure, CreateUserRequest, DeleteUserResponse, HealthData, HealthReadyData,
PatchUserRequest, RotateSecretRequest, SummaryData, UserActiveIps,
};
use runtime_edge::{
EdgeConnectionsCacheEntry, build_runtime_connections_summary_data,

View File

@@ -343,6 +343,10 @@ impl ProxyConfig {
let network_table = parsed_toml
.get("network")
.and_then(|value| value.as_table());
let server_table = parsed_toml.get("server").and_then(|value| value.as_table());
let conntrack_control_table = server_table
.and_then(|table| table.get("conntrack_control"))
.and_then(|value| value.as_table());
let update_every_is_explicit = general_table
.map(|table| table.contains_key("update_every"))
.unwrap_or(false);
@@ -372,10 +376,17 @@ impl ProxyConfig {
let stun_servers_is_explicit = network_table
.map(|table| table.contains_key("stun_servers"))
.unwrap_or(false);
let inline_conntrack_control_is_explicit = conntrack_control_table
.map(|table| table.contains_key("inline_conntrack_control"))
.unwrap_or(false);
let mut config: ProxyConfig = parsed_toml
.try_into()
.map_err(|e| ProxyError::Config(e.to_string()))?;
config
.server
.conntrack_control
.inline_conntrack_control_explicit = inline_conntrack_control_is_explicit;
if !update_every_is_explicit && (legacy_secret_is_explicit || legacy_config_is_explicit) {
config.general.update_every = None;
@@ -1881,6 +1892,43 @@ mod tests {
);
}
#[test]
fn conntrack_inline_explicit_flag_is_false_when_omitted() {
let cfg = load_config_from_temp_toml(
r#"
[general]
[network]
[server]
[server.conntrack_control]
[access]
"#,
);
assert!(
!cfg.server
.conntrack_control
.inline_conntrack_control_explicit
);
}
#[test]
fn conntrack_inline_explicit_flag_is_true_when_present() {
let cfg = load_config_from_temp_toml(
r#"
[general]
[network]
[server]
[server.conntrack_control]
inline_conntrack_control = true
[access]
"#,
);
assert!(
cfg.server
.conntrack_control
.inline_conntrack_control_explicit
);
}
#[test]
fn unknown_sni_action_parses_and_defaults_to_drop() {
let cfg_default: ProxyConfig = toml::from_str(

View File

@@ -1329,6 +1329,10 @@ pub struct ConntrackControlConfig {
#[serde(default = "default_conntrack_control_enabled")]
pub inline_conntrack_control: bool,
/// Tracks whether inline_conntrack_control was explicitly set in config.
#[serde(skip)]
pub inline_conntrack_control_explicit: bool,
/// Conntrack mode for listener ingress traffic.
#[serde(default)]
pub mode: ConntrackMode,
@@ -1363,6 +1367,7 @@ impl Default for ConntrackControlConfig {
fn default() -> Self {
Self {
inline_conntrack_control: default_conntrack_control_enabled(),
inline_conntrack_control_explicit: false,
mode: ConntrackMode::default(),
backend: ConntrackBackend::default(),
profile: ConntrackPressureProfile::default(),

View File

@@ -24,6 +24,13 @@ enum NetfilterBackend {
Iptables,
}
#[derive(Clone, Copy)]
struct ConntrackRuntimeSupport {
netfilter_backend: Option<NetfilterBackend>,
has_cap_net_admin: bool,
has_conntrack_binary: bool,
}
#[derive(Clone, Copy)]
struct PressureSample {
conn_pct: Option<u8>,
@@ -56,11 +63,8 @@ pub(crate) fn spawn_conntrack_controller(
shared: Arc<ProxySharedState>,
) {
if !cfg!(target_os = "linux") {
let enabled = config_rx
.borrow()
.server
.conntrack_control
.inline_conntrack_control;
let cfg = config_rx.borrow();
let enabled = cfg.server.conntrack_control.inline_conntrack_control;
stats.set_conntrack_control_enabled(enabled);
stats.set_conntrack_control_available(false);
stats.set_conntrack_pressure_active(false);
@@ -68,9 +72,14 @@ pub(crate) fn spawn_conntrack_controller(
stats.set_conntrack_rule_apply_ok(false);
shared.disable_conntrack_close_sender();
shared.set_conntrack_pressure_active(false);
if enabled {
if enabled
&& cfg
.server
.conntrack_control
.inline_conntrack_control_explicit
{
warn!(
"conntrack control is configured but unsupported on this OS; disabling runtime worker"
"conntrack control explicitly enabled but unsupported on this OS; disabling runtime worker"
);
}
return;
@@ -92,16 +101,17 @@ async fn run_conntrack_controller(
let mut cfg = config_rx.borrow().clone();
let mut pressure_state = PressureState::new(stats.as_ref());
let mut delete_budget_tokens = cfg.server.conntrack_control.delete_budget_per_sec;
let mut backend = pick_backend(cfg.server.conntrack_control.backend);
let mut runtime_support = probe_runtime_support(cfg.server.conntrack_control.backend);
let mut effective_enabled = effective_conntrack_enabled(&cfg, runtime_support);
apply_runtime_state(
stats.as_ref(),
shared.as_ref(),
&cfg,
backend.is_some(),
runtime_support,
false,
);
reconcile_rules(&cfg, backend, stats.as_ref()).await;
reconcile_rules(&cfg, runtime_support, stats.as_ref()).await;
loop {
tokio::select! {
@@ -110,17 +120,18 @@ async fn run_conntrack_controller(
break;
}
cfg = config_rx.borrow_and_update().clone();
backend = pick_backend(cfg.server.conntrack_control.backend);
runtime_support = probe_runtime_support(cfg.server.conntrack_control.backend);
effective_enabled = effective_conntrack_enabled(&cfg, runtime_support);
delete_budget_tokens = cfg.server.conntrack_control.delete_budget_per_sec;
apply_runtime_state(stats.as_ref(), shared.as_ref(), &cfg, backend.is_some(), pressure_state.active);
reconcile_rules(&cfg, backend, stats.as_ref()).await;
apply_runtime_state(stats.as_ref(), shared.as_ref(), &cfg, runtime_support, pressure_state.active);
reconcile_rules(&cfg, runtime_support, stats.as_ref()).await;
}
event = close_rx.recv() => {
let Some(event) = event else {
break;
};
stats.set_conntrack_event_queue_depth(close_rx.len() as u64);
if !cfg.server.conntrack_control.inline_conntrack_control {
if !effective_enabled {
continue;
}
if !pressure_state.active {
@@ -156,6 +167,7 @@ async fn run_conntrack_controller(
stats.as_ref(),
shared.as_ref(),
&cfg,
effective_enabled,
&sample,
&mut pressure_state,
);
@@ -175,20 +187,30 @@ fn apply_runtime_state(
stats: &Stats,
shared: &ProxySharedState,
cfg: &ProxyConfig,
backend_available: bool,
runtime_support: ConntrackRuntimeSupport,
pressure_active: bool,
) {
let enabled = cfg.server.conntrack_control.inline_conntrack_control;
let available = enabled && backend_available && has_cap_net_admin();
if enabled && !available {
let available = effective_conntrack_enabled(cfg, runtime_support);
if enabled
&& !available
&& cfg
.server
.conntrack_control
.inline_conntrack_control_explicit
{
warn!(
"conntrack control enabled but unavailable (missing CAP_NET_ADMIN or backend binaries)"
has_cap_net_admin = runtime_support.has_cap_net_admin,
backend_available = runtime_support.netfilter_backend.is_some(),
conntrack_binary_available = runtime_support.has_conntrack_binary,
configured_backend = ?cfg.server.conntrack_control.backend,
"conntrack control explicitly enabled but unavailable; disabling runtime features"
);
}
stats.set_conntrack_control_enabled(enabled);
stats.set_conntrack_control_available(available);
shared.set_conntrack_pressure_active(enabled && pressure_active);
stats.set_conntrack_pressure_active(enabled && pressure_active);
shared.set_conntrack_pressure_active(available && pressure_active);
stats.set_conntrack_pressure_active(available && pressure_active);
}
fn collect_pressure_sample(
@@ -228,10 +250,11 @@ fn update_pressure_state(
stats: &Stats,
shared: &ProxySharedState,
cfg: &ProxyConfig,
effective_enabled: bool,
sample: &PressureSample,
state: &mut PressureState,
) {
if !cfg.server.conntrack_control.inline_conntrack_control {
if !effective_enabled {
if state.active {
state.active = false;
state.low_streak = 0;
@@ -285,22 +308,26 @@ fn update_pressure_state(
state.low_streak = 0;
}
async fn reconcile_rules(cfg: &ProxyConfig, backend: Option<NetfilterBackend>, stats: &Stats) {
async fn reconcile_rules(
cfg: &ProxyConfig,
runtime_support: ConntrackRuntimeSupport,
stats: &Stats,
) {
if !cfg.server.conntrack_control.inline_conntrack_control {
clear_notrack_rules_all_backends().await;
stats.set_conntrack_rule_apply_ok(true);
return;
}
if !has_cap_net_admin() {
if !effective_conntrack_enabled(cfg, runtime_support) {
clear_notrack_rules_all_backends().await;
stats.set_conntrack_rule_apply_ok(false);
return;
}
let Some(backend) = backend else {
stats.set_conntrack_rule_apply_ok(false);
return;
};
let backend = runtime_support
.netfilter_backend
.expect("netfilter backend must be available for effective conntrack control");
let apply_result = match backend {
NetfilterBackend::Nftables => apply_nft_rules(cfg).await,
@@ -315,6 +342,24 @@ async fn reconcile_rules(cfg: &ProxyConfig, backend: Option<NetfilterBackend>, s
}
}
fn probe_runtime_support(configured_backend: ConntrackBackend) -> ConntrackRuntimeSupport {
ConntrackRuntimeSupport {
netfilter_backend: pick_backend(configured_backend),
has_cap_net_admin: has_cap_net_admin(),
has_conntrack_binary: command_exists("conntrack"),
}
}
fn effective_conntrack_enabled(
cfg: &ProxyConfig,
runtime_support: ConntrackRuntimeSupport,
) -> bool {
cfg.server.conntrack_control.inline_conntrack_control
&& runtime_support.has_cap_net_admin
&& runtime_support.netfilter_backend.is_some()
&& runtime_support.has_conntrack_binary
}
fn pick_backend(configured: ConntrackBackend) -> Option<NetfilterBackend> {
match configured {
ConntrackBackend::Auto => {
@@ -710,7 +755,7 @@ mod tests {
me_queue_pressure_delta: 0,
};
update_pressure_state(&stats, shared.as_ref(), &cfg, &sample, &mut state);
update_pressure_state(&stats, shared.as_ref(), &cfg, true, &sample, &mut state);
assert!(state.active);
assert!(shared.conntrack_pressure_active());
@@ -731,7 +776,14 @@ mod tests {
accept_timeout_delta: 0,
me_queue_pressure_delta: 0,
};
update_pressure_state(&stats, shared.as_ref(), &cfg, &high_sample, &mut state);
update_pressure_state(
&stats,
shared.as_ref(),
&cfg,
true,
&high_sample,
&mut state,
);
assert!(state.active);
let low_sample = PressureSample {
@@ -740,11 +792,11 @@ mod tests {
accept_timeout_delta: 0,
me_queue_pressure_delta: 0,
};
update_pressure_state(&stats, shared.as_ref(), &cfg, &low_sample, &mut state);
update_pressure_state(&stats, shared.as_ref(), &cfg, true, &low_sample, &mut state);
assert!(state.active);
update_pressure_state(&stats, shared.as_ref(), &cfg, &low_sample, &mut state);
update_pressure_state(&stats, shared.as_ref(), &cfg, true, &low_sample, &mut state);
assert!(state.active);
update_pressure_state(&stats, shared.as_ref(), &cfg, &low_sample, &mut state);
update_pressure_state(&stats, shared.as_ref(), &cfg, true, &low_sample, &mut state);
assert!(!state.active);
assert!(!shared.conntrack_pressure_active());
@@ -765,7 +817,7 @@ mod tests {
me_queue_pressure_delta: 10,
};
update_pressure_state(&stats, shared.as_ref(), &cfg, &sample, &mut state);
update_pressure_state(&stats, shared.as_ref(), &cfg, false, &sample, &mut state);
assert!(!state.active);
assert!(!shared.conntrack_pressure_active());

View File

@@ -111,7 +111,10 @@ fn probe_target(listen: SocketAddr) -> SocketAddr {
}
fn build_request(target: SocketAddr, path: &str, auth_header: &str) -> String {
let mut request = format!("GET {path} HTTP/1.1\r\nHost: {}\r\nConnection: close\r\n", target);
let mut request = format!(
"GET {path} HTTP/1.1\r\nHost: {}\r\nConnection: close\r\n",
target
);
if !auth_header.is_empty() {
request.push_str("Authorization: ");
request.push_str(auth_header);

View File

@@ -253,7 +253,18 @@ pub fn build_emulated_server_hello(
}
// --- ApplicationData (fake encrypted records) ---
let mut sizes = jitter_and_clamp_sizes(&emulated_app_data_sizes(cached), rng);
let mut sizes = {
let base_sizes = emulated_app_data_sizes(cached);
match cached.behavior_profile.source {
TlsProfileSource::Raw | TlsProfileSource::Merged => base_sizes
.into_iter()
.map(|size| size.clamp(MIN_APP_DATA, MAX_APP_DATA))
.collect(),
TlsProfileSource::Default | TlsProfileSource::Rustls => {
jitter_and_clamp_sizes(&base_sizes, rng)
}
}
};
let compact_payload = cached
.cert_info
.as_ref()

View File

@@ -77,11 +77,12 @@ pub(crate) struct FlowFairnessState {
pub(crate) standing_state: StandingQueueState,
pub(crate) scheduler_state: FlowSchedulerState,
pub(crate) bucket_id: usize,
pub(crate) weight_quanta: u8,
pub(crate) in_active_ring: bool,
}
impl FlowFairnessState {
pub(crate) fn new(flow_id: u64, worker_id: u16, bucket_id: usize) -> Self {
pub(crate) fn new(flow_id: u64, worker_id: u16, bucket_id: usize, weight_quanta: u8) -> Self {
Self {
_flow_id: flow_id,
_worker_id: worker_id,
@@ -97,6 +98,7 @@ impl FlowFairnessState {
standing_state: StandingQueueState::Transient,
scheduler_state: FlowSchedulerState::Idle,
bucket_id,
weight_quanta: weight_quanta.max(1),
in_active_ring: false,
}
}

View File

@@ -146,59 +146,55 @@ impl PressureEvaluator {
((signals.standing_flows.saturating_mul(100)) / signals.active_flows).min(100) as u8
};
let mut pressure_score = 0u8;
let mut pressured = false;
let mut saturated = false;
let queue_saturated_pct = cfg.queue_ratio_shedding_pct.min(cfg.queue_ratio_saturated_pct);
if queue_ratio_pct >= cfg.queue_ratio_pressured_pct {
pressure_score = pressure_score.max(1);
pressured = true;
}
if queue_ratio_pct >= cfg.queue_ratio_shedding_pct {
pressure_score = pressure_score.max(2);
}
if queue_ratio_pct >= cfg.queue_ratio_saturated_pct {
pressure_score = pressure_score.max(3);
if queue_ratio_pct >= queue_saturated_pct {
saturated = true;
}
let standing_saturated_pct = cfg
.standing_ratio_shedding_pct
.min(cfg.standing_ratio_saturated_pct);
if standing_ratio_pct >= cfg.standing_ratio_pressured_pct {
pressure_score = pressure_score.max(1);
pressured = true;
}
if standing_ratio_pct >= cfg.standing_ratio_shedding_pct {
pressure_score = pressure_score.max(2);
}
if standing_ratio_pct >= cfg.standing_ratio_saturated_pct {
pressure_score = pressure_score.max(3);
if standing_ratio_pct >= standing_saturated_pct {
saturated = true;
}
let rejects_saturated = cfg.rejects_shedding.min(cfg.rejects_saturated);
if self.admission_rejects_window >= cfg.rejects_pressured {
pressure_score = pressure_score.max(1);
pressured = true;
}
if self.admission_rejects_window >= cfg.rejects_shedding {
pressure_score = pressure_score.max(2);
}
if self.admission_rejects_window >= cfg.rejects_saturated {
pressure_score = pressure_score.max(3);
if self.admission_rejects_window >= rejects_saturated {
saturated = true;
}
let stalls_saturated = cfg.stalls_shedding.min(cfg.stalls_saturated);
if self.route_stalls_window >= cfg.stalls_pressured {
pressure_score = pressure_score.max(1);
pressured = true;
}
if self.route_stalls_window >= cfg.stalls_shedding {
pressure_score = pressure_score.max(2);
}
if self.route_stalls_window >= cfg.stalls_saturated {
pressure_score = pressure_score.max(3);
if self.route_stalls_window >= stalls_saturated {
saturated = true;
}
if signals.backpressured_flows > signals.active_flows.saturating_div(2)
&& signals.active_flows > 0
{
pressure_score = pressure_score.max(2);
pressured = true;
}
match pressure_score {
0 => PressureState::Normal,
1 => PressureState::Pressured,
2 => PressureState::Shedding,
_ => PressureState::Saturated,
if saturated {
PressureState::Saturated
} else if pressured {
PressureState::Pressured
} else {
PressureState::Normal
}
}

View File

@@ -1,7 +1,8 @@
use std::collections::{HashMap, VecDeque};
use std::collections::{HashMap, HashSet, VecDeque};
use std::time::{Duration, Instant};
use bytes::Bytes;
use crate::protocol::constants::RPC_FLAG_QUICKACK;
use super::model::{
AdmissionDecision, DispatchAction, DispatchCandidate, DispatchFeedback, FlowFairnessState,
@@ -26,6 +27,8 @@ pub(crate) struct WorkerFairnessConfig {
pub(crate) max_consecutive_stalls_before_close: u8,
pub(crate) soft_bucket_count: usize,
pub(crate) soft_bucket_share_pct: u8,
pub(crate) default_flow_weight: u8,
pub(crate) quickack_flow_weight: u8,
pub(crate) pressure: PressureConfig,
}
@@ -46,6 +49,8 @@ impl Default for WorkerFairnessConfig {
max_consecutive_stalls_before_close: 16,
soft_bucket_count: 64,
soft_bucket_share_pct: 25,
default_flow_weight: 1,
quickack_flow_weight: 4,
pressure: PressureConfig::default(),
}
}
@@ -57,9 +62,9 @@ struct FlowEntry {
}
impl FlowEntry {
fn new(flow_id: u64, worker_id: u16, bucket_id: usize) -> Self {
fn new(flow_id: u64, worker_id: u16, bucket_id: usize, weight_quanta: u8) -> Self {
Self {
fairness: FlowFairnessState::new(flow_id, worker_id, bucket_id),
fairness: FlowFairnessState::new(flow_id, worker_id, bucket_id, weight_quanta),
queue: VecDeque::new(),
}
}
@@ -86,6 +91,7 @@ pub(crate) struct WorkerFairnessState {
pressure: PressureEvaluator,
flows: HashMap<u64, FlowEntry>,
active_ring: VecDeque<u64>,
active_ring_members: HashSet<u64>,
total_queued_bytes: u64,
bucket_queued_bytes: Vec<u64>,
bucket_active_flows: Vec<usize>,
@@ -108,6 +114,7 @@ impl WorkerFairnessState {
pressure: PressureEvaluator::new(now),
flows: HashMap::new(),
active_ring: VecDeque::new(),
active_ring_members: HashSet::new(),
total_queued_bytes: 0,
bucket_queued_bytes: vec![0; bucket_count],
bucket_active_flows: vec![0; bucket_count],
@@ -184,6 +191,7 @@ impl WorkerFairnessState {
}
let bucket_id = self.bucket_for(conn_id);
let frame_weight = Self::weight_for_flags(&self.config, flags);
let bucket_cap = self
.config
.max_total_queued_bytes
@@ -205,12 +213,13 @@ impl WorkerFairnessState {
self.bucket_active_flows[bucket_id].saturating_add(1);
self.flows.insert(
conn_id,
FlowEntry::new(conn_id, self.config.worker_id, bucket_id),
FlowEntry::new(conn_id, self.config.worker_id, bucket_id, frame_weight),
);
self.flows
.get_mut(&conn_id)
.expect("flow inserted must be retrievable")
};
entry.fairness.weight_quanta = entry.fairness.weight_quanta.max(frame_weight);
if entry.fairness.pending_bytes.saturating_add(frame_bytes)
> self.config.max_flow_queued_bytes
@@ -242,11 +251,24 @@ impl WorkerFairnessState {
self.bucket_queued_bytes[bucket_id] =
self.bucket_queued_bytes[bucket_id].saturating_add(frame_bytes);
let mut enqueue_active = false;
if !entry.fairness.in_active_ring {
entry.fairness.in_active_ring = true;
self.active_ring.push_back(conn_id);
enqueue_active = true;
}
let pressure_state = self.pressure.state();
let (before_membership, after_membership) = {
let before = Self::flow_membership(&entry.fairness);
Self::classify_flow(&self.config, pressure_state, now, &mut entry.fairness);
let after = Self::flow_membership(&entry.fairness);
(before, after)
};
if enqueue_active {
self.enqueue_active_conn(conn_id);
}
self.apply_flow_membership_delta(before_membership, after_membership);
self.evaluate_pressure(now, true);
AdmissionDecision::Admit
}
@@ -260,32 +282,44 @@ impl WorkerFairnessState {
let Some(conn_id) = self.active_ring.pop_front() else {
break;
};
if !self.active_ring_members.remove(&conn_id) {
continue;
}
let mut candidate = None;
let mut requeue_active = false;
let mut drained_bytes = 0u64;
let mut bucket_id = 0usize;
let mut should_continue = false;
let mut enqueue_active = false;
let mut membership_delta = None;
let pressure_state = self.pressure.state();
if let Some(flow) = self.flows.get_mut(&conn_id) {
bucket_id = flow.fairness.bucket_id;
flow.fairness.in_active_ring = false;
let before_membership = Self::flow_membership(&flow.fairness);
if flow.queue.is_empty() {
flow.fairness.in_active_ring = false;
flow.fairness.scheduler_state = FlowSchedulerState::Idle;
flow.fairness.pending_bytes = 0;
flow.fairness.deficit_bytes = 0;
flow.fairness.queue_started_at = None;
continue;
}
should_continue = true;
} else {
Self::classify_flow(&self.config, pressure_state, now, &mut flow.fairness);
let quantum =
Self::effective_quantum_bytes(&self.config, pressure_state, &flow.fairness);
let quantum = Self::effective_quantum_bytes(
&self.config,
pressure_state,
&flow.fairness,
);
flow.fairness.deficit_bytes = flow
.fairness
.deficit_bytes
.saturating_add(i64::from(quantum));
Self::clamp_deficit_bytes(&self.config, &mut flow.fairness);
self.deficit_grants = self.deficit_grants.saturating_add(1);
let front_len = flow.queue.front().map_or(0, |front| front.queued_bytes());
@@ -294,6 +328,8 @@ impl WorkerFairnessState {
flow.fairness.consecutive_skips.saturating_add(1);
self.deficit_skips = self.deficit_skips.saturating_add(1);
requeue_active = true;
flow.fairness.in_active_ring = true;
enqueue_active = true;
} else if let Some(frame) = flow.queue.pop_front() {
drained_bytes = frame.queued_bytes();
flow.fairness.pending_bytes =
@@ -302,6 +338,7 @@ impl WorkerFairnessState {
.fairness
.deficit_bytes
.saturating_sub(drained_bytes as i64);
Self::clamp_deficit_bytes(&self.config, &mut flow.fairness);
flow.fairness.consecutive_skips = 0;
flow.fairness.queue_started_at =
flow.queue.front().map(|front| front.enqueued_at);
@@ -309,6 +346,10 @@ impl WorkerFairnessState {
if !requeue_active {
flow.fairness.scheduler_state = FlowSchedulerState::Idle;
flow.fairness.in_active_ring = false;
flow.fairness.deficit_bytes = 0;
} else {
flow.fairness.in_active_ring = true;
enqueue_active = true;
}
candidate = Some(DispatchCandidate {
pressure_state,
@@ -318,17 +359,25 @@ impl WorkerFairnessState {
}
}
membership_delta = Some((before_membership, Self::flow_membership(&flow.fairness)));
}
if let Some((before_membership, after_membership)) = membership_delta {
self.apply_flow_membership_delta(before_membership, after_membership);
}
if should_continue {
continue;
}
if drained_bytes > 0 {
self.total_queued_bytes = self.total_queued_bytes.saturating_sub(drained_bytes);
self.bucket_queued_bytes[bucket_id] =
self.bucket_queued_bytes[bucket_id].saturating_sub(drained_bytes);
}
if requeue_active {
if let Some(flow) = self.flows.get_mut(&conn_id) {
flow.fairness.in_active_ring = true;
}
self.active_ring.push_back(conn_id);
if requeue_active && enqueue_active {
self.enqueue_active_conn(conn_id);
}
if let Some(candidate) = candidate {
@@ -348,7 +397,9 @@ impl WorkerFairnessState {
) -> DispatchAction {
match feedback {
DispatchFeedback::Routed => {
let mut membership_delta = None;
if let Some(flow) = self.flows.get_mut(&conn_id) {
let before_membership = Self::flow_membership(&flow.fairness);
flow.fairness.last_drain_at = Some(now);
flow.fairness.recent_drain_bytes = flow
.fairness
@@ -358,6 +409,17 @@ impl WorkerFairnessState {
if flow.fairness.scheduler_state != FlowSchedulerState::Idle {
flow.fairness.scheduler_state = FlowSchedulerState::Active;
}
Self::classify_flow(
&self.config,
self.pressure.state(),
now,
&mut flow.fairness,
);
membership_delta =
Some((before_membership, Self::flow_membership(&flow.fairness)));
}
if let Some((before_membership, after_membership)) = membership_delta {
self.apply_flow_membership_delta(before_membership, after_membership);
}
self.evaluate_pressure(now, false);
DispatchAction::Continue
@@ -365,17 +427,20 @@ impl WorkerFairnessState {
DispatchFeedback::QueueFull => {
self.pressure.note_route_stall(now, &self.config.pressure);
self.downstream_stalls = self.downstream_stalls.saturating_add(1);
let state = self.pressure.state();
let Some(flow) = self.flows.get_mut(&conn_id) else {
self.evaluate_pressure(now, true);
return DispatchAction::Continue;
};
let (before_membership, after_membership, should_close_flow, enqueue_active) = {
let before_membership = Self::flow_membership(&flow.fairness);
let mut enqueue_active = false;
flow.fairness.consecutive_stalls =
flow.fairness.consecutive_stalls.saturating_add(1);
flow.fairness.scheduler_state = FlowSchedulerState::Backpressured;
flow.fairness.pressure_class = FlowPressureClass::Backpressured;
let state = self.pressure.state();
let should_shed_frame = matches!(state, PressureState::Saturated)
|| (matches!(state, PressureState::Shedding)
&& flow.fairness.standing_state == StandingQueueState::Standing
@@ -392,20 +457,35 @@ impl WorkerFairnessState {
flow.fairness.pending_bytes.saturating_add(frame_bytes);
flow.fairness.queue_started_at =
flow.queue.front().map(|front| front.enqueued_at);
self.total_queued_bytes = self.total_queued_bytes.saturating_add(frame_bytes);
self.bucket_queued_bytes[flow.fairness.bucket_id] = self.bucket_queued_bytes
[flow.fairness.bucket_id]
self.total_queued_bytes =
self.total_queued_bytes.saturating_add(frame_bytes);
self.bucket_queued_bytes[flow.fairness.bucket_id] = self
.bucket_queued_bytes[flow.fairness.bucket_id]
.saturating_add(frame_bytes);
if !flow.fairness.in_active_ring {
flow.fairness.in_active_ring = true;
self.active_ring.push_back(conn_id);
enqueue_active = true;
}
}
if flow.fairness.consecutive_stalls
Self::classify_flow(&self.config, state, now, &mut flow.fairness);
let after_membership = Self::flow_membership(&flow.fairness);
let should_close_flow = flow.fairness.consecutive_stalls
>= self.config.max_consecutive_stalls_before_close
&& self.pressure.state() == PressureState::Saturated
{
&& self.pressure.state() == PressureState::Saturated;
(
before_membership,
after_membership,
should_close_flow,
enqueue_active,
)
};
if enqueue_active {
self.enqueue_active_conn(conn_id);
}
self.apply_flow_membership_delta(before_membership, after_membership);
if should_close_flow {
self.remove_flow(conn_id);
self.evaluate_pressure(now, true);
return DispatchAction::CloseFlow;
@@ -426,6 +506,15 @@ impl WorkerFairnessState {
let Some(entry) = self.flows.remove(&conn_id) else {
return;
};
self.active_ring_members.remove(&conn_id);
self.active_ring.retain(|queued_conn_id| *queued_conn_id != conn_id);
let (was_standing, was_backpressured) = Self::flow_membership(&entry.fairness);
if was_standing {
self.standing_flow_count = self.standing_flow_count.saturating_sub(1);
}
if was_backpressured {
self.backpressured_flow_count = self.backpressured_flow_count.saturating_sub(1);
}
self.bucket_active_flows[entry.fairness.bucket_id] =
self.bucket_active_flows[entry.fairness.bucket_id].saturating_sub(1);
@@ -440,27 +529,6 @@ impl WorkerFairnessState {
}
fn evaluate_pressure(&mut self, now: Instant, force: bool) {
let mut standing = 0usize;
let mut backpressured = 0usize;
for flow in self.flows.values_mut() {
Self::classify_flow(&self.config, self.pressure.state(), now, &mut flow.fairness);
if flow.fairness.standing_state == StandingQueueState::Standing {
standing = standing.saturating_add(1);
}
if matches!(
flow.fairness.scheduler_state,
FlowSchedulerState::Backpressured
| FlowSchedulerState::Penalized
| FlowSchedulerState::SheddingCandidate
) {
backpressured = backpressured.saturating_add(1);
}
}
self.standing_flow_count = standing;
self.backpressured_flow_count = backpressured;
let _ = self.pressure.maybe_evaluate(
now,
&self.config.pressure,
@@ -468,8 +536,8 @@ impl WorkerFairnessState {
PressureSignals {
active_flows: self.flows.len(),
total_queued_bytes: self.total_queued_bytes,
standing_flows: standing,
backpressured_flows: backpressured,
standing_flows: self.standing_flow_count,
backpressured_flows: self.backpressured_flow_count,
},
force,
);
@@ -481,12 +549,39 @@ impl WorkerFairnessState {
now: Instant,
fairness: &mut FlowFairnessState,
) {
if fairness.pending_bytes == 0 {
fairness.pressure_class = FlowPressureClass::Healthy;
fairness.standing_state = StandingQueueState::Transient;
fairness.scheduler_state = FlowSchedulerState::Idle;
let (pressure_class, standing_state, scheduler_state, standing) =
Self::derive_flow_classification(config, pressure_state, now, fairness);
fairness.pressure_class = pressure_class;
fairness.standing_state = standing_state;
fairness.scheduler_state = scheduler_state;
if scheduler_state == FlowSchedulerState::Idle {
fairness.deficit_bytes = 0;
}
if standing {
fairness.penalty_score = fairness.penalty_score.saturating_add(1);
} else {
fairness.penalty_score = fairness.penalty_score.saturating_sub(1);
return;
}
}
fn derive_flow_classification(
config: &WorkerFairnessConfig,
pressure_state: PressureState,
now: Instant,
fairness: &FlowFairnessState,
) -> (
FlowPressureClass,
StandingQueueState,
FlowSchedulerState,
bool,
) {
if fairness.pending_bytes == 0 {
return (
FlowPressureClass::Healthy,
StandingQueueState::Transient,
FlowSchedulerState::Idle,
false,
);
}
let queue_age = fairness
@@ -503,29 +598,165 @@ impl WorkerFairnessState {
&& (fairness.consecutive_stalls >= config.standing_stall_threshold || drain_stalled);
if standing {
fairness.standing_state = StandingQueueState::Standing;
fairness.pressure_class = FlowPressureClass::Standing;
fairness.penalty_score = fairness.penalty_score.saturating_add(1);
fairness.scheduler_state = if pressure_state >= PressureState::Shedding {
let scheduler_state = if pressure_state >= PressureState::Shedding {
FlowSchedulerState::SheddingCandidate
} else {
FlowSchedulerState::Penalized
};
return;
return (
FlowPressureClass::Standing,
StandingQueueState::Standing,
scheduler_state,
true,
);
}
fairness.standing_state = StandingQueueState::Transient;
if fairness.consecutive_stalls > 0 {
fairness.pressure_class = FlowPressureClass::Backpressured;
fairness.scheduler_state = FlowSchedulerState::Backpressured;
} else if fairness.pending_bytes >= config.standing_queue_min_backlog_bytes {
fairness.pressure_class = FlowPressureClass::Bursty;
fairness.scheduler_state = FlowSchedulerState::Active;
} else {
fairness.pressure_class = FlowPressureClass::Healthy;
fairness.scheduler_state = FlowSchedulerState::Active;
return (
FlowPressureClass::Backpressured,
StandingQueueState::Transient,
FlowSchedulerState::Backpressured,
false,
);
}
fairness.penalty_score = fairness.penalty_score.saturating_sub(1);
if fairness.pending_bytes >= config.standing_queue_min_backlog_bytes {
return (
FlowPressureClass::Bursty,
StandingQueueState::Transient,
FlowSchedulerState::Active,
false,
);
}
(
FlowPressureClass::Healthy,
StandingQueueState::Transient,
FlowSchedulerState::Active,
false,
)
}
#[inline]
fn flow_membership(fairness: &FlowFairnessState) -> (bool, bool) {
(
fairness.standing_state == StandingQueueState::Standing,
Self::scheduler_state_is_backpressured(fairness.scheduler_state),
)
}
#[inline]
fn scheduler_state_is_backpressured(state: FlowSchedulerState) -> bool {
matches!(
state,
FlowSchedulerState::Backpressured
| FlowSchedulerState::Penalized
| FlowSchedulerState::SheddingCandidate
)
}
fn apply_flow_membership_delta(
&mut self,
before_membership: (bool, bool),
after_membership: (bool, bool),
) {
if before_membership.0 != after_membership.0 {
if after_membership.0 {
self.standing_flow_count = self.standing_flow_count.saturating_add(1);
} else {
self.standing_flow_count = self.standing_flow_count.saturating_sub(1);
}
}
if before_membership.1 != after_membership.1 {
if after_membership.1 {
self.backpressured_flow_count = self.backpressured_flow_count.saturating_add(1);
} else {
self.backpressured_flow_count = self.backpressured_flow_count.saturating_sub(1);
}
}
}
#[inline]
fn clamp_deficit_bytes(config: &WorkerFairnessConfig, fairness: &mut FlowFairnessState) {
let max_deficit = config.max_flow_queued_bytes.min(i64::MAX as u64) as i64;
fairness.deficit_bytes = fairness.deficit_bytes.clamp(0, max_deficit);
}
#[inline]
fn enqueue_active_conn(&mut self, conn_id: u64) {
if self.active_ring_members.insert(conn_id) {
self.active_ring.push_back(conn_id);
}
}
#[inline]
fn weight_for_flags(config: &WorkerFairnessConfig, flags: u32) -> u8 {
if (flags & RPC_FLAG_QUICKACK) != 0 {
return config.quickack_flow_weight.max(1);
}
config.default_flow_weight.max(1)
}
#[cfg(test)]
pub(crate) fn debug_recompute_flow_counters(&self, now: Instant) -> (usize, usize) {
let pressure_state = self.pressure.state();
let mut standing = 0usize;
let mut backpressured = 0usize;
for flow in self.flows.values() {
let (_, standing_state, scheduler_state, _) =
Self::derive_flow_classification(&self.config, pressure_state, now, &flow.fairness);
if standing_state == StandingQueueState::Standing {
standing = standing.saturating_add(1);
}
if Self::scheduler_state_is_backpressured(scheduler_state) {
backpressured = backpressured.saturating_add(1);
}
}
(standing, backpressured)
}
#[cfg(test)]
pub(crate) fn debug_check_active_ring_consistency(&self) -> bool {
if self.active_ring.len() != self.active_ring_members.len() {
return false;
}
let mut seen = HashSet::with_capacity(self.active_ring.len());
for conn_id in self.active_ring.iter().copied() {
if !seen.insert(conn_id) {
return false;
}
if !self.active_ring_members.contains(&conn_id) {
return false;
}
let Some(flow) = self.flows.get(&conn_id) else {
return false;
};
if !flow.fairness.in_active_ring || flow.queue.is_empty() {
return false;
}
}
for (conn_id, flow) in self.flows.iter() {
let in_ring = self.active_ring_members.contains(conn_id);
if flow.fairness.in_active_ring != in_ring {
return false;
}
if in_ring && flow.queue.is_empty() {
return false;
}
}
true
}
#[cfg(test)]
pub(crate) fn debug_max_deficit_bytes(&self) -> i64 {
self.flows
.values()
.map(|entry| entry.fairness.deficit_bytes)
.max()
.unwrap_or(0)
}
fn effective_quantum_bytes(
@@ -542,12 +773,14 @@ impl WorkerFairnessState {
return config.penalized_quantum_bytes.max(1);
}
match pressure_state {
let base_quantum = match pressure_state {
PressureState::Normal => config.base_quantum_bytes.max(1),
PressureState::Pressured => config.pressured_quantum_bytes.max(1),
PressureState::Shedding => config.pressured_quantum_bytes.max(1),
PressureState::Saturated => config.penalized_quantum_bytes.max(1),
}
};
let weighted_quantum = base_quantum.saturating_mul(fairness.weight_quanta.max(1) as u32);
weighted_quantum.max(1)
}
fn bucket_for(&self, conn_id: u64) -> usize {

View File

@@ -2,6 +2,7 @@ use std::time::{Duration, Instant};
use bytes::Bytes;
use crate::protocol::constants::RPC_FLAG_QUICKACK;
use crate::transport::middle_proxy::fairness::{
AdmissionDecision, DispatchAction, DispatchFeedback, PressureState, SchedulerDecision,
WorkerFairnessConfig, WorkerFairnessState,
@@ -114,6 +115,62 @@ fn fairness_keeps_fast_flow_progress_under_slow_neighbor() {
assert!(snapshot.total_queued_bytes <= 64 * 1024);
}
#[test]
fn fairness_prioritizes_quickack_flow_when_weights_enabled() {
let mut now = Instant::now();
let mut fairness = WorkerFairnessState::new(
WorkerFairnessConfig {
max_total_queued_bytes: 256 * 1024,
max_flow_queued_bytes: 64 * 1024,
base_quantum_bytes: 8 * 1024,
pressured_quantum_bytes: 8 * 1024,
penalized_quantum_bytes: 8 * 1024,
default_flow_weight: 1,
quickack_flow_weight: 4,
..WorkerFairnessConfig::default()
},
now,
);
for _ in 0..8 {
assert_eq!(
fairness.enqueue_data(10, RPC_FLAG_QUICKACK, enqueue_payload(16 * 1024), now),
AdmissionDecision::Admit
);
assert_eq!(
fairness.enqueue_data(20, 0, enqueue_payload(16 * 1024), now),
AdmissionDecision::Admit
);
}
let mut quickack_dispatched = 0u64;
let mut bulk_dispatched = 0u64;
for _ in 0..64 {
now += Duration::from_millis(1);
let SchedulerDecision::Dispatch(candidate) = fairness.next_decision(now) else {
break;
};
if candidate.frame.conn_id == 10 {
quickack_dispatched = quickack_dispatched.saturating_add(1);
} else if candidate.frame.conn_id == 20 {
bulk_dispatched = bulk_dispatched.saturating_add(1);
}
let _ = fairness.apply_dispatch_feedback(
candidate.frame.conn_id,
candidate,
DispatchFeedback::Routed,
now,
);
}
assert!(
quickack_dispatched > bulk_dispatched,
"quickack flow must receive higher dispatch rate with larger weight"
);
}
#[test]
fn fairness_pressure_hysteresis_prevents_instant_flapping() {
let mut now = Instant::now();
@@ -180,6 +237,12 @@ fn fairness_randomized_sequence_preserves_memory_bounds() {
}
let snapshot = fairness.snapshot();
let (standing_recomputed, backpressured_recomputed) =
fairness.debug_recompute_flow_counters(now);
assert!(snapshot.total_queued_bytes <= 32 * 1024);
assert_eq!(snapshot.standing_flows, standing_recomputed);
assert_eq!(snapshot.backpressured_flows, backpressured_recomputed);
assert!(fairness.debug_check_active_ring_consistency());
assert!(fairness.debug_max_deficit_bytes() <= 4 * 1024);
}
}