mirror of
https://github.com/telemt/telemt.git
synced 2026-04-15 01:24:09 +03:00
Compare commits
11 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dd12997744 | ||
|
|
fc160913bf | ||
|
|
92c22ef16d | ||
|
|
aff22d0855 | ||
|
|
b3d3bca15a | ||
|
|
92f38392eb | ||
|
|
30ef8df1b3 | ||
|
|
2e174adf16 | ||
|
|
4e803b1412 | ||
|
|
9b174318ce | ||
|
|
99edcbe818 |
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "telemt"
|
||||
version = "3.3.3"
|
||||
version = "3.3.4"
|
||||
edition = "2024"
|
||||
|
||||
[dependencies]
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
***Löst Probleme, bevor andere überhaupt wissen, dass sie existieren*** / ***It solves problems before others even realize they exist***
|
||||
|
||||
**Telemt** is a fast, secure, and feature-rich server written in Rust: it fully implements the official Telegram proxy algo and adds many production-ready improvements such as:
|
||||
- ME Pool + Reader/Writer + Registry + Refill + Adaptive Floor + Trio-State + Generation Lifecycle
|
||||
- [ME Pool + Reader/Writer + Registry + Refill + Adaptive Floor + Trio-State + Generation Lifecycle](https://github.com/telemt/telemt/blob/main/docs/model/MODEL.en.md)
|
||||
- [Full-covered API w/ management](https://github.com/telemt/telemt/blob/main/docs/API.md)
|
||||
- Anti-Replay on Sliding Window
|
||||
- Prometheus-format Metrics
|
||||
|
||||
92
docs/API.md
92
docs/API.md
@@ -76,6 +76,10 @@ Notes:
|
||||
| Method | Path | Body | Success | `data` contract |
|
||||
| --- | --- | --- | --- | --- |
|
||||
| `GET` | `/v1/health` | none | `200` | `HealthData` |
|
||||
| `GET` | `/v1/system/info` | none | `200` | `SystemInfoData` |
|
||||
| `GET` | `/v1/runtime/gates` | none | `200` | `RuntimeGatesData` |
|
||||
| `GET` | `/v1/limits/effective` | none | `200` | `EffectiveLimitsData` |
|
||||
| `GET` | `/v1/security/posture` | none | `200` | `SecurityPostureData` |
|
||||
| `GET` | `/v1/stats/summary` | none | `200` | `SummaryData` |
|
||||
| `GET` | `/v1/stats/zero/all` | none | `200` | `ZeroAllData` |
|
||||
| `GET` | `/v1/stats/upstreams` | none | `200` | `UpstreamsData` |
|
||||
@@ -176,6 +180,94 @@ Note: the request contract is defined, but the corresponding route currently ret
|
||||
| `handshake_timeouts_total` | `u64` | Handshake timeout count. |
|
||||
| `configured_users` | `usize` | Number of configured users in config. |
|
||||
|
||||
### `SystemInfoData`
|
||||
| Field | Type | Description |
|
||||
| --- | --- | --- |
|
||||
| `version` | `string` | Binary version (`CARGO_PKG_VERSION`). |
|
||||
| `target_arch` | `string` | Target architecture (`std::env::consts::ARCH`). |
|
||||
| `target_os` | `string` | Target OS (`std::env::consts::OS`). |
|
||||
| `build_profile` | `string` | Build profile (`PROFILE` env when available). |
|
||||
| `git_commit` | `string?` | Optional commit hash from build env metadata. |
|
||||
| `build_time_utc` | `string?` | Optional build timestamp from build env metadata. |
|
||||
| `rustc_version` | `string?` | Optional compiler version from build env metadata. |
|
||||
| `process_started_at_epoch_secs` | `u64` | Process start time as Unix epoch seconds. |
|
||||
| `uptime_seconds` | `f64` | Process uptime in seconds. |
|
||||
| `config_path` | `string` | Active config file path used by runtime. |
|
||||
| `config_hash` | `string` | SHA-256 hash of current config content (same value as envelope `revision`). |
|
||||
| `config_reload_count` | `u64` | Number of successfully observed config updates since process start. |
|
||||
| `last_config_reload_epoch_secs` | `u64?` | Unix epoch seconds of the latest observed config reload; null/absent before first reload. |
|
||||
|
||||
### `RuntimeGatesData`
|
||||
| Field | Type | Description |
|
||||
| --- | --- | --- |
|
||||
| `accepting_new_connections` | `bool` | Current admission-gate state for new listener accepts. |
|
||||
| `conditional_cast_enabled` | `bool` | Whether conditional ME admission logic is enabled (`general.use_middle_proxy`). |
|
||||
| `me_runtime_ready` | `bool` | Current ME runtime readiness status used for conditional gate decisions. |
|
||||
| `me2dc_fallback_enabled` | `bool` | Whether ME -> direct fallback is enabled. |
|
||||
| `use_middle_proxy` | `bool` | Current transport mode preference. |
|
||||
|
||||
### `EffectiveLimitsData`
|
||||
| Field | Type | Description |
|
||||
| --- | --- | --- |
|
||||
| `update_every_secs` | `u64` | Effective unified updater interval. |
|
||||
| `me_reinit_every_secs` | `u64` | Effective ME periodic reinit interval. |
|
||||
| `me_pool_force_close_secs` | `u64` | Effective stale-writer force-close timeout. |
|
||||
| `timeouts` | `EffectiveTimeoutLimits` | Effective timeout policy snapshot. |
|
||||
| `upstream` | `EffectiveUpstreamLimits` | Effective upstream connect/retry limits. |
|
||||
| `middle_proxy` | `EffectiveMiddleProxyLimits` | Effective ME pool/floor/reconnect limits. |
|
||||
| `user_ip_policy` | `EffectiveUserIpPolicyLimits` | Effective unique-IP policy mode/window. |
|
||||
|
||||
#### `EffectiveTimeoutLimits`
|
||||
| Field | Type | Description |
|
||||
| --- | --- | --- |
|
||||
| `client_handshake_secs` | `u64` | Client handshake timeout. |
|
||||
| `tg_connect_secs` | `u64` | Upstream Telegram connect timeout. |
|
||||
| `client_keepalive_secs` | `u64` | Client keepalive interval. |
|
||||
| `client_ack_secs` | `u64` | ACK timeout. |
|
||||
| `me_one_retry` | `u8` | Fast retry count for single-endpoint ME DC. |
|
||||
| `me_one_timeout_ms` | `u64` | Fast retry timeout per attempt for single-endpoint ME DC. |
|
||||
|
||||
#### `EffectiveUpstreamLimits`
|
||||
| Field | Type | Description |
|
||||
| --- | --- | --- |
|
||||
| `connect_retry_attempts` | `u32` | Upstream connect retry attempts. |
|
||||
| `connect_retry_backoff_ms` | `u64` | Upstream retry backoff delay. |
|
||||
| `connect_budget_ms` | `u64` | Total connect wall-clock budget across retries. |
|
||||
| `unhealthy_fail_threshold` | `u32` | Consecutive fail threshold for unhealthy marking. |
|
||||
| `connect_failfast_hard_errors` | `bool` | Whether hard errors skip additional retries. |
|
||||
|
||||
#### `EffectiveMiddleProxyLimits`
|
||||
| Field | Type | Description |
|
||||
| --- | --- | --- |
|
||||
| `floor_mode` | `string` | Effective floor mode (`static` or `adaptive`). |
|
||||
| `adaptive_floor_idle_secs` | `u64` | Adaptive floor idle threshold. |
|
||||
| `adaptive_floor_min_writers_single_endpoint` | `u8` | Adaptive floor minimum for single-endpoint DCs. |
|
||||
| `adaptive_floor_recover_grace_secs` | `u64` | Adaptive floor recovery grace period. |
|
||||
| `reconnect_max_concurrent_per_dc` | `u32` | Max concurrent reconnects per DC. |
|
||||
| `reconnect_backoff_base_ms` | `u64` | Reconnect base backoff. |
|
||||
| `reconnect_backoff_cap_ms` | `u64` | Reconnect backoff cap. |
|
||||
| `reconnect_fast_retry_count` | `u32` | Number of fast retries before standard backoff strategy. |
|
||||
| `me2dc_fallback` | `bool` | Effective ME -> direct fallback flag. |
|
||||
|
||||
#### `EffectiveUserIpPolicyLimits`
|
||||
| Field | Type | Description |
|
||||
| --- | --- | --- |
|
||||
| `mode` | `string` | Unique-IP policy mode (`active_window`, `time_window`, `combined`). |
|
||||
| `window_secs` | `u64` | Time window length used by unique-IP policy. |
|
||||
|
||||
### `SecurityPostureData`
|
||||
| Field | Type | Description |
|
||||
| --- | --- | --- |
|
||||
| `api_read_only` | `bool` | Current API read-only state. |
|
||||
| `api_whitelist_enabled` | `bool` | Whether whitelist filtering is active. |
|
||||
| `api_whitelist_entries` | `usize` | Number of configured whitelist CIDRs. |
|
||||
| `api_auth_header_enabled` | `bool` | Whether `Authorization` header validation is active. |
|
||||
| `proxy_protocol_enabled` | `bool` | Global PROXY protocol accept setting. |
|
||||
| `log_level` | `string` | Effective log level (`debug`, `verbose`, `normal`, `silent`). |
|
||||
| `telemetry_core_enabled` | `bool` | Core telemetry toggle. |
|
||||
| `telemetry_user_enabled` | `bool` | Per-user telemetry toggle. |
|
||||
| `telemetry_me_level` | `string` | ME telemetry level (`silent`, `normal`, `debug`). |
|
||||
|
||||
### `ZeroAllData`
|
||||
| Field | Type | Description |
|
||||
| --- | --- | --- |
|
||||
|
||||
285
docs/model/MODEL.en.md
Normal file
285
docs/model/MODEL.en.md
Normal file
@@ -0,0 +1,285 @@
|
||||
# Telemt Runtime Model
|
||||
|
||||
## Scope
|
||||
This document defines runtime concepts used by the Middle-End (ME) transport pipeline and the orchestration logic around it.
|
||||
|
||||
It focuses on:
|
||||
- `ME Pool / Reader / Writer / Refill / Registry`
|
||||
- `Adaptive Floor`
|
||||
- `Trio-State`
|
||||
- `Generation Lifecycle`
|
||||
|
||||
## Core Entities
|
||||
|
||||
### ME Pool
|
||||
`ME Pool` is the runtime orchestrator for all Middle-End writers.
|
||||
|
||||
Responsibilities:
|
||||
- Holds writer inventory by DC/family/endpoint.
|
||||
- Maintains routing primitives and writer selection policy.
|
||||
- Tracks generation state (`active`, `warm`, `draining` context).
|
||||
- Applies runtime policies (floor mode, refill, reconnect, reinit, fallback behavior).
|
||||
- Exposes readiness gates used by admission logic (for conditional accept/cast behavior).
|
||||
|
||||
Non-goals:
|
||||
- It does not own client protocol decoding.
|
||||
- It does not own per-client business policy (quotas/limits).
|
||||
|
||||
### ME Writer
|
||||
`ME Writer` is a long-lived ME RPC tunnel bound to one concrete ME endpoint (`ip:port`), with:
|
||||
- Outbound command channel (send path).
|
||||
- Associated reader loop (inbound path).
|
||||
- Health/degraded flags.
|
||||
- Contour/state and generation metadata.
|
||||
|
||||
A writer is the actual data plane carrier for client sessions once bound.
|
||||
|
||||
### ME Reader
|
||||
`ME Reader` is the inbound parser/dispatcher for one writer:
|
||||
- Reads/decrypts ME RPC frames.
|
||||
- Validates sequence/checksum.
|
||||
- Routes payloads to client-connection channels via `Registry`.
|
||||
- Emits close/ack/data events and updates telemetry.
|
||||
|
||||
Design intent:
|
||||
- Reader must stay non-blocking as much as possible.
|
||||
- Backpressure on a single client route must not stall the whole writer stream.
|
||||
|
||||
### Refill
|
||||
`Refill` is the recovery mechanism that restores writer coverage when capacity drops:
|
||||
- Per-endpoint restore (same endpoint first).
|
||||
- Per-DC restore to satisfy required floor.
|
||||
- Optional outage-mode/shadow behavior for fragile single-endpoint DCs.
|
||||
|
||||
Refill works asynchronously and should not block hot routing paths.
|
||||
|
||||
### Registry
|
||||
`Registry` is the routing index between ME and client sessions:
|
||||
- `conn_id -> client response channel`
|
||||
- `conn_id <-> writer_id` binding map
|
||||
- writer activity snapshots and idle tracking
|
||||
|
||||
Main invariants:
|
||||
- A `conn_id` routes to at most one active response channel.
|
||||
- Writer loss triggers safe unbind/cleanup and close propagation.
|
||||
- Registry state is the source of truth for active ME-bound session mapping.
|
||||
|
||||
## Adaptive Floor
|
||||
|
||||
### What it is
|
||||
`Adaptive Floor` is a runtime policy that changes target writer count per DC based on observed activity, instead of always holding static peak floor.
|
||||
|
||||
### Why it exists
|
||||
Goals:
|
||||
- Reduce idle writer churn under low traffic.
|
||||
- Keep enough warm capacity to avoid client-visible stalls on burst recovery.
|
||||
- Limit needless reconnect storms on unstable endpoints.
|
||||
|
||||
### Behavioral model
|
||||
- Under activity: floor converges toward configured static requirement.
|
||||
- Under prolonged idle: floor can shrink to a safe minimum.
|
||||
- Recovery/grace windows prevent aggressive oscillation.
|
||||
|
||||
### Safety constraints
|
||||
- Never violate minimal survivability floor for a DC group.
|
||||
- Refill must still restore quickly on demand.
|
||||
- Floor adaptation must not force-drop already bound healthy sessions.
|
||||
|
||||
## Trio-State
|
||||
|
||||
`Trio-State` is writer contouring:
|
||||
- `Warm`
|
||||
- `Active`
|
||||
- `Draining`
|
||||
|
||||
### State semantics
|
||||
- `Warm`: connected and validated, not primary for new binds.
|
||||
- `Active`: preferred for new binds and normal traffic.
|
||||
- `Draining`: no new regular binds; existing sessions continue until graceful retirement rules apply.
|
||||
|
||||
### Transition intent
|
||||
- `Warm -> Active`: when coverage/readiness conditions are satisfied.
|
||||
- `Active -> Draining`: on generation swap, endpoint replacement, or controlled retirement.
|
||||
- `Draining -> removed`: after drain TTL/force-close policy (or when naturally empty).
|
||||
|
||||
This separation reduces SPOF and keeps cutovers predictable.
|
||||
|
||||
## Generation Lifecycle
|
||||
|
||||
Generation isolates pool epochs during reinit/reconfiguration.
|
||||
|
||||
### Lifecycle phases
|
||||
1. `Bootstrap`: initial writers are established.
|
||||
2. `Warmup`: next generation writers are created and validated.
|
||||
3. `Activation`: generation promoted to active when coverage gate passes.
|
||||
4. `Drain`: previous generation becomes draining, existing sessions are allowed to finish.
|
||||
5. `Retire`: old generation writers are removed after graceful rules.
|
||||
|
||||
### Operational guarantees
|
||||
- No partial generation activation without minimum coverage.
|
||||
- Existing healthy client sessions should not be dropped just because a new generation appears.
|
||||
- Draining generation exists to absorb in-flight traffic during swap.
|
||||
|
||||
### Readiness and admission
|
||||
Pool readiness is not equivalent to “all endpoints fully saturated”.
|
||||
Typical gating strategy:
|
||||
- Open admission when per-DC minimal alive coverage exists.
|
||||
- Continue background saturation for multi-endpoint DCs.
|
||||
|
||||
This keeps startup latency low while preserving eventual full capacity.
|
||||
|
||||
## Interactions Between Concepts
|
||||
|
||||
- `Generation` defines pool epochs.
|
||||
- `Trio-State` defines per-writer role inside/around those epochs.
|
||||
- `Adaptive Floor` defines how much capacity should be maintained right now.
|
||||
- `Refill` is the actuator that closes the gap between desired and current capacity.
|
||||
- `Registry` keeps per-session routing correctness while all of the above changes over time.
|
||||
|
||||
## Architectural Approach
|
||||
|
||||
### Layered Design
|
||||
The runtime is intentionally split into two planes:
|
||||
- `Control Plane`: decides desired topology and policy (`floor`, `generation swap`, `refill`, `fallback`).
|
||||
- `Data Plane`: executes packet/session transport (`reader`, `writer`, routing, acks, close propagation).
|
||||
|
||||
Architectural rule:
|
||||
- Control Plane may change writer inventory and policy.
|
||||
- Data Plane must remain stable and low-latency while those changes happen.
|
||||
|
||||
### Ownership Model
|
||||
Ownership is centered around explicit state domains:
|
||||
- `MePool` owns writer lifecycle and policy state.
|
||||
- `Registry` owns per-connection routing bindings.
|
||||
- `Writer task` owns outbound ME socket send progression.
|
||||
- `Reader task` owns inbound ME socket parsing and event dispatch.
|
||||
|
||||
This prevents accidental cross-layer mutation and keeps invariants local.
|
||||
|
||||
### Control Plane Responsibilities
|
||||
Control Plane is event-driven and policy-driven:
|
||||
- Startup initialization and readiness gates.
|
||||
- Runtime reinit (periodic or config-triggered).
|
||||
- Coverage checks per DC/family/endpoint group.
|
||||
- Floor enforcement (static/adaptive).
|
||||
- Refill scheduling and retry orchestration.
|
||||
- Generation transition (`warm -> active`, previous `active -> draining`).
|
||||
|
||||
Control Plane must prioritize determinism over short-term aggressiveness.
|
||||
|
||||
### Data Plane Responsibilities
|
||||
Data Plane is throughput-first and allocation-sensitive:
|
||||
- Session bind to writer.
|
||||
- Per-frame parsing/validation and dispatch.
|
||||
- Ack and close signal propagation.
|
||||
- Route drop behavior under missing connection or closed channel.
|
||||
- Minimal critical logging in hot path.
|
||||
|
||||
Data Plane should avoid waiting on operations that are not strictly required for frame correctness.
|
||||
|
||||
## Concurrency and Synchronization
|
||||
|
||||
### Concurrency Principles
|
||||
- Per-writer isolation: each writer has independent send/read task loops.
|
||||
- Per-connection isolation: client channel state is scoped by `conn_id`.
|
||||
- Asynchronous recovery: refill/reconnect runs outside the packet hot path.
|
||||
|
||||
### Synchronization Strategy
|
||||
- Shared maps use fine-grained, short-lived locking.
|
||||
- Read-mostly paths avoid broad write-lock windows.
|
||||
- Backpressure decisions are localized at route/channel boundary.
|
||||
|
||||
Design target:
|
||||
- A slow consumer should degrade only itself (or its route), not global writer progress.
|
||||
|
||||
### Cancellation and Shutdown
|
||||
Writer and reader loops are cancellation-aware:
|
||||
- explicit cancel token / close command support;
|
||||
- safe unbind and cleanup via registry;
|
||||
- deterministic order: stop admission -> drain/close -> release resources.
|
||||
|
||||
## Consistency Model
|
||||
|
||||
### Session Consistency
|
||||
For one `conn_id`:
|
||||
- exactly one active route target at a time;
|
||||
- close and unbind must be idempotent;
|
||||
- writer loss must not leave dangling bindings.
|
||||
|
||||
### Generation Consistency
|
||||
Generational consistency guarantees:
|
||||
- New generation is not promoted before minimum coverage gate.
|
||||
- Previous generation remains available in `draining` state during handover.
|
||||
- Forced retirement is policy-bound (`drain ttl`, optional force-close), not immediate.
|
||||
|
||||
### Policy Consistency
|
||||
Policy changes (`adaptive/static floor`, fallback mode, retries) should apply without violating established active-session routing invariants.
|
||||
|
||||
## Backpressure and Flow Control
|
||||
|
||||
### Route-Level Backpressure
|
||||
Route channels are bounded by design.
|
||||
When pressure increases:
|
||||
- short burst absorption is allowed;
|
||||
- prolonged congestion triggers controlled drop semantics;
|
||||
- drop accounting is explicit via metrics/counters.
|
||||
|
||||
### Reader Non-Blocking Priority
|
||||
Inbound ME reader path should never be serialized behind one congested client route.
|
||||
Practical implication:
|
||||
- prefer non-blocking route attempt in the parser loop;
|
||||
- move heavy recovery to async side paths.
|
||||
|
||||
## Failure Domain Strategy
|
||||
|
||||
### Endpoint-Level Failure
|
||||
Failure of one endpoint should trigger endpoint-scoped recovery first:
|
||||
- same endpoint reconnect;
|
||||
- endpoint replacement within same DC group if applicable.
|
||||
|
||||
### DC-Level Degradation
|
||||
If a DC group cannot satisfy floor:
|
||||
- keep service via remaining coverage if policy allows;
|
||||
- continue asynchronous refill saturation in background.
|
||||
|
||||
### Whole-Pool Readiness Loss
|
||||
If no sufficient ME coverage exists:
|
||||
- admission gate can hold new accepts (conditional policy);
|
||||
- existing sessions should continue when their path remains healthy.
|
||||
|
||||
## Performance Architecture Notes
|
||||
|
||||
### Hotpath Discipline
|
||||
Allowed in hotpath:
|
||||
- fixed-size parsing and cheap validation;
|
||||
- bounded channel operations;
|
||||
- precomputed or low-allocation access patterns.
|
||||
|
||||
Avoid in hotpath:
|
||||
- repeated expensive decoding;
|
||||
- broad locks with awaits inside critical sections;
|
||||
- verbose high-frequency logging.
|
||||
|
||||
### Throughput Stability Over Peak Spikes
|
||||
Architecture prefers stable throughput and predictable latency over short peak gains that increase churn or long-tail reconnect times.
|
||||
|
||||
## Evolution and Extension Rules
|
||||
|
||||
To evolve this model safely:
|
||||
- Add new policy knobs in Control Plane first.
|
||||
- Keep Data Plane contracts stable (`conn_id`, route semantics, close semantics).
|
||||
- Validate generation and registry invariants before enabling by default.
|
||||
- Introduce new retry/recovery strategies behind explicit config.
|
||||
|
||||
## Failure and Recovery Notes
|
||||
|
||||
- Single-endpoint DC failure is a normal degraded mode case; policy should prioritize fast reconnect and optional shadow/probing strategies.
|
||||
- Idle close by peer should be treated as expected when upstream enforces idle timeout.
|
||||
- Reconnect backoff must protect against synchronized churn while still allowing fast first retries.
|
||||
- Fallback (`ME -> direct DC`) is a policy switch, not a transport bug by itself.
|
||||
|
||||
## Terminology Summary
|
||||
- `Coverage`: enough live writers to satisfy per-DC acceptance policy.
|
||||
- `Floor`: target minimum writer count policy.
|
||||
- `Churn`: frequent writer reconnect/remove cycles.
|
||||
- `Hotpath`: per-packet/per-connection data path where extra waits/allocations are expensive.
|
||||
285
docs/model/MODEL.ru.md
Normal file
285
docs/model/MODEL.ru.md
Normal file
@@ -0,0 +1,285 @@
|
||||
# Runtime-модель Telemt
|
||||
|
||||
## Область описания
|
||||
Документ фиксирует ключевые runtime-понятия пайплайна Middle-End (ME) и оркестрации вокруг него.
|
||||
|
||||
Фокус:
|
||||
- `ME Pool / Reader / Writer / Refill / Registry`
|
||||
- `Adaptive Floor`
|
||||
- `Trio-State`
|
||||
- `Generation Lifecycle`
|
||||
|
||||
## Базовые сущности
|
||||
|
||||
### ME Pool
|
||||
`ME Pool` — центральный оркестратор всех Middle-End writer-ов.
|
||||
|
||||
Зона ответственности:
|
||||
- хранит инвентарь writer-ов по DC/family/endpoint;
|
||||
- управляет выбором writer-а и маршрутизацией;
|
||||
- ведёт состояние поколений (`active`, `warm`, `draining` контекст);
|
||||
- применяет runtime-политики (floor, refill, reconnect, reinit, fallback);
|
||||
- отдаёт сигналы готовности для admission-логики (conditional accept/cast).
|
||||
|
||||
Что не делает:
|
||||
- не декодирует клиентский протокол;
|
||||
- не реализует бизнес-политику пользователя (квоты/лимиты).
|
||||
|
||||
### ME Writer
|
||||
`ME Writer` — долгоживущий ME RPC-канал к конкретному endpoint (`ip:port`), у которого есть:
|
||||
- канал команд на отправку;
|
||||
- связанный reader loop для входящего потока;
|
||||
- флаги состояния/деградации;
|
||||
- метаданные contour/state и generation.
|
||||
|
||||
Writer — это фактический data-plane носитель клиентских сессий после бинда.
|
||||
|
||||
### ME Reader
|
||||
`ME Reader` — входной parser/dispatcher одного writer-а:
|
||||
- читает и расшифровывает ME RPC-фреймы;
|
||||
- проверяет sequence/checksum;
|
||||
- маршрутизирует payload в client-каналы через `Registry`;
|
||||
- обрабатывает close/ack/data и обновляет телеметрию.
|
||||
|
||||
Инженерный принцип:
|
||||
- Reader должен оставаться неблокирующим.
|
||||
- Backpressure одной клиентской сессии не должен останавливать весь поток writer-а.
|
||||
|
||||
### Refill
|
||||
`Refill` — механизм восстановления покрытия writer-ов при просадке:
|
||||
- восстановление на том же endpoint в первую очередь;
|
||||
- восстановление по DC до требуемого floor;
|
||||
- опциональные outage/shadow-режимы для хрупких single-endpoint DC.
|
||||
|
||||
Refill работает асинхронно и не должен блокировать hotpath.
|
||||
|
||||
### Registry
|
||||
`Registry` — маршрутизационный индекс между ME и клиентскими сессиями:
|
||||
- `conn_id -> канал ответа клиенту`;
|
||||
- map биндов `conn_id <-> writer_id`;
|
||||
- снимки активности writer-ов и idle-трекинг.
|
||||
|
||||
Ключевые инварианты:
|
||||
- один `conn_id` маршрутизируется максимум в один активный канал ответа;
|
||||
- потеря writer-а приводит к безопасному unbind/cleanup и отправке close;
|
||||
- именно `Registry` является источником истины по активным ME-биндам.
|
||||
|
||||
## Adaptive Floor
|
||||
|
||||
### Что это
|
||||
`Adaptive Floor` — runtime-политика, которая динамически меняет целевое число writer-ов на DC в зависимости от активности, а не держит всегда фиксированный статический floor.
|
||||
|
||||
### Зачем
|
||||
Цели:
|
||||
- уменьшить churn на idle-трафике;
|
||||
- сохранить достаточную прогретую ёмкость для быстрых всплесков;
|
||||
- снизить лишние reconnect-штормы на нестабильных endpoint.
|
||||
|
||||
### Модель поведения
|
||||
- при активности floor стремится к статическому требованию;
|
||||
- при длительном idle floor может снижаться до безопасного минимума;
|
||||
- grace/recovery окна не дают системе "флапать" слишком резко.
|
||||
|
||||
### Ограничения безопасности
|
||||
- нельзя нарушать минимальный floor выживаемости DC-группы;
|
||||
- refill обязан быстро нарастить покрытие по запросу;
|
||||
- адаптация не должна принудительно ронять уже привязанные healthy-сессии.
|
||||
|
||||
## Trio-State
|
||||
|
||||
`Trio-State` — контурная роль writer-а:
|
||||
- `Warm`
|
||||
- `Active`
|
||||
- `Draining`
|
||||
|
||||
### Семантика состояний
|
||||
- `Warm`: writer подключён и валиден, но не основной для новых биндов.
|
||||
- `Active`: приоритетный для новых биндов и обычного трафика.
|
||||
- `Draining`: новые обычные бинды не назначаются; текущие сессии живут до правил graceful-вывода.
|
||||
|
||||
### Логика переходов
|
||||
- `Warm -> Active`: когда достигнуты условия покрытия/готовности.
|
||||
- `Active -> Draining`: при swap поколения, замене endpoint или контролируемом выводе.
|
||||
- `Draining -> removed`: после drain TTL/force-close политики (или естественного опустошения).
|
||||
|
||||
Такое разделение снижает SPOF-риски и делает cutover предсказуемым.
|
||||
|
||||
## Generation Lifecycle
|
||||
|
||||
Generation изолирует эпохи пула при reinit/reconfiguration.
|
||||
|
||||
### Фазы жизненного цикла
|
||||
1. `Bootstrap`: поднимается начальный набор writer-ов.
|
||||
2. `Warmup`: создаётся и валидируется новое поколение.
|
||||
3. `Activation`: новое поколение становится active после прохождения coverage-gate.
|
||||
4. `Drain`: предыдущее поколение переводится в draining, текущим сессиям дают завершиться.
|
||||
5. `Retire`: старое поколение удаляется по graceful-правилам.
|
||||
|
||||
### Операционные гарантии
|
||||
- нельзя активировать поколение частично без минимального покрытия;
|
||||
- healthy-клиенты не должны теряться только из-за появления нового поколения;
|
||||
- draining-поколение служит буфером для in-flight трафика во время swap.
|
||||
|
||||
### Готовность и приём клиентов
|
||||
Готовность пула не равна "все endpoint полностью насыщены".
|
||||
Типичная стратегия:
|
||||
- открыть admission при минимально достаточном alive-покрытии по DC;
|
||||
- параллельно продолжать saturation для multi-endpoint DC.
|
||||
|
||||
Это уменьшает startup latency и сохраняет выход на полную ёмкость.
|
||||
|
||||
## Как понятия связаны между собой
|
||||
|
||||
- `Generation` задаёт эпохи пула.
|
||||
- `Trio-State` задаёт роль каждого writer-а внутри/между эпохами.
|
||||
- `Adaptive Floor` задаёт, сколько ёмкости нужно сейчас.
|
||||
- `Refill` — исполнитель, который закрывает разницу между desired и current capacity.
|
||||
- `Registry` гарантирует корректную маршрутизацию сессий, пока всё выше меняется.
|
||||
|
||||
## Архитектурный подход
|
||||
|
||||
### Слоистая модель
|
||||
Runtime специально разделён на две плоскости:
|
||||
- `Control Plane`: принимает решения о целевой топологии и политиках (`floor`, `generation swap`, `refill`, `fallback`).
|
||||
- `Data Plane`: исполняет транспорт сессий и пакетов (`reader`, `writer`, маршрутизация, ack, close).
|
||||
|
||||
Ключевое правило:
|
||||
- Control Plane может менять состав writer-ов и policy.
|
||||
- Data Plane должен оставаться стабильным и низколатентным в момент этих изменений.
|
||||
|
||||
### Модель владения состоянием
|
||||
Владение разделено по доменам:
|
||||
- `MePool` владеет жизненным циклом writer-ов и policy-state.
|
||||
- `Registry` владеет routing-биндами клиентских сессий.
|
||||
- `Writer task` владеет исходящей прогрессией ME-сокета.
|
||||
- `Reader task` владеет входящим парсингом и dispatch-событиями.
|
||||
|
||||
Это ограничивает побочные мутации и локализует инварианты.
|
||||
|
||||
### Обязанности Control Plane
|
||||
Control Plane работает событийно и policy-ориентированно:
|
||||
- стартовая инициализация и readiness-gate;
|
||||
- runtime reinit (периодический и/или по изменению конфигурации);
|
||||
- проверки покрытия по DC/family/endpoint group;
|
||||
- применение floor-политики (static/adaptive);
|
||||
- планирование refill и orchestration retry;
|
||||
- переходы поколений (`warm -> active`, прежний `active -> draining`).
|
||||
|
||||
Для него важнее детерминизм, чем агрессивная краткосрочная реакция.
|
||||
|
||||
### Обязанности Data Plane
|
||||
Data Plane ориентирован на пропускную способность и предсказуемую задержку:
|
||||
- bind клиентской сессии к writer-у;
|
||||
- per-frame parsing/validation/dispatch;
|
||||
- распространение ack/close;
|
||||
- корректная реакция на missing conn/closed channel;
|
||||
- минимальный лог-шум в hotpath.
|
||||
|
||||
Data Plane не должен ждать операций, не критичных для корректности текущего фрейма.
|
||||
|
||||
## Конкурентность и синхронизация
|
||||
|
||||
### Принципы конкурентности
|
||||
- Изоляция по writer-у: у каждого writer-а независимые send/read loop.
|
||||
- Изоляция по сессии: состояние канала локально для `conn_id`.
|
||||
- Асинхронное восстановление: refill/reconnect выполняются вне пакетного hotpath.
|
||||
|
||||
### Стратегия синхронизации
|
||||
- Для shared map используются короткие и узкие lock-секции.
|
||||
- Read-heavy пути избегают длительных write-lock окон.
|
||||
- Решения по backpressure локализованы на границе route/channel.
|
||||
|
||||
Цель:
|
||||
- медленный consumer должен деградировать локально, не останавливая глобальный прогресс writer-а.
|
||||
|
||||
### Cancellation и shutdown
|
||||
Reader/Writer loop должны быть cancellation-aware:
|
||||
- явные cancel token / close command;
|
||||
- безопасный unbind/cleanup через registry;
|
||||
- детерминированный порядок: stop admission -> drain/close -> release resources.
|
||||
|
||||
## Модель согласованности
|
||||
|
||||
### Согласованность сессии
|
||||
Для одного `conn_id`:
|
||||
- одновременно ровно один активный route-target;
|
||||
- close/unbind операции идемпотентны;
|
||||
- потеря writer-а не оставляет dangling-бинды.
|
||||
|
||||
### Согласованность поколения
|
||||
Гарантии generation:
|
||||
- новое поколение не активируется до прохождения минимального coverage-gate;
|
||||
- предыдущее поколение остаётся в `draining` на время handover;
|
||||
- принудительный вывод writer-ов ограничен policy (`drain ttl`, optional force-close), а не мгновенный.
|
||||
|
||||
### Согласованность политик
|
||||
Изменение policy (`adaptive/static floor`, fallback mode, retries) не должно ломать инварианты маршрутизации уже активных сессий.
|
||||
|
||||
## Backpressure и управление потоком
|
||||
|
||||
### Route-level backpressure
|
||||
Route-каналы намеренно bounded.
|
||||
При росте нагрузки:
|
||||
- кратковременный burst поглощается;
|
||||
- длительная перегрузка переходит в контролируемую drop-семантику;
|
||||
- все drop-сценарии должны быть прозрачно видны в метриках.
|
||||
|
||||
### Приоритет неблокирующего Reader
|
||||
Входящий ME-reader path не должен сериализоваться из-за одной перегруженной клиентской сессии.
|
||||
Практически это означает:
|
||||
- использовать неблокирующую попытку route в parser loop;
|
||||
- выносить тяжёлое восстановление в асинхронные side-path.
|
||||
|
||||
## Стратегия доменов отказа
|
||||
|
||||
### Отказ отдельного endpoint
|
||||
Сначала применяется endpoint-local recovery:
|
||||
- reconnect в тот же endpoint;
|
||||
- затем замена endpoint внутри той же DC-группы (если доступно).
|
||||
|
||||
### Деградация уровня DC
|
||||
Если DC-группа не набирает floor:
|
||||
- сервис сохраняется на остаточном покрытии (если policy разрешает);
|
||||
- saturation refill продолжается асинхронно в фоне.
|
||||
|
||||
### Потеря готовности всего пула
|
||||
Если достаточного ME-покрытия нет:
|
||||
- admission gate может временно закрыть приём новых подключений (conditional policy);
|
||||
- уже активные сессии продолжают работать, пока их маршрут остаётся healthy.
|
||||
|
||||
## Архитектурные заметки по производительности
|
||||
|
||||
### Дисциплина hotpath
|
||||
Допустимо в hotpath:
|
||||
- фиксированный и дешёвый parsing/validation;
|
||||
- bounded channel operations;
|
||||
- precomputed/low-allocation доступ к данным.
|
||||
|
||||
Нежелательно в hotpath:
|
||||
- повторные дорогие decode;
|
||||
- широкие lock-секции с `await` внутри;
|
||||
- высокочастотный подробный logging.
|
||||
|
||||
### Стабильность важнее пиков
|
||||
Архитектура приоритетно выбирает стабильную пропускную способность и предсказуемую latency, а не краткосрочные пики ценой churn и long-tail reconnect.
|
||||
|
||||
## Правила эволюции модели
|
||||
|
||||
Чтобы расширять модель безопасно:
|
||||
- новые policy knobs сначала внедрять в Control Plane;
|
||||
- контракты Data Plane (`conn_id`, route/close семантика) держать стабильными;
|
||||
- перед дефолтным включением проверять generation/registry инварианты;
|
||||
- новые recovery/retry стратегии вводить через явный config-флаг.
|
||||
|
||||
## Нюансы отказов и восстановления
|
||||
|
||||
- падение single-endpoint DC — штатный деградированный сценарий; приоритет: быстрый reconnect и, при необходимости, shadow/probing;
|
||||
- idle-close со стороны peer должен считаться нормальным событием при upstream idle-timeout;
|
||||
- backoff reconnect-логики должен ограничивать синхронный churn, но сохранять быстрые первые попытки;
|
||||
- fallback (`ME -> direct DC`) — это переключаемая policy-ветка, а не автоматический признак бага транспорта.
|
||||
|
||||
## Краткий словарь
|
||||
- `Coverage`: достаточное число живых writer-ов для политики приёма по DC.
|
||||
- `Floor`: целевая минимальная ёмкость writer-ов.
|
||||
- `Churn`: частые циклы reconnect/remove writer-ов.
|
||||
- `Hotpath`: пер-пакетный/пер-коннектный путь, где любые лишние ожидания и аллокации особенно дороги.
|
||||
@@ -2,7 +2,8 @@ use std::convert::Infallible;
|
||||
use std::net::{IpAddr, SocketAddr};
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::{AtomicU64, Ordering};
|
||||
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
|
||||
use std::time::{SystemTime, UNIX_EPOCH};
|
||||
|
||||
use http_body_util::{BodyExt, Full};
|
||||
use hyper::body::{Bytes, Incoming};
|
||||
@@ -25,6 +26,7 @@ use crate::transport::UpstreamManager;
|
||||
mod config_store;
|
||||
mod model;
|
||||
mod runtime_stats;
|
||||
mod runtime_zero;
|
||||
mod users;
|
||||
|
||||
use config_store::{current_revision, parse_if_match};
|
||||
@@ -36,8 +38,19 @@ use runtime_stats::{
|
||||
MinimalCacheEntry, build_dcs_data, build_me_writers_data, build_minimal_all_data,
|
||||
build_upstreams_data, build_zero_all_data,
|
||||
};
|
||||
use runtime_zero::{
|
||||
build_limits_effective_data, build_runtime_gates_data, build_security_posture_data,
|
||||
build_system_info_data,
|
||||
};
|
||||
use users::{create_user, delete_user, patch_user, rotate_secret, users_from_config};
|
||||
|
||||
pub(super) struct ApiRuntimeState {
|
||||
pub(super) process_started_at_epoch_secs: u64,
|
||||
pub(super) config_reload_count: AtomicU64,
|
||||
pub(super) last_config_reload_epoch_secs: AtomicU64,
|
||||
pub(super) admission_open: AtomicBool,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(super) struct ApiShared {
|
||||
pub(super) stats: Arc<Stats>,
|
||||
@@ -50,6 +63,7 @@ pub(super) struct ApiShared {
|
||||
pub(super) mutation_lock: Arc<Mutex<()>>,
|
||||
pub(super) minimal_cache: Arc<Mutex<Option<MinimalCacheEntry>>>,
|
||||
pub(super) request_id: Arc<AtomicU64>,
|
||||
pub(super) runtime_state: Arc<ApiRuntimeState>,
|
||||
}
|
||||
|
||||
impl ApiShared {
|
||||
@@ -65,9 +79,11 @@ pub async fn serve(
|
||||
me_pool: Option<Arc<MePool>>,
|
||||
upstream_manager: Arc<UpstreamManager>,
|
||||
config_rx: watch::Receiver<Arc<ProxyConfig>>,
|
||||
admission_rx: watch::Receiver<bool>,
|
||||
config_path: PathBuf,
|
||||
startup_detected_ip_v4: Option<IpAddr>,
|
||||
startup_detected_ip_v6: Option<IpAddr>,
|
||||
process_started_at_epoch_secs: u64,
|
||||
) {
|
||||
let listener = match TcpListener::bind(listen).await {
|
||||
Ok(listener) => listener,
|
||||
@@ -83,6 +99,13 @@ pub async fn serve(
|
||||
|
||||
info!("API endpoint: http://{}/v1/*", listen);
|
||||
|
||||
let runtime_state = Arc::new(ApiRuntimeState {
|
||||
process_started_at_epoch_secs,
|
||||
config_reload_count: AtomicU64::new(0),
|
||||
last_config_reload_epoch_secs: AtomicU64::new(0),
|
||||
admission_open: AtomicBool::new(*admission_rx.borrow()),
|
||||
});
|
||||
|
||||
let shared = Arc::new(ApiShared {
|
||||
stats,
|
||||
ip_tracker,
|
||||
@@ -94,6 +117,38 @@ pub async fn serve(
|
||||
mutation_lock: Arc::new(Mutex::new(())),
|
||||
minimal_cache: Arc::new(Mutex::new(None)),
|
||||
request_id: Arc::new(AtomicU64::new(1)),
|
||||
runtime_state: runtime_state.clone(),
|
||||
});
|
||||
|
||||
let mut config_rx_reload = config_rx.clone();
|
||||
let runtime_state_reload = runtime_state.clone();
|
||||
tokio::spawn(async move {
|
||||
loop {
|
||||
if config_rx_reload.changed().await.is_err() {
|
||||
break;
|
||||
}
|
||||
runtime_state_reload
|
||||
.config_reload_count
|
||||
.fetch_add(1, Ordering::Relaxed);
|
||||
runtime_state_reload
|
||||
.last_config_reload_epoch_secs
|
||||
.store(now_epoch_secs(), Ordering::Relaxed);
|
||||
}
|
||||
});
|
||||
|
||||
let mut admission_rx_watch = admission_rx.clone();
|
||||
tokio::spawn(async move {
|
||||
runtime_state
|
||||
.admission_open
|
||||
.store(*admission_rx_watch.borrow(), Ordering::Relaxed);
|
||||
loop {
|
||||
if admission_rx_watch.changed().await.is_err() {
|
||||
break;
|
||||
}
|
||||
runtime_state
|
||||
.admission_open
|
||||
.store(*admission_rx_watch.borrow(), Ordering::Relaxed);
|
||||
}
|
||||
});
|
||||
|
||||
loop {
|
||||
@@ -189,6 +244,26 @@ async fn handle(
|
||||
};
|
||||
Ok(success_response(StatusCode::OK, data, revision))
|
||||
}
|
||||
("GET", "/v1/system/info") => {
|
||||
let revision = current_revision(&shared.config_path).await?;
|
||||
let data = build_system_info_data(shared.as_ref(), cfg.as_ref(), &revision);
|
||||
Ok(success_response(StatusCode::OK, data, revision))
|
||||
}
|
||||
("GET", "/v1/runtime/gates") => {
|
||||
let revision = current_revision(&shared.config_path).await?;
|
||||
let data = build_runtime_gates_data(shared.as_ref(), cfg.as_ref());
|
||||
Ok(success_response(StatusCode::OK, data, revision))
|
||||
}
|
||||
("GET", "/v1/limits/effective") => {
|
||||
let revision = current_revision(&shared.config_path).await?;
|
||||
let data = build_limits_effective_data(cfg.as_ref());
|
||||
Ok(success_response(StatusCode::OK, data, revision))
|
||||
}
|
||||
("GET", "/v1/security/posture") => {
|
||||
let revision = current_revision(&shared.config_path).await?;
|
||||
let data = build_security_posture_data(cfg.as_ref());
|
||||
Ok(success_response(StatusCode::OK, data, revision))
|
||||
}
|
||||
("GET", "/v1/stats/summary") => {
|
||||
let revision = current_revision(&shared.config_path).await?;
|
||||
let data = SummaryData {
|
||||
@@ -441,3 +516,10 @@ async fn read_body_with_limit(body: Incoming, limit: usize) -> Result<Vec<u8>, A
|
||||
}
|
||||
Ok(collected)
|
||||
}
|
||||
|
||||
fn now_epoch_secs() -> u64 {
|
||||
SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_secs()
|
||||
}
|
||||
|
||||
227
src/api/runtime_zero.rs
Normal file
227
src/api/runtime_zero.rs
Normal file
@@ -0,0 +1,227 @@
|
||||
use std::sync::atomic::Ordering;
|
||||
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::config::{MeFloorMode, ProxyConfig, UserMaxUniqueIpsMode};
|
||||
|
||||
use super::ApiShared;
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub(super) struct SystemInfoData {
|
||||
pub(super) version: String,
|
||||
pub(super) target_arch: String,
|
||||
pub(super) target_os: String,
|
||||
pub(super) build_profile: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub(super) git_commit: Option<String>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub(super) build_time_utc: Option<String>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub(super) rustc_version: Option<String>,
|
||||
pub(super) process_started_at_epoch_secs: u64,
|
||||
pub(super) uptime_seconds: f64,
|
||||
pub(super) config_path: String,
|
||||
pub(super) config_hash: String,
|
||||
pub(super) config_reload_count: u64,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub(super) last_config_reload_epoch_secs: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub(super) struct RuntimeGatesData {
|
||||
pub(super) accepting_new_connections: bool,
|
||||
pub(super) conditional_cast_enabled: bool,
|
||||
pub(super) me_runtime_ready: bool,
|
||||
pub(super) me2dc_fallback_enabled: bool,
|
||||
pub(super) use_middle_proxy: bool,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub(super) struct EffectiveTimeoutLimits {
|
||||
pub(super) client_handshake_secs: u64,
|
||||
pub(super) tg_connect_secs: u64,
|
||||
pub(super) client_keepalive_secs: u64,
|
||||
pub(super) client_ack_secs: u64,
|
||||
pub(super) me_one_retry: u8,
|
||||
pub(super) me_one_timeout_ms: u64,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub(super) struct EffectiveUpstreamLimits {
|
||||
pub(super) connect_retry_attempts: u32,
|
||||
pub(super) connect_retry_backoff_ms: u64,
|
||||
pub(super) connect_budget_ms: u64,
|
||||
pub(super) unhealthy_fail_threshold: u32,
|
||||
pub(super) connect_failfast_hard_errors: bool,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub(super) struct EffectiveMiddleProxyLimits {
|
||||
pub(super) floor_mode: &'static str,
|
||||
pub(super) adaptive_floor_idle_secs: u64,
|
||||
pub(super) adaptive_floor_min_writers_single_endpoint: u8,
|
||||
pub(super) adaptive_floor_recover_grace_secs: u64,
|
||||
pub(super) reconnect_max_concurrent_per_dc: u32,
|
||||
pub(super) reconnect_backoff_base_ms: u64,
|
||||
pub(super) reconnect_backoff_cap_ms: u64,
|
||||
pub(super) reconnect_fast_retry_count: u32,
|
||||
pub(super) me2dc_fallback: bool,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub(super) struct EffectiveUserIpPolicyLimits {
|
||||
pub(super) mode: &'static str,
|
||||
pub(super) window_secs: u64,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub(super) struct EffectiveLimitsData {
|
||||
pub(super) update_every_secs: u64,
|
||||
pub(super) me_reinit_every_secs: u64,
|
||||
pub(super) me_pool_force_close_secs: u64,
|
||||
pub(super) timeouts: EffectiveTimeoutLimits,
|
||||
pub(super) upstream: EffectiveUpstreamLimits,
|
||||
pub(super) middle_proxy: EffectiveMiddleProxyLimits,
|
||||
pub(super) user_ip_policy: EffectiveUserIpPolicyLimits,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
pub(super) struct SecurityPostureData {
|
||||
pub(super) api_read_only: bool,
|
||||
pub(super) api_whitelist_enabled: bool,
|
||||
pub(super) api_whitelist_entries: usize,
|
||||
pub(super) api_auth_header_enabled: bool,
|
||||
pub(super) proxy_protocol_enabled: bool,
|
||||
pub(super) log_level: String,
|
||||
pub(super) telemetry_core_enabled: bool,
|
||||
pub(super) telemetry_user_enabled: bool,
|
||||
pub(super) telemetry_me_level: String,
|
||||
}
|
||||
|
||||
pub(super) fn build_system_info_data(
|
||||
shared: &ApiShared,
|
||||
_cfg: &ProxyConfig,
|
||||
revision: &str,
|
||||
) -> SystemInfoData {
|
||||
let last_reload_epoch_secs = shared
|
||||
.runtime_state
|
||||
.last_config_reload_epoch_secs
|
||||
.load(Ordering::Relaxed);
|
||||
let last_config_reload_epoch_secs = (last_reload_epoch_secs > 0).then_some(last_reload_epoch_secs);
|
||||
|
||||
let git_commit = option_env!("TELEMT_GIT_COMMIT")
|
||||
.or(option_env!("VERGEN_GIT_SHA"))
|
||||
.or(option_env!("GIT_COMMIT"))
|
||||
.map(ToString::to_string);
|
||||
let build_time_utc = option_env!("BUILD_TIME_UTC")
|
||||
.or(option_env!("VERGEN_BUILD_TIMESTAMP"))
|
||||
.map(ToString::to_string);
|
||||
let rustc_version = option_env!("RUSTC_VERSION")
|
||||
.or(option_env!("VERGEN_RUSTC_SEMVER"))
|
||||
.map(ToString::to_string);
|
||||
|
||||
SystemInfoData {
|
||||
version: env!("CARGO_PKG_VERSION").to_string(),
|
||||
target_arch: std::env::consts::ARCH.to_string(),
|
||||
target_os: std::env::consts::OS.to_string(),
|
||||
build_profile: option_env!("PROFILE").unwrap_or("unknown").to_string(),
|
||||
git_commit,
|
||||
build_time_utc,
|
||||
rustc_version,
|
||||
process_started_at_epoch_secs: shared.runtime_state.process_started_at_epoch_secs,
|
||||
uptime_seconds: shared.stats.uptime_secs(),
|
||||
config_path: shared.config_path.display().to_string(),
|
||||
config_hash: revision.to_string(),
|
||||
config_reload_count: shared.runtime_state.config_reload_count.load(Ordering::Relaxed),
|
||||
last_config_reload_epoch_secs,
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn build_runtime_gates_data(shared: &ApiShared, cfg: &ProxyConfig) -> RuntimeGatesData {
|
||||
let me_runtime_ready = if !cfg.general.use_middle_proxy {
|
||||
true
|
||||
} else {
|
||||
shared
|
||||
.me_pool
|
||||
.as_ref()
|
||||
.map(|pool| pool.is_runtime_ready())
|
||||
.unwrap_or(false)
|
||||
};
|
||||
|
||||
RuntimeGatesData {
|
||||
accepting_new_connections: shared.runtime_state.admission_open.load(Ordering::Relaxed),
|
||||
conditional_cast_enabled: cfg.general.use_middle_proxy,
|
||||
me_runtime_ready,
|
||||
me2dc_fallback_enabled: cfg.general.me2dc_fallback,
|
||||
use_middle_proxy: cfg.general.use_middle_proxy,
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn build_limits_effective_data(cfg: &ProxyConfig) -> EffectiveLimitsData {
|
||||
EffectiveLimitsData {
|
||||
update_every_secs: cfg.general.effective_update_every_secs(),
|
||||
me_reinit_every_secs: cfg.general.effective_me_reinit_every_secs(),
|
||||
me_pool_force_close_secs: cfg.general.effective_me_pool_force_close_secs(),
|
||||
timeouts: EffectiveTimeoutLimits {
|
||||
client_handshake_secs: cfg.timeouts.client_handshake,
|
||||
tg_connect_secs: cfg.timeouts.tg_connect,
|
||||
client_keepalive_secs: cfg.timeouts.client_keepalive,
|
||||
client_ack_secs: cfg.timeouts.client_ack,
|
||||
me_one_retry: cfg.timeouts.me_one_retry,
|
||||
me_one_timeout_ms: cfg.timeouts.me_one_timeout_ms,
|
||||
},
|
||||
upstream: EffectiveUpstreamLimits {
|
||||
connect_retry_attempts: cfg.general.upstream_connect_retry_attempts,
|
||||
connect_retry_backoff_ms: cfg.general.upstream_connect_retry_backoff_ms,
|
||||
connect_budget_ms: cfg.general.upstream_connect_budget_ms,
|
||||
unhealthy_fail_threshold: cfg.general.upstream_unhealthy_fail_threshold,
|
||||
connect_failfast_hard_errors: cfg.general.upstream_connect_failfast_hard_errors,
|
||||
},
|
||||
middle_proxy: EffectiveMiddleProxyLimits {
|
||||
floor_mode: me_floor_mode_label(cfg.general.me_floor_mode),
|
||||
adaptive_floor_idle_secs: cfg.general.me_adaptive_floor_idle_secs,
|
||||
adaptive_floor_min_writers_single_endpoint: cfg
|
||||
.general
|
||||
.me_adaptive_floor_min_writers_single_endpoint,
|
||||
adaptive_floor_recover_grace_secs: cfg.general.me_adaptive_floor_recover_grace_secs,
|
||||
reconnect_max_concurrent_per_dc: cfg.general.me_reconnect_max_concurrent_per_dc,
|
||||
reconnect_backoff_base_ms: cfg.general.me_reconnect_backoff_base_ms,
|
||||
reconnect_backoff_cap_ms: cfg.general.me_reconnect_backoff_cap_ms,
|
||||
reconnect_fast_retry_count: cfg.general.me_reconnect_fast_retry_count,
|
||||
me2dc_fallback: cfg.general.me2dc_fallback,
|
||||
},
|
||||
user_ip_policy: EffectiveUserIpPolicyLimits {
|
||||
mode: user_max_unique_ips_mode_label(cfg.access.user_max_unique_ips_mode),
|
||||
window_secs: cfg.access.user_max_unique_ips_window_secs,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub(super) fn build_security_posture_data(cfg: &ProxyConfig) -> SecurityPostureData {
|
||||
SecurityPostureData {
|
||||
api_read_only: cfg.server.api.read_only,
|
||||
api_whitelist_enabled: !cfg.server.api.whitelist.is_empty(),
|
||||
api_whitelist_entries: cfg.server.api.whitelist.len(),
|
||||
api_auth_header_enabled: !cfg.server.api.auth_header.is_empty(),
|
||||
proxy_protocol_enabled: cfg.server.proxy_protocol,
|
||||
log_level: cfg.general.log_level.to_string(),
|
||||
telemetry_core_enabled: cfg.general.telemetry.core_enabled,
|
||||
telemetry_user_enabled: cfg.general.telemetry.user_enabled,
|
||||
telemetry_me_level: cfg.general.telemetry.me_level.to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
fn user_max_unique_ips_mode_label(mode: UserMaxUniqueIpsMode) -> &'static str {
|
||||
match mode {
|
||||
UserMaxUniqueIpsMode::ActiveWindow => "active_window",
|
||||
UserMaxUniqueIpsMode::TimeWindow => "time_window",
|
||||
UserMaxUniqueIpsMode::Combined => "combined",
|
||||
}
|
||||
}
|
||||
|
||||
fn me_floor_mode_label(mode: MeFloorMode) -> &'static str {
|
||||
match mode {
|
||||
MeFloorMode::Static => "static",
|
||||
MeFloorMode::Adaptive => "adaptive",
|
||||
}
|
||||
}
|
||||
@@ -555,11 +555,6 @@ impl ProxyConfig {
|
||||
warn!("prefer_ipv6 is deprecated, use [network].prefer = 6");
|
||||
}
|
||||
|
||||
// Auto-enable NAT probe when Middle Proxy is requested.
|
||||
if config.general.use_middle_proxy && !config.general.middle_proxy_nat_probe {
|
||||
config.general.middle_proxy_nat_probe = true;
|
||||
warn!("Auto-enabled middle_proxy_nat_probe for middle proxy mode");
|
||||
}
|
||||
if config.general.use_middle_proxy && !config.general.me_secret_atomic_snapshot {
|
||||
config.general.me_secret_atomic_snapshot = true;
|
||||
warn!(
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
use std::net::SocketAddr;
|
||||
use std::sync::Arc;
|
||||
use std::time::{Duration, Instant};
|
||||
use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
|
||||
use rand::Rng;
|
||||
use tokio::net::TcpListener;
|
||||
use tokio::signal;
|
||||
@@ -369,6 +369,10 @@ async fn load_startup_proxy_config_snapshot(
|
||||
#[tokio::main]
|
||||
async fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
|
||||
let process_started_at = Instant::now();
|
||||
let process_started_at_epoch_secs = SystemTime::now()
|
||||
.duration_since(UNIX_EPOCH)
|
||||
.unwrap_or_default()
|
||||
.as_secs();
|
||||
let (config_path, cli_silent, cli_log_level) = parse_cli();
|
||||
|
||||
let mut config = match ProxyConfig::load(&config_path) {
|
||||
@@ -1556,6 +1560,7 @@ async fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
|
||||
let me_pool_api = me_pool.clone();
|
||||
let upstream_manager_api = upstream_manager.clone();
|
||||
let config_rx_api = config_rx.clone();
|
||||
let admission_rx_api = admission_rx.clone();
|
||||
let config_path_api = std::path::PathBuf::from(&config_path);
|
||||
let startup_detected_ip_v4 = detected_ip_v4;
|
||||
let startup_detected_ip_v6 = detected_ip_v6;
|
||||
@@ -1567,9 +1572,11 @@ async fn main() -> std::result::Result<(), Box<dyn std::error::Error>> {
|
||||
me_pool_api,
|
||||
upstream_manager_api,
|
||||
config_rx_api,
|
||||
admission_rx_api,
|
||||
config_path_api,
|
||||
startup_detected_ip_v4,
|
||||
startup_detected_ip_v6,
|
||||
process_started_at_epoch_secs,
|
||||
)
|
||||
.await;
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user