Files
telemt/src/transport/middle_proxy/health.rs
T

179 lines
5.7 KiB
Rust
Raw Normal View History

2026-02-24 22:59:59 +03:00
use std::collections::HashMap;
2026-02-14 01:36:14 +03:00
use std::net::SocketAddr;
use std::sync::Arc;
2026-02-17 04:16:16 +03:00
use std::time::{Duration, Instant};
2026-02-14 01:36:14 +03:00
use tracing::{debug, info, warn};
2026-02-19 15:39:30 +03:00
use rand::Rng;
2026-02-14 01:36:14 +03:00
use crate::crypto::SecureRandom;
2026-02-18 06:01:52 +03:00
use crate::network::IpFamily;
2026-02-14 01:36:14 +03:00
use super::MePool;
const HEALTH_INTERVAL_SECS: u64 = 1;
2026-02-19 15:39:30 +03:00
const JITTER_FRAC_NUM: u64 = 2; // jitter up to 50% of backoff
2026-02-24 03:40:59 +03:00
#[allow(dead_code)]
2026-02-19 16:02:50 +03:00
const MAX_CONCURRENT_PER_DC_DEFAULT: usize = 1;
2026-02-15 14:02:00 +03:00
pub async fn me_health_monitor(pool: Arc<MePool>, rng: Arc<SecureRandom>, _min_connections: usize) {
2026-02-18 06:01:52 +03:00
let mut backoff: HashMap<(i32, IpFamily), u64> = HashMap::new();
2026-02-19 15:39:30 +03:00
let mut next_attempt: HashMap<(i32, IpFamily), Instant> = HashMap::new();
2026-02-19 16:02:50 +03:00
let mut inflight: HashMap<(i32, IpFamily), usize> = HashMap::new();
2026-02-14 01:36:14 +03:00
loop {
tokio::time::sleep(Duration::from_secs(HEALTH_INTERVAL_SECS)).await;
2026-02-18 19:50:16 +03:00
check_family(
IpFamily::V4,
&pool,
&rng,
&mut backoff,
2026-02-19 15:39:30 +03:00
&mut next_attempt,
2026-02-19 16:02:50 +03:00
&mut inflight,
2026-02-18 19:50:16 +03:00
)
.await;
check_family(
IpFamily::V6,
&pool,
&rng,
&mut backoff,
2026-02-19 15:39:30 +03:00
&mut next_attempt,
2026-02-19 16:02:50 +03:00
&mut inflight,
2026-02-18 19:50:16 +03:00
)
.await;
2026-02-18 06:01:52 +03:00
}
}
2026-02-15 14:02:00 +03:00
2026-02-18 06:01:52 +03:00
async fn check_family(
family: IpFamily,
pool: &Arc<MePool>,
rng: &Arc<SecureRandom>,
backoff: &mut HashMap<(i32, IpFamily), u64>,
2026-02-19 15:39:30 +03:00
next_attempt: &mut HashMap<(i32, IpFamily), Instant>,
2026-02-19 16:02:50 +03:00
inflight: &mut HashMap<(i32, IpFamily), usize>,
2026-02-18 06:01:52 +03:00
) {
let enabled = match family {
IpFamily::V4 => pool.decision.ipv4_me,
IpFamily::V6 => pool.decision.ipv6_me,
};
if !enabled {
return;
}
let map = match family {
IpFamily::V4 => pool.proxy_map_v4.read().await.clone(),
IpFamily::V6 => pool.proxy_map_v6.read().await.clone(),
};
2026-02-24 22:59:59 +03:00
let mut dc_endpoints = HashMap::<i32, Vec<SocketAddr>>::new();
for (dc, addrs) in map {
let entry = dc_endpoints.entry(dc.abs()).or_default();
for (ip, port) in addrs {
entry.push(SocketAddr::new(ip, port));
}
}
for endpoints in dc_endpoints.values_mut() {
endpoints.sort_unstable();
endpoints.dedup();
}
let mut live_addr_counts = HashMap::<SocketAddr, usize>::new();
for writer in pool
2026-02-18 06:01:52 +03:00
.writers
.read()
.await
.iter()
2026-02-24 00:04:12 +03:00
.filter(|w| !w.draining.load(std::sync::atomic::Ordering::Relaxed))
2026-02-24 22:59:59 +03:00
{
*live_addr_counts.entry(writer.addr).or_insert(0) += 1;
}
2026-02-18 19:50:16 +03:00
2026-02-24 22:59:59 +03:00
for (dc, endpoints) in dc_endpoints {
if endpoints.is_empty() {
2026-02-18 06:01:52 +03:00
continue;
}
2026-02-24 22:59:59 +03:00
let required = MePool::required_writers_for_dc(endpoints.len());
let alive = endpoints
.iter()
.map(|addr| *live_addr_counts.get(addr).unwrap_or(&0))
.sum::<usize>();
if alive >= required {
continue;
}
let missing = required - alive;
2026-02-18 19:50:16 +03:00
let key = (dc, family);
2026-02-18 06:01:52 +03:00
let now = Instant::now();
2026-02-24 05:57:53 +03:00
if let Some(ts) = next_attempt.get(&key)
&& now < *ts
{
continue;
2026-02-18 06:01:52 +03:00
}
2026-02-18 19:50:16 +03:00
2026-02-19 16:02:50 +03:00
let max_concurrent = pool.me_reconnect_max_concurrent_per_dc.max(1) as usize;
if *inflight.get(&key).unwrap_or(&0) >= max_concurrent {
return;
}
*inflight.entry(key).or_insert(0) += 1;
2026-02-24 22:59:59 +03:00
let mut restored = 0usize;
for _ in 0..missing {
let res = tokio::time::timeout(
pool.me_one_timeout,
pool.connect_endpoints_round_robin(&endpoints, rng.as_ref()),
)
.await;
2026-02-19 15:39:30 +03:00
match res {
2026-02-24 22:59:59 +03:00
Ok(true) => {
restored += 1;
2026-02-19 15:49:35 +03:00
pool.stats.increment_me_reconnect_success();
2026-02-17 04:16:16 +03:00
}
2026-02-24 22:59:59 +03:00
Ok(false) => {
pool.stats.increment_me_reconnect_attempt();
debug!(dc = %dc, ?family, "ME round-robin reconnect failed")
}
Err(_) => {
2026-02-19 15:49:35 +03:00
pool.stats.increment_me_reconnect_attempt();
2026-02-24 22:59:59 +03:00
debug!(dc = %dc, ?family, "ME reconnect timed out");
2026-02-19 15:49:35 +03:00
}
2026-02-14 01:36:14 +03:00
}
}
2026-02-24 22:59:59 +03:00
let now_alive = alive + restored;
if now_alive >= required {
info!(
dc = %dc,
?family,
alive = now_alive,
required,
endpoint_count = endpoints.len(),
"ME writer floor restored for DC"
);
backoff.insert(key, pool.me_reconnect_backoff_base.as_millis() as u64);
let jitter = pool.me_reconnect_backoff_base.as_millis() as u64 / JITTER_FRAC_NUM;
let wait = pool.me_reconnect_backoff_base
+ Duration::from_millis(rand::rng().random_range(0..=jitter.max(1)));
next_attempt.insert(key, now + wait);
} else {
2026-02-19 15:39:30 +03:00
let curr = *backoff.get(&key).unwrap_or(&(pool.me_reconnect_backoff_base.as_millis() as u64));
let next_ms = (curr.saturating_mul(2)).min(pool.me_reconnect_backoff_cap.as_millis() as u64);
backoff.insert(key, next_ms);
let jitter = next_ms / JITTER_FRAC_NUM;
let wait = Duration::from_millis(next_ms)
+ Duration::from_millis(rand::rng().random_range(0..=jitter.max(1)));
next_attempt.insert(key, now + wait);
2026-02-24 22:59:59 +03:00
warn!(
dc = %dc,
?family,
alive = now_alive,
required,
endpoint_count = endpoints.len(),
backoff_ms = next_ms,
"DC writer floor is below required level, scheduled reconnect"
);
2026-02-18 06:01:52 +03:00
}
2026-02-19 16:02:50 +03:00
if let Some(v) = inflight.get_mut(&key) {
*v = v.saturating_sub(1);
}
2026-02-14 01:36:14 +03:00
}
}