2026-02-18 06:01:52 +03:00
|
|
|
use std::collections::{HashMap, HashSet};
|
2026-02-14 01:36:14 +03:00
|
|
|
use std::net::SocketAddr;
|
|
|
|
|
use std::sync::Arc;
|
2026-02-17 04:16:16 +03:00
|
|
|
use std::time::{Duration, Instant};
|
2026-02-14 01:36:14 +03:00
|
|
|
|
|
|
|
|
use tracing::{debug, info, warn};
|
2026-02-15 14:02:00 +03:00
|
|
|
use rand::seq::SliceRandom;
|
2026-02-14 01:36:14 +03:00
|
|
|
|
|
|
|
|
use crate::crypto::SecureRandom;
|
2026-02-18 06:01:52 +03:00
|
|
|
use crate::network::IpFamily;
|
2026-02-14 01:36:14 +03:00
|
|
|
|
|
|
|
|
use super::MePool;
|
|
|
|
|
|
2026-02-19 14:25:39 +03:00
|
|
|
const HEALTH_INTERVAL_SECS: u64 = 1;
|
|
|
|
|
const QUICK_RETRY_ATTEMPTS: u8 = 10;
|
|
|
|
|
const QUICK_RETRY_DELAY_MS: u64 = 2500;
|
|
|
|
|
|
2026-02-15 14:02:00 +03:00
|
|
|
pub async fn me_health_monitor(pool: Arc<MePool>, rng: Arc<SecureRandom>, _min_connections: usize) {
|
2026-02-18 06:01:52 +03:00
|
|
|
let mut backoff: HashMap<(i32, IpFamily), u64> = HashMap::new();
|
|
|
|
|
let mut last_attempt: HashMap<(i32, IpFamily), Instant> = HashMap::new();
|
2026-02-18 19:50:16 +03:00
|
|
|
let mut inflight_single: HashSet<(i32, IpFamily)> = HashSet::new();
|
2026-02-14 01:36:14 +03:00
|
|
|
loop {
|
2026-02-19 14:25:39 +03:00
|
|
|
tokio::time::sleep(Duration::from_secs(HEALTH_INTERVAL_SECS)).await;
|
2026-02-18 19:50:16 +03:00
|
|
|
check_family(
|
|
|
|
|
IpFamily::V4,
|
|
|
|
|
&pool,
|
|
|
|
|
&rng,
|
|
|
|
|
&mut backoff,
|
|
|
|
|
&mut last_attempt,
|
|
|
|
|
&mut inflight_single,
|
|
|
|
|
)
|
|
|
|
|
.await;
|
|
|
|
|
check_family(
|
|
|
|
|
IpFamily::V6,
|
|
|
|
|
&pool,
|
|
|
|
|
&rng,
|
|
|
|
|
&mut backoff,
|
|
|
|
|
&mut last_attempt,
|
|
|
|
|
&mut inflight_single,
|
|
|
|
|
)
|
|
|
|
|
.await;
|
2026-02-18 06:01:52 +03:00
|
|
|
}
|
|
|
|
|
}
|
2026-02-15 14:02:00 +03:00
|
|
|
|
2026-02-18 06:01:52 +03:00
|
|
|
async fn check_family(
|
|
|
|
|
family: IpFamily,
|
|
|
|
|
pool: &Arc<MePool>,
|
|
|
|
|
rng: &Arc<SecureRandom>,
|
|
|
|
|
backoff: &mut HashMap<(i32, IpFamily), u64>,
|
|
|
|
|
last_attempt: &mut HashMap<(i32, IpFamily), Instant>,
|
2026-02-18 19:50:16 +03:00
|
|
|
inflight_single: &mut HashSet<(i32, IpFamily)>,
|
2026-02-18 06:01:52 +03:00
|
|
|
) {
|
|
|
|
|
let enabled = match family {
|
|
|
|
|
IpFamily::V4 => pool.decision.ipv4_me,
|
|
|
|
|
IpFamily::V6 => pool.decision.ipv6_me,
|
|
|
|
|
};
|
|
|
|
|
if !enabled {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let map = match family {
|
|
|
|
|
IpFamily::V4 => pool.proxy_map_v4.read().await.clone(),
|
|
|
|
|
IpFamily::V6 => pool.proxy_map_v6.read().await.clone(),
|
|
|
|
|
};
|
|
|
|
|
let writer_addrs: HashSet<SocketAddr> = pool
|
|
|
|
|
.writers
|
|
|
|
|
.read()
|
|
|
|
|
.await
|
|
|
|
|
.iter()
|
|
|
|
|
.map(|w| w.addr)
|
|
|
|
|
.collect();
|
2026-02-17 04:16:16 +03:00
|
|
|
|
2026-02-18 19:50:16 +03:00
|
|
|
let entries: Vec<(i32, Vec<SocketAddr>)> = map
|
|
|
|
|
.iter()
|
|
|
|
|
.map(|(dc, addrs)| {
|
|
|
|
|
let list = addrs
|
|
|
|
|
.iter()
|
|
|
|
|
.map(|(ip, port)| SocketAddr::new(*ip, *port))
|
|
|
|
|
.collect::<Vec<_>>();
|
|
|
|
|
(*dc, list)
|
|
|
|
|
})
|
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
|
|
for (dc, dc_addrs) in entries {
|
2026-02-18 06:01:52 +03:00
|
|
|
let has_coverage = dc_addrs.iter().any(|a| writer_addrs.contains(a));
|
|
|
|
|
if has_coverage {
|
2026-02-18 19:50:16 +03:00
|
|
|
inflight_single.remove(&(dc, family));
|
2026-02-18 06:01:52 +03:00
|
|
|
continue;
|
|
|
|
|
}
|
2026-02-19 14:25:39 +03:00
|
|
|
|
|
|
|
|
// Aggressive quick-retry burst: up to 10 attempts every 2.5s before falling back to exponential backoff.
|
2026-02-18 19:50:16 +03:00
|
|
|
let key = (dc, family);
|
2026-02-19 14:25:39 +03:00
|
|
|
for attempt in 0..QUICK_RETRY_ATTEMPTS {
|
|
|
|
|
let mut shuffled = dc_addrs.clone();
|
|
|
|
|
shuffled.shuffle(&mut rand::rng());
|
|
|
|
|
let mut success = false;
|
|
|
|
|
for addr in &shuffled {
|
|
|
|
|
match pool.connect_one(*addr, rng.as_ref()).await {
|
|
|
|
|
Ok(()) => {
|
|
|
|
|
info!(%addr, dc = %dc, ?family, attempt, "ME reconnected (quick burst)");
|
|
|
|
|
backoff.insert(key, HEALTH_INTERVAL_SECS);
|
|
|
|
|
last_attempt.insert(key, Instant::now());
|
|
|
|
|
inflight_single.remove(&key);
|
|
|
|
|
success = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
Err(e) => debug!(%addr, dc = %dc, error = %e, attempt, ?family, "ME reconnect failed (quick)"),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if success {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
tokio::time::sleep(Duration::from_millis(QUICK_RETRY_DELAY_MS)).await;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let delay = *backoff.get(&key).unwrap_or(&HEALTH_INTERVAL_SECS);
|
2026-02-18 06:01:52 +03:00
|
|
|
let now = Instant::now();
|
|
|
|
|
if let Some(last) = last_attempt.get(&key) {
|
|
|
|
|
if now.duration_since(*last).as_secs() < delay {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
}
|
2026-02-18 19:50:16 +03:00
|
|
|
if dc_addrs.len() == 1 {
|
|
|
|
|
// Single ME address: fast retries then slower background retries.
|
|
|
|
|
if inflight_single.contains(&key) {
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
inflight_single.insert(key);
|
|
|
|
|
let addr = dc_addrs[0];
|
|
|
|
|
let dc_id = dc;
|
|
|
|
|
let pool_clone = pool.clone();
|
|
|
|
|
let rng_clone = rng.clone();
|
|
|
|
|
let timeout = pool.me_one_timeout;
|
|
|
|
|
let quick_attempts = pool.me_one_retry.max(1);
|
|
|
|
|
tokio::spawn(async move {
|
|
|
|
|
let mut success = false;
|
|
|
|
|
for _ in 0..quick_attempts {
|
|
|
|
|
let res = tokio::time::timeout(timeout, pool_clone.connect_one(addr, rng_clone.as_ref())).await;
|
|
|
|
|
match res {
|
|
|
|
|
Ok(Ok(())) => {
|
|
|
|
|
info!(%addr, dc = %dc_id, ?family, "ME reconnected for DC coverage");
|
|
|
|
|
success = true;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
Ok(Err(e)) => debug!(%addr, dc = %dc_id, error = %e, ?family, "ME reconnect failed"),
|
|
|
|
|
Err(_) => debug!(%addr, dc = %dc_id, ?family, "ME reconnect timed out"),
|
|
|
|
|
}
|
|
|
|
|
tokio::time::sleep(Duration::from_millis(1000)).await;
|
|
|
|
|
}
|
|
|
|
|
if success {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
let timeout_ms = timeout.as_millis();
|
|
|
|
|
warn!(
|
|
|
|
|
dc = %dc_id,
|
|
|
|
|
?family,
|
|
|
|
|
attempts = quick_attempts,
|
|
|
|
|
timeout_ms,
|
|
|
|
|
"DC={} has no ME coverage: {} tries * {} ms... retry in 5 seconds...",
|
|
|
|
|
dc_id,
|
|
|
|
|
quick_attempts,
|
|
|
|
|
timeout_ms
|
|
|
|
|
);
|
|
|
|
|
loop {
|
|
|
|
|
tokio::time::sleep(Duration::from_secs(5)).await;
|
|
|
|
|
let res = tokio::time::timeout(timeout, pool_clone.connect_one(addr, rng_clone.as_ref())).await;
|
|
|
|
|
match res {
|
|
|
|
|
Ok(Ok(())) => {
|
|
|
|
|
info!(%addr, dc = %dc_id, ?family, "ME reconnected for DC coverage");
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
Ok(Err(e)) => debug!(%addr, dc = %dc_id, error = %e, ?family, "ME reconnect failed"),
|
|
|
|
|
Err(_) => debug!(%addr, dc = %dc_id, ?family, "ME reconnect timed out"),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// will drop inflight flag in outer loop when coverage detected
|
|
|
|
|
});
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
2026-02-19 14:25:39 +03:00
|
|
|
warn!(dc = %dc, delay, ?family, "DC has no ME coverage, reconnecting (backoff)...");
|
2026-02-18 06:01:52 +03:00
|
|
|
let mut shuffled = dc_addrs.clone();
|
|
|
|
|
shuffled.shuffle(&mut rand::rng());
|
|
|
|
|
let mut reconnected = false;
|
|
|
|
|
for addr in shuffled {
|
|
|
|
|
match pool.connect_one(addr, rng.as_ref()).await {
|
|
|
|
|
Ok(()) => {
|
|
|
|
|
info!(%addr, dc = %dc, ?family, "ME reconnected for DC coverage");
|
|
|
|
|
backoff.insert(key, 30);
|
|
|
|
|
last_attempt.insert(key, now);
|
|
|
|
|
reconnected = true;
|
|
|
|
|
break;
|
2026-02-17 04:16:16 +03:00
|
|
|
}
|
2026-02-18 06:01:52 +03:00
|
|
|
Err(e) => debug!(%addr, dc = %dc, error = %e, ?family, "ME reconnect failed"),
|
2026-02-14 01:36:14 +03:00
|
|
|
}
|
|
|
|
|
}
|
2026-02-18 06:01:52 +03:00
|
|
|
if !reconnected {
|
2026-02-19 14:25:39 +03:00
|
|
|
let next = (*backoff.get(&key).unwrap_or(&HEALTH_INTERVAL_SECS)).saturating_mul(2).min(60);
|
2026-02-18 06:01:52 +03:00
|
|
|
backoff.insert(key, next);
|
|
|
|
|
last_attempt.insert(key, now);
|
|
|
|
|
}
|
2026-02-14 01:36:14 +03:00
|
|
|
}
|
|
|
|
|
}
|