diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 56532c4..d92b69c 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -115,7 +115,10 @@ jobs: - name: Setup php-fpm for Linux if: matrix.os == 'ubuntu-24.04' run: | - sudo apt-get update + sudo apt-get update + sudo apt-get install -y software-properties-common + sudo add-apt-repository -y ppa:ondrej/php + sudo apt-get update sudo apt-get install -y php${{ matrix.flag.php_version }}-fpm sudo ln -sf /usr/sbin/php-fpm${{ matrix.flag.php_version }} /usr/sbin/php-fpm @@ -171,8 +174,18 @@ jobs: # Build mixture for cargo test. - name: Docker compose run: | - docker compose up -d --wait - docker compose ps + for i in 1 2 3; do + if docker compose up -d --wait && docker compose ps; then + break + fi + echo "docker compose up failed (attempt ${i}/3), retrying in 10s..." + docker compose down --remove-orphans 2>/dev/null || true + if [ "${i}" -lt 3 ]; then + sleep 10 + else + exit 1 + fi + done # Try cargo test. - name: Cargo test diff --git a/Cargo.lock b/Cargo.lock index 8d20824..5f6ac84 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2456,7 +2456,7 @@ dependencies = [ [[package]] name = "skywalking-php" -version = "1.1.0" +version = "1.2.0" dependencies = [ "anyhow", "axum 0.8.4", @@ -2487,7 +2487,7 @@ dependencies = [ [[package]] name = "skywalking-php-worker" -version = "1.1.0" +version = "1.2.0" dependencies = [ "anyhow", "bincode", diff --git a/Cargo.toml b/Cargo.toml index 87cf48d..b5a6c0a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,7 @@ members = [ ] [workspace.package] -version = "1.1.0" +version = "1.2.0" authors = ["Apache Software Foundation", "jmjoy ", "Yanlong He "] edition = "2024" rust-version = "1.85" diff --git a/README.md b/README.md index a256adb..60c8415 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,8 @@ Sky Walking logo -**SkyWalking PHP** The PHP Agent for Apache SkyWalking, which provides the native tracing abilities for PHP project. +**SkyWalking PHP** The PHP Agent for Apache SkyWalking, which provides native tracing and PHP Health Metrics (PHM) +runtime reporting for PHP projects. **SkyWalking** an APM(application performance monitor) system, especially designed for microservices, cloud native and container-based (Docker, Kubernetes, Mesos) architectures. diff --git a/docs/en/configuration/ini-settings.md b/docs/en/configuration/ini-settings.md index 214677e..c07daf5 100644 --- a/docs/en/configuration/ini-settings.md +++ b/docs/en/configuration/ini-settings.md @@ -27,3 +27,5 @@ This is the configuration list supported in `php.ini`. | skywalking_agent.instance_name | Instance name. You can set `${HOSTNAME}`, refer to [Example #1](https://www.php.net/manual/en/install.fpm.configuration.php) | | | skywalking_agent.standalone_socket_path | Unix domain socket file path of standalone skywalking php worker. Only available when `reporter_type` is `standalone`. | | | skywalking_agent.psr_logging_level | The log level reported to SkyWalking, based on PSR-3, one of `Off`, `Debug`, `Info`, Notice`, Warning`, Error`, Critical`, Alert`, Emergency`. | Off | +| skywalking_agent.metrics_enable | Enable PHP Health Metrics (PHM) meter reporting via native MeterReportService. **Linux only** (requires `/proc`). Default **On** on Linux when the agent is active; default **Off** on macOS/Windows. Set to `Off` to disable on Linux. Reports six process meters: CPU utilization, memory used/peak, virtual memory, thread count, and open FD count. See [PHP agent README](../setup/service-agent/php-agent/README.md#php-health-metrics-phm). | On (Linux); Off (other) | +| skywalking_agent.metrics_report_period | PHM meter collection interval in seconds. Process meters are sampled by the forked reporter worker via `/proc` (parent PHP process PID). **Linux only.** | 30 | diff --git a/docs/en/setup/service-agent/php-agent/README.md b/docs/en/setup/service-agent/php-agent/README.md index 5464177..cc04f11 100644 --- a/docs/en/setup/service-agent/php-agent/README.md +++ b/docs/en/setup/service-agent/php-agent/README.md @@ -113,6 +113,48 @@ Refer to the Configuration section for more configuration items. > Enabling it by default will cause extra meaningless consumption when skywalking agent is not > needed (such as simply executing a php script). +### PHP Health Metrics (PHM) + +> **Platform:** PHM process meters are **Linux only**. The forked reporter worker reads the +> parent PHP process via `/proc` (`/proc/{pid}/status`, `stat`, and `fd`). They are not available +> on macOS or Windows. Trace and other agent features are unchanged. + +When `reporter_type` is `grpc` or `kafka`, the forked reporter worker boots +`skywalking::metrics::Metricer` in `start_worker`, alongside heartbeat reporting. A background +collector samples `/proc` for the parent PHP process (`getppid()`), updates Gauges, and `Metricer` +reports meter data to OAP through the same path as traces and logs. PHM does not run when +`reporter_type = standalone`. + +PHM reports PHP runtime meters through the native Meter protocol (MeterReportService), without +requiring HTTP traffic, similar to Python PVM and Ruby runtime meters. +**PHM is enabled by default on Linux** when the agent is active (`skywalking_agent.enable = On`). +To disable it or tune the interval, use `php.ini`: + +```ini +; Disable PHM if not needed (default is On on Linux). +; skywalking_agent.metrics_enable = Off + +; Report interval in seconds (default 30). +skywalking_agent.metrics_report_period = 30 +``` + +PHM reports six process meters (aligned with OAP `php-runtime.yaml` and Horizon UI widgets): + +| Agent meter name | OAP / UI expression | Source | +| --- | --- | --- | +| `instance_php_process_cpu_utilization` | `meter_instance_php_process_cpu_utilization` | `/proc/{pid}/stat` utime+stime delta | +| `instance_php_memory_used_mb` | `meter_instance_php_memory_used_mb` | `/proc/{pid}/status` VmRSS | +| `instance_php_memory_peak_mb` | `meter_instance_php_memory_peak_mb` | `/proc/{pid}/status` VmHWM | +| `instance_php_virtual_memory_mb` | `meter_instance_php_virtual_memory_mb` | `/proc/{pid}/status` VmSize | +| `instance_php_thread_count` | `meter_instance_php_thread_count` | `/proc/{pid}/status` Threads | +| `instance_php_open_fd_count` | `meter_instance_php_open_fd_count` | `/proc/{pid}/fd` count | + +On the OAP side, activate the `php-runtime` entry in +`agent-analyzer.default.meterAnalyzerActiveFiles`. Horizon UI shows the widgets on the **General +Service → Instance** dashboard when data is available. + +See [INI Settings](../../../configuration/ini-settings.md) for all PHM options. + ## Run Start `php-fpm` server: diff --git a/src/lib.rs b/src/lib.rs index 1c6736f..c813268 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -118,6 +118,14 @@ const SKYWALKING_AGENT_STANDALONE_SOCKET_PATH: &str = "skywalking_agent.standalo /// `Info`, Notice`, Warning`, Error`, Critical`, Alert`, Emergency`. const SKYWALKING_AGENT_PSR_LOGGING_LEVEL: &str = "skywalking_agent.psr_logging_level"; +/// Whether to report PHP Health Metrics (PHM) via native meter protocol. +/// Default is **On on Linux** when the agent extension is active (`/proc` +/// sampling only); **Off** on other platforms. +const SKYWALKING_AGENT_METRICS_ENABLE: &str = "skywalking_agent.metrics_enable"; + +/// PHM report period in seconds. Meters are sampled at most once per period. +const SKYWALKING_AGENT_METRICS_REPORT_PERIOD: &str = "skywalking_agent.metrics_report_period"; + #[php_get_module] pub fn get_module() -> Module { let mut module = Module::new( @@ -214,6 +222,15 @@ pub fn get_module() -> Module { "".to_string(), Policy::System, ); + #[cfg(target_os = "linux")] + module.add_ini(SKYWALKING_AGENT_METRICS_ENABLE, true, Policy::System); + #[cfg(not(target_os = "linux"))] + module.add_ini(SKYWALKING_AGENT_METRICS_ENABLE, false, Policy::System); + module.add_ini( + SKYWALKING_AGENT_METRICS_REPORT_PERIOD, + 30i64, + Policy::System, + ); // Hooks. module.on_module_init(module::init); diff --git a/src/module.rs b/src/module.rs index c9eea05..be80743 100644 --- a/src/module.rs +++ b/src/module.rs @@ -167,6 +167,12 @@ pub static PSR_LOGGING_LEVEL: Lazy = Lazy::new(|| { .into() }); +pub static METRICS_ENABLE: Lazy = + Lazy::new(|| ini_get::(SKYWALKING_AGENT_METRICS_ENABLE)); + +pub static METRICS_REPORT_PERIOD: Lazy = + Lazy::new(|| ini_get::(SKYWALKING_AGENT_METRICS_REPORT_PERIOD)); + pub fn init() { if !is_enable() { return; @@ -193,6 +199,8 @@ pub fn init() { Lazy::force(&KAFKA_PRODUCER_CONFIG); Lazy::force(&INJECT_CONTEXT); Lazy::force(&PSR_LOGGING_LEVEL); + Lazy::force(&METRICS_ENABLE); + Lazy::force(&METRICS_REPORT_PERIOD); if let Err(err) = try_init_logger() { eprintln!("skywalking_agent: initialize logger failed: {}", err); @@ -228,7 +236,6 @@ pub fn init() { return; } - // Initialize Agent worker. init_worker(); let reporter = Arc::new(Reporter::new(&*SOCKET_FILE_PATH)); @@ -239,7 +246,11 @@ pub fn init() { reporter.clone(), )); - logger::set_global_logger(Logger::new(&*SERVICE_NAME, &*SERVICE_INSTANCE, reporter)); + logger::set_global_logger(Logger::new( + &*SERVICE_NAME, + &*SERVICE_INSTANCE, + reporter.clone(), + )); // Hook functions. register_execute_functions(); diff --git a/src/worker.rs b/src/worker.rs index c01f7c5..99036e8 100644 --- a/src/worker.rs +++ b/src/worker.rs @@ -14,9 +14,10 @@ // limitations under the License. use crate::module::{ - AUTHENTICATION, ENABLE_TLS, HEARTBEAT_PERIOD, PROPERTIES_REPORT_PERIOD_FACTOR, REPORTER_TYPE, - SERVER_ADDR, SERVICE_INSTANCE, SERVICE_NAME, SOCKET_FILE_PATH, SSL_CERT_CHAIN_PATH, - SSL_KEY_PATH, SSL_TRUSTED_CA_PATH, WORKER_THREADS, is_standalone_reporter_type, + AUTHENTICATION, ENABLE_TLS, HEARTBEAT_PERIOD, METRICS_ENABLE, METRICS_REPORT_PERIOD, + PROPERTIES_REPORT_PERIOD_FACTOR, REPORTER_TYPE, SERVER_ADDR, SERVICE_INSTANCE, SERVICE_NAME, + SOCKET_FILE_PATH, SSL_CERT_CHAIN_PATH, SSL_KEY_PATH, SSL_TRUSTED_CA_PATH, WORKER_THREADS, + is_standalone_reporter_type, }; #[cfg(feature = "kafka-reporter")] use crate::module::{KAFKA_BOOTSTRAP_SERVERS, KAFKA_PRODUCER_CONFIG}; @@ -24,6 +25,7 @@ use crate::module::{KAFKA_BOOTSTRAP_SERVERS, KAFKA_PRODUCER_CONFIG}; use skywalking_php_worker::reporter::KafkaReporterConfiguration; use skywalking_php_worker::{ HeartBeatConfiguration, WorkerConfiguration, new_tokio_runtime, + phm::PhmConfiguration, reporter::{GrpcReporterConfiguration, ReporterConfiguration}, start_worker, }; @@ -78,6 +80,7 @@ pub fn init_worker() { heartbeat_period: *HEARTBEAT_PERIOD, properties_report_period_factor: *PROPERTIES_REPORT_PERIOD_FACTOR, }), + phm: phm_configuration(), reporter_config, }; @@ -106,3 +109,20 @@ fn worker_threads() -> usize { worker_threads as usize } } + +#[cfg(target_os = "linux")] +fn phm_configuration() -> Option { + if !*METRICS_ENABLE { + return None; + } + Some(PhmConfiguration { + service_name: SERVICE_NAME.clone(), + service_instance: SERVICE_INSTANCE.clone(), + report_period_secs: *METRICS_REPORT_PERIOD, + }) +} + +#[cfg(not(target_os = "linux"))] +fn phm_configuration() -> Option { + None +} diff --git a/tests/common/mod.rs b/tests/common/mod.rs index 8083027..db2d95e 100644 --- a/tests/common/mod.rs +++ b/tests/common/mod.rs @@ -23,17 +23,10 @@ use axum::{ routing::any, }; use futures_util::future::join_all; -use libc::{SIGTERM, kill, pid_t}; +use libc::{SIGKILL, SIGTERM, kill, pid_t}; use once_cell::sync::Lazy; use std::{ - env, - fs::File, - io::{self, Cursor}, - net::SocketAddr, - process::{ExitStatus, Stdio}, - sync::Arc, - thread, - time::Duration, + env, fs::File, io::Cursor, net::SocketAddr, process::Stdio, sync::Arc, thread, time::Duration, }; use tokio::{ net::TcpStream, @@ -112,16 +105,13 @@ pub async fn teardown(fixture: Fixture) { fixture.http_server_1_handle.abort(); fixture.http_server_2_handle.abort(); - let results = join_all([ - kill_command(fixture.php_fpm_1_child), - kill_command(fixture.php_fpm_2_child), - kill_command(fixture.php_swoole_1_child), - kill_command(fixture.php_swoole_2_child), + join_all([ + stop_child(fixture.php_fpm_1_child), + stop_child(fixture.php_fpm_2_child), + stop_child(fixture.php_swoole_1_child), + stop_child(fixture.php_swoole_2_child), ]) .await; - for result in results { - assert!(result.unwrap().success()); - } } fn setup_logging() { @@ -319,8 +309,22 @@ fn setup_php_fpm(index: usize, fpm_addr: &str) -> Child { "-d", "skywalking_agent.psr_logging_level=Warning", ]; + let mut args: Vec = args.iter().map(|s| (*s).to_string()).collect(); + if index == 1 { + args.extend([ + "-d".to_owned(), + "skywalking_agent.metrics_enable=On".to_owned(), + "-d".to_owned(), + "skywalking_agent.metrics_report_period=5".to_owned(), + ]); + } else { + args.extend([ + "-d".to_owned(), + "skywalking_agent.metrics_enable=Off".to_owned(), + ]); + } info!(cmd = args.join(" "), "start command"); - let child = Command::new(args[0]) + let child = Command::new(&args[0]) .args(&args[1..]) .stdin(Stdio::null()) .stdout(File::create("/tmp/fpm-skywalking-stdout.log").unwrap()) @@ -364,6 +368,8 @@ fn setup_php_swoole(index: usize) -> Child { "skywalking_agent.enable_zend_observer={}", *ENABLE_ZEND_OBSERVER ), + "-d", + "skywalking_agent.metrics_enable=Off", &format!("tests/php/swoole/main.{}.php", index), ]; info!(cmd = args.join(" "), "start command"); @@ -378,11 +384,21 @@ fn setup_php_swoole(index: usize) -> Child { child } -async fn kill_command(mut child: Child) -> io::Result { - if let Some(id) = child.id() { - unsafe { - kill(id as pid_t, SIGTERM); +async fn stop_child(mut child: Child) { + let Some(id) = child.id() else { + let _ = child.wait().await; + return; + }; + unsafe { + kill(id as pid_t, SIGTERM); + } + match tokio::time::timeout(Duration::from_secs(5), child.wait()).await { + Ok(Ok(_)) | Ok(Err(_)) => {} + Err(_) => { + unsafe { + kill(id as pid_t, SIGKILL); + } + let _ = child.wait().await; } } - child.wait().await } diff --git a/tests/data/expected_context.yaml b/tests/data/expected_context.yaml index d67e37b..40fbc20 100644 --- a/tests/data/expected_context.yaml +++ b/tests/data/expected_context.yaml @@ -2059,3 +2059,31 @@ logItems: - {key: bar, value: 'false'} - {key: baz, value: test} layer: '' +meterItems: + - serviceName: skywalking-agent-test-1 + meterSize: ge 6 + meters: + - meterId: + name: instance_php_process_cpu_utilization + tags: [] + singleValue: ge 0 + - meterId: + name: instance_php_memory_used_mb + tags: [] + singleValue: ge 0 + - meterId: + name: instance_php_memory_peak_mb + tags: [] + singleValue: ge 0 + - meterId: + name: instance_php_virtual_memory_mb + tags: [] + singleValue: ge 0 + - meterId: + name: instance_php_thread_count + tags: [] + singleValue: ge 1 + - meterId: + name: instance_php_open_fd_count + tags: [] + singleValue: ge 1 diff --git a/tests/e2e.rs b/tests/e2e.rs index 97ba724..a76ce6c 100644 --- a/tests/e2e.rs +++ b/tests/e2e.rs @@ -74,7 +74,8 @@ async fn run_e2e() { request_swoole_2_predis().await; request_swoole_2_mongodb().await; request_swoole_2_memcache().await; - sleep(Duration::from_secs(3)).await; + // Wait for PHM meter report (metrics_report_period=5s on FPM test-1). + sleep(Duration::from_secs(8)).await; request_collector_validate().await; } diff --git a/worker/src/lib.rs b/worker/src/lib.rs index 45df23f..6d12f80 100644 --- a/worker/src/lib.rs +++ b/worker/src/lib.rs @@ -14,10 +14,12 @@ // limitations under the License. pub mod channel; +pub mod phm; pub mod reporter; use crate::{ channel::TxReporter, + phm::{PhmConfiguration, boot_phm_metrics}, reporter::{ReporterConfiguration, run_reporter}, }; use skywalking::{ @@ -44,6 +46,7 @@ use tracing::{debug, error, info}; pub struct WorkerConfiguration { pub socket_file_path: PathBuf, pub heart_beat: Option, + pub phm: Option, pub reporter_config: ReporterConfiguration, } @@ -126,9 +129,13 @@ pub async fn start_worker(config: WorkerConfiguration) -> anyhow::Result<()> { }); if let Some(heart_beat_config) = config.heart_beat { - report_properties_and_keep_alive(heart_beat_config, TxReporter(tx_)); + report_properties_and_keep_alive(heart_beat_config, TxReporter(tx_.clone())); } + let _phm_booting = config + .phm + .map(|phm_config| boot_phm_metrics(phm_config, TxReporter(tx_.clone()))); + // Run reporter with blocking. run_reporter(config.reporter_config, (), Consumer(rx)).await?; diff --git a/worker/src/phm.rs b/worker/src/phm.rs new file mode 100644 index 0000000..dc9625c --- /dev/null +++ b/worker/src/phm.rs @@ -0,0 +1,296 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Periodic PHM meter collection in the forked reporter worker. Samples the +//! parent PHP process via `/proc` and reports through `skywalking::metrics` +//! `Metricer`, booted from `start_worker` alongside heartbeat reporting. + +use crate::channel::TxReporter; +use skywalking::metrics::{ + meter::Gauge, + metricer::{Booting, Metricer}, +}; +use std::{ + fs, + sync::{ + Arc, + atomic::{AtomicU64, Ordering}, + }, + time::{Duration, SystemTime, UNIX_EPOCH}, +}; +use tracing::{debug, trace, warn}; + +const METRIC_PROCESS_CPU: &str = "instance_php_process_cpu_utilization"; +const DEFAULT_CLK_TCK: f64 = 100.0; +const METRIC_MEMORY_USED_MB: &str = "instance_php_memory_used_mb"; +const METRIC_MEMORY_PEAK_MB: &str = "instance_php_memory_peak_mb"; +const METRIC_THREAD_COUNT: &str = "instance_php_thread_count"; +const METRIC_VIRTUAL_MEMORY_MB: &str = "instance_php_virtual_memory_mb"; +const METRIC_OPEN_FD_COUNT: &str = "instance_php_open_fd_count"; + +#[derive(Clone)] +pub struct PhmConfiguration { + pub service_name: String, + pub service_instance: String, + pub report_period_secs: i64, +} + +#[derive(Clone)] +struct PhmCollectorConfiguration { + report_period_secs: i64, +} + +#[derive(Clone, Default)] +pub struct PhmSamples { + memory_used_mb: Arc, + memory_peak_mb: Arc, + virtual_memory_mb: Arc, + thread_count: Arc, + open_fd_count: Arc, + process_cpu: Arc, +} + +impl PhmSamples { + fn store(cell: &AtomicU64, value: f64) { + cell.store(value.to_bits(), Ordering::Relaxed); + } + + fn gauge(cell: Arc) -> impl Fn() -> f64 + Send + Sync + 'static { + move || f64::from_bits(cell.load(Ordering::Relaxed)) + } +} + +pub fn register_gauges(metricer: &mut Metricer, samples: PhmSamples) { + metricer.register(Gauge::new( + METRIC_MEMORY_USED_MB, + PhmSamples::gauge(samples.memory_used_mb.clone()), + )); + metricer.register(Gauge::new( + METRIC_MEMORY_PEAK_MB, + PhmSamples::gauge(samples.memory_peak_mb.clone()), + )); + metricer.register(Gauge::new( + METRIC_VIRTUAL_MEMORY_MB, + PhmSamples::gauge(samples.virtual_memory_mb.clone()), + )); + metricer.register(Gauge::new( + METRIC_THREAD_COUNT, + PhmSamples::gauge(samples.thread_count.clone()), + )); + metricer.register(Gauge::new( + METRIC_OPEN_FD_COUNT, + PhmSamples::gauge(samples.open_fd_count.clone()), + )); + metricer.register(Gauge::new( + METRIC_PROCESS_CPU, + PhmSamples::gauge(samples.process_cpu.clone()), + )); +} + +struct CpuStatSample { + utime: u64, + stime: u64, + wall_ms: u128, +} + +fn update_samples(samples: &PhmSamples, cpu_sample: &mut Option) -> Option { + let pid = unsafe { libc::getppid() as i32 }; + if !process_alive(pid) { + warn!(pid, "PHM target PHP process is gone, skip sample"); + return None; + } + + let status = read_proc_status(pid); + if let Some(mb) = status.vm_rss_mb { + PhmSamples::store(&samples.memory_used_mb, mb); + } + if let Some(mb) = status.vm_hwm_mb { + PhmSamples::store(&samples.memory_peak_mb, mb); + } + if let Some(mb) = status.vm_size_mb { + PhmSamples::store(&samples.virtual_memory_mb, mb); + } + if let Some(count) = status.threads { + PhmSamples::store(&samples.thread_count, count as f64); + } + if let Some(count) = read_open_fd_count(pid) { + PhmSamples::store(&samples.open_fd_count, count); + } + if let Some((utime, stime)) = read_proc_stat_cpu(pid) { + let now_ms = current_time_millis(); + let cpu = match cpu_sample { + None => { + *cpu_sample = Some(CpuStatSample { + utime, + stime, + wall_ms: now_ms, + }); + None + } + Some(sample) => { + let delta_jiffies = + utime.saturating_sub(sample.utime) + stime.saturating_sub(sample.stime); + let delta_wall_ms = now_ms.saturating_sub(sample.wall_ms); + sample.utime = utime; + sample.stime = stime; + sample.wall_ms = now_ms; + Some(cpu_percent(delta_jiffies, delta_wall_ms)) + } + }; + if let Some(cpu) = cpu { + trace!(pid, cpu, "update PHM process CPU sample"); + PhmSamples::store(&samples.process_cpu, cpu); + } + } else { + warn!(pid, "failed to read /proc stat for PHM CPU sampling"); + } + debug!(pid, "PHM proc samples updated"); + Some(pid) +} + +/// Populate gauges once before `Metricer::boot()` so the first report is not +/// all zeros. +pub fn warmup_samples(samples: &PhmSamples) { + let mut cpu_sample = None; + update_samples(samples, &mut cpu_sample); +} + +pub fn boot_phm_metrics(config: PhmConfiguration, reporter: TxReporter) -> Booting { + let samples = PhmSamples::default(); + let report_period = Duration::from_secs(config.report_period_secs.max(1) as u64); + let collector_config = PhmCollectorConfiguration { + report_period_secs: config.report_period_secs, + }; + warmup_samples(&samples); + run_phm_collector(collector_config, samples.clone()); + let mut metricer = Metricer::new(config.service_name, config.service_instance, reporter); + metricer.set_report_interval(report_period); + register_gauges(&mut metricer, samples); + metricer.boot() +} + +fn run_phm_collector(config: PhmCollectorConfiguration, samples: PhmSamples) { + tokio::spawn(async move { + let period = Duration::from_secs(config.report_period_secs.max(1) as u64); + let mut cpu_sample: Option = None; + loop { + if update_samples(&samples, &mut cpu_sample).is_none() { + break; + } + tokio::time::sleep(period).await; + } + }); +} + +fn process_alive(pid: i32) -> bool { + fs::metadata(format!("/proc/{pid}")).is_ok() +} + +#[derive(Default)] +struct ProcStatusFields { + vm_rss_mb: Option, + vm_hwm_mb: Option, + vm_size_mb: Option, + threads: Option, +} + +fn read_proc_status(pid: i32) -> ProcStatusFields { + let Ok(content) = fs::read_to_string(format!("/proc/{pid}/status")) else { + return ProcStatusFields::default(); + }; + let mut fields = ProcStatusFields::default(); + for line in content.lines() { + if fields.vm_rss_mb.is_none() { + fields.vm_rss_mb = parse_status_kib_line(line, "VmRSS"); + } + if fields.vm_hwm_mb.is_none() { + fields.vm_hwm_mb = parse_status_kib_line(line, "VmHWM"); + } + if fields.vm_size_mb.is_none() { + fields.vm_size_mb = parse_status_kib_line(line, "VmSize"); + } + if fields.threads.is_none() { + fields.threads = parse_status_count_line(line, "Threads"); + } + if fields.vm_rss_mb.is_some() + && fields.vm_hwm_mb.is_some() + && fields.vm_size_mb.is_some() + && fields.threads.is_some() + { + break; + } + } + fields +} + +fn parse_status_kib_line(line: &str, key: &str) -> Option { + let prefix = format!("{key}:"); + if !line.starts_with(&prefix) { + return None; + } + let kb: f64 = line.split_whitespace().nth(1)?.parse().ok()?; + Some(kb / 1024.0) +} + +fn parse_status_count_line(line: &str, key: &str) -> Option { + let prefix = format!("{key}:"); + if !line.starts_with(&prefix) { + return None; + } + line.split_whitespace().nth(1)?.parse().ok() +} + +fn read_open_fd_count(pid: i32) -> Option { + let count = fs::read_dir(format!("/proc/{pid}/fd")) + .ok()? + .filter_map(|entry| entry.ok()) + .count(); + Some(count as f64) +} + +fn read_proc_stat_cpu(pid: i32) -> Option<(u64, u64)> { + let content = fs::read_to_string(format!("/proc/{pid}/stat")).ok()?; + let rparen = content.rfind(')')?; + let fields: Vec<&str> = content[rparen + 2..].split_whitespace().collect(); + let utime: u64 = fields.get(11)?.parse().ok()?; + let stime: u64 = fields.get(12)?.parse().ok()?; + Some((utime, stime)) +} + +fn cpu_percent(delta_jiffies: u64, delta_wall_ms: u128) -> f64 { + if delta_wall_ms == 0 { + return 0.0; + } + let clk_tck = unsafe { libc::sysconf(libc::_SC_CLK_TCK) }; + let clk_tck = if clk_tck > 0 { + clk_tck as f64 + } else { + warn!( + clk_tck, + "sysconf(_SC_CLK_TCK) unavailable, using default {DEFAULT_CLK_TCK}" + ); + DEFAULT_CLK_TCK + }; + let cpu_sec = delta_jiffies as f64 / clk_tck; + let wall_sec = delta_wall_ms as f64 / 1000.0; + cpu_sec / wall_sec * 100.0 +} + +fn current_time_millis() -> u128 { + SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_millis()) + .unwrap_or_default() +}