Reorganising the daemon startup so it doesn't fail with OTEL configured (#2934)

2025-06-07 00:27:46 +02:00 · 2024-07-26 00:28:35 -07:00 · 2024-07-26 00:28:35 -07:00 · 5313c5ffdc
parent 2a7a009482
commit 5313c5ffdc
6 changed files with 167 additions and 149 deletions
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -2,9 +2,9 @@ Fixes #
 Checklist
- [ ] This pr contains no AI generated code
+- [ ] This PR contains no AI generated code
- [ ] cargo fmt has been run
+- [ ] `cargo fmt` has been run
- [ ] cargo clippy has been run
+- [ ] `cargo clippy` has been run
- [ ] cargo test has been run and passes
+- [ ] `cargo test` has been run and passes
 - [ ] book chapter included (if relevant)
 - [ ] design document included (if relevant)
--- a/libs/scim_proto/src/lib.rs
+++ b/libs/scim_proto/src/lib.rs
@ -84,6 +84,10 @@ impl ScimValue {
            ScimValue::MultiComplex(a) => a.len(),
        }
    }
    pub fn is_empty(&self) -> bool {
        self.len() == 0
    }
 }
 #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
--- a/libs/sketching/src/otel.rs
+++ b/libs/sketching/src/otel.rs
@ -30,18 +30,15 @@ pub const MAX_ATTRIBUTES_PER_SPAN: u32 = 128;
 /// This does all the startup things for the logging pipeline
 pub fn start_logging_pipeline(
-    otlp_endpoint: Option<String>,
+    otlp_endpoint: &Option<String>,
    log_filter: crate::LogLevel,
-    service_name: String,
+    service_name: &'static str,
 ) -> Result<Box<dyn Subscriber + Send + Sync>, String> {
    let forest_filter: EnvFilter = EnvFilter::builder()
        .with_default_directive(log_filter.into())
        .from_env_lossy();
    // TODO: work out how to do metrics things
    // let meter_provider = init_metrics()
    //     .map_err(|err| eprintln!("failed to start metrics provider: {:?}", err))?;
    match otlp_endpoint {
        Some(endpoint) => {
            // adding these filters because when you close out the process the OTLP comms layer is NOISY
--- a/proto/src/scim_v1/synch.rs
+++ b/proto/src/scim_v1/synch.rs
@ -101,7 +101,7 @@ impl TryInto<ScimEntryGeneric> for ScimSyncPerson {
    type Error = serde_json::Error;
    fn try_into(self) -> Result<ScimEntryGeneric, Self::Error> {
-        serde_json::to_value(self).and_then(|value| serde_json::from_value(value))
+        serde_json::to_value(self).and_then(serde_json::from_value)
    }
 }
@ -227,7 +227,7 @@ impl TryInto<ScimEntryGeneric> for ScimSyncGroup {
    type Error = serde_json::Error;
    fn try_into(self) -> Result<ScimEntryGeneric, Self::Error> {
-        serde_json::to_value(self).and_then(|value| serde_json::from_value(value))
+        serde_json::to_value(self).and_then(serde_json::from_value)
    }
 }
--- a/server/core/src/config.rs
+++ b/server/core/src/config.rs
@ -30,7 +30,7 @@ pub struct OnlineBackup {
    ///
    /// - every day at 22:00 UTC (default): `"00 22 * * *"`
    /// - every 6th hours (four times a day) at 3 minutes past the hour, :
-    /// `"03 */6 * * *"`
+    ///   `"03 */6 * * *"`
    ///
    /// We also support non standard cron syntax, with the following format:
    ///
--- a/server/daemon/src/main.rs
+++ b/server/daemon/src/main.rs
@ -23,7 +23,6 @@ use std::fs::{metadata, File};
 use fs4::FileExt;
 use kanidm_proto::messages::ConsoleOutputMode;
 use sketching::otel::TracingPipelineGuard;
 use sketching::LogLevel;
 use std::io::Read;
 #[cfg(target_family = "unix")]
 use std::os::unix::fs::MetadataExt;
@ -128,7 +127,7 @@ impl KanidmdOpt {
 /// Get information on the windows username
 #[cfg(target_family = "windows")]
 fn get_user_details_windows() {
-    debug!(
+    eprintln!(
        "Running on windows, current username is: {:?}",
        whoami::username()
    );
@ -319,117 +318,8 @@ async fn submit_admin_req(path: &str, req: AdminTaskRequest, output_mode: Consol
    }
 }
-fn main() -> ExitCode {
+/// Check what we're running as and various filesystem permissions.
-    // On linux when debug assertions are disabled, prevent ptrace
+fn check_file_ownership(opt: &KanidmdParser) -> Result<(), ExitCode> {
    // from attaching to us.
    #[cfg(all(target_os = "linux", not(debug_assertions)))]
    if let Err(code) = prctl::set_dumpable(false) {
        error!(?code, "CRITICAL: Unable to set prctl flags");
        return ExitCode::FAILURE;
    }
    // We need enough backtrace depth to find leak sources if they exist.
    #[cfg(feature = "dhat-heap")]
    let _profiler = dhat::Profiler::builder().trim_backtraces(Some(40)).build();
    // Read CLI args, determine what the user has asked us to do.
    let opt = KanidmdParser::parse();
    // print the app version and bail
    if let KanidmdOpt::Version(_) = &opt.commands {
        println!("kanidmd {}", env!("KANIDM_PKG_VERSION"));
        return ExitCode::SUCCESS;
    };
    //we set up a list of these so we can set the log config THEN log out the errors.
    let mut config_error: Vec<String> = Vec::new();
    let mut config = Configuration::new();
    let Ok(default_config_path) = PathBuf::from_str(env!("KANIDM_DEFAULT_CONFIG_PATH")) else {
        eprintln!("CRITICAL: Kanidmd was not built correctly and is missing a valid KANIDM_DEFAULT_CONFIG_PATH value");
        return ExitCode::FAILURE;
    };
    let maybe_config_path = if let Some(p) = opt.config_path() {
        Some(p)
    } else {
        // The user didn't ask for a file, lets check if the default path exists?
        if default_config_path.exists() {
            // It does, lets use it.
            Some(default_config_path)
        } else {
            // No default config, and no config specified, lets assume the user
            // has selected environment variables.
            None
        }
    };
    let sconfig = match ServerConfig::new(maybe_config_path) {
        Ok(c) => Some(c),
        Err(e) => {
            config_error.push(format!("Config Parse failure {:?}", e));
            return ExitCode::FAILURE;
        }
    };
    // We only allow config file for log level now.
    let log_filter = match sconfig.as_ref() {
        Some(val) => val.log_level.unwrap_or_default(),
        None => LogLevel::Info,
    };
    println!("Log filter: {:?}", log_filter);
    // if we have a server config and it has an otel url, then we'll start the logging pipeline
    let otel_grpc_url = sconfig
        .as_ref()
        .and_then(|config| config.otel_grpc_url.clone());
    // TODO: only send to stderr when we're not in a TTY
    let sub = match sketching::otel::start_logging_pipeline(
        otel_grpc_url,
        log_filter,
        "kanidmd".to_string(),
    ) {
        Err(err) => {
            eprintln!("Error starting logger - {:} - Bailing on startup!", err);
            return ExitCode::FAILURE;
        }
        Ok(val) => val,
    };
    if let Err(err) = tracing::subscriber::set_global_default(sub).map_err(|err| {
        eprintln!("Error starting logger - {:} - Bailing on startup!", err);
        ExitCode::FAILURE
    }) {
        return err;
    };
    // guard which shuts down the logging/tracing providers when we close out
    let _otelguard = TracingPipelineGuard {};
    // Get information on the windows username
    #[cfg(target_family = "windows")]
    get_user_details_windows();
    if !config_error.is_empty() {
        for e in config_error {
            error!("{}", e);
        }
        return ExitCode::FAILURE;
    }
    let sconfig = match sconfig {
        Some(val) => val,
        None => {
            error!("Somehow you got an empty ServerConfig after error checking?");
            return ExitCode::FAILURE;
        }
    };
    // ===========================================================================
    // Config ready, start to setup pre-run checks.
    // Get info about who we are.
    #[cfg(target_family = "unix")]
    let (cuid, ceuid) = {
@ -447,7 +337,7 @@ fn main() -> ExitCode {
        if cuid != ceuid || cgid != cegid {
            error!("{} != {} || {} != {}", cuid, ceuid, cgid, cegid);
            error!("Refusing to run - uid and euid OR gid and egid must be consistent.");
-            return ExitCode::FAILURE;
+            return Err(ExitCode::FAILURE);
        }
        (cuid, ceuid)
    };
@ -469,27 +359,70 @@ fn main() -> ExitCode {
            } {
                if !kanidm_lib_file_permissions::readonly(&cfg_meta) {
                    warn!("permissions on {} may not be secure. Should be readonly to running uid. This could be a security risk ...",
-                            cfg_path.to_str().unwrap_or("invalid file path"));
+                        cfg_path.to_str().unwrap_or("invalid file path"));
                }
                if cfg_meta.mode() & 0o007 != 0 {
                    warn!("WARNING: {} has 'everyone' permission bits in the mode. This could be a security risk ...",
-                            cfg_path.to_str().unwrap_or("invalid file path")
+                        cfg_path.to_str().unwrap_or("invalid file path")
-                            );
+                        );
                }
                if cfg_meta.uid() == cuid || cfg_meta.uid() == ceuid {
                    warn!("WARNING: {} owned by the current uid, which may allow file permission changes. This could be a security risk ...",
-                            cfg_path.to_str().unwrap_or("invalid file path")
+                        cfg_path.to_str().unwrap_or("invalid file path")
-                            );
+                        );
                }
            }
        }
    }
    Ok(())
 }
 // We have to do this because we can't use tracing until we've started the logging pipeline, and we can't start the logging pipeline until the tokio runtime's doing its thing.
 async fn start_daemon(
    opt: KanidmdParser,
    mut config: Configuration,
    sconfig: ServerConfig,
 ) -> ExitCode {
    // if we have a server config and it has an OTEL URL, then we'll start the logging pipeline now.
    // TODO: only send to stderr when we're not in a TTY
    let sub = match sketching::otel::start_logging_pipeline(
        &sconfig.otel_grpc_url,
        sconfig.log_level.unwrap_or_default(),
        "kanidmd",
    ) {
        Err(err) => {
            eprintln!("Error starting logger - {:} - Bailing on startup!", err);
            return ExitCode::FAILURE;
        }
        Ok(val) => val,
    };
    if let Err(err) = tracing::subscriber::set_global_default(sub).map_err(|err| {
        eprintln!("Error starting logger - {:} - Bailing on startup!", err);
        ExitCode::FAILURE
    }) {
        return err;
    };
    // ************************************************
    // HERE'S WHERE YOU CAN START USING THE LOGGER
    // ************************************************
    // guard which shuts down the logging/tracing providers when we close out
    let _otelguard = TracingPipelineGuard {};
    // ===========================================================================
    // Start pre-run checks
    // Check the permissions of the files from the configuration.
-    if let Some(db_path) = sconfig.db_path.clone() {
+    if let Err(err) = check_file_ownership(&opt) {
-        #[allow(clippy::expect_used)]
+        return err;
    };
    if let Some(db_path) = sconfig.db_path.as_ref() {
        let db_pathbuf = PathBuf::from(db_path.as_str());
        // We can't check the db_path permissions because it may not exist yet!
        if let Some(db_parent_path) = db_pathbuf.parent() {
@ -530,7 +463,7 @@ fn main() -> ExitCode {
                warn!("WARNING: DB folder {} has 'everyone' permission bits in the mode. This could be a security risk ...", db_par_path_buf.to_str().unwrap_or("invalid file path"));
            }
        }
-        config.update_db_path(&db_path);
+        config.update_db_path(db_path);
    } else {
        error!("No db_path set in configuration, server startup will FAIL!");
        return ExitCode::FAILURE;
@ -557,16 +490,6 @@ fn main() -> ExitCode {
    config.update_admin_bind_path(&sconfig.adminbindpath);
    config.update_replication_config(sconfig.repl_config.clone());
    // We always set threads to 1 unless it's the main server.
    if matches!(&opt.commands, KanidmdOpt::Server(_)) {
        // If not updated, will default to maximum
        if let Some(threads) = sconfig.thread_count {
            config.update_threads_count(threads);
        }
    } else {
        config.update_threads_count(1);
    };
    match &opt.commands {
        // we aren't going to touch the DB so we can carry on
        KanidmdOpt::ShowReplicationCertificate { .. }
@ -582,14 +505,14 @@ fn main() -> ExitCode {
                None => std::env::temp_dir()
                    .join("kanidmd.klock")
                    .to_str()
-                    .expect("Unable to create klock path")
+                    .expect("Unable to create klock path, this is a critical error!")
                    .to_string(),
            };
            let flock = match File::create(&klock_path) {
                Ok(flock) => flock,
                Err(e) => {
-                    error!("ERROR: Refusing to start - unable to create kanidm exclusive lock at {} - {:?}", klock_path, e);
+                    error!("ERROR: Refusing to start - unable to create kanidmd exclusive lock at {} - {:?}", klock_path, e);
                    return ExitCode::FAILURE;
                }
            };
@ -597,14 +520,108 @@ fn main() -> ExitCode {
            match flock.try_lock_exclusive() {
                Ok(()) => debug!("Acquired kanidm exclusive lock"),
                Err(e) => {
-                    error!("ERROR: Refusing to start - unable to lock kanidm exclusive lock at {} - {:?}", klock_path, e);
+                    error!("ERROR: Refusing to start - unable to lock kanidmd exclusive lock at {} - {:?}", klock_path, e);
-                    error!("Is another kanidm process running?");
+                    error!("Is another kanidmd process running?");
                    return ExitCode::FAILURE;
                }
            };
        }
    }
    kanidm_main(sconfig, config, opt).await
 }
 fn main() -> ExitCode {
    // On linux when debug assertions are disabled, prevent ptrace
    // from attaching to us.
    #[cfg(all(target_os = "linux", not(debug_assertions)))]
    if let Err(code) = prctl::set_dumpable(false) {
        println!(
            ?code,
            "CRITICAL: Unable to set prctl flags, which breaches our security model, quitting!"
        );
        return ExitCode::FAILURE;
    }
    // We need enough backtrace depth to find leak sources if they exist.
    #[cfg(feature = "dhat-heap")]
    let _profiler = dhat::Profiler::builder().trim_backtraces(Some(40)).build();
    // Read CLI args, determine what the user has asked us to do.
    let opt = KanidmdParser::parse();
    // print the app version and bail
    if let KanidmdOpt::Version(_) = &opt.commands {
        println!("kanidmd {}", env!("KANIDM_PKG_VERSION"));
        return ExitCode::SUCCESS;
    };
    //we set up a list of these so we can set the log config THEN log out the errors.
    let mut config_error: Vec<String> = Vec::new();
    let mut config = Configuration::new();
    let Ok(default_config_path) = PathBuf::from_str(env!("KANIDM_DEFAULT_CONFIG_PATH")) else {
        println!("CRITICAL: Kanidmd was not built correctly and is missing a valid KANIDM_DEFAULT_CONFIG_PATH value");
        return ExitCode::FAILURE;
    };
    let maybe_config_path = if let Some(p) = opt.config_path() {
        Some(p)
    } else {
        // The user didn't ask for a file, lets check if the default path exists?
        if default_config_path.exists() {
            // It does, lets use it.
            Some(default_config_path)
        } else {
            // No default config, and no config specified, lets assume the user
            // has selected environment variables.
            None
        }
    };
    let sconfig = match ServerConfig::new(maybe_config_path) {
        Ok(c) => Some(c),
        Err(e) => {
            config_error.push(format!("Config Parse failure {:?}", e));
            return ExitCode::FAILURE;
        }
    };
    // Get information on the windows username
    #[cfg(target_family = "windows")]
    get_user_details_windows();
    if !config_error.is_empty() {
        println!("There were errors on startup, which prevent the server from starting:");
        for e in config_error {
            println!(" - {}", e);
        }
        return ExitCode::FAILURE;
    }
    let sconfig = match sconfig {
        Some(val) => val,
        None => {
            println!("Somehow you got an empty ServerConfig after error checking? Cannot start!");
            return ExitCode::FAILURE;
        }
    };
    // ===========================================================================
    // Config ready
    // We always set threads to 1 unless it's the main server.
    if matches!(&opt.commands, KanidmdOpt::Server(_)) {
        // If not updated, will default to maximum
        if let Some(threads) = sconfig.thread_count {
            config.update_threads_count(threads);
        }
    } else {
        config.update_threads_count(1);
    };
    // Start the runtime
    let maybe_rt = tokio::runtime::Builder::new_multi_thread()
        .worker_threads(config.threads)
        .enable_all()
@ -624,7 +641,7 @@ fn main() -> ExitCode {
        }
    };
-    rt.block_on(kanidm_main(sconfig, config, opt))
+    rt.block_on(start_daemon(opt, config, sconfig))
 }
 /// Build and execute the main server. The ServerConfig are the configuration options