From cdbaefe23db8bc264cd06a77190384e6ce0aadd1 Mon Sep 17 00:00:00 2001 From: Firstyear Date: Wed, 7 Feb 2024 13:11:55 +1000 Subject: [PATCH] Fix for incorrect domain migration rollbacks (#2482) --- examples/insecure_server.toml | 4 +- proto/src/v1.rs | 1 + server/lib/src/constants/entries.rs | 2 +- server/lib/src/constants/mod.rs | 2 +- server/lib/src/plugins/dyngroup.rs | 6 ++ server/lib/src/server/migrations.rs | 87 +++++++++++++++++++++-------- server/lib/src/server/mod.rs | 8 ++- 7 files changed, 82 insertions(+), 28 deletions(-) diff --git a/examples/insecure_server.toml b/examples/insecure_server.toml index 150399154..a05e5c77a 100644 --- a/examples/insecure_server.toml +++ b/examples/insecure_server.toml @@ -12,8 +12,8 @@ tls_client_ca = "/tmp/kanidm/client_ca" # NOTE: this is overridden by environment variables at runtime # Defaults to "info" # -# log_level = "info" -log_level = "debug" +log_level = "info" +# log_level = "debug" # log_level = "trace" # otel_grpc_url = "http://localhost:4317" diff --git a/proto/src/v1.rs b/proto/src/v1.rs index e41332342..3e7676037 100644 --- a/proto/src/v1.rs +++ b/proto/src/v1.rs @@ -301,6 +301,7 @@ pub enum OperationError { // Migration MG0001InvalidReMigrationLevel, MG0002RaiseDomainLevelExceedsMaximum, + MG0003ServerPhaseInvalidForMigration, } impl PartialEq for OperationError { diff --git a/server/lib/src/constants/entries.rs b/server/lib/src/constants/entries.rs index 3d4c75953..1cbb2ebe9 100644 --- a/server/lib/src/constants/entries.rs +++ b/server/lib/src/constants/entries.rs @@ -750,7 +750,7 @@ lazy_static! { Attribute::Description, Value::new_utf8s("System (local) info and metadata object.") ), - (Attribute::Version, Value::Uint32(18)) + (Attribute::Version, Value::Uint32(19)) ); pub static ref E_DOMAIN_INFO_V1: EntryInitNew = entry_init!( diff --git a/server/lib/src/constants/mod.rs b/server/lib/src/constants/mod.rs index 85c3be200..2880d0f2c 100644 --- a/server/lib/src/constants/mod.rs +++ b/server/lib/src/constants/mod.rs @@ -19,7 +19,7 @@ pub use crate::constants::values::*; use std::time::Duration; // Increment this as we add new schema types and values!!! -pub const SYSTEM_INDEX_VERSION: i64 = 30; +pub const SYSTEM_INDEX_VERSION: i64 = 31; /* * domain functional levels diff --git a/server/lib/src/plugins/dyngroup.rs b/server/lib/src/plugins/dyngroup.rs index 33df00c45..458796b1e 100644 --- a/server/lib/src/plugins/dyngroup.rs +++ b/server/lib/src/plugins/dyngroup.rs @@ -5,6 +5,7 @@ use kanidm_proto::v1::Filter as ProtoFilter; use crate::filter::FilterInvalid; use crate::prelude::*; +use crate::server::ServerPhase; #[derive(Clone, Default)] pub struct DynGroupCache { @@ -35,6 +36,11 @@ impl DynGroup { } */ + if qs.get_phase() < ServerPhase::SchemaReady { + trace!("Server is not ready to load dyngroups"); + return Ok(()); + } + // Search all the new groups first. let filt = filter!(FC::Or( n_dyn_groups diff --git a/server/lib/src/server/migrations.rs b/server/lib/src/server/migrations.rs index 3978e050c..2ac2befb8 100644 --- a/server/lib/src/server/migrations.rs +++ b/server/lib/src/server/migrations.rs @@ -38,25 +38,6 @@ impl QueryServer { .initialise_schema_core() .and_then(|_| write_txn.reload())?; - write_txn - .initialise_schema_idm() - .and_then(|_| write_txn.reload())?; - - // reindex and set to version + 1, this way when we bump the version - // we are essetially pushing this version id back up to step write_1 - write_txn - .upgrade_reindex(SYSTEM_INDEX_VERSION + 1) - .and_then(|_| write_txn.reload())?; - - // Force the schema to reload - this is so that any changes to index slope - // analysis that was performed during the reindex are now reflected correctly - // in the in-memory schema cache. - // - // A side effect of these reloads is that other plugins or elements that reload - // on schema change are now setup. - - write_txn.set_phase(ServerPhase::SchemaReady); - write_txn.force_schema_reload(); write_txn.reload()?; // Now, based on the system version apply migrations. You may ask "should you not @@ -118,6 +99,10 @@ impl QueryServer { } if system_info_version < 17 { + write_txn.initialise_schema_idm()?; + + write_txn.reload()?; + write_txn.migrate_16_to_17()?; } @@ -130,9 +115,13 @@ impl QueryServer { // the domain migrations kick in again. write_txn.initialise_idm()?; } + + if system_info_version < 19 { + write_txn.migrate_18_to_19()?; + } } - // Reload if anything in migrations requires it. + // Reload if anything in the (older) system migrations requires it. write_txn.reload()?; // This is what tells us if the domain entry existed before or not. @@ -144,11 +133,23 @@ impl QueryServer { info!(?db_domain_version, "Before setting internal domain info"); - // Migrations complete. Init idm will now set the system config version and minimum domain + // No domain info was present, so neither was the rest of the IDM. We need to bootstrap + // the base-schema here. + if db_domain_version == 0 { + write_txn.initialise_schema_idm()?; + + write_txn.reload()?; + } + + // Indicate the schema is now ready, which allows dyngroups to work. + write_txn.set_phase(ServerPhase::SchemaReady); + + // Init idm will now set the system config version and minimum domain // level if none was present write_txn.initialise_domain_info()?; - // No domain info was present, so neither was the rest of the IDM. + // No domain info was present, so neither was the rest of the IDM. We need to bootstrap + // the base entries here. if db_domain_version == 0 { write_txn.initialise_idm()?; } @@ -186,6 +187,20 @@ impl QueryServer { // Reload if anything in migrations requires it - this triggers the domain migrations. write_txn.reload()?; + // reindex and set to version + 1, this way when we bump the version + // we are essetially pushing this version id back up to step write_1 + write_txn + .upgrade_reindex(SYSTEM_INDEX_VERSION + 1) + .and_then(|_| write_txn.reload())?; + + // Force the schema to reload - this is so that any changes to index slope + // analysis that was performed during the reindex are now reflected correctly + // in the in-memory schema cache. + // + // A side effect of these reloads is that other plugins or elements that reload + // on schema change are now setup. + write_txn.force_schema_reload(); + // We are ready to run write_txn.set_phase(ServerPhase::Running); @@ -676,6 +691,34 @@ impl<'a> QueryServerWriteTransaction<'a> { Ok(()) } + #[instrument(level = "info", skip_all)] + /// Automate fix for #2470 - force the domain version to be lowered, to allow + /// it to re-raise and force re-run migrations. This is because we accidentally + /// were "overwriting" the changes from domain migrations on startup due to + /// a logic error. At this point in the startup, the server phase is lower than + /// domain info ready, so the change won't immediately trigger remigrations. Rather + /// it will force them later in the startup. + pub fn migrate_18_to_19(&mut self) -> Result<(), OperationError> { + admin_warn!("starting 18 to 19 migration."); + + debug_assert!(*self.phase < ServerPhase::DomainInfoReady); + if *self.phase >= ServerPhase::DomainInfoReady { + error!("Unable to perform system migration as server phase is greater or equal to domain info ready"); + return Err(OperationError::MG0003ServerPhaseInvalidForMigration); + }; + + self.internal_modify_uuid( + UUID_DOMAIN_INFO, + &ModifyList::new_purge_and_set(Attribute::Version, Value::new_uint32(DOMAIN_LEVEL_2)), + ) + .map(|()| { + warn!( + "Domain level has been temporarily lowered to {}", + DOMAIN_LEVEL_2 + ); + }) + } + #[instrument(level = "info", skip_all)] /// This migration will /// * Trigger a "once off" mfa account policy rule on all persons. diff --git a/server/lib/src/server/mod.rs b/server/lib/src/server/mod.rs index d515b3a50..692f2972a 100644 --- a/server/lib/src/server/mod.rs +++ b/server/lib/src/server/mod.rs @@ -53,8 +53,8 @@ const RESOLVE_FILTER_CACHE_LOCAL: usize = 0; pub type ResolveFilterCacheReadTxn<'a> = ARCacheReadTxn<'a, (IdentityId, Filter), Filter, ()>; -#[derive(Debug, Clone, PartialOrd, PartialEq, Eq)] -enum ServerPhase { +#[derive(Debug, Clone, Copy, PartialOrd, PartialEq, Eq)] +pub(crate) enum ServerPhase { Bootstrap, SchemaReady, DomainInfoReady, @@ -1825,6 +1825,10 @@ impl<'a> QueryServerWriteTransaction<'a> { *self.phase = phase } + pub(crate) fn get_phase(&mut self) -> ServerPhase { + *self.phase + } + pub(crate) fn reload(&mut self) -> Result<(), OperationError> { // First, check if the domain version has changed. if self.changed_domain {