From 040e9fd3524f702dc4699c35ce460c844b194905 Mon Sep 17 00:00:00 2001 From: Firstyear Date: Fri, 2 Jul 2021 14:50:56 +1000 Subject: [PATCH] Add statistical analysis to indexes (#505) --- CONTRIBUTORS.md | 2 + kanidmd/src/lib/be/idl_arc_sqlite.rs | 264 +++++++++++++++++++- kanidmd/src/lib/be/idl_sqlite.rs | 113 ++++++++- kanidmd/src/lib/be/idxkey.rs | 18 ++ kanidmd/src/lib/be/mod.rs | 308 +++++++++++++++++------- kanidmd/src/lib/entry.rs | 59 +++-- kanidmd/src/lib/filter.rs | 347 ++++++++++++++++----------- kanidmd/src/lib/schema.rs | 2 +- kanidmd/src/lib/server.rs | 18 +- 9 files changed, 867 insertions(+), 264 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 8872aba9d..3d4f4ebfa 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -18,3 +18,5 @@ ## Acknowledgements * M. Gerstner +* Perri Boulton + diff --git a/kanidmd/src/lib/be/idl_arc_sqlite.rs b/kanidmd/src/lib/be/idl_arc_sqlite.rs index 50f9023ec..301f86672 100644 --- a/kanidmd/src/lib/be/idl_arc_sqlite.rs +++ b/kanidmd/src/lib/be/idl_arc_sqlite.rs @@ -2,16 +2,22 @@ use crate::audit::AuditScope; use crate::be::idl_sqlite::{ IdlSqlite, IdlSqliteReadTransaction, IdlSqliteTransaction, IdlSqliteWriteTransaction, }; -use crate::be::idxkey::{IdlCacheKey, IdlCacheKeyRef, IdlCacheKeyToRef}; +use crate::be::idxkey::{ + IdlCacheKey, IdlCacheKeyRef, IdlCacheKeyToRef, IdxKey, IdxKeyRef, IdxKeyToRef, IdxSlope, +}; use crate::be::{BackendConfig, IdList, IdRawEntry}; use crate::entry::{Entry, EntryCommitted, EntrySealed}; use crate::value::IndexType; use crate::value::Value; + use concread::arcache::{ARCache, ARCacheReadTxn, ARCacheWriteTxn}; use concread::cowcell::*; use idlset::{v2::IDLBitRange, AndNot}; use kanidm_proto::v1::{ConsistencyError, OperationError}; + +use hashbrown::HashMap; use std::collections::BTreeSet; +use std::convert::TryInto; use std::ops::DerefMut; use std::time::Duration; use uuid::Uuid; @@ -800,6 +806,262 @@ impl<'a> IdlArcSqliteWriteTransaction<'a> { }) } + pub fn is_idx_slopeyness_generated( + &self, + audit: &mut AuditScope, + ) -> Result { + self.db.is_idx_slopeyness_generated(audit) + } + + pub fn get_idx_slope( + &self, + audit: &mut AuditScope, + ikey: &IdxKey, + ) -> Result, OperationError> { + self.db.get_idx_slope(audit, ikey) + } + + /// Index Slope Analysis. For the purpose of external modules you can consider this as a + /// module that generates "weights" for each index that we have. Smaller values are faster + /// indexes - larger values are more costly ones. This is not intended to yield perfect + /// weights. The intent is to seperate over obviously more effective indexes rather than + /// to min-max the fine tuning of these. Consider name=foo vs class=*. name=foo will always + /// be better than class=*, but comparing name=foo to spn=foo is "much over muchness" since + /// both are really fast. + pub fn analyse_idx_slopes(&mut self, audit: &mut AuditScope) -> Result<(), OperationError> { + /* + * Inside of this analysis there are two major factors we need to understand + * + * * What is the variation of idl lengths within an index? + * * How man keys are stored in this index? + * + * Since we have the filter2idl threshold, we want to find "what is the smallest + * and most unique index asap so we can exit faster". This allows us to avoid + * loading larger most costly indexs that either have large idls, high variation + * or few keys and are likely to miss and have to go out to disk. + * + * A few methods were proposed, but thanks to advice from Perri Boulton (psychology + * researcher with a background in statistics), we were able to device a reasonable + * approach. + * + * These are commented in line to help understand the process. + */ + + /* + * Step 1 - we have an index like "idx_eq_member". It has data that looks somewhat + * like: + * + * key | idl + * -------+------------ + * uuid_a | [1, 2, 3, ...] + * -------+------------ + * uuid_b | [4, 5, 6, ...] + * + * We need to collect this into a single vec of "how long is each idl". Since we have + * each idl in the vec, the length of the vec is also the number of keys in the set. + * This yields for us: + * + * idx_eq_member: [4.0, 5.0, ...] + * where each f64 value is the float representation of the length of idl. + * + * We then assemble these to a map so we have each idxkey and it's associated list + * of idl lens. + */ + + let mut data: HashMap> = HashMap::new(); + self.idl_cache.iter_dirty().for_each(|(k, maybe_idl)| { + if let Some(idl) = maybe_idl { + let idl_len: u32 = idl.len().try_into().unwrap_or(u32::MAX); + // Convert to something we can use. + let idl_len = f64::from(idl_len); + + let kref = IdxKeyRef::new(&k.a, &k.i); + if idl_len > 0.0 { + // It's worth looking at. Anything len 0 will be removed. + if let Some(lens) = data.get_mut(&kref as &dyn IdxKeyToRef) { + lens.push(idl_len) + } else { + data.insert(kref.to_key(), vec![idl_len]); + } + } + } + }); + + /* + * So now for each of our sets: + * + * idx_eq_member: [4.0, 5.0, ...] + * idx_eq_name : [1.0, 1.0, 1.0, ...] + * + * To get the variability, we calculate the normal distribution of the set of values + * and then using this variance we use the 1st deviation (~85%) value to assert that + * 85% or more of the values in this set will be "equal or less" than this length.* + * + * So given say: + * [1.0, 1.0, 1.0, 1.0] + * We know that the sd_1 will be 1.0. Given: + * [1.0, 1.0, 2.0, 3.0] + * We know that it will be ~2.57 (mean 1.75 + sd of 0.82). + * + * The other factor is number of keys. This is thankfully easy! We have that from + * vec.len(). + * + * We can now calculate the index slope. Why is it a slope you ask? Because we + * plot the data out on a graph, with "variability" on the y axis, and number of + * keys on the x. + * + * Lets plot our data we just added. + * + * | + * 4 + + * | + * 3 + + * | + * 2 + * eq_member + * | + * 1 + * eq_name + * | + * +--+--+--+--+-- + * 1 2 3 4 + * + * Now, if we were to connect a line from (0,0) to each point we get a line with an angle. + * + * | + * 4 + + * | + * 3 + + * | + * 2 + * eq_member + * | + * 1 + * eq_name + * |/---------/ + * +--+--+--+--+-- + * 1 2 3 4 + + * | + * 4 + + * | + * 3 + + * | + * 2 + * eq_member + * | /--/ + * 1 + /--/ * eq_name + * |/--/ + * +--+--+--+--+-- + * 1 2 3 4 + * + * (Look it's ascii art, don't judge.). + * + * Point is that eq_member is "steeper" and eq_name is "shallower". This is what we call + * the "slopeyness" aka the jank of the line, or more precisely, the angle. + * + * Now we need a way to numerically compare these lines. Since the points could be + * anywere on our graph: + * + * | + * 4 + * + * | + * 3 + * + * | + * 2 + * + * | + * 1 + * + * | + * +--+--+--+--+-- + * 1 2 3 4 + * + * While we can see what's obvious or best here, a computer has to know it. So we now + * assume that these points construct a triangle, going through (0,0), (x, 0) and (x, y). + * + * + * Λ│ + * ╱ │ + * ╱ │ + * ╱ │ + * ╱ │ + * ╱ │ + * ╱ │ + * ╱ │ sd_1 + * ╱ │ + * ╱ │ + * ───────────┼ + * nkeys + * + * Since this is right angled we can use arctan to work out the degress of the line. This + * gives us a value from 1.0 to 90.0 (We clamp to a minimum of 1.0, because we use 0 as "None" + * in the NonZeroU8 type in filter.rs, which allows ZST optimisation) + * + * The problem is that we have to go from float to u8 - this means we lose decimal precision + * in the conversion. To lessen this, we multiply by 2 to give some extra weight to each angle + * to minimise this loss and then we convert. + * + * And there we have it! A slope factor of the index! A way to compare these sets quickly + * at query optimisation time to minimse index access. + */ + let slopes: HashMap<_, _> = data + .into_iter() + .filter_map(|(k, lens)| { + let slope_factor = Self::calculate_sd_slope(lens); + if slope_factor == 0 || slope_factor == IdxSlope::MAX { + None + } else { + Some((k, slope_factor)) + } + }) + .collect(); + ltrace!(audit, "Generated slopes -> {:?}", slopes); + // Write the data down + self.db.store_idx_slope_analysis(audit, &slopes) + } + + fn calculate_sd_slope(data: Vec) -> IdxSlope { + let (n_keys, sd_1) = if data.len() >= 2 { + // We can only do SD on sets greater than 2 + let l: u32 = data.len().try_into().unwrap_or(u32::MAX); + let c = f64::from(l); + let mean = data.iter().take(u32::MAX as usize).sum::() / c; + let varience: f64 = data + .iter() + .take(u32::MAX as usize) + .map(|len| { + let delta = mean - len; + delta * delta + }) + .sum::() + / (c - 1.0); + + let sd = varience.sqrt(); + + // This is saying ~85% of values will be at least this len or less. + let sd_1 = mean + sd; + (c, sd_1) + } else if data.len() == 1 { + (1.0, data[0]) + } else { + // Cant resolve. + return IdxSlope::MAX; + }; + + // Now we know sd_1 and number of keys. We can use this as a triangle to work out + // the angle along the hypotenuse. We use this angle - or slope - to show which + // elements have the smallest sd_1 and most keys available. Then because this + // is bound between 0.0 -> 90.0, we "unfurl" this around a half circle by multipling + // by 2. This gives us a little more precision when we drop the decimal point. + let sf = (sd_1 / n_keys).atan().to_degrees() * 2.8; + + // Now these are fractions, and we can't use those in u8, so we clamp the min/max values + // that we expect to be yielded. + let sf = sf.clamp(1.0, 254.0); + if !sf.is_finite() { + IdxSlope::MAX + } else { + // SAFETY + // `sf` is clamped between 1.0 and 180.0 above, ensuring it is + // always in range. + unsafe { sf.to_int_unchecked::() } + } + } + pub fn create_name2uuid(&self, audit: &mut AuditScope) -> Result<(), OperationError> { self.db.create_name2uuid(audit) } diff --git a/kanidmd/src/lib/be/idl_sqlite.rs b/kanidmd/src/lib/be/idl_sqlite.rs index 51ae6d968..9a8cc82a9 100644 --- a/kanidmd/src/lib/be/idl_sqlite.rs +++ b/kanidmd/src/lib/be/idl_sqlite.rs @@ -1,7 +1,8 @@ use crate::audit::AuditScope; -use crate::be::{BackendConfig, IdList, IdRawEntry}; +use crate::be::{BackendConfig, IdList, IdRawEntry, IdxKey, IdxSlope}; use crate::entry::{Entry, EntryCommitted, EntrySealed}; use crate::value::{IndexType, Value}; +use hashbrown::HashMap; use idlset::v2::IDLBitRange; use kanidm_proto::v1::{ConsistencyError, OperationError}; use r2d2::Pool; @@ -195,13 +196,7 @@ pub trait IdlSqliteTransaction { } } - fn exists_idx( - &self, - audit: &mut AuditScope, - attr: &str, - itype: &IndexType, - ) -> Result { - let tname = format!("idx_{}_{}", itype.as_idx_str(), attr); + fn exists_table(&self, audit: &mut AuditScope, tname: &str) -> Result { let mut stmt = self .get_conn() .prepare("SELECT COUNT(name) from sqlite_master where name = :tname") @@ -210,7 +205,7 @@ pub trait IdlSqliteTransaction { OperationError::SqliteError })?; let i: Option = stmt - .query_row(&[(":tname", &tname)], |row| row.get(0)) + .query_row(&[(":tname", tname)], |row| row.get(0)) .map_err(|e| { ladmin_error!(audit, "SQLite Error {:?}", e); OperationError::SqliteError @@ -223,6 +218,16 @@ pub trait IdlSqliteTransaction { } } + fn exists_idx( + &self, + audit: &mut AuditScope, + attr: &str, + itype: &IndexType, + ) -> Result { + let tname = format!("idx_{}_{}", itype.as_idx_str(), attr); + self.exists_table(audit, &tname) + } + fn get_idl( &self, audit: &mut AuditScope, @@ -1067,6 +1072,94 @@ impl IdlSqliteWriteTransaction { }) } + pub fn store_idx_slope_analysis( + &self, + audit: &mut AuditScope, + slopes: &HashMap, + ) -> Result<(), OperationError> { + self.conn + .execute( + "CREATE TABLE IF NOT EXISTS idxslope_analysis ( + id TEXT PRIMARY KEY, + slope INTEGER + )", + [], + ) + .map(|_| ()) + .map_err(|e| { + ladmin_error!(audit, "SQLite Error {:?}", e); + OperationError::SqliteError + })?; + + // Remove any data if it exists. + self.conn + .execute("DELETE FROM idxslope_analysis", []) + .map(|_| ()) + .map_err(|e| { + ladmin_error!(audit, "SQLite Error {:?}", e); + OperationError::SqliteError + })?; + + slopes.iter().try_for_each(|(k, v)| { + let key = format!("idx_{}_{}", k.itype.as_idx_str(), k.attr); + self.conn + .execute( + "INSERT OR REPLACE INTO idxslope_analysis (id, slope) VALUES(:id, :slope)", + named_params! { + ":id": &key, + ":slope": &v, + }, + ) + .map(|_| ()) + .map_err(|e| { + eprintln!("CRITICAL: rusqlite error {:?}", e); + OperationError::SqliteError + }) + }) + } + + pub fn is_idx_slopeyness_generated( + &self, + audit: &mut AuditScope, + ) -> Result { + self.exists_table(audit, "idxslope_analysis") + } + + pub fn get_idx_slope( + &self, + audit: &mut AuditScope, + ikey: &IdxKey, + ) -> Result, OperationError> { + let analysis_exists = self.exists_table(audit, "idxslope_analysis")?; + if !analysis_exists { + return Ok(None); + } + + // Then we have the table and it should have things in it, lets put + // it all together. + let key = format!("idx_{}_{}", ikey.itype.as_idx_str(), ikey.attr); + + let mut stmt = self + .get_conn() + .prepare("SELECT slope FROM idxslope_analysis WHERE id = :id") + .map_err(|e| { + ladmin_error!(audit, "SQLite Error {:?}", e); + OperationError::SqliteError + })?; + + let slope: Option = stmt + .query_row(&[(":id", &key)], |row| row.get(0)) + // We don't mind if it doesn't exist + .optional() + .map_err(|e| { + ladmin_error!(audit, "SQLite Error {:?}", e); + OperationError::SqliteError + })?; + ltrace!(audit, "Got slope for index name {} -> {:?}", key, slope); + + Ok(slope) + } + pub unsafe fn purge_id2entry(&self, audit: &mut AuditScope) -> Result<(), OperationError> { ltrace!(audit, "purge id2entry ..."); self.conn @@ -1303,7 +1396,7 @@ impl IdlSqliteWriteTransaction { dbv_id2entry ); } - // * if v3 -> complete. + // * if v3 -> create name2uuid, uuid2spn, uuid2rdn. if dbv_id2entry == 3 { self.create_name2uuid(audit) .and_then(|_| self.create_uuid2spn(audit)) diff --git a/kanidmd/src/lib/be/idxkey.rs b/kanidmd/src/lib/be/idxkey.rs index 99f3041eb..80eec8606 100644 --- a/kanidmd/src/lib/be/idxkey.rs +++ b/kanidmd/src/lib/be/idxkey.rs @@ -4,6 +4,8 @@ use std::borrow::Borrow; use std::cmp::Ordering; use std::hash::{Hash, Hasher}; +pub type IdxSlope = u8; + // Huge props to https://github.com/sunshowers/borrow-complex-key-example/blob/master/src/lib.rs #[derive(Debug, Clone, PartialEq, Eq, Hash)] @@ -12,6 +14,15 @@ pub struct IdxKey { pub itype: IndexType, } +impl IdxKey { + pub fn new(attr: &str, itype: IndexType) -> Self { + IdxKey { + attr: attr.into(), + itype, + } + } +} + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub struct IdxKeyRef<'a> { pub attr: &'a str, @@ -22,6 +33,13 @@ impl<'a> IdxKeyRef<'a> { pub fn new(attr: &'a str, itype: &'a IndexType) -> Self { IdxKeyRef { attr, itype } } + + pub fn to_key(&self) -> IdxKey { + IdxKey { + attr: self.attr.into(), + itype: self.itype.clone(), + } + } } pub trait IdxKeyToRef { diff --git a/kanidmd/src/lib/be/mod.rs b/kanidmd/src/lib/be/mod.rs index 9622831fe..c9e1fac27 100644 --- a/kanidmd/src/lib/be/mod.rs +++ b/kanidmd/src/lib/be/mod.rs @@ -7,7 +7,8 @@ use std::fs; use crate::value::IndexType; -use hashbrown::HashSet as Set; +use hashbrown::HashMap as Map; +use hashbrown::HashSet; use std::cell::UnsafeCell; use std::sync::Arc; @@ -32,7 +33,7 @@ mod idl_arc_sqlite; mod idl_sqlite; pub(crate) mod idxkey; -pub(crate) use self::idxkey::{IdxKey, IdxKeyRef, IdxKeyToRef}; +pub(crate) use self::idxkey::{IdxKey, IdxKeyRef, IdxKeyToRef, IdxSlope}; use crate::be::idl_arc_sqlite::{ IdlArcSqlite, IdlArcSqliteReadTransaction, IdlArcSqliteTransaction, @@ -61,11 +62,11 @@ pub struct IdRawEntry { #[derive(Debug, Clone)] pub struct IdxMeta { - pub idxkeys: Set, + pub idxkeys: Map, } impl IdxMeta { - pub fn new(idxkeys: Set) -> Self { + pub fn new(idxkeys: Map) -> Self { IdxMeta { idxkeys } } } @@ -158,7 +159,7 @@ pub trait BackendTransaction { ) -> Result<(IdList, FilterPlan), OperationError> { Ok(match filt { FilterResolved::Eq(attr, value, idx) => { - if *idx { + if idx.is_some() { // Get the idx_key let idx_key = value.get_idx_eq_key(); // Get the idl for this @@ -178,7 +179,7 @@ pub trait BackendTransaction { } } FilterResolved::Sub(attr, subvalue, idx) => { - if *idx { + if idx.is_some() { // Get the idx_key let idx_key = subvalue.get_idx_sub_key(); // Get the idl for this @@ -198,7 +199,7 @@ pub trait BackendTransaction { } } FilterResolved::Pres(attr, idx) => { - if *idx { + if idx.is_some() { // Get the idl for this match self.get_idlayer().get_idl( au, @@ -218,7 +219,7 @@ pub trait BackendTransaction { // We have no process for indexing this right now. (IdList::AllIds, FilterPlan::LessThanUnindexed(attr.clone())) } - FilterResolved::Or(l) => { + FilterResolved::Or(l, _) => { // Importantly if this has no inner elements, this returns // an empty list. let mut plan = Vec::new(); @@ -270,7 +271,7 @@ pub trait BackendTransaction { (IdList::Indexed(result), setplan) } } - FilterResolved::And(l) => { + FilterResolved::And(l, _) => { // This algorithm is a little annoying. I couldn't get it to work with iter and // folds due to the logic needed ... @@ -382,7 +383,7 @@ pub trait BackendTransaction { for f in f_andnot.iter() { f_rem_count -= 1; let f_in = match f { - FilterResolved::AndNot(f_in) => f_in, + FilterResolved::AndNot(f_in, _) => f_in, _ => { lfilter_error!( au, @@ -465,7 +466,7 @@ pub trait BackendTransaction { // debug!("final cand set ==> {:?}", cand_idl); (cand_idl, setplan) } // end and - FilterResolved::Inclusion(l) => { + FilterResolved::Inclusion(l, _) => { // For inclusion to be valid, every term must have *at least* one element present. // This really relies on indexing, and so it's internal only - generally only // for fully indexed existance queries, such as from refint. @@ -507,7 +508,7 @@ pub trait BackendTransaction { // of an "AND" query, but if it's used anywhere else IE the root filter, then there is // no other set to exclude - therefore it's empty set. Additionally, even in an OR query // the AndNot will be skipped as an empty set for the same reason. - FilterResolved::AndNot(_f) => { + FilterResolved::AndNot(_f, _) => { // get the idl for f // now do andnot? lfilter_error!( @@ -1046,8 +1047,32 @@ impl<'a> BackendWriteTransaction<'a> { }) } - pub fn update_idxmeta(&mut self, mut idxkeys: Set) { + pub fn update_idxmeta( + &mut self, + audit: &mut AuditScope, + idxkeys: Vec, + ) -> Result<(), OperationError> { + if self.is_idx_slopeyness_generated(audit)? { + ltrace!(audit, "Indexing slopes available"); + } else { + ladmin_warning!( + audit, + "No indexing slopes available. You should consider reindexing to generate these" + ); + }; + + // Setup idxkeys here. By default we set these all to "max slope" aka + // all indexes are "equal" but also worse case unless analysed. If they + // have been analysed, we can set the slope factor into here. + let idxkeys: Result, _> = idxkeys + .into_iter() + .map(|k| self.get_idx_slope(audit, &k).map(|slope| (k, slope))) + .collect(); + + let mut idxkeys = idxkeys?; + std::mem::swap(&mut self.idxmeta_wr.deref_mut().idxkeys, &mut idxkeys); + Ok(()) } // Should take a mut index set, and then we write the whole thing back @@ -1234,12 +1259,12 @@ impl<'a> BackendWriteTransaction<'a> { let idx_table_list = self.get_idlayer().list_idxs(audit)?; // Turn the vec to a real set - let idx_table_set: Set<_> = idx_table_list.into_iter().collect(); + let idx_table_set: HashSet<_> = idx_table_list.into_iter().collect(); let missing: Vec<_> = self .idxmeta .idxkeys - .iter() + .keys() .filter_map(|ikey| { // what would the table name be? let tname = format!("idx_{}_{}", ikey.itype.as_idx_str(), ikey.attr.as_str()); @@ -1269,7 +1294,7 @@ impl<'a> BackendWriteTransaction<'a> { self.idxmeta .idxkeys - .iter() + .keys() .try_for_each(|ikey| idlayer.create_idx(audit, &ikey.attr, &ikey.itype)) } @@ -1327,7 +1352,12 @@ impl<'a> BackendWriteTransaction<'a> { limmediate_warning!(audit, "Optimising Indexes ... "); idlayer.optimise_dirty_idls(audit); limmediate_warning!(audit, "done ✅\n"); - + limmediate_warning!(audit, "Calculating Index Optimisation Slopes ... "); + idlayer.analyse_idx_slopes(audit).map_err(|e| { + ladmin_error!(audit, "index optimisation failed -> {:?}", e); + e + })?; + limmediate_warning!(audit, "done ✅\n"); Ok(()) } @@ -1347,6 +1377,24 @@ impl<'a> BackendWriteTransaction<'a> { self.get_idlayer().get_idl(audit, attr, itype, idx_key) } + fn is_idx_slopeyness_generated(&self, audit: &mut AuditScope) -> Result { + self.get_idlayer().is_idx_slopeyness_generated(audit) + } + + fn get_idx_slope( + &self, + audit: &mut AuditScope, + ikey: &IdxKey, + ) -> Result { + // Do we have the slopeyness? + let slope = self + .get_idlayer() + .get_idx_slope(audit, ikey)? + .unwrap_or_else(|| get_idx_slope_default(ikey)); + ltrace!(audit, "index slope - {:?} -> {:?}", ikey, slope); + Ok(slope) + } + pub fn restore(&self, audit: &mut AuditScope, src_path: &str) -> Result<(), OperationError> { let idlayer = self.get_idlayer(); // load all entries into RAM, may need to change this later @@ -1369,35 +1417,6 @@ impl<'a> BackendWriteTransaction<'a> { OperationError::SerdeJsonError })?; - // Filter all elements that have a UUID in the system range. - /* - use crate::constants::UUID_ANONYMOUS; - use crate::be::dbentry::DbEntryVers; - use crate::be::dbvalue::DbValueV1; - let uuid_anonymous = UUID_ANONYMOUS.clone(); - let dbentries: Vec = dbentries.into_iter() - .filter(|e| { - let e_uuid = match &e.ent { - DbEntryVers::V1(dbe) => dbe.attrs.get("uuid") - .and_then(|dbvs| dbvs.first()) - .and_then(|dbv| { - match dbv { - DbValueV1::UU(u) => Some(u), - _ => panic!(), - } - }) - .unwrap() - }; - - e_uuid > &uuid_anonymous - }) - .collect(); - - dbentries.iter().for_each(|e| { - ltrace!(audit, "importing -> {:?}", e); - }); - */ - // Now, we setup all the entries with new ids. let mut id_max = 0; let identries: Result, _> = dbentries @@ -1411,16 +1430,6 @@ impl<'a> BackendWriteTransaction<'a> { idlayer.write_identries_raw(audit, identries?.into_iter())?; - // for debug - /* - self.idlayer.get_identry(audit, &IdList::AllIds) - .unwrap() - .iter() - .for_each(|dbe| { - ltrace!(audit, "dbe -> {:?}", dbe.id); - }); - */ - // Reindex now we are loaded. self.reindex(audit)?; @@ -1505,6 +1514,21 @@ impl<'a> BackendWriteTransaction<'a> { } } +// We have a number of hardcoded, "obvious" slopes that should +// exist. We return these when the analysis has not been run, as +// these are values that are generally "good enough" for most applications +fn get_idx_slope_default(ikey: &IdxKey) -> IdxSlope { + match (ikey.attr.as_str(), &ikey.itype) { + ("name", IndexType::Equality) + | ("spn", IndexType::Equality) + | ("uuid", IndexType::Equality) => 1, + ("class", IndexType::Equality) => 180, + (_, IndexType::Equality) => 45, + (_, IndexType::SubString) => 90, + (_, IndexType::Presence) => 90, + } +} + // In the future this will do the routing between the chosen backends etc. impl Backend { pub fn new( @@ -1513,7 +1537,7 @@ impl Backend { // path: &str, // mut pool_size: u32, // fstype: FsType, - idxkeys: Set, + idxkeys: Vec, vacuum: bool, ) -> Result { info!("DB tickets -> {:?}", cfg.pool_size); @@ -1525,6 +1549,19 @@ impl Backend { cfg.pool_size = 1; } + // Setup idxkeys here. By default we set these all to "max slope" aka + // all indexes are "equal" but also worse case unless analysed. + // + // During startup this will be "fixed" as the schema core will call reload_idxmeta + // which will trigger a reload of the analysis data (if present). + let idxkeys: Map<_, _> = idxkeys + .into_iter() + .map(|ikey| { + let slope = get_idx_slope_default(&ikey); + (ikey, slope) + }) + .collect(); + // this has a ::memory() type, but will path == "" work? lperf_trace_segment!(audit, "be::new", || { let idlayer = Arc::new(IdlArcSqlite::new(audit, &cfg, vacuum)?); @@ -1597,7 +1634,6 @@ impl Backend { #[cfg(test)] mod tests { - use hashbrown::HashSet as Set; use idlset::v2::IDLBitRange; use std::fs; use std::iter::FromIterator; @@ -1626,35 +1662,36 @@ mod tests { let mut audit = AuditScope::new("run_test", uuid::Uuid::new_v4(), None); // This is a demo idxmeta, purely for testing. - let mut idxmeta = Set::with_capacity(16); - idxmeta.insert(IdxKey { - attr: AttrString::from("name"), - itype: IndexType::Equality, - }); - idxmeta.insert(IdxKey { - attr: AttrString::from("name"), - itype: IndexType::Presence, - }); - idxmeta.insert(IdxKey { - attr: AttrString::from("name"), - itype: IndexType::SubString, - }); - idxmeta.insert(IdxKey { - attr: AttrString::from("uuid"), - itype: IndexType::Equality, - }); - idxmeta.insert(IdxKey { - attr: AttrString::from("uuid"), - itype: IndexType::Presence, - }); - idxmeta.insert(IdxKey { - attr: AttrString::from("ta"), - itype: IndexType::Equality, - }); - idxmeta.insert(IdxKey { - attr: AttrString::from("tb"), - itype: IndexType::Equality, - }); + let idxmeta = vec![ + IdxKey { + attr: AttrString::from("name"), + itype: IndexType::Equality, + }, + IdxKey { + attr: AttrString::from("name"), + itype: IndexType::Presence, + }, + IdxKey { + attr: AttrString::from("name"), + itype: IndexType::SubString, + }, + IdxKey { + attr: AttrString::from("uuid"), + itype: IndexType::Equality, + }, + IdxKey { + attr: AttrString::from("uuid"), + itype: IndexType::Presence, + }, + IdxKey { + attr: AttrString::from("ta"), + itype: IndexType::Equality, + }, + IdxKey { + attr: AttrString::from("tb"), + itype: IndexType::Equality, + }, + ]; let be = Backend::new(&mut audit, BackendConfig::new_test(), idxmeta, false) .expect("Failed to setup backend"); @@ -2765,6 +2802,103 @@ mod tests { }) } + #[test] + fn test_be_index_slope_generation() { + run_test!(|audit: &mut AuditScope, be: &mut BackendWriteTransaction| { + // Create some test entry with some indexed / unindexed values. + let mut e1: Entry = Entry::new(); + e1.add_ava("name", Value::from("william")); + e1.add_ava("uuid", Value::from("db237e8a-0079-4b8c-8a56-593b22aa44d1")); + e1.add_ava("ta", Value::from("dupe")); + e1.add_ava("tb", Value::from("1")); + let e1 = unsafe { e1.into_sealed_new() }; + + let mut e2: Entry = Entry::new(); + e2.add_ava("name", Value::from("claire")); + e2.add_ava("uuid", Value::from("db237e8a-0079-4b8c-8a56-593b22aa44d2")); + e2.add_ava("ta", Value::from("dupe")); + e2.add_ava("tb", Value::from("1")); + let e2 = unsafe { e2.into_sealed_new() }; + + let mut e3: Entry = Entry::new(); + e3.add_ava("name", Value::from("benny")); + e3.add_ava("uuid", Value::from("db237e8a-0079-4b8c-8a56-593b22aa44d3")); + e3.add_ava("ta", Value::from("dupe")); + e3.add_ava("tb", Value::from("2")); + let e3 = unsafe { e3.into_sealed_new() }; + + let _rset = be + .create(audit, vec![e1.clone(), e2.clone(), e3.clone()]) + .unwrap(); + + // If the slopes haven't been generated yet, there are some hardcoded values + // that we can use instead. They aren't generated until a first re-index. + assert!(!be.is_idx_slopeyness_generated(audit).unwrap()); + + let ta_eq_slope = be + .get_idx_slope(audit, &IdxKey::new("ta", IndexType::Equality)) + .unwrap(); + assert_eq!(ta_eq_slope, 45); + + let tb_eq_slope = be + .get_idx_slope(audit, &IdxKey::new("tb", IndexType::Equality)) + .unwrap(); + assert_eq!(tb_eq_slope, 45); + + let name_eq_slope = be + .get_idx_slope(audit, &IdxKey::new("name", IndexType::Equality)) + .unwrap(); + assert_eq!(name_eq_slope, 1); + let uuid_eq_slope = be + .get_idx_slope(audit, &IdxKey::new("uuid", IndexType::Equality)) + .unwrap(); + assert_eq!(uuid_eq_slope, 1); + + let name_pres_slope = be + .get_idx_slope(audit, &IdxKey::new("name", IndexType::Presence)) + .unwrap(); + assert_eq!(name_pres_slope, 90); + let uuid_pres_slope = be + .get_idx_slope(audit, &IdxKey::new("uuid", IndexType::Presence)) + .unwrap(); + assert_eq!(uuid_pres_slope, 90); + // Check the slopes are what we expect for hardcoded values. + + // Now check slope generation for the values. Today these are calculated + // at reindex time, so we now perform the re-index. + assert!(be.reindex(audit).is_ok()); + assert!(be.is_idx_slopeyness_generated(audit).unwrap()); + + let ta_eq_slope = be + .get_idx_slope(audit, &IdxKey::new("ta", IndexType::Equality)) + .unwrap(); + assert_eq!(ta_eq_slope, 200); + + let tb_eq_slope = be + .get_idx_slope(audit, &IdxKey::new("tb", IndexType::Equality)) + .unwrap(); + assert_eq!(tb_eq_slope, 133); + + let name_eq_slope = be + .get_idx_slope(audit, &IdxKey::new("name", IndexType::Equality)) + .unwrap(); + assert_eq!(name_eq_slope, 51); + let uuid_eq_slope = be + .get_idx_slope(audit, &IdxKey::new("uuid", IndexType::Equality)) + .unwrap(); + assert_eq!(uuid_eq_slope, 51); + + let name_pres_slope = be + .get_idx_slope(audit, &IdxKey::new("name", IndexType::Presence)) + .unwrap(); + assert_eq!(name_pres_slope, 200); + let uuid_pres_slope = be + .get_idx_slope(audit, &IdxKey::new("uuid", IndexType::Presence)) + .unwrap(); + assert_eq!(uuid_pres_slope, 200); + }) + } + #[test] fn test_be_limits_allids() { run_test!(|audit: &mut AuditScope, be: &mut BackendWriteTransaction| { diff --git a/kanidmd/src/lib/entry.rs b/kanidmd/src/lib/entry.rs index 70ea0adce..2da0b5531 100644 --- a/kanidmd/src/lib/entry.rs +++ b/kanidmd/src/lib/entry.rs @@ -38,14 +38,14 @@ use kanidm_proto::v1::Filter as ProtoFilter; use kanidm_proto::v1::{OperationError, SchemaError}; use crate::be::dbentry::{DbEntry, DbEntryV1, DbEntryVers}; -use crate::be::IdxKey; +use crate::be::{IdxKey, IdxSlope}; use ldap3_server::simple::{LdapPartialAttribute, LdapSearchResultEntry}; use std::collections::BTreeMap as Map; pub use std::collections::BTreeSet as Set; use std::collections::BTreeSet; // use hashbrown::HashMap as Map; -use hashbrown::HashSet; +use hashbrown::HashMap; use smartstring::alias::String as AttrString; use time::OffsetDateTime; use uuid::Uuid; @@ -1097,7 +1097,7 @@ impl Entry { /// Given the previous and current state of this entry, determine the indexing differential /// that needs to be applied. i.e. what indexes must be created, modified and removed. pub(crate) fn idx_diff<'a>( - idxmeta: &'a HashSet, + idxmeta: &'a HashMap, pre: Option<&Self>, post: Option<&Self>, ) -> IdxDiff<'a> { @@ -1113,7 +1113,7 @@ impl Entry { (Some(pre_e), None) => { // If we are none (?), yield our pre-state as removals. idxmeta - .iter() + .keys() .flat_map(|ikey| { match pre_e.get_ava(ikey.attr.as_str()) { None => Vec::new(), @@ -1143,7 +1143,7 @@ impl Entry { (None, Some(post_e)) => { // If the pre-state is none, yield our additions. idxmeta - .iter() + .keys() .flat_map(|ikey| { match post_e.get_ava(ikey.attr.as_str()) { None => Vec::new(), @@ -1175,7 +1175,7 @@ impl Entry { (Some(pre_e), Some(post_e)) => { assert!(pre_e.state.id == post_e.state.id); idxmeta - .iter() + .keys() .flat_map(|ikey| { match ( pre_e.get_ava_set(ikey.attr.as_str()), @@ -1804,16 +1804,16 @@ impl Entry { self.attribute_lessthan(attr.as_str(), subvalue) } // Check with ftweedal about or filter zero len correctness. - FilterResolved::Or(l) => l.iter().any(|f| self.entry_match_no_index_inner(f)), + FilterResolved::Or(l, _) => l.iter().any(|f| self.entry_match_no_index_inner(f)), // Check with ftweedal about and filter zero len correctness. - FilterResolved::And(l) => l.iter().all(|f| self.entry_match_no_index_inner(f)), - FilterResolved::Inclusion(_) => { + FilterResolved::And(l, _) => l.iter().all(|f| self.entry_match_no_index_inner(f)), + FilterResolved::Inclusion(_, _) => { // An inclusion doesn't make sense on an entry in isolation! // Inclusions are part of exists queries, on search they mean // nothing! false } - FilterResolved::AndNot(f) => !self.entry_match_no_index_inner(f), + FilterResolved::AndNot(f, _) => !self.entry_match_no_index_inner(f), } } @@ -2091,11 +2091,11 @@ impl From<&SchemaClass> for Entry { #[cfg(test)] mod tests { - use crate::be::IdxKey; + use crate::be::{IdxKey, IdxSlope}; use crate::entry::{Entry, EntryInit, EntryInvalid, EntryNew}; use crate::modify::{Modify, ModifyList}; use crate::value::{IndexType, PartialValue, Value}; - use hashbrown::HashSet; + use hashbrown::HashMap; use smartstring::alias::String as AttrString; use std::collections::BTreeSet as Set; @@ -2270,19 +2270,28 @@ mod tests { e2.add_ava("userid", Value::from("claire")); let e2 = unsafe { e2.into_sealed_committed() }; - let mut idxmeta = HashSet::with_capacity(8); - idxmeta.insert(IdxKey { - attr: AttrString::from("userid"), - itype: IndexType::Equality, - }); - idxmeta.insert(IdxKey { - attr: AttrString::from("userid"), - itype: IndexType::Presence, - }); - idxmeta.insert(IdxKey { - attr: AttrString::from("extra"), - itype: IndexType::Equality, - }); + let mut idxmeta = HashMap::with_capacity(8); + idxmeta.insert( + IdxKey { + attr: AttrString::from("userid"), + itype: IndexType::Equality, + }, + IdxSlope::MAX, + ); + idxmeta.insert( + IdxKey { + attr: AttrString::from("userid"), + itype: IndexType::Presence, + }, + IdxSlope::MAX, + ); + idxmeta.insert( + IdxKey { + attr: AttrString::from("extra"), + itype: IndexType::Equality, + }, + IdxSlope::MAX, + ); // When we do None, None, we get nothing back. let r1 = Entry::idx_diff(&idxmeta, None, None); diff --git a/kanidmd/src/lib/filter.rs b/kanidmd/src/lib/filter.rs index 8e67f6d4f..3cf5d4b11 100644 --- a/kanidmd/src/lib/filter.rs +++ b/kanidmd/src/lib/filter.rs @@ -8,25 +8,28 @@ //! [`Filter`]: struct.Filter.html //! [`Entry`]: ../entry/struct.Entry.html -use crate::be::{IdxKey, IdxKeyRef, IdxKeyToRef, IdxMeta}; +use crate::be::{IdxKey, IdxKeyRef, IdxKeyToRef, IdxMeta, IdxSlope}; use crate::identity::IdentityId; use crate::ldap::ldap_attr_filter_map; use crate::prelude::*; use crate::schema::SchemaTransaction; use crate::value::{IndexType, PartialValue}; -use hashbrown::HashSet; +use concread::arcache::ARCacheReadTxn; use kanidm_proto::v1::Filter as ProtoFilter; use kanidm_proto::v1::{OperationError, SchemaError}; use ldap3_server::proto::{LdapFilter, LdapSubstringFilter}; -// use smartstring::alias::String; -use concread::arcache::ARCacheReadTxn; -use smartstring::alias::String as AttrString; +// use smartstring::alias::String as AttrString; use std::cmp::{Ordering, PartialOrd}; use std::collections::BTreeSet; use std::hash::Hash; use std::iter; +use std::num::NonZeroU8; use uuid::Uuid; +use hashbrown::HashMap; +#[cfg(test)] +use hashbrown::HashSet; + const FILTER_DEPTH_MAX: usize = 16; // Default filter is safe, ignores all hidden types! @@ -104,8 +107,8 @@ pub fn f_spn_name(id: &str) -> FC<'static> { FC::Or(f) } -// This is the short-form for tests and internal filters that can then -// be transformed into a filter for the server to use. +/// This is the short-form for tests and internal filters that can then +/// be transformed into a filter for the server to use. #[derive(Debug, Deserialize)] pub enum FC<'a> { Eq(&'a str, PartialValue), @@ -120,7 +123,7 @@ pub enum FC<'a> { // Not(Box), } -// This is the filters internal representation. +/// This is the filters internal representation #[derive(Debug, Clone, Hash, PartialEq, PartialOrd, Ord, Eq)] enum FilterComp { // This is attr - value @@ -137,22 +140,28 @@ enum FilterComp { // Not(Box), } -// This is the fully resolved internal representation. Note the lack of Not and selfUUID -// because these are resolved into And(Pres(class), AndNot(term)) and Eq(uuid, ...). -// Importantly, we make this accessible to Entry so that it can then match on filters -// properly. -#[derive(Debug, Clone, PartialEq, Eq)] +/// This is the fully resolved internal representation. Note the lack of Not and selfUUID +/// because these are resolved into And(Pres(class), AndNot(term)) and Eq(uuid, ...). +/// Importantly, we make this accessible to Entry so that it can then match on filters +/// internally. +/// +/// Each filter that has been resolved also has been enriched with metadata about its +/// index, and index slope. For the purpose of this module, consider slope as a "weight" +/// where small value - faster index, larger value - slower index. This metadata is extremely +/// important for the query optimiser to make decisions about how to re-arrange queries +/// correctly. +#[derive(Debug, Clone, Eq)] pub enum FilterResolved { - // This is attr - value - indexed - Eq(AttrString, PartialValue, bool), - Sub(AttrString, PartialValue, bool), - Pres(AttrString, bool), - LessThan(AttrString, PartialValue, bool), - Or(Vec), - And(Vec), + // This is attr - value - indexed slope factor + Eq(AttrString, PartialValue, Option), + Sub(AttrString, PartialValue, Option), + Pres(AttrString, Option), + LessThan(AttrString, PartialValue, Option), + Or(Vec, Option), + And(Vec, Option), // All terms must have 1 or more items, or the inclusion is false! - Inclusion(Vec), - AndNot(Box), + Inclusion(Vec, Option), + AndNot(Box, Option), } #[derive(Debug, Clone, PartialEq)] @@ -903,26 +912,6 @@ impl PartialEq for Filter { } */ -/* -impl PartialEq for FilterResolved { - fn eq(&self, rhs: &FilterResolved) -> bool { - match (self, rhs) { - (FilterResolved::Eq(a1, v1, i1), FilterResolved::Eq(a2, v2, i2)) => { - a1 == a2 && v1 == v2 && i1 == i2 - } - (FilterResolved::Sub(a1, v1, i1), FilterResolved::Sub(a2, v2, i2)) => { - a1 == a2 && v1 == v2 && i1 == i2 - } - (FilterResolved::Pres(a1, i1), FilterResolved::Pres(a2, i2)) => a1 == a2 && i1 == i2, - (FilterResolved::And(vs1), FilterResolved::And(vs2)) => vs1 == vs2, - (FilterResolved::Or(vs1), FilterResolved::Or(vs2)) => vs1 == vs2, - (FilterResolved::AndNot(f1), FilterResolved::AndNot(f2)) => f1 == f2, - (_, _) => false, - } - } -} -*/ - /* * Only needed in tests, in run time order only matters on the inner for * optimisation. @@ -934,8 +923,26 @@ impl PartialOrd for Filter { } } -// remember, this isn't ordering by alphanumeric, this is ordering of -// optimisation preference! +impl PartialEq for FilterResolved { + fn eq(&self, rhs: &FilterResolved) -> bool { + match (self, rhs) { + (FilterResolved::Eq(a1, v1, _), FilterResolved::Eq(a2, v2, _)) => a1 == a2 && v1 == v2, + (FilterResolved::Sub(a1, v1, _), FilterResolved::Sub(a2, v2, _)) => { + a1 == a2 && v1 == v2 + } + (FilterResolved::Pres(a1, _), FilterResolved::Pres(a2, _)) => a1 == a2, + (FilterResolved::LessThan(a1, v1, _), FilterResolved::LessThan(a2, v2, _)) => { + a1 == a2 && v1 == v2 + } + (FilterResolved::And(vs1, _), FilterResolved::And(vs2, _)) => vs1 == vs2, + (FilterResolved::Or(vs1, _), FilterResolved::Or(vs2, _)) => vs1 == vs2, + (FilterResolved::Inclusion(vs1, _), FilterResolved::Inclusion(vs2, _)) => vs1 == vs2, + (FilterResolved::AndNot(f1, _), FilterResolved::AndNot(f2, _)) => f1 == f2, + (_, _) => false, + } + } +} + impl PartialOrd for FilterResolved { fn partial_cmp(&self, rhs: &FilterResolved) -> Option { Some(self.cmp(rhs)) @@ -943,41 +950,50 @@ impl PartialOrd for FilterResolved { } impl Ord for FilterResolved { + /// Ordering of filters for optimisation and subsequent dead term elimination. + /// fn cmp(&self, rhs: &FilterResolved) -> Ordering { - match (self, rhs) { - (FilterResolved::Eq(a1, v1, true), FilterResolved::Eq(a2, v2, true)) => { - match a1.cmp(a2) { - Ordering::Equal => v1.cmp(v2), - o => o, + let left_slopey = self.get_slopeyness_factor(); + let right_slopey = rhs.get_slopeyness_factor(); + + let r = match (left_slopey, right_slopey) { + (Some(sfl), Some(sfr)) => sfl.cmp(&sfr), + (Some(_), None) => Ordering::Less, + (None, Some(_)) => Ordering::Greater, + (None, None) => Ordering::Equal, + }; + + // If they are equal, compare other elements for determining the order. This is what + // allows dead term elimination to occur. + // + // In almost all cases, we'll miss this step though as slopes will vary distinctly. + // + // We do NOT need to check for indexed vs unindexed here, we already did it in the slope check! + if r == Ordering::Equal { + match (self, rhs) { + (FilterResolved::Eq(a1, v1, _), FilterResolved::Eq(a2, v2, _)) + | (FilterResolved::Sub(a1, v1, _), FilterResolved::Sub(a2, v2, _)) + | (FilterResolved::LessThan(a1, v1, _), FilterResolved::LessThan(a2, v2, _)) => { + match a1.cmp(a2) { + Ordering::Equal => v1.cmp(v2), + o => o, + } } + (FilterResolved::Pres(a1, _), FilterResolved::Pres(a2, _)) => a1.cmp(a2), + // Now sort these into the generally "best" order. + (FilterResolved::Eq(_, _, _), _) => Ordering::Less, + (_, FilterResolved::Eq(_, _, _)) => Ordering::Greater, + (FilterResolved::Pres(_, _), _) => Ordering::Less, + (_, FilterResolved::Pres(_, _)) => Ordering::Greater, + (FilterResolved::LessThan(_, _, _), _) => Ordering::Less, + (_, FilterResolved::LessThan(_, _, _)) => Ordering::Greater, + (FilterResolved::Sub(_, _, _), _) => Ordering::Less, + (_, FilterResolved::Sub(_, _, _)) => Ordering::Greater, + // They can't be re-arranged, they don't move! + (_, _) => Ordering::Equal, } - (FilterResolved::Sub(a1, v1, true), FilterResolved::Sub(a2, v2, true)) => { - match a1.cmp(a2) { - Ordering::Equal => v1.cmp(v2), - o => o, - } - } - (FilterResolved::Pres(a1, true), FilterResolved::Pres(a2, true)) => a1.cmp(a2), - // Always higher prefer indexed Eq over all else, as these will have - // the best indexes and return smallest candidates. - (FilterResolved::Eq(_, _, true), _) => Ordering::Less, - (_, FilterResolved::Eq(_, _, true)) => Ordering::Greater, - (FilterResolved::Pres(_, true), _) => Ordering::Less, - (_, FilterResolved::Pres(_, true)) => Ordering::Greater, - (FilterResolved::Sub(_, _, true), _) => Ordering::Greater, - (_, FilterResolved::Sub(_, _, true)) => Ordering::Less, - // Now prefer the unindexed types by performance order. - (FilterResolved::Pres(_, false), FilterResolved::Pres(_, false)) => Ordering::Equal, - (FilterResolved::Pres(_, false), _) => Ordering::Less, - (_, FilterResolved::Pres(_, false)) => Ordering::Greater, - (FilterResolved::Eq(_, _, false), FilterResolved::Eq(_, _, false)) => Ordering::Equal, - (FilterResolved::Eq(_, _, false), _) => Ordering::Less, - (_, FilterResolved::Eq(_, _, false)) => Ordering::Greater, - (FilterResolved::Sub(_, _, false), FilterResolved::Sub(_, _, false)) => Ordering::Equal, - (FilterResolved::Sub(_, _, false), _) => Ordering::Greater, - (_, FilterResolved::Sub(_, _, false)) => Ordering::Less, - // They can't be compared, they don't move! - (_, _) => Ordering::Equal, + } else { + r } } } @@ -988,38 +1004,44 @@ impl FilterResolved { match fc { FilterComp::Eq(a, v) => { let idx = idxmeta.contains(&(&a, &IndexType::Equality)); + let idx = NonZeroU8::new(idx as u8); FilterResolved::Eq(a, v, idx) } + FilterComp::SelfUuid => panic!("Not possible to resolve SelfUuid in from_invalid!"), FilterComp::Sub(a, v) => { - // let idx = idxmeta.contains(&(&a, &IndexType::SubString)); // TODO: For now, don't emit substring indexes. - let idx = false; - FilterResolved::Sub(a, v, idx) + // let idx = idxmeta.contains(&(&a, &IndexType::SubString)); + // let idx = NonZeroU8::new(idx as u8); + FilterResolved::Sub(a, v, None) } FilterComp::Pres(a) => { let idx = idxmeta.contains(&(&a, &IndexType::Presence)); + let idx = NonZeroU8::new(idx as u8); FilterResolved::Pres(a, idx) } FilterComp::LessThan(a, v) => { // let idx = idxmeta.contains(&(&a, &IndexType::ORDERING)); // TODO: For now, don't emit ordering indexes. - let idx = false; + let idx = None; FilterResolved::LessThan(a, v, idx) } FilterComp::Or(vs) => FilterResolved::Or( vs.into_iter() .map(|v| FilterResolved::from_invalid(v, idxmeta)) .collect(), + None, ), FilterComp::And(vs) => FilterResolved::And( vs.into_iter() .map(|v| FilterResolved::from_invalid(v, idxmeta)) .collect(), + None, ), FilterComp::Inclusion(vs) => FilterResolved::Inclusion( vs.into_iter() .map(|v| FilterResolved::from_invalid(v, idxmeta)) .collect(), + None, ), FilterComp::AndNot(f) => { // TODO: pattern match box here. (AndNot(box f)). @@ -1027,57 +1049,80 @@ impl FilterResolved { // not today remove the box, and we need f in our ownership. Since // AndNot currently is a rare request, cloning is not the worst thing // here ... - FilterResolved::AndNot(Box::new(FilterResolved::from_invalid( - (*f).clone(), - idxmeta, - ))) + FilterResolved::AndNot( + Box::new(FilterResolved::from_invalid((*f).clone(), idxmeta)), + None, + ) } - FilterComp::SelfUuid => panic!("Not possible to resolve SelfUuid in from_invalid!"), } } - fn resolve_idx(fc: FilterComp, ev: &Identity, idxmeta: &HashSet) -> Option { + fn resolve_idx( + fc: FilterComp, + ev: &Identity, + idxmeta: &HashMap, + ) -> Option { match fc { FilterComp::Eq(a, v) => { let idxkref = IdxKeyRef::new(&a, &IndexType::Equality); - let idx = idxmeta.contains(&idxkref as &dyn IdxKeyToRef); + let idx = idxmeta + .get(&idxkref as &dyn IdxKeyToRef) + .copied() + .and_then(NonZeroU8::new); Some(FilterResolved::Eq(a, v, idx)) } + FilterComp::SelfUuid => ev.get_uuid().map(|uuid| { + let uuid_s = AttrString::from("uuid"); + let idxkref = IdxKeyRef::new(&uuid_s, &IndexType::Equality); + let idx = idxmeta + .get(&idxkref as &dyn IdxKeyToRef) + .copied() + .and_then(NonZeroU8::new); + FilterResolved::Eq(uuid_s, PartialValue::new_uuid(uuid), idx) + }), FilterComp::Sub(a, v) => { let idxkref = IdxKeyRef::new(&a, &IndexType::SubString); - let idx = idxmeta.contains(&idxkref as &dyn IdxKeyToRef); + let idx = idxmeta + .get(&idxkref as &dyn IdxKeyToRef) + .copied() + .and_then(NonZeroU8::new); Some(FilterResolved::Sub(a, v, idx)) } FilterComp::Pres(a) => { let idxkref = IdxKeyRef::new(&a, &IndexType::Presence); - let idx = idxmeta.contains(&idxkref as &dyn IdxKeyToRef); + let idx = idxmeta + .get(&idxkref as &dyn IdxKeyToRef) + .copied() + .and_then(NonZeroU8::new); Some(FilterResolved::Pres(a, idx)) } FilterComp::LessThan(a, v) => { // let idx = idxmeta.contains(&(&a, &IndexType::SubString)); - let idx = false; - Some(FilterResolved::LessThan(a, v, idx)) + Some(FilterResolved::LessThan(a, v, None)) } + // We set the compound filters slope factor to "None" here, because when we do + // optimise we'll actually fill in the correct slope factors after we sort those + // inner terms in a more optimial way. FilterComp::Or(vs) => { let fi: Option> = vs .into_iter() .map(|f| FilterResolved::resolve_idx(f, ev, idxmeta)) .collect(); - fi.map(FilterResolved::Or) + fi.map(|fi| FilterResolved::Or(fi, None)) } FilterComp::And(vs) => { let fi: Option> = vs .into_iter() .map(|f| FilterResolved::resolve_idx(f, ev, idxmeta)) .collect(); - fi.map(FilterResolved::And) + fi.map(|fi| FilterResolved::And(fi, None)) } FilterComp::Inclusion(vs) => { let fi: Option> = vs .into_iter() .map(|f| FilterResolved::resolve_idx(f, ev, idxmeta)) .collect(); - fi.map(FilterResolved::Inclusion) + fi.map(|fi| FilterResolved::Inclusion(fi, None)) } FilterComp::AndNot(f) => { // TODO: pattern match box here. (AndNot(box f)). @@ -1086,43 +1131,54 @@ impl FilterResolved { // AndNot currently is a rare request, cloning is not the worst thing // here ... FilterResolved::resolve_idx((*f).clone(), ev, idxmeta) - .map(|fi| FilterResolved::AndNot(Box::new(fi))) + .map(|fi| FilterResolved::AndNot(Box::new(fi), None)) } - FilterComp::SelfUuid => ev.get_uuid().map(|uuid| { - let uuid_s = AttrString::from("uuid"); - let idxkref = IdxKeyRef::new(&uuid_s, &IndexType::Equality); - let idx = idxmeta.contains(&idxkref as &dyn IdxKeyToRef); - FilterResolved::Eq(uuid_s, PartialValue::new_uuid(uuid), idx) - }), } } fn resolve_no_idx(fc: FilterComp, ev: &Identity) -> Option { + // ⚠️ ⚠️ ⚠️ ⚠️ + // Remember, this function means we have NO INDEX METADATA so we can only + // asssign slopes to values we can GUARANTEE will EXIST. match fc { - FilterComp::Eq(a, v) => Some(FilterResolved::Eq(a, v, false)), - FilterComp::Sub(a, v) => Some(FilterResolved::Sub(a, v, false)), - FilterComp::Pres(a) => Some(FilterResolved::Pres(a, false)), - FilterComp::LessThan(a, v) => Some(FilterResolved::LessThan(a, v, false)), + FilterComp::Eq(a, v) => { + // Since we have no index data, we manually configure a reasonable + // slope and indicate the presence of some expected basic + // indexes. + let idx = matches!(a.as_str(), "name" | "uuid"); + let idx = NonZeroU8::new(idx as u8); + Some(FilterResolved::Eq(a, v, idx)) + } + FilterComp::SelfUuid => ev.get_uuid().map(|uuid| { + FilterResolved::Eq( + AttrString::from("uuid"), + PartialValue::new_uuid(uuid), + NonZeroU8::new(true as u8), + ) + }), + FilterComp::Sub(a, v) => Some(FilterResolved::Sub(a, v, None)), + FilterComp::Pres(a) => Some(FilterResolved::Pres(a, None)), + FilterComp::LessThan(a, v) => Some(FilterResolved::LessThan(a, v, None)), FilterComp::Or(vs) => { let fi: Option> = vs .into_iter() .map(|f| FilterResolved::resolve_no_idx(f, ev)) .collect(); - fi.map(FilterResolved::Or) + fi.map(|fi| FilterResolved::Or(fi, None)) } FilterComp::And(vs) => { let fi: Option> = vs .into_iter() .map(|f| FilterResolved::resolve_no_idx(f, ev)) .collect(); - fi.map(FilterResolved::And) + fi.map(|fi| FilterResolved::And(fi, None)) } FilterComp::Inclusion(vs) => { let fi: Option> = vs .into_iter() .map(|f| FilterResolved::resolve_no_idx(f, ev)) .collect(); - fi.map(FilterResolved::Inclusion) + fi.map(|fi| FilterResolved::Inclusion(fi, None)) } FilterComp::AndNot(f) => { // TODO: pattern match box here. (AndNot(box f)). @@ -1131,31 +1187,25 @@ impl FilterResolved { // AndNot currently is a rare request, cloning is not the worst thing // here ... FilterResolved::resolve_no_idx((*f).clone(), ev) - .map(|fi| FilterResolved::AndNot(Box::new(fi))) + .map(|fi| FilterResolved::AndNot(Box::new(fi), None)) } - - FilterComp::SelfUuid => ev.get_uuid().map(|uuid| { - FilterResolved::Eq( - AttrString::from("uuid"), - PartialValue::new_uuid(uuid), - false, - ) - }), } } // This is an optimise that only attempts to optimise the outer terms. fn fast_optimise(self) -> Self { match self { - FilterResolved::Inclusion(mut f_list) => { + FilterResolved::Inclusion(mut f_list, _) => { f_list.sort_unstable(); f_list.dedup(); - FilterResolved::Inclusion(f_list) + let sf = f_list.last().and_then(|f| f.get_slopeyness_factor()); + FilterResolved::Inclusion(f_list, sf) } - FilterResolved::And(mut f_list) => { + FilterResolved::And(mut f_list, _) => { f_list.sort_unstable(); f_list.dedup(); - FilterResolved::And(f_list) + let sf = f_list.first().and_then(|f| f.get_slopeyness_factor()); + FilterResolved::And(f_list, sf) } v => v, } @@ -1164,29 +1214,31 @@ impl FilterResolved { fn optimise(&self) -> Self { // Most optimisations only matter around or/and terms. match self { - FilterResolved::Inclusion(f_list) => { + FilterResolved::Inclusion(f_list, _) => { // first, optimise all our inner elements let (f_list_inc, mut f_list_new): (Vec<_>, Vec<_>) = f_list .iter() .map(|f_ref| f_ref.optimise()) - .partition(|f| matches!(f, FilterResolved::Inclusion(_))); + .partition(|f| matches!(f, FilterResolved::Inclusion(_, _))); f_list_inc.into_iter().for_each(|fc| { - if let FilterResolved::Inclusion(mut l) = fc { + if let FilterResolved::Inclusion(mut l, _) = fc { f_list_new.append(&mut l) } }); // finally, optimise this list by sorting. f_list_new.sort_unstable(); f_list_new.dedup(); - FilterResolved::Inclusion(f_list_new) + // Inclusions are similar to or, so what's our worst case? + let sf = f_list_new.last().and_then(|f| f.get_slopeyness_factor()); + FilterResolved::Inclusion(f_list_new, sf) } - FilterResolved::And(f_list) => { + FilterResolved::And(f_list, _) => { // first, optimise all our inner elements let (f_list_and, mut f_list_new): (Vec<_>, Vec<_>) = f_list .iter() .map(|f_ref| f_ref.optimise()) - .partition(|f| matches!(f, FilterResolved::And(_))); + .partition(|f| matches!(f, FilterResolved::And(_, _))); // now, iterate over this list - for each "and" term, fold // it's elements to this level. @@ -1201,12 +1253,12 @@ impl FilterResolved { // shortcutting f_list_and.into_iter().for_each(|fc| { - if let FilterResolved::And(mut l) = fc { + if let FilterResolved::And(mut l, _) = fc { f_list_new.append(&mut l) } }); - // If the f_list_or only has one element, pop it and return. + // If the f_list_and only has one element, pop it and return. if f_list_new.len() == 1 { #[allow(clippy::expect_used)] f_list_new.pop().expect("corrupt?") @@ -1214,21 +1266,25 @@ impl FilterResolved { // finally, optimise this list by sorting. f_list_new.sort_unstable(); f_list_new.dedup(); + // Which ever element as the head is first must be the min SF + // so we use this in our And to represent us. + let sf = f_list_new.first().and_then(|f| f.get_slopeyness_factor()); + // // return! - FilterResolved::And(f_list_new) + FilterResolved::And(f_list_new, sf) } } - FilterResolved::Or(f_list) => { + FilterResolved::Or(f_list, _) => { let (f_list_or, mut f_list_new): (Vec<_>, Vec<_>) = f_list .iter() // Optimise all inner items. .map(|f_ref| f_ref.optimise()) // Split out inner-or terms to fold into this term. - .partition(|f| matches!(f, FilterResolved::Or(_))); + .partition(|f| matches!(f, FilterResolved::Or(_, _))); // Append the inner terms. f_list_or.into_iter().for_each(|fc| { - if let FilterResolved::Or(mut l) = fc { + if let FilterResolved::Or(mut l, _) = fc { f_list_new.append(&mut l) } }); @@ -1247,8 +1303,9 @@ impl FilterResolved { #[allow(clippy::unnecessary_sort_by)] f_list_new.sort_unstable_by(|a, b| b.cmp(a)); f_list_new.dedup(); - - FilterResolved::Or(f_list_new) + // The *last* element is the most here, and our worst case factor. + let sf = f_list_new.last().and_then(|f| f.get_slopeyness_factor()); + FilterResolved::Or(f_list_new, sf) } } f => f.clone(), @@ -1256,7 +1313,21 @@ impl FilterResolved { } pub fn is_andnot(&self) -> bool { - matches!(self, FilterResolved::AndNot(_)) + matches!(self, FilterResolved::AndNot(_, _)) + } + + #[inline(always)] + fn get_slopeyness_factor(&self) -> Option { + match self { + FilterResolved::Eq(_, _, sf) + | FilterResolved::Sub(_, _, sf) + | FilterResolved::Pres(_, sf) + | FilterResolved::LessThan(_, _, sf) + | FilterResolved::Or(_, sf) + | FilterResolved::And(_, sf) + | FilterResolved::Inclusion(_, sf) + | FilterResolved::AndNot(_, sf) => *sf, + } } } @@ -1452,8 +1523,8 @@ mod tests { // antisymmetry: if a < b then !(a > b), as well as a > b implying !(a < b); and // These are unindexed so we have to check them this way. let f_t3b = unsafe { filter_resolved!(f_eq("userid", PartialValue::new_iutf8(""))) }; - assert_eq!(f_t1a.partial_cmp(&f_t3b), Some(Ordering::Less)); - assert_eq!(f_t3b.partial_cmp(&f_t1a), Some(Ordering::Greater)); + assert_eq!(f_t1a.partial_cmp(&f_t3b), Some(Ordering::Greater)); + assert_eq!(f_t3b.partial_cmp(&f_t1a), Some(Ordering::Less)); // transitivity: a < b and b < c implies a < c. The same must hold for both == and >. let f_t4b = unsafe { filter_resolved!(f_sub("userid", PartialValue::new_iutf8(""))) }; diff --git a/kanidmd/src/lib/schema.rs b/kanidmd/src/lib/schema.rs index f1667904c..709acd59e 100644 --- a/kanidmd/src/lib/schema.rs +++ b/kanidmd/src/lib/schema.rs @@ -512,7 +512,7 @@ impl<'a> SchemaWriteTransaction<'a> { r } - pub(crate) fn reload_idxmeta(&self) -> HashSet { + pub(crate) fn reload_idxmeta(&self) -> Vec { self.get_attributes() .values() .flat_map(|a| { diff --git a/kanidmd/src/lib/server.rs b/kanidmd/src/lib/server.rs index efbc37302..b3a8e6b5f 100644 --- a/kanidmd/src/lib/server.rs +++ b/kanidmd/src/lib/server.rs @@ -1014,6 +1014,12 @@ impl QueryServer { .upgrade_reindex(audit, SYSTEM_INDEX_VERSION + 1) .and_then(|_| reindex_write_2.commit(audit))?; + // Force the schema to reload - this is so that any changes to index slope + // analysis are now reflected correctly. + let slope_reload = task::block_on(self.write_async(ts)); + slope_reload.force_schema_reload(); + slope_reload.commit(audit)?; + // Now, based on the system version apply migrations. You may ask "should you not // be doing migrations before indexes?". And this is a very good question! The issue // is within a migration we must be able to search for content by pres index, and those @@ -2457,8 +2463,12 @@ impl<'a> QueryServerWriteTransaction<'a> { if valid_r.is_empty() { // Now use this to reload the backend idxmeta ltrace!(audit, "Reloading idxmeta ..."); - self.be_txn.update_idxmeta(self.schema.reload_idxmeta()); - Ok(()) + self.be_txn + .update_idxmeta(audit, self.schema.reload_idxmeta()) + .map_err(|e| { + ladmin_error!(audit, "reload schema update idxmeta {:?}", e); + e + }) } else { // Log the failures? ladmin_error!(audit, "Schema reload failed -> {:?}", valid_r); @@ -2625,6 +2635,10 @@ impl<'a> QueryServerWriteTransaction<'a> { self.be_txn.reindex(audit) } + fn force_schema_reload(&self) { + self.changed_schema.set(true); + } + pub(crate) fn upgrade_reindex( &self, audit: &mut AuditScope,