Add statistical analysis to indexes (#505)

This commit is contained in:
Firstyear 2021-07-02 14:50:56 +10:00 committed by GitHub
parent 4c6a28b7ad
commit 040e9fd352
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 867 additions and 264 deletions

View file

@ -18,3 +18,5 @@
## Acknowledgements
* M. Gerstner
* Perri Boulton

View file

@ -2,16 +2,22 @@ use crate::audit::AuditScope;
use crate::be::idl_sqlite::{
IdlSqlite, IdlSqliteReadTransaction, IdlSqliteTransaction, IdlSqliteWriteTransaction,
};
use crate::be::idxkey::{IdlCacheKey, IdlCacheKeyRef, IdlCacheKeyToRef};
use crate::be::idxkey::{
IdlCacheKey, IdlCacheKeyRef, IdlCacheKeyToRef, IdxKey, IdxKeyRef, IdxKeyToRef, IdxSlope,
};
use crate::be::{BackendConfig, IdList, IdRawEntry};
use crate::entry::{Entry, EntryCommitted, EntrySealed};
use crate::value::IndexType;
use crate::value::Value;
use concread::arcache::{ARCache, ARCacheReadTxn, ARCacheWriteTxn};
use concread::cowcell::*;
use idlset::{v2::IDLBitRange, AndNot};
use kanidm_proto::v1::{ConsistencyError, OperationError};
use hashbrown::HashMap;
use std::collections::BTreeSet;
use std::convert::TryInto;
use std::ops::DerefMut;
use std::time::Duration;
use uuid::Uuid;
@ -800,6 +806,262 @@ impl<'a> IdlArcSqliteWriteTransaction<'a> {
})
}
pub fn is_idx_slopeyness_generated(
&self,
audit: &mut AuditScope,
) -> Result<bool, OperationError> {
self.db.is_idx_slopeyness_generated(audit)
}
pub fn get_idx_slope(
&self,
audit: &mut AuditScope,
ikey: &IdxKey,
) -> Result<Option<IdxSlope>, OperationError> {
self.db.get_idx_slope(audit, ikey)
}
/// Index Slope Analysis. For the purpose of external modules you can consider this as a
/// module that generates "weights" for each index that we have. Smaller values are faster
/// indexes - larger values are more costly ones. This is not intended to yield perfect
/// weights. The intent is to seperate over obviously more effective indexes rather than
/// to min-max the fine tuning of these. Consider name=foo vs class=*. name=foo will always
/// be better than class=*, but comparing name=foo to spn=foo is "much over muchness" since
/// both are really fast.
pub fn analyse_idx_slopes(&mut self, audit: &mut AuditScope) -> Result<(), OperationError> {
/*
* Inside of this analysis there are two major factors we need to understand
*
* * What is the variation of idl lengths within an index?
* * How man keys are stored in this index?
*
* Since we have the filter2idl threshold, we want to find "what is the smallest
* and most unique index asap so we can exit faster". This allows us to avoid
* loading larger most costly indexs that either have large idls, high variation
* or few keys and are likely to miss and have to go out to disk.
*
* A few methods were proposed, but thanks to advice from Perri Boulton (psychology
* researcher with a background in statistics), we were able to device a reasonable
* approach.
*
* These are commented in line to help understand the process.
*/
/*
* Step 1 - we have an index like "idx_eq_member". It has data that looks somewhat
* like:
*
* key | idl
* -------+------------
* uuid_a | [1, 2, 3, ...]
* -------+------------
* uuid_b | [4, 5, 6, ...]
*
* We need to collect this into a single vec of "how long is each idl". Since we have
* each idl in the vec, the length of the vec is also the number of keys in the set.
* This yields for us:
*
* idx_eq_member: [4.0, 5.0, ...]
* where each f64 value is the float representation of the length of idl.
*
* We then assemble these to a map so we have each idxkey and it's associated list
* of idl lens.
*/
let mut data: HashMap<IdxKey, Vec<f64>> = HashMap::new();
self.idl_cache.iter_dirty().for_each(|(k, maybe_idl)| {
if let Some(idl) = maybe_idl {
let idl_len: u32 = idl.len().try_into().unwrap_or(u32::MAX);
// Convert to something we can use.
let idl_len = f64::from(idl_len);
let kref = IdxKeyRef::new(&k.a, &k.i);
if idl_len > 0.0 {
// It's worth looking at. Anything len 0 will be removed.
if let Some(lens) = data.get_mut(&kref as &dyn IdxKeyToRef) {
lens.push(idl_len)
} else {
data.insert(kref.to_key(), vec![idl_len]);
}
}
}
});
/*
* So now for each of our sets:
*
* idx_eq_member: [4.0, 5.0, ...]
* idx_eq_name : [1.0, 1.0, 1.0, ...]
*
* To get the variability, we calculate the normal distribution of the set of values
* and then using this variance we use the 1st deviation (~85%) value to assert that
* 85% or more of the values in this set will be "equal or less" than this length.*
*
* So given say:
* [1.0, 1.0, 1.0, 1.0]
* We know that the sd_1 will be 1.0. Given:
* [1.0, 1.0, 2.0, 3.0]
* We know that it will be ~2.57 (mean 1.75 + sd of 0.82).
*
* The other factor is number of keys. This is thankfully easy! We have that from
* vec.len().
*
* We can now calculate the index slope. Why is it a slope you ask? Because we
* plot the data out on a graph, with "variability" on the y axis, and number of
* keys on the x.
*
* Lets plot our data we just added.
*
* |
* 4 +
* |
* 3 +
* |
* 2 + * eq_member
* |
* 1 + * eq_name
* |
* +--+--+--+--+--
* 1 2 3 4
*
* Now, if we were to connect a line from (0,0) to each point we get a line with an angle.
*
* |
* 4 +
* |
* 3 +
* |
* 2 + * eq_member
* |
* 1 + * eq_name
* |/---------/
* +--+--+--+--+--
* 1 2 3 4
* |
* 4 +
* |
* 3 +
* |
* 2 + * eq_member
* | /--/
* 1 + /--/ * eq_name
* |/--/
* +--+--+--+--+--
* 1 2 3 4
*
* (Look it's ascii art, don't judge.).
*
* Point is that eq_member is "steeper" and eq_name is "shallower". This is what we call
* the "slopeyness" aka the jank of the line, or more precisely, the angle.
*
* Now we need a way to numerically compare these lines. Since the points could be
* anywere on our graph:
*
* |
* 4 + *
* |
* 3 + *
* |
* 2 + *
* |
* 1 + *
* |
* +--+--+--+--+--
* 1 2 3 4
*
* While we can see what's obvious or best here, a computer has to know it. So we now
* assume that these points construct a triangle, going through (0,0), (x, 0) and (x, y).
*
*
* Λ
*
*
*
*
*
*
* sd_1
*
*
*
* nkeys
*
* Since this is right angled we can use arctan to work out the degress of the line. This
* gives us a value from 1.0 to 90.0 (We clamp to a minimum of 1.0, because we use 0 as "None"
* in the NonZeroU8 type in filter.rs, which allows ZST optimisation)
*
* The problem is that we have to go from float to u8 - this means we lose decimal precision
* in the conversion. To lessen this, we multiply by 2 to give some extra weight to each angle
* to minimise this loss and then we convert.
*
* And there we have it! A slope factor of the index! A way to compare these sets quickly
* at query optimisation time to minimse index access.
*/
let slopes: HashMap<_, _> = data
.into_iter()
.filter_map(|(k, lens)| {
let slope_factor = Self::calculate_sd_slope(lens);
if slope_factor == 0 || slope_factor == IdxSlope::MAX {
None
} else {
Some((k, slope_factor))
}
})
.collect();
ltrace!(audit, "Generated slopes -> {:?}", slopes);
// Write the data down
self.db.store_idx_slope_analysis(audit, &slopes)
}
fn calculate_sd_slope(data: Vec<f64>) -> IdxSlope {
let (n_keys, sd_1) = if data.len() >= 2 {
// We can only do SD on sets greater than 2
let l: u32 = data.len().try_into().unwrap_or(u32::MAX);
let c = f64::from(l);
let mean = data.iter().take(u32::MAX as usize).sum::<f64>() / c;
let varience: f64 = data
.iter()
.take(u32::MAX as usize)
.map(|len| {
let delta = mean - len;
delta * delta
})
.sum::<f64>()
/ (c - 1.0);
let sd = varience.sqrt();
// This is saying ~85% of values will be at least this len or less.
let sd_1 = mean + sd;
(c, sd_1)
} else if data.len() == 1 {
(1.0, data[0])
} else {
// Cant resolve.
return IdxSlope::MAX;
};
// Now we know sd_1 and number of keys. We can use this as a triangle to work out
// the angle along the hypotenuse. We use this angle - or slope - to show which
// elements have the smallest sd_1 and most keys available. Then because this
// is bound between 0.0 -> 90.0, we "unfurl" this around a half circle by multipling
// by 2. This gives us a little more precision when we drop the decimal point.
let sf = (sd_1 / n_keys).atan().to_degrees() * 2.8;
// Now these are fractions, and we can't use those in u8, so we clamp the min/max values
// that we expect to be yielded.
let sf = sf.clamp(1.0, 254.0);
if !sf.is_finite() {
IdxSlope::MAX
} else {
// SAFETY
// `sf` is clamped between 1.0 and 180.0 above, ensuring it is
// always in range.
unsafe { sf.to_int_unchecked::<IdxSlope>() }
}
}
pub fn create_name2uuid(&self, audit: &mut AuditScope) -> Result<(), OperationError> {
self.db.create_name2uuid(audit)
}

View file

@ -1,7 +1,8 @@
use crate::audit::AuditScope;
use crate::be::{BackendConfig, IdList, IdRawEntry};
use crate::be::{BackendConfig, IdList, IdRawEntry, IdxKey, IdxSlope};
use crate::entry::{Entry, EntryCommitted, EntrySealed};
use crate::value::{IndexType, Value};
use hashbrown::HashMap;
use idlset::v2::IDLBitRange;
use kanidm_proto::v1::{ConsistencyError, OperationError};
use r2d2::Pool;
@ -195,13 +196,7 @@ pub trait IdlSqliteTransaction {
}
}
fn exists_idx(
&self,
audit: &mut AuditScope,
attr: &str,
itype: &IndexType,
) -> Result<bool, OperationError> {
let tname = format!("idx_{}_{}", itype.as_idx_str(), attr);
fn exists_table(&self, audit: &mut AuditScope, tname: &str) -> Result<bool, OperationError> {
let mut stmt = self
.get_conn()
.prepare("SELECT COUNT(name) from sqlite_master where name = :tname")
@ -210,7 +205,7 @@ pub trait IdlSqliteTransaction {
OperationError::SqliteError
})?;
let i: Option<i64> = stmt
.query_row(&[(":tname", &tname)], |row| row.get(0))
.query_row(&[(":tname", tname)], |row| row.get(0))
.map_err(|e| {
ladmin_error!(audit, "SQLite Error {:?}", e);
OperationError::SqliteError
@ -223,6 +218,16 @@ pub trait IdlSqliteTransaction {
}
}
fn exists_idx(
&self,
audit: &mut AuditScope,
attr: &str,
itype: &IndexType,
) -> Result<bool, OperationError> {
let tname = format!("idx_{}_{}", itype.as_idx_str(), attr);
self.exists_table(audit, &tname)
}
fn get_idl(
&self,
audit: &mut AuditScope,
@ -1067,6 +1072,94 @@ impl IdlSqliteWriteTransaction {
})
}
pub fn store_idx_slope_analysis(
&self,
audit: &mut AuditScope,
slopes: &HashMap<IdxKey, IdxSlope>,
) -> Result<(), OperationError> {
self.conn
.execute(
"CREATE TABLE IF NOT EXISTS idxslope_analysis (
id TEXT PRIMARY KEY,
slope INTEGER
)",
[],
)
.map(|_| ())
.map_err(|e| {
ladmin_error!(audit, "SQLite Error {:?}", e);
OperationError::SqliteError
})?;
// Remove any data if it exists.
self.conn
.execute("DELETE FROM idxslope_analysis", [])
.map(|_| ())
.map_err(|e| {
ladmin_error!(audit, "SQLite Error {:?}", e);
OperationError::SqliteError
})?;
slopes.iter().try_for_each(|(k, v)| {
let key = format!("idx_{}_{}", k.itype.as_idx_str(), k.attr);
self.conn
.execute(
"INSERT OR REPLACE INTO idxslope_analysis (id, slope) VALUES(:id, :slope)",
named_params! {
":id": &key,
":slope": &v,
},
)
.map(|_| ())
.map_err(|e| {
eprintln!("CRITICAL: rusqlite error {:?}", e);
OperationError::SqliteError
})
})
}
pub fn is_idx_slopeyness_generated(
&self,
audit: &mut AuditScope,
) -> Result<bool, OperationError> {
self.exists_table(audit, "idxslope_analysis")
}
pub fn get_idx_slope(
&self,
audit: &mut AuditScope,
ikey: &IdxKey,
) -> Result<Option<IdxSlope>, OperationError> {
let analysis_exists = self.exists_table(audit, "idxslope_analysis")?;
if !analysis_exists {
return Ok(None);
}
// Then we have the table and it should have things in it, lets put
// it all together.
let key = format!("idx_{}_{}", ikey.itype.as_idx_str(), ikey.attr);
let mut stmt = self
.get_conn()
.prepare("SELECT slope FROM idxslope_analysis WHERE id = :id")
.map_err(|e| {
ladmin_error!(audit, "SQLite Error {:?}", e);
OperationError::SqliteError
})?;
let slope: Option<IdxSlope> = stmt
.query_row(&[(":id", &key)], |row| row.get(0))
// We don't mind if it doesn't exist
.optional()
.map_err(|e| {
ladmin_error!(audit, "SQLite Error {:?}", e);
OperationError::SqliteError
})?;
ltrace!(audit, "Got slope for index name {} -> {:?}", key, slope);
Ok(slope)
}
pub unsafe fn purge_id2entry(&self, audit: &mut AuditScope) -> Result<(), OperationError> {
ltrace!(audit, "purge id2entry ...");
self.conn
@ -1303,7 +1396,7 @@ impl IdlSqliteWriteTransaction {
dbv_id2entry
);
}
// * if v3 -> complete.
// * if v3 -> create name2uuid, uuid2spn, uuid2rdn.
if dbv_id2entry == 3 {
self.create_name2uuid(audit)
.and_then(|_| self.create_uuid2spn(audit))

View file

@ -4,6 +4,8 @@ use std::borrow::Borrow;
use std::cmp::Ordering;
use std::hash::{Hash, Hasher};
pub type IdxSlope = u8;
// Huge props to https://github.com/sunshowers/borrow-complex-key-example/blob/master/src/lib.rs
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
@ -12,6 +14,15 @@ pub struct IdxKey {
pub itype: IndexType,
}
impl IdxKey {
pub fn new(attr: &str, itype: IndexType) -> Self {
IdxKey {
attr: attr.into(),
itype,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct IdxKeyRef<'a> {
pub attr: &'a str,
@ -22,6 +33,13 @@ impl<'a> IdxKeyRef<'a> {
pub fn new(attr: &'a str, itype: &'a IndexType) -> Self {
IdxKeyRef { attr, itype }
}
pub fn to_key(&self) -> IdxKey {
IdxKey {
attr: self.attr.into(),
itype: self.itype.clone(),
}
}
}
pub trait IdxKeyToRef {

View file

@ -7,7 +7,8 @@
use std::fs;
use crate::value::IndexType;
use hashbrown::HashSet as Set;
use hashbrown::HashMap as Map;
use hashbrown::HashSet;
use std::cell::UnsafeCell;
use std::sync::Arc;
@ -32,7 +33,7 @@ mod idl_arc_sqlite;
mod idl_sqlite;
pub(crate) mod idxkey;
pub(crate) use self::idxkey::{IdxKey, IdxKeyRef, IdxKeyToRef};
pub(crate) use self::idxkey::{IdxKey, IdxKeyRef, IdxKeyToRef, IdxSlope};
use crate::be::idl_arc_sqlite::{
IdlArcSqlite, IdlArcSqliteReadTransaction, IdlArcSqliteTransaction,
@ -61,11 +62,11 @@ pub struct IdRawEntry {
#[derive(Debug, Clone)]
pub struct IdxMeta {
pub idxkeys: Set<IdxKey>,
pub idxkeys: Map<IdxKey, IdxSlope>,
}
impl IdxMeta {
pub fn new(idxkeys: Set<IdxKey>) -> Self {
pub fn new(idxkeys: Map<IdxKey, IdxSlope>) -> Self {
IdxMeta { idxkeys }
}
}
@ -158,7 +159,7 @@ pub trait BackendTransaction {
) -> Result<(IdList, FilterPlan), OperationError> {
Ok(match filt {
FilterResolved::Eq(attr, value, idx) => {
if *idx {
if idx.is_some() {
// Get the idx_key
let idx_key = value.get_idx_eq_key();
// Get the idl for this
@ -178,7 +179,7 @@ pub trait BackendTransaction {
}
}
FilterResolved::Sub(attr, subvalue, idx) => {
if *idx {
if idx.is_some() {
// Get the idx_key
let idx_key = subvalue.get_idx_sub_key();
// Get the idl for this
@ -198,7 +199,7 @@ pub trait BackendTransaction {
}
}
FilterResolved::Pres(attr, idx) => {
if *idx {
if idx.is_some() {
// Get the idl for this
match self.get_idlayer().get_idl(
au,
@ -218,7 +219,7 @@ pub trait BackendTransaction {
// We have no process for indexing this right now.
(IdList::AllIds, FilterPlan::LessThanUnindexed(attr.clone()))
}
FilterResolved::Or(l) => {
FilterResolved::Or(l, _) => {
// Importantly if this has no inner elements, this returns
// an empty list.
let mut plan = Vec::new();
@ -270,7 +271,7 @@ pub trait BackendTransaction {
(IdList::Indexed(result), setplan)
}
}
FilterResolved::And(l) => {
FilterResolved::And(l, _) => {
// This algorithm is a little annoying. I couldn't get it to work with iter and
// folds due to the logic needed ...
@ -382,7 +383,7 @@ pub trait BackendTransaction {
for f in f_andnot.iter() {
f_rem_count -= 1;
let f_in = match f {
FilterResolved::AndNot(f_in) => f_in,
FilterResolved::AndNot(f_in, _) => f_in,
_ => {
lfilter_error!(
au,
@ -465,7 +466,7 @@ pub trait BackendTransaction {
// debug!("final cand set ==> {:?}", cand_idl);
(cand_idl, setplan)
} // end and
FilterResolved::Inclusion(l) => {
FilterResolved::Inclusion(l, _) => {
// For inclusion to be valid, every term must have *at least* one element present.
// This really relies on indexing, and so it's internal only - generally only
// for fully indexed existance queries, such as from refint.
@ -507,7 +508,7 @@ pub trait BackendTransaction {
// of an "AND" query, but if it's used anywhere else IE the root filter, then there is
// no other set to exclude - therefore it's empty set. Additionally, even in an OR query
// the AndNot will be skipped as an empty set for the same reason.
FilterResolved::AndNot(_f) => {
FilterResolved::AndNot(_f, _) => {
// get the idl for f
// now do andnot?
lfilter_error!(
@ -1046,8 +1047,32 @@ impl<'a> BackendWriteTransaction<'a> {
})
}
pub fn update_idxmeta(&mut self, mut idxkeys: Set<IdxKey>) {
pub fn update_idxmeta(
&mut self,
audit: &mut AuditScope,
idxkeys: Vec<IdxKey>,
) -> Result<(), OperationError> {
if self.is_idx_slopeyness_generated(audit)? {
ltrace!(audit, "Indexing slopes available");
} else {
ladmin_warning!(
audit,
"No indexing slopes available. You should consider reindexing to generate these"
);
};
// Setup idxkeys here. By default we set these all to "max slope" aka
// all indexes are "equal" but also worse case unless analysed. If they
// have been analysed, we can set the slope factor into here.
let idxkeys: Result<Map<_, _>, _> = idxkeys
.into_iter()
.map(|k| self.get_idx_slope(audit, &k).map(|slope| (k, slope)))
.collect();
let mut idxkeys = idxkeys?;
std::mem::swap(&mut self.idxmeta_wr.deref_mut().idxkeys, &mut idxkeys);
Ok(())
}
// Should take a mut index set, and then we write the whole thing back
@ -1234,12 +1259,12 @@ impl<'a> BackendWriteTransaction<'a> {
let idx_table_list = self.get_idlayer().list_idxs(audit)?;
// Turn the vec to a real set
let idx_table_set: Set<_> = idx_table_list.into_iter().collect();
let idx_table_set: HashSet<_> = idx_table_list.into_iter().collect();
let missing: Vec<_> = self
.idxmeta
.idxkeys
.iter()
.keys()
.filter_map(|ikey| {
// what would the table name be?
let tname = format!("idx_{}_{}", ikey.itype.as_idx_str(), ikey.attr.as_str());
@ -1269,7 +1294,7 @@ impl<'a> BackendWriteTransaction<'a> {
self.idxmeta
.idxkeys
.iter()
.keys()
.try_for_each(|ikey| idlayer.create_idx(audit, &ikey.attr, &ikey.itype))
}
@ -1327,7 +1352,12 @@ impl<'a> BackendWriteTransaction<'a> {
limmediate_warning!(audit, "Optimising Indexes ... ");
idlayer.optimise_dirty_idls(audit);
limmediate_warning!(audit, "done ✅\n");
limmediate_warning!(audit, "Calculating Index Optimisation Slopes ... ");
idlayer.analyse_idx_slopes(audit).map_err(|e| {
ladmin_error!(audit, "index optimisation failed -> {:?}", e);
e
})?;
limmediate_warning!(audit, "done ✅\n");
Ok(())
}
@ -1347,6 +1377,24 @@ impl<'a> BackendWriteTransaction<'a> {
self.get_idlayer().get_idl(audit, attr, itype, idx_key)
}
fn is_idx_slopeyness_generated(&self, audit: &mut AuditScope) -> Result<bool, OperationError> {
self.get_idlayer().is_idx_slopeyness_generated(audit)
}
fn get_idx_slope(
&self,
audit: &mut AuditScope,
ikey: &IdxKey,
) -> Result<IdxSlope, OperationError> {
// Do we have the slopeyness?
let slope = self
.get_idlayer()
.get_idx_slope(audit, ikey)?
.unwrap_or_else(|| get_idx_slope_default(ikey));
ltrace!(audit, "index slope - {:?} -> {:?}", ikey, slope);
Ok(slope)
}
pub fn restore(&self, audit: &mut AuditScope, src_path: &str) -> Result<(), OperationError> {
let idlayer = self.get_idlayer();
// load all entries into RAM, may need to change this later
@ -1369,35 +1417,6 @@ impl<'a> BackendWriteTransaction<'a> {
OperationError::SerdeJsonError
})?;
// Filter all elements that have a UUID in the system range.
/*
use crate::constants::UUID_ANONYMOUS;
use crate::be::dbentry::DbEntryVers;
use crate::be::dbvalue::DbValueV1;
let uuid_anonymous = UUID_ANONYMOUS.clone();
let dbentries: Vec<DbEntry> = dbentries.into_iter()
.filter(|e| {
let e_uuid = match &e.ent {
DbEntryVers::V1(dbe) => dbe.attrs.get("uuid")
.and_then(|dbvs| dbvs.first())
.and_then(|dbv| {
match dbv {
DbValueV1::UU(u) => Some(u),
_ => panic!(),
}
})
.unwrap()
};
e_uuid > &uuid_anonymous
})
.collect();
dbentries.iter().for_each(|e| {
ltrace!(audit, "importing -> {:?}", e);
});
*/
// Now, we setup all the entries with new ids.
let mut id_max = 0;
let identries: Result<Vec<IdRawEntry>, _> = dbentries
@ -1411,16 +1430,6 @@ impl<'a> BackendWriteTransaction<'a> {
idlayer.write_identries_raw(audit, identries?.into_iter())?;
// for debug
/*
self.idlayer.get_identry(audit, &IdList::AllIds)
.unwrap()
.iter()
.for_each(|dbe| {
ltrace!(audit, "dbe -> {:?}", dbe.id);
});
*/
// Reindex now we are loaded.
self.reindex(audit)?;
@ -1505,6 +1514,21 @@ impl<'a> BackendWriteTransaction<'a> {
}
}
// We have a number of hardcoded, "obvious" slopes that should
// exist. We return these when the analysis has not been run, as
// these are values that are generally "good enough" for most applications
fn get_idx_slope_default(ikey: &IdxKey) -> IdxSlope {
match (ikey.attr.as_str(), &ikey.itype) {
("name", IndexType::Equality)
| ("spn", IndexType::Equality)
| ("uuid", IndexType::Equality) => 1,
("class", IndexType::Equality) => 180,
(_, IndexType::Equality) => 45,
(_, IndexType::SubString) => 90,
(_, IndexType::Presence) => 90,
}
}
// In the future this will do the routing between the chosen backends etc.
impl Backend {
pub fn new(
@ -1513,7 +1537,7 @@ impl Backend {
// path: &str,
// mut pool_size: u32,
// fstype: FsType,
idxkeys: Set<IdxKey>,
idxkeys: Vec<IdxKey>,
vacuum: bool,
) -> Result<Self, OperationError> {
info!("DB tickets -> {:?}", cfg.pool_size);
@ -1525,6 +1549,19 @@ impl Backend {
cfg.pool_size = 1;
}
// Setup idxkeys here. By default we set these all to "max slope" aka
// all indexes are "equal" but also worse case unless analysed.
//
// During startup this will be "fixed" as the schema core will call reload_idxmeta
// which will trigger a reload of the analysis data (if present).
let idxkeys: Map<_, _> = idxkeys
.into_iter()
.map(|ikey| {
let slope = get_idx_slope_default(&ikey);
(ikey, slope)
})
.collect();
// this has a ::memory() type, but will path == "" work?
lperf_trace_segment!(audit, "be::new", || {
let idlayer = Arc::new(IdlArcSqlite::new(audit, &cfg, vacuum)?);
@ -1597,7 +1634,6 @@ impl Backend {
#[cfg(test)]
mod tests {
use hashbrown::HashSet as Set;
use idlset::v2::IDLBitRange;
use std::fs;
use std::iter::FromIterator;
@ -1626,35 +1662,36 @@ mod tests {
let mut audit = AuditScope::new("run_test", uuid::Uuid::new_v4(), None);
// This is a demo idxmeta, purely for testing.
let mut idxmeta = Set::with_capacity(16);
idxmeta.insert(IdxKey {
attr: AttrString::from("name"),
itype: IndexType::Equality,
});
idxmeta.insert(IdxKey {
attr: AttrString::from("name"),
itype: IndexType::Presence,
});
idxmeta.insert(IdxKey {
attr: AttrString::from("name"),
itype: IndexType::SubString,
});
idxmeta.insert(IdxKey {
attr: AttrString::from("uuid"),
itype: IndexType::Equality,
});
idxmeta.insert(IdxKey {
attr: AttrString::from("uuid"),
itype: IndexType::Presence,
});
idxmeta.insert(IdxKey {
attr: AttrString::from("ta"),
itype: IndexType::Equality,
});
idxmeta.insert(IdxKey {
attr: AttrString::from("tb"),
itype: IndexType::Equality,
});
let idxmeta = vec![
IdxKey {
attr: AttrString::from("name"),
itype: IndexType::Equality,
},
IdxKey {
attr: AttrString::from("name"),
itype: IndexType::Presence,
},
IdxKey {
attr: AttrString::from("name"),
itype: IndexType::SubString,
},
IdxKey {
attr: AttrString::from("uuid"),
itype: IndexType::Equality,
},
IdxKey {
attr: AttrString::from("uuid"),
itype: IndexType::Presence,
},
IdxKey {
attr: AttrString::from("ta"),
itype: IndexType::Equality,
},
IdxKey {
attr: AttrString::from("tb"),
itype: IndexType::Equality,
},
];
let be = Backend::new(&mut audit, BackendConfig::new_test(), idxmeta, false)
.expect("Failed to setup backend");
@ -2765,6 +2802,103 @@ mod tests {
})
}
#[test]
fn test_be_index_slope_generation() {
run_test!(|audit: &mut AuditScope, be: &mut BackendWriteTransaction| {
// Create some test entry with some indexed / unindexed values.
let mut e1: Entry<EntryInit, EntryNew> = Entry::new();
e1.add_ava("name", Value::from("william"));
e1.add_ava("uuid", Value::from("db237e8a-0079-4b8c-8a56-593b22aa44d1"));
e1.add_ava("ta", Value::from("dupe"));
e1.add_ava("tb", Value::from("1"));
let e1 = unsafe { e1.into_sealed_new() };
let mut e2: Entry<EntryInit, EntryNew> = Entry::new();
e2.add_ava("name", Value::from("claire"));
e2.add_ava("uuid", Value::from("db237e8a-0079-4b8c-8a56-593b22aa44d2"));
e2.add_ava("ta", Value::from("dupe"));
e2.add_ava("tb", Value::from("1"));
let e2 = unsafe { e2.into_sealed_new() };
let mut e3: Entry<EntryInit, EntryNew> = Entry::new();
e3.add_ava("name", Value::from("benny"));
e3.add_ava("uuid", Value::from("db237e8a-0079-4b8c-8a56-593b22aa44d3"));
e3.add_ava("ta", Value::from("dupe"));
e3.add_ava("tb", Value::from("2"));
let e3 = unsafe { e3.into_sealed_new() };
let _rset = be
.create(audit, vec![e1.clone(), e2.clone(), e3.clone()])
.unwrap();
// If the slopes haven't been generated yet, there are some hardcoded values
// that we can use instead. They aren't generated until a first re-index.
assert!(!be.is_idx_slopeyness_generated(audit).unwrap());
let ta_eq_slope = be
.get_idx_slope(audit, &IdxKey::new("ta", IndexType::Equality))
.unwrap();
assert_eq!(ta_eq_slope, 45);
let tb_eq_slope = be
.get_idx_slope(audit, &IdxKey::new("tb", IndexType::Equality))
.unwrap();
assert_eq!(tb_eq_slope, 45);
let name_eq_slope = be
.get_idx_slope(audit, &IdxKey::new("name", IndexType::Equality))
.unwrap();
assert_eq!(name_eq_slope, 1);
let uuid_eq_slope = be
.get_idx_slope(audit, &IdxKey::new("uuid", IndexType::Equality))
.unwrap();
assert_eq!(uuid_eq_slope, 1);
let name_pres_slope = be
.get_idx_slope(audit, &IdxKey::new("name", IndexType::Presence))
.unwrap();
assert_eq!(name_pres_slope, 90);
let uuid_pres_slope = be
.get_idx_slope(audit, &IdxKey::new("uuid", IndexType::Presence))
.unwrap();
assert_eq!(uuid_pres_slope, 90);
// Check the slopes are what we expect for hardcoded values.
// Now check slope generation for the values. Today these are calculated
// at reindex time, so we now perform the re-index.
assert!(be.reindex(audit).is_ok());
assert!(be.is_idx_slopeyness_generated(audit).unwrap());
let ta_eq_slope = be
.get_idx_slope(audit, &IdxKey::new("ta", IndexType::Equality))
.unwrap();
assert_eq!(ta_eq_slope, 200);
let tb_eq_slope = be
.get_idx_slope(audit, &IdxKey::new("tb", IndexType::Equality))
.unwrap();
assert_eq!(tb_eq_slope, 133);
let name_eq_slope = be
.get_idx_slope(audit, &IdxKey::new("name", IndexType::Equality))
.unwrap();
assert_eq!(name_eq_slope, 51);
let uuid_eq_slope = be
.get_idx_slope(audit, &IdxKey::new("uuid", IndexType::Equality))
.unwrap();
assert_eq!(uuid_eq_slope, 51);
let name_pres_slope = be
.get_idx_slope(audit, &IdxKey::new("name", IndexType::Presence))
.unwrap();
assert_eq!(name_pres_slope, 200);
let uuid_pres_slope = be
.get_idx_slope(audit, &IdxKey::new("uuid", IndexType::Presence))
.unwrap();
assert_eq!(uuid_pres_slope, 200);
})
}
#[test]
fn test_be_limits_allids() {
run_test!(|audit: &mut AuditScope, be: &mut BackendWriteTransaction| {

View file

@ -38,14 +38,14 @@ use kanidm_proto::v1::Filter as ProtoFilter;
use kanidm_proto::v1::{OperationError, SchemaError};
use crate::be::dbentry::{DbEntry, DbEntryV1, DbEntryVers};
use crate::be::IdxKey;
use crate::be::{IdxKey, IdxSlope};
use ldap3_server::simple::{LdapPartialAttribute, LdapSearchResultEntry};
use std::collections::BTreeMap as Map;
pub use std::collections::BTreeSet as Set;
use std::collections::BTreeSet;
// use hashbrown::HashMap as Map;
use hashbrown::HashSet;
use hashbrown::HashMap;
use smartstring::alias::String as AttrString;
use time::OffsetDateTime;
use uuid::Uuid;
@ -1097,7 +1097,7 @@ impl Entry<EntrySealed, EntryCommitted> {
/// Given the previous and current state of this entry, determine the indexing differential
/// that needs to be applied. i.e. what indexes must be created, modified and removed.
pub(crate) fn idx_diff<'a>(
idxmeta: &'a HashSet<IdxKey>,
idxmeta: &'a HashMap<IdxKey, IdxSlope>,
pre: Option<&Self>,
post: Option<&Self>,
) -> IdxDiff<'a> {
@ -1113,7 +1113,7 @@ impl Entry<EntrySealed, EntryCommitted> {
(Some(pre_e), None) => {
// If we are none (?), yield our pre-state as removals.
idxmeta
.iter()
.keys()
.flat_map(|ikey| {
match pre_e.get_ava(ikey.attr.as_str()) {
None => Vec::new(),
@ -1143,7 +1143,7 @@ impl Entry<EntrySealed, EntryCommitted> {
(None, Some(post_e)) => {
// If the pre-state is none, yield our additions.
idxmeta
.iter()
.keys()
.flat_map(|ikey| {
match post_e.get_ava(ikey.attr.as_str()) {
None => Vec::new(),
@ -1175,7 +1175,7 @@ impl Entry<EntrySealed, EntryCommitted> {
(Some(pre_e), Some(post_e)) => {
assert!(pre_e.state.id == post_e.state.id);
idxmeta
.iter()
.keys()
.flat_map(|ikey| {
match (
pre_e.get_ava_set(ikey.attr.as_str()),
@ -1804,16 +1804,16 @@ impl<VALID, STATE> Entry<VALID, STATE> {
self.attribute_lessthan(attr.as_str(), subvalue)
}
// Check with ftweedal about or filter zero len correctness.
FilterResolved::Or(l) => l.iter().any(|f| self.entry_match_no_index_inner(f)),
FilterResolved::Or(l, _) => l.iter().any(|f| self.entry_match_no_index_inner(f)),
// Check with ftweedal about and filter zero len correctness.
FilterResolved::And(l) => l.iter().all(|f| self.entry_match_no_index_inner(f)),
FilterResolved::Inclusion(_) => {
FilterResolved::And(l, _) => l.iter().all(|f| self.entry_match_no_index_inner(f)),
FilterResolved::Inclusion(_, _) => {
// An inclusion doesn't make sense on an entry in isolation!
// Inclusions are part of exists queries, on search they mean
// nothing!
false
}
FilterResolved::AndNot(f) => !self.entry_match_no_index_inner(f),
FilterResolved::AndNot(f, _) => !self.entry_match_no_index_inner(f),
}
}
@ -2091,11 +2091,11 @@ impl From<&SchemaClass> for Entry<EntryInit, EntryNew> {
#[cfg(test)]
mod tests {
use crate::be::IdxKey;
use crate::be::{IdxKey, IdxSlope};
use crate::entry::{Entry, EntryInit, EntryInvalid, EntryNew};
use crate::modify::{Modify, ModifyList};
use crate::value::{IndexType, PartialValue, Value};
use hashbrown::HashSet;
use hashbrown::HashMap;
use smartstring::alias::String as AttrString;
use std::collections::BTreeSet as Set;
@ -2270,19 +2270,28 @@ mod tests {
e2.add_ava("userid", Value::from("claire"));
let e2 = unsafe { e2.into_sealed_committed() };
let mut idxmeta = HashSet::with_capacity(8);
idxmeta.insert(IdxKey {
attr: AttrString::from("userid"),
itype: IndexType::Equality,
});
idxmeta.insert(IdxKey {
attr: AttrString::from("userid"),
itype: IndexType::Presence,
});
idxmeta.insert(IdxKey {
attr: AttrString::from("extra"),
itype: IndexType::Equality,
});
let mut idxmeta = HashMap::with_capacity(8);
idxmeta.insert(
IdxKey {
attr: AttrString::from("userid"),
itype: IndexType::Equality,
},
IdxSlope::MAX,
);
idxmeta.insert(
IdxKey {
attr: AttrString::from("userid"),
itype: IndexType::Presence,
},
IdxSlope::MAX,
);
idxmeta.insert(
IdxKey {
attr: AttrString::from("extra"),
itype: IndexType::Equality,
},
IdxSlope::MAX,
);
// When we do None, None, we get nothing back.
let r1 = Entry::idx_diff(&idxmeta, None, None);

View file

@ -8,25 +8,28 @@
//! [`Filter`]: struct.Filter.html
//! [`Entry`]: ../entry/struct.Entry.html
use crate::be::{IdxKey, IdxKeyRef, IdxKeyToRef, IdxMeta};
use crate::be::{IdxKey, IdxKeyRef, IdxKeyToRef, IdxMeta, IdxSlope};
use crate::identity::IdentityId;
use crate::ldap::ldap_attr_filter_map;
use crate::prelude::*;
use crate::schema::SchemaTransaction;
use crate::value::{IndexType, PartialValue};
use hashbrown::HashSet;
use concread::arcache::ARCacheReadTxn;
use kanidm_proto::v1::Filter as ProtoFilter;
use kanidm_proto::v1::{OperationError, SchemaError};
use ldap3_server::proto::{LdapFilter, LdapSubstringFilter};
// use smartstring::alias::String;
use concread::arcache::ARCacheReadTxn;
use smartstring::alias::String as AttrString;
// use smartstring::alias::String as AttrString;
use std::cmp::{Ordering, PartialOrd};
use std::collections::BTreeSet;
use std::hash::Hash;
use std::iter;
use std::num::NonZeroU8;
use uuid::Uuid;
use hashbrown::HashMap;
#[cfg(test)]
use hashbrown::HashSet;
const FILTER_DEPTH_MAX: usize = 16;
// Default filter is safe, ignores all hidden types!
@ -104,8 +107,8 @@ pub fn f_spn_name(id: &str) -> FC<'static> {
FC::Or(f)
}
// This is the short-form for tests and internal filters that can then
// be transformed into a filter for the server to use.
/// This is the short-form for tests and internal filters that can then
/// be transformed into a filter for the server to use.
#[derive(Debug, Deserialize)]
pub enum FC<'a> {
Eq(&'a str, PartialValue),
@ -120,7 +123,7 @@ pub enum FC<'a> {
// Not(Box<FC>),
}
// This is the filters internal representation.
/// This is the filters internal representation
#[derive(Debug, Clone, Hash, PartialEq, PartialOrd, Ord, Eq)]
enum FilterComp {
// This is attr - value
@ -137,22 +140,28 @@ enum FilterComp {
// Not(Box<FilterComp>),
}
// This is the fully resolved internal representation. Note the lack of Not and selfUUID
// because these are resolved into And(Pres(class), AndNot(term)) and Eq(uuid, ...).
// Importantly, we make this accessible to Entry so that it can then match on filters
// properly.
#[derive(Debug, Clone, PartialEq, Eq)]
/// This is the fully resolved internal representation. Note the lack of Not and selfUUID
/// because these are resolved into And(Pres(class), AndNot(term)) and Eq(uuid, ...).
/// Importantly, we make this accessible to Entry so that it can then match on filters
/// internally.
///
/// Each filter that has been resolved also has been enriched with metadata about its
/// index, and index slope. For the purpose of this module, consider slope as a "weight"
/// where small value - faster index, larger value - slower index. This metadata is extremely
/// important for the query optimiser to make decisions about how to re-arrange queries
/// correctly.
#[derive(Debug, Clone, Eq)]
pub enum FilterResolved {
// This is attr - value - indexed
Eq(AttrString, PartialValue, bool),
Sub(AttrString, PartialValue, bool),
Pres(AttrString, bool),
LessThan(AttrString, PartialValue, bool),
Or(Vec<FilterResolved>),
And(Vec<FilterResolved>),
// This is attr - value - indexed slope factor
Eq(AttrString, PartialValue, Option<NonZeroU8>),
Sub(AttrString, PartialValue, Option<NonZeroU8>),
Pres(AttrString, Option<NonZeroU8>),
LessThan(AttrString, PartialValue, Option<NonZeroU8>),
Or(Vec<FilterResolved>, Option<NonZeroU8>),
And(Vec<FilterResolved>, Option<NonZeroU8>),
// All terms must have 1 or more items, or the inclusion is false!
Inclusion(Vec<FilterResolved>),
AndNot(Box<FilterResolved>),
Inclusion(Vec<FilterResolved>, Option<NonZeroU8>),
AndNot(Box<FilterResolved>, Option<NonZeroU8>),
}
#[derive(Debug, Clone, PartialEq)]
@ -903,26 +912,6 @@ impl PartialEq for Filter<FilterValidResolved> {
}
*/
/*
impl PartialEq for FilterResolved {
fn eq(&self, rhs: &FilterResolved) -> bool {
match (self, rhs) {
(FilterResolved::Eq(a1, v1, i1), FilterResolved::Eq(a2, v2, i2)) => {
a1 == a2 && v1 == v2 && i1 == i2
}
(FilterResolved::Sub(a1, v1, i1), FilterResolved::Sub(a2, v2, i2)) => {
a1 == a2 && v1 == v2 && i1 == i2
}
(FilterResolved::Pres(a1, i1), FilterResolved::Pres(a2, i2)) => a1 == a2 && i1 == i2,
(FilterResolved::And(vs1), FilterResolved::And(vs2)) => vs1 == vs2,
(FilterResolved::Or(vs1), FilterResolved::Or(vs2)) => vs1 == vs2,
(FilterResolved::AndNot(f1), FilterResolved::AndNot(f2)) => f1 == f2,
(_, _) => false,
}
}
}
*/
/*
* Only needed in tests, in run time order only matters on the inner for
* optimisation.
@ -934,8 +923,26 @@ impl PartialOrd for Filter<FilterValidResolved> {
}
}
// remember, this isn't ordering by alphanumeric, this is ordering of
// optimisation preference!
impl PartialEq for FilterResolved {
fn eq(&self, rhs: &FilterResolved) -> bool {
match (self, rhs) {
(FilterResolved::Eq(a1, v1, _), FilterResolved::Eq(a2, v2, _)) => a1 == a2 && v1 == v2,
(FilterResolved::Sub(a1, v1, _), FilterResolved::Sub(a2, v2, _)) => {
a1 == a2 && v1 == v2
}
(FilterResolved::Pres(a1, _), FilterResolved::Pres(a2, _)) => a1 == a2,
(FilterResolved::LessThan(a1, v1, _), FilterResolved::LessThan(a2, v2, _)) => {
a1 == a2 && v1 == v2
}
(FilterResolved::And(vs1, _), FilterResolved::And(vs2, _)) => vs1 == vs2,
(FilterResolved::Or(vs1, _), FilterResolved::Or(vs2, _)) => vs1 == vs2,
(FilterResolved::Inclusion(vs1, _), FilterResolved::Inclusion(vs2, _)) => vs1 == vs2,
(FilterResolved::AndNot(f1, _), FilterResolved::AndNot(f2, _)) => f1 == f2,
(_, _) => false,
}
}
}
impl PartialOrd for FilterResolved {
fn partial_cmp(&self, rhs: &FilterResolved) -> Option<Ordering> {
Some(self.cmp(rhs))
@ -943,41 +950,50 @@ impl PartialOrd for FilterResolved {
}
impl Ord for FilterResolved {
/// Ordering of filters for optimisation and subsequent dead term elimination.
///
fn cmp(&self, rhs: &FilterResolved) -> Ordering {
match (self, rhs) {
(FilterResolved::Eq(a1, v1, true), FilterResolved::Eq(a2, v2, true)) => {
match a1.cmp(a2) {
Ordering::Equal => v1.cmp(v2),
o => o,
let left_slopey = self.get_slopeyness_factor();
let right_slopey = rhs.get_slopeyness_factor();
let r = match (left_slopey, right_slopey) {
(Some(sfl), Some(sfr)) => sfl.cmp(&sfr),
(Some(_), None) => Ordering::Less,
(None, Some(_)) => Ordering::Greater,
(None, None) => Ordering::Equal,
};
// If they are equal, compare other elements for determining the order. This is what
// allows dead term elimination to occur.
//
// In almost all cases, we'll miss this step though as slopes will vary distinctly.
//
// We do NOT need to check for indexed vs unindexed here, we already did it in the slope check!
if r == Ordering::Equal {
match (self, rhs) {
(FilterResolved::Eq(a1, v1, _), FilterResolved::Eq(a2, v2, _))
| (FilterResolved::Sub(a1, v1, _), FilterResolved::Sub(a2, v2, _))
| (FilterResolved::LessThan(a1, v1, _), FilterResolved::LessThan(a2, v2, _)) => {
match a1.cmp(a2) {
Ordering::Equal => v1.cmp(v2),
o => o,
}
}
(FilterResolved::Pres(a1, _), FilterResolved::Pres(a2, _)) => a1.cmp(a2),
// Now sort these into the generally "best" order.
(FilterResolved::Eq(_, _, _), _) => Ordering::Less,
(_, FilterResolved::Eq(_, _, _)) => Ordering::Greater,
(FilterResolved::Pres(_, _), _) => Ordering::Less,
(_, FilterResolved::Pres(_, _)) => Ordering::Greater,
(FilterResolved::LessThan(_, _, _), _) => Ordering::Less,
(_, FilterResolved::LessThan(_, _, _)) => Ordering::Greater,
(FilterResolved::Sub(_, _, _), _) => Ordering::Less,
(_, FilterResolved::Sub(_, _, _)) => Ordering::Greater,
// They can't be re-arranged, they don't move!
(_, _) => Ordering::Equal,
}
(FilterResolved::Sub(a1, v1, true), FilterResolved::Sub(a2, v2, true)) => {
match a1.cmp(a2) {
Ordering::Equal => v1.cmp(v2),
o => o,
}
}
(FilterResolved::Pres(a1, true), FilterResolved::Pres(a2, true)) => a1.cmp(a2),
// Always higher prefer indexed Eq over all else, as these will have
// the best indexes and return smallest candidates.
(FilterResolved::Eq(_, _, true), _) => Ordering::Less,
(_, FilterResolved::Eq(_, _, true)) => Ordering::Greater,
(FilterResolved::Pres(_, true), _) => Ordering::Less,
(_, FilterResolved::Pres(_, true)) => Ordering::Greater,
(FilterResolved::Sub(_, _, true), _) => Ordering::Greater,
(_, FilterResolved::Sub(_, _, true)) => Ordering::Less,
// Now prefer the unindexed types by performance order.
(FilterResolved::Pres(_, false), FilterResolved::Pres(_, false)) => Ordering::Equal,
(FilterResolved::Pres(_, false), _) => Ordering::Less,
(_, FilterResolved::Pres(_, false)) => Ordering::Greater,
(FilterResolved::Eq(_, _, false), FilterResolved::Eq(_, _, false)) => Ordering::Equal,
(FilterResolved::Eq(_, _, false), _) => Ordering::Less,
(_, FilterResolved::Eq(_, _, false)) => Ordering::Greater,
(FilterResolved::Sub(_, _, false), FilterResolved::Sub(_, _, false)) => Ordering::Equal,
(FilterResolved::Sub(_, _, false), _) => Ordering::Greater,
(_, FilterResolved::Sub(_, _, false)) => Ordering::Less,
// They can't be compared, they don't move!
(_, _) => Ordering::Equal,
} else {
r
}
}
}
@ -988,38 +1004,44 @@ impl FilterResolved {
match fc {
FilterComp::Eq(a, v) => {
let idx = idxmeta.contains(&(&a, &IndexType::Equality));
let idx = NonZeroU8::new(idx as u8);
FilterResolved::Eq(a, v, idx)
}
FilterComp::SelfUuid => panic!("Not possible to resolve SelfUuid in from_invalid!"),
FilterComp::Sub(a, v) => {
// let idx = idxmeta.contains(&(&a, &IndexType::SubString));
// TODO: For now, don't emit substring indexes.
let idx = false;
FilterResolved::Sub(a, v, idx)
// let idx = idxmeta.contains(&(&a, &IndexType::SubString));
// let idx = NonZeroU8::new(idx as u8);
FilterResolved::Sub(a, v, None)
}
FilterComp::Pres(a) => {
let idx = idxmeta.contains(&(&a, &IndexType::Presence));
let idx = NonZeroU8::new(idx as u8);
FilterResolved::Pres(a, idx)
}
FilterComp::LessThan(a, v) => {
// let idx = idxmeta.contains(&(&a, &IndexType::ORDERING));
// TODO: For now, don't emit ordering indexes.
let idx = false;
let idx = None;
FilterResolved::LessThan(a, v, idx)
}
FilterComp::Or(vs) => FilterResolved::Or(
vs.into_iter()
.map(|v| FilterResolved::from_invalid(v, idxmeta))
.collect(),
None,
),
FilterComp::And(vs) => FilterResolved::And(
vs.into_iter()
.map(|v| FilterResolved::from_invalid(v, idxmeta))
.collect(),
None,
),
FilterComp::Inclusion(vs) => FilterResolved::Inclusion(
vs.into_iter()
.map(|v| FilterResolved::from_invalid(v, idxmeta))
.collect(),
None,
),
FilterComp::AndNot(f) => {
// TODO: pattern match box here. (AndNot(box f)).
@ -1027,57 +1049,80 @@ impl FilterResolved {
// not today remove the box, and we need f in our ownership. Since
// AndNot currently is a rare request, cloning is not the worst thing
// here ...
FilterResolved::AndNot(Box::new(FilterResolved::from_invalid(
(*f).clone(),
idxmeta,
)))
FilterResolved::AndNot(
Box::new(FilterResolved::from_invalid((*f).clone(), idxmeta)),
None,
)
}
FilterComp::SelfUuid => panic!("Not possible to resolve SelfUuid in from_invalid!"),
}
}
fn resolve_idx(fc: FilterComp, ev: &Identity, idxmeta: &HashSet<IdxKey>) -> Option<Self> {
fn resolve_idx(
fc: FilterComp,
ev: &Identity,
idxmeta: &HashMap<IdxKey, IdxSlope>,
) -> Option<Self> {
match fc {
FilterComp::Eq(a, v) => {
let idxkref = IdxKeyRef::new(&a, &IndexType::Equality);
let idx = idxmeta.contains(&idxkref as &dyn IdxKeyToRef);
let idx = idxmeta
.get(&idxkref as &dyn IdxKeyToRef)
.copied()
.and_then(NonZeroU8::new);
Some(FilterResolved::Eq(a, v, idx))
}
FilterComp::SelfUuid => ev.get_uuid().map(|uuid| {
let uuid_s = AttrString::from("uuid");
let idxkref = IdxKeyRef::new(&uuid_s, &IndexType::Equality);
let idx = idxmeta
.get(&idxkref as &dyn IdxKeyToRef)
.copied()
.and_then(NonZeroU8::new);
FilterResolved::Eq(uuid_s, PartialValue::new_uuid(uuid), idx)
}),
FilterComp::Sub(a, v) => {
let idxkref = IdxKeyRef::new(&a, &IndexType::SubString);
let idx = idxmeta.contains(&idxkref as &dyn IdxKeyToRef);
let idx = idxmeta
.get(&idxkref as &dyn IdxKeyToRef)
.copied()
.and_then(NonZeroU8::new);
Some(FilterResolved::Sub(a, v, idx))
}
FilterComp::Pres(a) => {
let idxkref = IdxKeyRef::new(&a, &IndexType::Presence);
let idx = idxmeta.contains(&idxkref as &dyn IdxKeyToRef);
let idx = idxmeta
.get(&idxkref as &dyn IdxKeyToRef)
.copied()
.and_then(NonZeroU8::new);
Some(FilterResolved::Pres(a, idx))
}
FilterComp::LessThan(a, v) => {
// let idx = idxmeta.contains(&(&a, &IndexType::SubString));
let idx = false;
Some(FilterResolved::LessThan(a, v, idx))
Some(FilterResolved::LessThan(a, v, None))
}
// We set the compound filters slope factor to "None" here, because when we do
// optimise we'll actually fill in the correct slope factors after we sort those
// inner terms in a more optimial way.
FilterComp::Or(vs) => {
let fi: Option<Vec<_>> = vs
.into_iter()
.map(|f| FilterResolved::resolve_idx(f, ev, idxmeta))
.collect();
fi.map(FilterResolved::Or)
fi.map(|fi| FilterResolved::Or(fi, None))
}
FilterComp::And(vs) => {
let fi: Option<Vec<_>> = vs
.into_iter()
.map(|f| FilterResolved::resolve_idx(f, ev, idxmeta))
.collect();
fi.map(FilterResolved::And)
fi.map(|fi| FilterResolved::And(fi, None))
}
FilterComp::Inclusion(vs) => {
let fi: Option<Vec<_>> = vs
.into_iter()
.map(|f| FilterResolved::resolve_idx(f, ev, idxmeta))
.collect();
fi.map(FilterResolved::Inclusion)
fi.map(|fi| FilterResolved::Inclusion(fi, None))
}
FilterComp::AndNot(f) => {
// TODO: pattern match box here. (AndNot(box f)).
@ -1086,43 +1131,54 @@ impl FilterResolved {
// AndNot currently is a rare request, cloning is not the worst thing
// here ...
FilterResolved::resolve_idx((*f).clone(), ev, idxmeta)
.map(|fi| FilterResolved::AndNot(Box::new(fi)))
.map(|fi| FilterResolved::AndNot(Box::new(fi), None))
}
FilterComp::SelfUuid => ev.get_uuid().map(|uuid| {
let uuid_s = AttrString::from("uuid");
let idxkref = IdxKeyRef::new(&uuid_s, &IndexType::Equality);
let idx = idxmeta.contains(&idxkref as &dyn IdxKeyToRef);
FilterResolved::Eq(uuid_s, PartialValue::new_uuid(uuid), idx)
}),
}
}
fn resolve_no_idx(fc: FilterComp, ev: &Identity) -> Option<Self> {
// ⚠️ ⚠️ ⚠️ ⚠️
// Remember, this function means we have NO INDEX METADATA so we can only
// asssign slopes to values we can GUARANTEE will EXIST.
match fc {
FilterComp::Eq(a, v) => Some(FilterResolved::Eq(a, v, false)),
FilterComp::Sub(a, v) => Some(FilterResolved::Sub(a, v, false)),
FilterComp::Pres(a) => Some(FilterResolved::Pres(a, false)),
FilterComp::LessThan(a, v) => Some(FilterResolved::LessThan(a, v, false)),
FilterComp::Eq(a, v) => {
// Since we have no index data, we manually configure a reasonable
// slope and indicate the presence of some expected basic
// indexes.
let idx = matches!(a.as_str(), "name" | "uuid");
let idx = NonZeroU8::new(idx as u8);
Some(FilterResolved::Eq(a, v, idx))
}
FilterComp::SelfUuid => ev.get_uuid().map(|uuid| {
FilterResolved::Eq(
AttrString::from("uuid"),
PartialValue::new_uuid(uuid),
NonZeroU8::new(true as u8),
)
}),
FilterComp::Sub(a, v) => Some(FilterResolved::Sub(a, v, None)),
FilterComp::Pres(a) => Some(FilterResolved::Pres(a, None)),
FilterComp::LessThan(a, v) => Some(FilterResolved::LessThan(a, v, None)),
FilterComp::Or(vs) => {
let fi: Option<Vec<_>> = vs
.into_iter()
.map(|f| FilterResolved::resolve_no_idx(f, ev))
.collect();
fi.map(FilterResolved::Or)
fi.map(|fi| FilterResolved::Or(fi, None))
}
FilterComp::And(vs) => {
let fi: Option<Vec<_>> = vs
.into_iter()
.map(|f| FilterResolved::resolve_no_idx(f, ev))
.collect();
fi.map(FilterResolved::And)
fi.map(|fi| FilterResolved::And(fi, None))
}
FilterComp::Inclusion(vs) => {
let fi: Option<Vec<_>> = vs
.into_iter()
.map(|f| FilterResolved::resolve_no_idx(f, ev))
.collect();
fi.map(FilterResolved::Inclusion)
fi.map(|fi| FilterResolved::Inclusion(fi, None))
}
FilterComp::AndNot(f) => {
// TODO: pattern match box here. (AndNot(box f)).
@ -1131,31 +1187,25 @@ impl FilterResolved {
// AndNot currently is a rare request, cloning is not the worst thing
// here ...
FilterResolved::resolve_no_idx((*f).clone(), ev)
.map(|fi| FilterResolved::AndNot(Box::new(fi)))
.map(|fi| FilterResolved::AndNot(Box::new(fi), None))
}
FilterComp::SelfUuid => ev.get_uuid().map(|uuid| {
FilterResolved::Eq(
AttrString::from("uuid"),
PartialValue::new_uuid(uuid),
false,
)
}),
}
}
// This is an optimise that only attempts to optimise the outer terms.
fn fast_optimise(self) -> Self {
match self {
FilterResolved::Inclusion(mut f_list) => {
FilterResolved::Inclusion(mut f_list, _) => {
f_list.sort_unstable();
f_list.dedup();
FilterResolved::Inclusion(f_list)
let sf = f_list.last().and_then(|f| f.get_slopeyness_factor());
FilterResolved::Inclusion(f_list, sf)
}
FilterResolved::And(mut f_list) => {
FilterResolved::And(mut f_list, _) => {
f_list.sort_unstable();
f_list.dedup();
FilterResolved::And(f_list)
let sf = f_list.first().and_then(|f| f.get_slopeyness_factor());
FilterResolved::And(f_list, sf)
}
v => v,
}
@ -1164,29 +1214,31 @@ impl FilterResolved {
fn optimise(&self) -> Self {
// Most optimisations only matter around or/and terms.
match self {
FilterResolved::Inclusion(f_list) => {
FilterResolved::Inclusion(f_list, _) => {
// first, optimise all our inner elements
let (f_list_inc, mut f_list_new): (Vec<_>, Vec<_>) = f_list
.iter()
.map(|f_ref| f_ref.optimise())
.partition(|f| matches!(f, FilterResolved::Inclusion(_)));
.partition(|f| matches!(f, FilterResolved::Inclusion(_, _)));
f_list_inc.into_iter().for_each(|fc| {
if let FilterResolved::Inclusion(mut l) = fc {
if let FilterResolved::Inclusion(mut l, _) = fc {
f_list_new.append(&mut l)
}
});
// finally, optimise this list by sorting.
f_list_new.sort_unstable();
f_list_new.dedup();
FilterResolved::Inclusion(f_list_new)
// Inclusions are similar to or, so what's our worst case?
let sf = f_list_new.last().and_then(|f| f.get_slopeyness_factor());
FilterResolved::Inclusion(f_list_new, sf)
}
FilterResolved::And(f_list) => {
FilterResolved::And(f_list, _) => {
// first, optimise all our inner elements
let (f_list_and, mut f_list_new): (Vec<_>, Vec<_>) = f_list
.iter()
.map(|f_ref| f_ref.optimise())
.partition(|f| matches!(f, FilterResolved::And(_)));
.partition(|f| matches!(f, FilterResolved::And(_, _)));
// now, iterate over this list - for each "and" term, fold
// it's elements to this level.
@ -1201,12 +1253,12 @@ impl FilterResolved {
// shortcutting
f_list_and.into_iter().for_each(|fc| {
if let FilterResolved::And(mut l) = fc {
if let FilterResolved::And(mut l, _) = fc {
f_list_new.append(&mut l)
}
});
// If the f_list_or only has one element, pop it and return.
// If the f_list_and only has one element, pop it and return.
if f_list_new.len() == 1 {
#[allow(clippy::expect_used)]
f_list_new.pop().expect("corrupt?")
@ -1214,21 +1266,25 @@ impl FilterResolved {
// finally, optimise this list by sorting.
f_list_new.sort_unstable();
f_list_new.dedup();
// Which ever element as the head is first must be the min SF
// so we use this in our And to represent us.
let sf = f_list_new.first().and_then(|f| f.get_slopeyness_factor());
//
// return!
FilterResolved::And(f_list_new)
FilterResolved::And(f_list_new, sf)
}
}
FilterResolved::Or(f_list) => {
FilterResolved::Or(f_list, _) => {
let (f_list_or, mut f_list_new): (Vec<_>, Vec<_>) = f_list
.iter()
// Optimise all inner items.
.map(|f_ref| f_ref.optimise())
// Split out inner-or terms to fold into this term.
.partition(|f| matches!(f, FilterResolved::Or(_)));
.partition(|f| matches!(f, FilterResolved::Or(_, _)));
// Append the inner terms.
f_list_or.into_iter().for_each(|fc| {
if let FilterResolved::Or(mut l) = fc {
if let FilterResolved::Or(mut l, _) = fc {
f_list_new.append(&mut l)
}
});
@ -1247,8 +1303,9 @@ impl FilterResolved {
#[allow(clippy::unnecessary_sort_by)]
f_list_new.sort_unstable_by(|a, b| b.cmp(a));
f_list_new.dedup();
FilterResolved::Or(f_list_new)
// The *last* element is the most here, and our worst case factor.
let sf = f_list_new.last().and_then(|f| f.get_slopeyness_factor());
FilterResolved::Or(f_list_new, sf)
}
}
f => f.clone(),
@ -1256,7 +1313,21 @@ impl FilterResolved {
}
pub fn is_andnot(&self) -> bool {
matches!(self, FilterResolved::AndNot(_))
matches!(self, FilterResolved::AndNot(_, _))
}
#[inline(always)]
fn get_slopeyness_factor(&self) -> Option<NonZeroU8> {
match self {
FilterResolved::Eq(_, _, sf)
| FilterResolved::Sub(_, _, sf)
| FilterResolved::Pres(_, sf)
| FilterResolved::LessThan(_, _, sf)
| FilterResolved::Or(_, sf)
| FilterResolved::And(_, sf)
| FilterResolved::Inclusion(_, sf)
| FilterResolved::AndNot(_, sf) => *sf,
}
}
}
@ -1452,8 +1523,8 @@ mod tests {
// antisymmetry: if a < b then !(a > b), as well as a > b implying !(a < b); and
// These are unindexed so we have to check them this way.
let f_t3b = unsafe { filter_resolved!(f_eq("userid", PartialValue::new_iutf8(""))) };
assert_eq!(f_t1a.partial_cmp(&f_t3b), Some(Ordering::Less));
assert_eq!(f_t3b.partial_cmp(&f_t1a), Some(Ordering::Greater));
assert_eq!(f_t1a.partial_cmp(&f_t3b), Some(Ordering::Greater));
assert_eq!(f_t3b.partial_cmp(&f_t1a), Some(Ordering::Less));
// transitivity: a < b and b < c implies a < c. The same must hold for both == and >.
let f_t4b = unsafe { filter_resolved!(f_sub("userid", PartialValue::new_iutf8(""))) };

View file

@ -512,7 +512,7 @@ impl<'a> SchemaWriteTransaction<'a> {
r
}
pub(crate) fn reload_idxmeta(&self) -> HashSet<IdxKey> {
pub(crate) fn reload_idxmeta(&self) -> Vec<IdxKey> {
self.get_attributes()
.values()
.flat_map(|a| {

View file

@ -1014,6 +1014,12 @@ impl QueryServer {
.upgrade_reindex(audit, SYSTEM_INDEX_VERSION + 1)
.and_then(|_| reindex_write_2.commit(audit))?;
// Force the schema to reload - this is so that any changes to index slope
// analysis are now reflected correctly.
let slope_reload = task::block_on(self.write_async(ts));
slope_reload.force_schema_reload();
slope_reload.commit(audit)?;
// Now, based on the system version apply migrations. You may ask "should you not
// be doing migrations before indexes?". And this is a very good question! The issue
// is within a migration we must be able to search for content by pres index, and those
@ -2457,8 +2463,12 @@ impl<'a> QueryServerWriteTransaction<'a> {
if valid_r.is_empty() {
// Now use this to reload the backend idxmeta
ltrace!(audit, "Reloading idxmeta ...");
self.be_txn.update_idxmeta(self.schema.reload_idxmeta());
Ok(())
self.be_txn
.update_idxmeta(audit, self.schema.reload_idxmeta())
.map_err(|e| {
ladmin_error!(audit, "reload schema update idxmeta {:?}", e);
e
})
} else {
// Log the failures?
ladmin_error!(audit, "Schema reload failed -> {:?}", valid_r);
@ -2625,6 +2635,10 @@ impl<'a> QueryServerWriteTransaction<'a> {
self.be_txn.reindex(audit)
}
fn force_schema_reload(&self) {
self.changed_schema.set(true);
}
pub(crate) fn upgrade_reindex(
&self,
audit: &mut AuditScope,