Moving daemon tracing to OpenTelemetry ()

* sally forth into the great otel unknown
* make the build env identification slightly more durable
* docs updates
* wasm recompile
This commit is contained in:
James Hodgkinson 2023-11-09 15:15:12 +10:00 committed by GitHub
parent 3bd2cc8a9f
commit 60e5935faa
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
53 changed files with 1714 additions and 822 deletions

229
Cargo.lock generated
View file

@ -1130,6 +1130,8 @@ dependencies = [
"kanidm_proto",
"kanidm_utils_users",
"kanidmd_core",
"opentelemetry",
"opentelemetry_api",
"reqwest",
"sd-notify",
"serde",
@ -1138,6 +1140,7 @@ dependencies = [
"tokio",
"tokio-util",
"toml",
"tracing",
"whoami",
]
@ -1801,6 +1804,16 @@ dependencies = [
"version_check",
]
[[package]]
name = "gethostname"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0176e0459c2e4a1fe232f984bca6890e681076abb9934f6cea7c326f3fc47818"
dependencies = [
"libc",
"windows-targets 0.48.5",
]
[[package]]
name = "getrandom"
version = "0.2.11"
@ -2672,6 +2685,18 @@ dependencies = [
"want",
]
[[package]]
name = "hyper-timeout"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1"
dependencies = [
"hyper",
"pin-project-lite",
"tokio",
"tokio-io-timeout",
]
[[package]]
name = "hyper-tls"
version = "0.5.0"
@ -3340,7 +3365,6 @@ dependencies = [
"serde-wasm-bindgen 0.5.0",
"serde_json",
"time",
"url",
"uuid",
"wasm-bindgen",
"wasm-bindgen-futures",
@ -3363,7 +3387,6 @@ dependencies = [
"serde-wasm-bindgen 0.5.0",
"serde_json",
"time",
"url",
"uuid",
"wasm-bindgen",
"wasm-bindgen-futures",
@ -3410,7 +3433,6 @@ dependencies = [
"serde-wasm-bindgen 0.5.0",
"serde_json",
"time",
"url",
"uuid",
"wasm-bindgen",
"wasm-bindgen-futures",
@ -4109,6 +4131,109 @@ dependencies = [
"vcpkg",
]
[[package]]
name = "opentelemetry"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9591d937bc0e6d2feb6f71a559540ab300ea49955229c347a517a28d27784c54"
dependencies = [
"opentelemetry_api",
"opentelemetry_sdk",
]
[[package]]
name = "opentelemetry-http"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c7594ec0e11d8e33faf03530a4c49af7064ebba81c1480e01be67d90b356508b"
dependencies = [
"async-trait",
"bytes",
"http",
"opentelemetry_api",
]
[[package]]
name = "opentelemetry-otlp"
version = "0.13.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7e5e5a5c4135864099f3faafbe939eb4d7f9b80ebf68a8448da961b32a7c1275"
dependencies = [
"async-trait",
"futures-core",
"http",
"opentelemetry-http",
"opentelemetry-proto",
"opentelemetry-semantic-conventions",
"opentelemetry_api",
"opentelemetry_sdk",
"prost",
"serde",
"thiserror",
"tokio",
"tonic",
]
[[package]]
name = "opentelemetry-proto"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1e3f814aa9f8c905d0ee4bde026afd3b2577a97c10e1699912e3e44f0c4cbeb"
dependencies = [
"opentelemetry_api",
"opentelemetry_sdk",
"prost",
"tonic",
]
[[package]]
name = "opentelemetry-semantic-conventions"
version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73c9f9340ad135068800e7f1b24e9e09ed9e7143f5bf8518ded3d3ec69789269"
dependencies = [
"opentelemetry",
]
[[package]]
name = "opentelemetry_api"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a81f725323db1b1206ca3da8bb19874bbd3f57c3bcd59471bfb04525b265b9b"
dependencies = [
"futures-channel",
"futures-util",
"indexmap 1.9.3",
"js-sys",
"once_cell",
"pin-project-lite",
"thiserror",
"urlencoding",
]
[[package]]
name = "opentelemetry_sdk"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa8e705a0612d48139799fcbaba0d4a90f06277153e43dd2bdc16c6f0edd8026"
dependencies = [
"async-trait",
"crossbeam-channel",
"futures-channel",
"futures-executor",
"futures-util",
"once_cell",
"opentelemetry_api",
"ordered-float",
"percent-encoding",
"rand",
"regex",
"serde_json",
"thiserror",
"tokio",
"tokio-stream",
]
[[package]]
name = "orca"
version = "1.1.0-rc.15-dev"
@ -4138,6 +4263,15 @@ dependencies = [
"uuid",
]
[[package]]
name = "ordered-float"
version = "3.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1e1c390732d15f1d48471625cd92d154e66db2c56645e29a9cd26f4699f72dc"
dependencies = [
"num-traits",
]
[[package]]
name = "overload"
version = "0.1.1"
@ -4554,6 +4688,29 @@ dependencies = [
"wasm-bindgen-futures",
]
[[package]]
name = "prost"
version = "0.11.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b82eaa1d779e9a4bc1c3217db8ffbeabaae1dca241bf70183242128d48681cd"
dependencies = [
"bytes",
"prost-derive",
]
[[package]]
name = "prost-derive"
version = "0.11.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4"
dependencies = [
"anyhow",
"itertools 0.10.5",
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "psl-types"
version = "2.0.11"
@ -5289,10 +5446,18 @@ dependencies = [
name = "sketching"
version = "1.1.0-rc.15-dev"
dependencies = [
"gethostname",
"num_enum",
"opentelemetry",
"opentelemetry-otlp",
"opentelemetry_sdk",
"rand",
"serde",
"tracing",
"tracing-forest",
"tracing-opentelemetry",
"tracing-subscriber",
"uuid",
]
[[package]]
@ -5642,6 +5807,16 @@ dependencies = [
"windows-sys 0.48.0",
]
[[package]]
name = "tokio-io-timeout"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf"
dependencies = [
"pin-project-lite",
"tokio",
]
[[package]]
name = "tokio-macros"
version = "2.1.0"
@ -5727,6 +5902,34 @@ dependencies = [
"winnow",
]
[[package]]
name = "tonic"
version = "0.9.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3082666a3a6433f7f511c7192923fa1fe07c69332d3c6a2e6bb040b569199d5a"
dependencies = [
"async-trait",
"axum",
"base64 0.21.5",
"bytes",
"futures-core",
"futures-util",
"h2",
"http",
"http-body",
"hyper",
"hyper-timeout",
"percent-encoding",
"pin-project",
"prost",
"tokio",
"tokio-stream",
"tower",
"tower-layer",
"tower-service",
"tracing",
]
[[package]]
name = "tower"
version = "0.4.13"
@ -5735,10 +5938,14 @@ checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
dependencies = [
"futures-core",
"futures-util",
"indexmap 1.9.3",
"pin-project",
"pin-project-lite",
"rand",
"slab",
"tokio",
"tokio-stream",
"tokio-util",
"tower-layer",
"tower-service",
"tracing",
@ -5841,6 +6048,22 @@ dependencies = [
"tracing-core",
]
[[package]]
name = "tracing-opentelemetry"
version = "0.21.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75327c6b667828ddc28f5e3f169036cb793c3f588d83bf0f262a7f062ffed3c8"
dependencies = [
"once_cell",
"opentelemetry",
"opentelemetry_sdk",
"smallvec",
"tracing",
"tracing-core",
"tracing-log",
"tracing-subscriber",
]
[[package]]
name = "tracing-serde"
version = "0.1.3"

View file

@ -162,6 +162,25 @@ num_enum = "^0.5.11"
oauth2_ext = { version = "^4.1.0", package = "oauth2", default-features = false }
openssl-sys = "^0.9"
openssl = "^0.10.59"
opentelemetry = { version = "0.20.0" }
opentelemetry_api = { version = "0.20.0", features = ["logs", "metrics"] }
opentelemetry-otlp = { version = "0.13.0", default-features = false, features = [
"serde",
"logs",
"metrics",
"http-proto",
"grpc-tonic",
] }
opentelemetry_sdk = "0.20.0"
opentelemetry-stdout = { version = "0.1.0", features = [
"logs",
"metrics",
"trace",
] }
tonic = "0.10.2"
tracing-opentelemetry = "0.21.0"
paste = "^1.0.14"
pkg-config = "^0.3.27"
proc-macro2 = "1.0.69"

View file

@ -140,6 +140,7 @@ codespell:
--skip='./book/src/images/*' \
--skip='./docs/*,./.git' \
--skip='*.svg' \
--skip='*.br' \
--skip='./rlm_python/mods-available/eap' \
--skip='./server/web_ui/static/external' \
--skip='./server/web_ui/pkg/external' \

View file

@ -0,0 +1,159 @@
# Logging
Logging is how the server communicates to developers and administrators about the state of the
service, and how operations are performing and what they are doing. It's important this is clear in
how it communicates.
## Use Cases
### Developer Bug Reports
A developer should be able to see the internal state of the server, and why any decision or logic
path was taken, so that errors or logic can be analysed post-incident. The information in the log
and the source code should be enough to resolve any issues, as we may not have LLDB access to any
consumers site, or any effective reproducer.
### Security Audits
We must be able to see why any security decision was made, such as credential validation, access
control application, or group/claim issuing to a session. This should be connected to the IP and
other identifiers of the caller.
### Error Analysis
For an administrator, they must be able to determine why an operation is failing in detail so they
can advise on how a consumer or user could change their behaviour to improve the situation (beyond
the error messages we return).
### Performance
Administrators and Developers should be able to analyse fine grained information about the
performance of any operation, and make informed decisions about tuning (such as caches or or
threads), and developers should be able to identify code paths that are under pressure and could be
targets for improvement.
### Containers/Systemd
Logs should be emitted on stdout/stderr as this is the easiest interface for existing log
aggregation systems to collect data from.
## Details
As developers we should indicate what messages are relevant to what use case as part of the message.
Log levels are used in other services, but that allows messages to be missed. Instead we always log
every "service", but filter them to different locations.
This leads to the following log categories:
- Analysis
- Display of all logic branches and why decision points or paths taken
- A unique event ID that associates related log messages
- Performance
- Cache and DB metrics available
- Performance frames of timing of key points
- Structure of the performance frames to understand the execution paths taken.
- Display of query optimisation
- Display of query planning and application
- Failure (server failure)
- Hard Errors
- Warning (admin should take action)
- Possible misconfiguration
- OperationError (user mistake, op mistake etc)
- All error reports and finalised result summaries logged
- The unique event ID is provided in any operation success or failure.
- Security (aka audit)
- Filtering of security sensitive attributes (via debug/display features)
- Display of sufficient information to establish a security picture of connected actions via the
user's uuid/session id.
- Tracking of who-changed-what-when-why
- Replication
- Both replication consumers and providers log when they make runs.
- Errors in replication should surface as such.
It can be seen pretty quickly that multiple message types are useful across categories. For example,
the unique event id for all messages, how hard errors affect operation errors or how an operation
error can come from a security denial.
Logging must also remain a separate thread and async for performance.
This means that the best way to declare these logs is a unified log which can be filtered based on
the admins or consumers needs.
## API
For all types, it's important that we can associate all related events correctly. When the operation
initiates we assign an event-id that is part of the audit trail.
### Statistics
Stats should be accumulated in a statistics variable so that we can determine possible tuning and
other events related. Useful stats would be:
- Cache Hits
- Cache Misses
- Cache Inclusions
- Number of Searches
- Number of Entries Modified
This would be then logged as a structured line such as:
```json
{ "entry_cache_miss": 8, "idl_cache_miss": 8, "entry_cache_hit": 16', .... }
```
This would also then be fed back to the global stats thread for averaging.
### System Performance
The key metric for performance is time-in-function so it would be good to be able to build a value
like:
```json
{
"name": "do_search",
"time_ns": 130,
"pct": 100,
"called": [
{
"name": "filter2idl",
"time_ns": 23',
"called": [],
},
{
...
}
]
}
```
This would allow a rich view of how much time went to any function at a high level, as then further
investigation can occur.
### SQL Query Analysis
To analyse a query we need:
- The original query
- The optimised version, with index tagging/profiling choices.
- The idl's that were loaded and how the query was applied
- The idl of the final result set.
### Security Events
- What access controls were considered?
- Who authenticated and where from?
- Audit of who modified what when why.
### Internal Analysis
This is generally what is "debug" logging, which is just decision points and verbose descriptions of
what we went where.
### Admin Notification
This is warnings or errors that the admin should be aware of.
### User Events
This must associate what happened for a user

View file

@ -1,170 +0,0 @@
Logging Design (Refactor)
-------------------------
Logging is how the server communicates to developers and administrators about the state
of the service, and how operations are performing and what they are doing. It's important
this is clear in how it communicates. Today (2020-05-12) the log has been written with
development in mind, and has a structure that has as a result, become hard to parse and
understand. This has motivated a rewrite of logging to improve how the servers state
and errors are communicated to users.
Use Cases
---------
* Developer Bug Reports
A developer should be able to see the internal state of the server, and why any decision
or logic path was taken, so that errors or logic can be analysed post-incident. The
information in the log and the source code should be enough to resolve any issues, as we
may not have LLDB access to any consumers site, or any effective reproducer.
* Security Audits
We must be able to see why any security decision was made, such as credential validation,
access control application, or group/claim issuing to a session. This should be connected
to the IP and other identifiers of the caller.
* Error Analysis
For an administrator, they must be able to determine why an operation is failing in detail
so they can advise on how a consumer or user could change their behaviour to improve the
situation (beyond the error messages we return).
* Performance
Administrators and Developers should be able to analyse fine grained information about the
performance of any operation, and make informed decisions about tuning (such as caches or
or threads), and developers should be able to identify code paths that are under pressure
and could be targets for improvement.
* Containers/Systemd
Logs should be emitted on stdout/stderr as this is the easiest interface for existing
log aggregation systems to collect data from.
Details
-------
As developers we should indicate what messages are relevant to what use case as part of the
message. Log levels are used in other services, but that allows messages to be missed. Instead
we log every "service" always, but filter them to different locations.
This leads to the following log categories:
* Analysis
* Display of all logic branches and why decision points or paths taken
* A unique event ID that associates related log messages
* Performance
* Cache and DB metrics available
* Performance frames of timing of key points
* Structure of the performance frames to understand the execution paths taken.
* Display of query optimisation
* Display of query planning and application
* Failure (server failure)
* Hard Errors
* Warning (admin should take action)
* Possible misconfiguration
* OperationError (user mistake, op mistake etc)
* All error reports and finalised result summaries logged
* The unique event ID is provided in any operation success or failure.
* Security (aka audit)
* Filtering of security sensitive attributes (via debug/display features)
* Display of sufficient information to establish a security picture of connected actions via the user's uuid/session id.
* Tracking of who-changed-what-when-why
* Replication
* TODO
It can be seen pretty quickly that multiple message types are useful across categories. For
example, the unique event id for all messages, how hard errors affect operation errors
or how an operation error can come from a security denial.
Logging must also remain a separate thread and async for performance.
This means that the best way to declare these logs is a unified log which can be filtered based
on the admins or consumers needs.
API
---
For all types, it's important that we can associate all related events correctly. When the
operation initiates we assign an event-id that is part of the audit trail.
Statistics
==========
Stats should be accumulated in a statistics variable so that we can determine possible
tuning and other events related. Useful stats would be:
* Cache Hits
* Cache Misses
* Cache Inclusions
* Number of Searches
* Number of Entries Modified
This would be then logged as a structured line such as:
{ 'entry_cache_miss': 8, 'idl_cache_miss': 8, 'entry_cache_hit': 16', .... }
This would also then be fed back to the global stats thread for averaging.
Performance
===========
The key metric for performance is time-in-function so it would be good to be able to
build a value like:
{
'name': 'do_search',
'time': x,
'pct': 100,
called: [
{
'name': 'filter2idl',
'time': x',
called: [],
},
{
...
}
]
}
This would allow a rich view of how much time went to any function at a high level, as then
further investigation can occur.
Query Analysis
==============
To analyse a query we need:
* The original query
* The optimised version, with index tagging/profiling choices.
* The idl's that were loaded and how the query was applied
* The idl of the final result set.
Security Events
===============
* What access controls were considered?
* Who authenticated and where from?
* Audit of who modified what when why.
Analysis
========
This is generally what is "debug" logging, which is just decision points and verbose
descriptions of what we went where.
Admin Notification
==================
This is warnings or errors that the admin should be aware of.
User Events
===========
This must associate what happened for a user

View file

@ -3,7 +3,7 @@
The monitoring design of Kanidm is still very much in its infancy -
[take part in the discussion at github.com/kanidm/kanidm/issues/216](https://github.com/kanidm/kanidm/issues/216).
## kanidmd
## kanidmd status endpoint
kanidmd currently responds to HTTP GET requests at the `/status` endpoint with a JSON object of
either "true" or "false". `true` indicates that the platform is responding to requests.
@ -15,3 +15,27 @@ either "true" or "false". `true` indicates that the platform is responding to re
| Additional Headers | x-kanidm-opid |
| Content Type | application/json |
| Cookies | kanidm-session |
## OpenTelemetry Tracing
Configure OTLP trace exports by setting a `otel_grpc_endpoint` in the server configuration. This'll
enable [OpenTelemetry traces](https://opentelemetry.io) to be sent for observability use cases.
### Troubleshooting
#### Max Span Size Exceeded
On startup, we run some big processes that might hit a "max trace size" in certain configurations.
Grafana Tempo defaults to 5MB, which is sensible for most things, but ... 😁
Grafana Tempo
[config to allow larger spans](https://grafana.com/docs/tempo/latest/troubleshooting/response-too-large/):
```yaml
distributor:
receivers:
otlp:
protocols:
grpc:
max_recv_msg_size_mib: 20
```

View file

@ -32,7 +32,7 @@ This means:
If you see something like this:
```
```shell
➜ curl -v https://idm.example.com:8443
* Trying 10.0.0.1:8443...
* connect to 10.0.0.1 port 8443 failed: Connection refused
@ -47,7 +47,7 @@ some reason.
If you get errors about certificates, try adding `-k` to skip certificate verification checking and
just test connectivity:
```
```shell
curl -vk https://idm.example.com:8443/status
```
@ -87,3 +87,19 @@ to `1.1`. This can go in the same block as the `proxy_pass` option.
```text
proxy_http_version 1.1
```
## OpenTelemetry errors
If you see something like this:
> `OpenTelemetry trace error occurred. Exporter otlp encountered the following error(s): the grpc server returns error (The system is not in a state required for the operation's execution): , detailed error message: TRACE_TOO_LARGE: max size of trace (5000000) exceeded while adding 86725 bytes to trace a657b63f6ca0415eb70b6734f20f82cf for tenant single-tenant`
Then you'l need to tweak the maximum trace size in your OTLP receiver. In Grafana Tempo you can add
the following keys to your `tempo.yaml`, in this example we're setting it to 20MiB:
```yaml
overrides:
defaults:
global:
max_bytes_per_trace: 20971520 # 20MiB
```

View file

@ -15,6 +15,8 @@ tls_key = "/tmp/kanidm/key.pem"
log_level = "debug"
# log_level = "trace"
otel_grpc_url = "http://localhost:4317"
domain = "localhost"
origin = "https://localhost:8443"
trust_x_forward_for = true

View file

@ -1,14 +1,28 @@
use std::path::PathBuf;
use std::path::{Path, PathBuf};
use std::{env, fs};
use base64::{engine::general_purpose, Engine as _};
// We do this here so it's only actually run and checked once.
/// Work out where the workspace dir is
fn workspace_dir() -> PathBuf {
let output = std::process::Command::new(env!("CARGO"))
.arg("locate-project")
.arg("--workspace")
.arg("--message-format=plain")
.output()
.unwrap()
.stdout;
let cargo_path = Path::new(std::str::from_utf8(&output).unwrap().trim());
cargo_path.parent().unwrap().to_path_buf()
}
// We do this here so it's only actually run and checked once at build time.
fn determine_git_rev() -> Option<String> {
let path = PathBuf::from("../../");
let repo = match gix::open(path) {
let repo = match gix::open(workspace_dir()) {
Ok(repo) => repo,
Err(_) => return None,
Err(_) => {
return None;
}
};
let mut head = repo.head().ok()?;
let commit = head.peel_to_commit_in_place().ok()?;

View file

@ -17,12 +17,26 @@ test = false
doctest = false
[dependencies]
gethostname = "0.4.3"
num_enum = { workspace = true }
opentelemetry = { workspace = true, features = ["metrics", "rt-tokio"] }
opentelemetry-otlp = { workspace = true, default-features = false, features = [
"serde",
"logs",
"metrics",
"http-proto",
"grpc-tonic",
] }
opentelemetry_sdk = { workspace = true }
rand = { workspace = true }
serde = { workspace = true, features = ["derive"] }
tracing = { workspace = true, features = ["attributes"] }
tracing-subscriber = { workspace = true, features = ["env-filter"] }
tracing-forest = { workspace = true, features = [
"uuid",
"smallvec",
"tokio",
"env-filter",
] }
tracing-opentelemetry = { workspace = true }
tracing-subscriber = { workspace = true, features = ["env-filter"] }
uuid = { workspace = true, features = ["v4"] }

View file

@ -1,7 +1,10 @@
#![deny(warnings)]
#![warn(unused_extern_crates)]
#![allow(non_snake_case)]
use std::str::FromStr;
use num_enum::{IntoPrimitive, TryFromPrimitive};
use serde::Deserialize;
use tracing_forest::printer::TestCapturePrinter;
use tracing_forest::tag::NoTag;
use tracing_forest::util::*;
@ -9,6 +12,7 @@ use tracing_forest::Tag;
use tracing_subscriber::prelude::*;
pub mod macros;
pub mod otel;
pub use {tracing, tracing_forest, tracing_subscriber};
@ -96,3 +100,47 @@ impl EventTag {
}
}
}
#[derive(Clone, Copy, Deserialize, Debug, Default)]
pub enum LogLevel {
#[default]
#[serde(rename = "info")]
Info,
#[serde(rename = "debug")]
Debug,
#[serde(rename = "trace")]
Trace,
}
impl FromStr for LogLevel {
type Err = &'static str;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"info" => Ok(LogLevel::Info),
"debug" => Ok(LogLevel::Debug),
"trace" => Ok(LogLevel::Trace),
_ => Err("Must be one of info, debug, trace"),
}
}
}
impl ToString for LogLevel {
fn to_string(&self) -> String {
match self {
LogLevel::Info => "info".to_string(),
LogLevel::Debug => "debug".to_string(),
LogLevel::Trace => "trace".to_string(),
}
}
}
impl From<LogLevel> for EnvFilter {
fn from(value: LogLevel) -> Self {
match value {
LogLevel::Info => EnvFilter::new("info"),
LogLevel::Debug => EnvFilter::new("debug"),
LogLevel::Trace => EnvFilter::new("trace"),
}
}
}

View file

@ -119,3 +119,51 @@ macro_rules! filter_trace {
macro_rules! perf_trace {
($($arg:tt)*) => { tagged_event!(TRACE, EventTag::PerfTrace, $($arg)*) }
}
#[macro_export]
macro_rules! event_dynamic_lvl {
( $(target: $target:expr,)? $(parent: $parent:expr,)? $lvl:expr, $($tt:tt)* ) => {
match $lvl {
tracing::Level::ERROR => {
tracing::event!(
$(target: $target,)?
$(parent: $parent,)?
tracing::Level::ERROR,
$($tt)*
);
}
tracing::Level::WARN => {
tracing::event!(
$(target: $target,)?
$(parent: $parent,)?
tracing::Level::WARN,
$($tt)*
);
}
tracing::Level::INFO => {
tracing::event!(
$(target: $target,)?
$(parent: $parent,)?
tracing::Level::INFO,
$($tt)*
);
}
tracing::Level::DEBUG => {
tracing::event!(
$(target: $target,)?
$(parent: $parent,)?
tracing::Level::DEBUG,
$($tt)*
);
}
tracing::Level::TRACE => {
tracing::event!(
$(target: $target,)?
$(parent: $parent,)?
tracing::Level::TRACE,
$($tt)*
);
}
}
};
}

132
libs/sketching/src/otel.rs Normal file
View file

@ -0,0 +1,132 @@
use gethostname::gethostname;
use opentelemetry::KeyValue;
use opentelemetry_otlp::{Protocol, WithExportConfig};
use opentelemetry_sdk::trace::{self, Sampler};
use opentelemetry_sdk::Resource;
use std::time::Duration;
use tracing::Subscriber;
use tracing_subscriber::Registry;
use tracing_subscriber::{prelude::*, EnvFilter};
pub const MAX_EVENTS_PER_SPAN: u32 = 64 * 1024;
pub const MAX_ATTRIBUTES_PER_SPAN: u32 = 128;
/// if you set the KANIDM_OTEL_GRPC_ENDPOINT env var you'll start the OpenTelemetry pipeline.
pub fn get_otlp_endpoint() -> Option<String> {
std::env::var("KANIDM_OTEL_GRPC_ENDPOINT").ok()
}
// TODO: this is coming back later
// #[allow(dead_code)]
// pub fn init_metrics() -> metrics::Result<MeterProvider> {
// let export_config = opentelemetry_otlp::ExportConfig {
// endpoint: "http://localhost:4318/v1/metrics".to_string(),
// ..opentelemetry_otlp::ExportConfig::default()
// };
// opentelemetry_otlp::new_pipeline()
// .metrics(opentelemetry_sdk::runtime::Tokio)
// .with_exporter(
// opentelemetry_otlp::new_exporter()
// .http()
// .with_export_config(export_config),
// )
// .build()
// }
/// This does all the startup things for the logging pipeline
pub fn start_logging_pipeline(
otlp_endpoint: Option<String>,
log_filter: crate::LogLevel,
service_name: String,
) -> Result<Box<dyn Subscriber + Send + Sync>, String> {
let forest_filter: EnvFilter = log_filter.into();
// TODO: work out how to do metrics things
// let meter_provider = init_metrics()
// .map_err(|err| eprintln!("failed to start metrics provider: {:?}", err))?;
match otlp_endpoint {
Some(endpoint) => {
// adding these filters because when you close out the process the OTLP comms layer is NOISY
let forest_filter = forest_filter
.add_directive(
"tonic=info"
.parse()
.expect("Failed to set tonic logging to info"),
)
.add_directive("h2=info".parse().expect("Failed to set h2 logging to info"))
.add_directive(
"hyper=info"
.parse()
.expect("Failed to set hyper logging to info"),
);
let forest_layer = tracing_forest::ForestLayer::default().with_filter(forest_filter);
let t_filter: EnvFilter = log_filter.into();
let tracer = opentelemetry_otlp::new_pipeline().tracing().with_exporter(
opentelemetry_otlp::new_exporter()
.tonic()
.with_endpoint(endpoint)
.with_timeout(Duration::from_secs(5))
.with_protocol(Protocol::HttpBinary),
);
// this env var gets set at build time, if we can pull it, add it to the metadata
let git_rev = match option_env!("KANIDM_KANIDM_PKG_COMMIT_REV") {
Some(rev) => format!("-{}", rev),
None => "".to_string(),
};
let version = format!("{}{}", env!("CARGO_PKG_VERSION"), git_rev);
let hostname = gethostname();
let hostname = hostname.to_string_lossy();
let hostname = hostname.to_lowercase();
let tracer = tracer
.with_trace_config(
trace::config()
// we want *everything!*
.with_sampler(Sampler::AlwaysOn)
.with_max_events_per_span(MAX_EVENTS_PER_SPAN)
.with_max_attributes_per_span(MAX_ATTRIBUTES_PER_SPAN)
.with_resource(Resource::new(vec![
KeyValue::new("service.name", service_name),
KeyValue::new("service.version", version),
KeyValue::new("host.name", hostname),
// TODO: it'd be really nice to be able to set the instance ID here, from the server UUID so we know *which* instance on this host is logging
])),
)
.install_batch(opentelemetry::runtime::Tokio)
.map_err(|err| {
let err = format!("Failed to start OTLP pipeline: {:?}", err);
eprintln!("{}", err);
err
})?;
// Create a tracing layer with the configured tracer;
let telemetry = tracing_opentelemetry::layer()
.with_tracer(tracer)
.with_threads(true)
.with_filter(t_filter);
Ok(Box::new(
Registry::default().with(forest_layer).with(telemetry),
))
}
None => {
let forest_layer = tracing_forest::ForestLayer::default().with_filter(forest_filter);
Ok(Box::new(Registry::default().with(forest_layer)))
}
}
}
/// This helps with cleanly shutting down the tracing/logging providers when done,
/// so we don't lose traces.
pub struct TracingPipelineGuard {}
impl Drop for TracingPipelineGuard {
fn drop(&mut self) {
opentelemetry::global::shutdown_tracer_provider();
opentelemetry::global::shutdown_logger_provider();
println!("Logging pipeline completed shutdown");
}
}

31
scripts/otel/README.md Normal file
View file

@ -0,0 +1,31 @@
# OpenTelemetry for Kanidm
First, start the containers. You can use docker-compose if you know how, or `./startup.sh` is a
shortcut. You'll need docker (or similar) and docker-compose (or something that can handle
`docker-compose.yml`).
Once that's stopped scrolling for a bit, run the Kanidm server, setting the `otel_grpc_url` to
`http://localhost:4317`
Then access the
[Grafana UI](http://localhost:3000/explore?panes=%7B%22G-2%22:%7B%22datasource%22:%22tempo%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22datasource%22:%7B%22type%22:%22tempo%22,%22uid%22:%22tempo%22%7D,%22queryType%22:%22traceqlSearch%22,%22limit%22:20,%22filters%22:%5B%7B%22id%22:%2219b1a582%22,%22operator%22:%22%3D%22,%22scope%22:%22span%22%7D,%7B%22id%22:%22service-name%22,%22tag%22:%22service.name%22,%22operator%22:%22%3D%22,%22scope%22:%22resource%22,%22value%22:%5B%22kanidmd%22%5D,%22valueType%22:%22string%22%7D%5D%7D%5D,%22range%22:%7B%22from%22:%22now-6h%22,%22to%22:%22now%22%7D%7D%7D&schemaVersion=1&orgId=1)
and start clicking on traces 😁
## Architecture of the docker containers
```mermaid
graph TD;
K[Kanidmd] --"OTLP tcp/4317"--> T
U[User] --tcp/3000--> G[Grafana]
G --tcp/3200-->T["Tempo (Traces)"]
G --tcp/9090-->P["Prometheus (Metrics)"]
T--cache-->TDV["Tempo Docker Volume"]
T--tcp/9000-->M["Minio (S3 Storage)"]
P--tcp/9000-->M
P--cache-->PDV["Prometheus Docker Volume"]
M-->DVM["Minio Docker Volume"]
```

View file

@ -0,0 +1,92 @@
---
# It should be *very* clear that this is an insecure, dev-only configuration. Don't run this in production!
services:
grafana:
image: grafana/grafana:10.1.1
volumes:
- type: bind
source: ./grafana-datasources.yaml
target: /etc/grafana/provisioning/datasources/datasources.yaml
environment:
- GF_AUTH_ANONYMOUS_ENABLED=true
- GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
- GF_AUTH_DISABLE_LOGIN_FORM=true
- GF_FEATURE_TOGGLES_ENABLE=traceqlEditor
ports:
- "3000:3000"
tempo:
image: grafana/tempo:latest
command: [ "-config.file=/etc/tempo.yaml" ]
volumes:
- type: bind
source: ./tempo.yaml
target: /etc/tempo.yaml
- type: volume
source: tempo
target: /tmp/tempo
ports:
# - "14268:14268" # jaeger ingest
- "3200:3200" # tempo
- "9095:9095" # tempo grpc
- "4317:4317" # otlp grpc
# - "4318:4318" # otlp http
# - "9411:9411" # zipkin
# loki:
# image: docker.io/grafana/loki:2.9.2
# volumes:
# - type: bind
# source: ./loki-local-config.yaml
# target: /etc/loki/local-config.yaml
# command: |
# -config.file=/etc/loki/local-config.yaml \
# -target=all
# ports:
# - "3100:3100"
# - "3101:3101"
# - "3102:3102"
minio:
image: minio/minio
entrypoint:
- sh
- -euc
- |
mkdir -p /data/loki-data && \
mkdir -p /data/loki-ruler && \
mkdir -p /data/tempo && \
minio server /data
environment:
- MINIO_ROOT_USER=loki
- MINIO_ROOT_PASSWORD=supersecret
- MINIO_PROMETHEUS_AUTH_TYPE=public
- MINIO_UPDATE=off
ports:
- 9000
volumes:
- type: volume
source: minio
target: /data
healthcheck:
test: [ "CMD", "curl", "-f", "http://localhost:9000/minio/health/live" ]
interval: 15s
timeout: 20s
retries: 5
prometheus:
hostname: prometheus
container_name: prometheus
image: prom/prometheus:v2.47.2
restart: always
ports:
- "9090:9090"
volumes:
- type: bind
source: ./prometheus.yml
target: /etc/prometheus/prometheus.yml
- type: volume
source: prometheus
target: /prometheus
volumes:
minio:
tempo:
prometheus:

View file

@ -0,0 +1,32 @@
---
# It should be *very* clear that this is an insecure, dev-only configuration. Don't run this in production!
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
uid: prometheus
access: proxy
orgId: 1
url: http://prometheus:9090
basicAuth: false
isDefault: false
version: 1
editable: false
jsonData:
httpMethod: GET
- name: Tempo
type: tempo
access: proxy
orgId: 1
url: http://tempo:3200
basicAuth: false
isDefault: true
version: 1
editable: false
apiVersion: 1
uid: tempo
jsonData:
httpMethod: GET
serviceMap:
datasourceUid: prometheus

View file

@ -0,0 +1,32 @@
---
# It should be *very* clear that this is an insecure, dev-only configuration. Don't run this in production!
server:
http_listen_port: 3100
schema_config:
configs:
- from: 2021-08-01
store: tsdb
object_store: s3
schema: v12
index:
prefix: index_
period: 24h
common:
path_prefix: /loki
replication_factor: 1
storage:
s3:
endpoint: minio:9000
insecure: true
bucketnames: loki-data
access_key_id: loki
secret_access_key: supersecret
s3forcepathstyle: true
ring:
instance_addr: 0.0.0.0
kvstore:
store: memberlist
ruler:
storage:
s3:
bucketnames: loki-ruler

12
scripts/otel/multi_curl.sh Executable file
View file

@ -0,0 +1,12 @@
#!/bin/bash
# This allows testing a bunch of endpoints in a really dumb way
COMMAND="curl -ks"
# 404
$COMMAND https://localhost:8443/asdfasfasfsadf > /dev/null 2>&1
# auth fail
$COMMAND --json '{"hello" : "world" }' https://localhost:8443/v1/auth > /dev/null 2>&1
# good
$COMMAND https://localhost:8443/status

View file

@ -0,0 +1,19 @@
global:
scrape_interval: 30s # By default, scrape targets every 15 seconds.
# Attach these labels to any time series or alerts when communicating with
# external systems (federation, remote storage, Alertmanager).
# external_labels:
# monitor: "codelab-monitor"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "prometheus"
# Override the global default and scrape targets from this job every 5 seconds.
# scrape_interval: 30s
static_configs:
- targets: ["localhost:9090"]

8
scripts/otel/startup.sh Executable file
View file

@ -0,0 +1,8 @@
#!/bin/bash
echo "Tearing down"
docker-compose down -t0
echo "Building up"
docker-compose up -d
echo "LOG TIME!"
docker-compose logs -f

69
scripts/otel/tempo.yaml Normal file
View file

@ -0,0 +1,69 @@
---
# It should be *very* clear that this is an insecure, dev-only configuration. Don't run this in production!
# config docs https://grafana.com/docs/tempo/latest/configuration/#compactor
server:
http_listen_port: 3200
grpc_server_max_recv_msg_size: 20971520 # 20MiB
grpc_server_max_send_msg_size: 20971520 # 20MiB
query_frontend:
search:
duration_slo: 5s
throughput_bytes_slo: 1.073741824e+09
trace_by_id:
duration_slo: 5s
distributor:
receivers:
otlp:
protocols:
# http:
grpc:
max_recv_msg_size_mib: 20
opencensus:
# ingester:
# max_block_duration: 5m # cut the headblock when this much time passes. this is being set for demo purposes and should probably be left alone normally
compactor:
compaction:
# Optional. Duration to keep blocks. Default is 14 days (336h).
block_retention: 24h
metrics_generator:
registry:
external_labels:
source: tempo
cluster: docker-compose
storage:
# path: /tmp/tempo/generator/wal
remote_write:
- url: http://prometheus:9090/api/v1/write
send_exemplars: true
storage:
trace:
backend: s3 # we're using minio anyway!
s3:
bucket: tempo
endpoint: minio:9000
region: minio
insecure: true
access_key: loki
secret_key: supersecret
# backend: local
# wal:
# path: /tmp/tempo/wal # where to store the the wal locally
# local:
# path: /tmp/tempo/blocks
overrides:
defaults:
metrics_generator:
processors:
- service-graphs
- span-metrics # enables metrics generator
global:
max_bytes_per_trace: 20971520 # 20MiB

View file

@ -1568,7 +1568,7 @@ impl QueryServerWriteV1 {
let res = idms_prox_write
.qs_write
.purge_tombstones()
.and_then(|_| idms_prox_write.commit());
.and_then(|_changed| idms_prox_write.commit());
match res {
Ok(()) => {
@ -1592,7 +1592,14 @@ impl QueryServerWriteV1 {
let res = idms_prox_write
.qs_write
.purge_recycled()
.and_then(|_| idms_prox_write.commit());
.and_then(|touched| {
// don't need to commit a txn with no changes
if touched > 0 {
idms_prox_write.commit()
} else {
Ok(())
}
});
match res {
Ok(()) => {

View file

@ -20,7 +20,7 @@ use kanidm_lib_crypto::prelude::X509;
use kanidm_lib_crypto::serialise::x509b64;
use serde::Deserialize;
use sketching::tracing_subscriber::EnvFilter;
use sketching::LogLevel;
use url::Url;
#[derive(Deserialize, Debug, Clone)]
@ -171,6 +171,8 @@ pub struct ServerConfig {
#[serde(rename = "replication")]
/// Replication configuration, this is a development feature and not yet ready for production use.
pub repl_config: Option<ReplicationConfiguration>,
/// An optional OpenTelemetry collector (GRPC) url to send trace and log data to, eg http://localhost:4317
pub otel_grpc_url: Option<String>,
}
impl ServerConfig {
@ -233,50 +235,6 @@ impl FromStr for ServerRole {
}
}
#[derive(Clone, Deserialize, Debug, Default)]
pub enum LogLevel {
#[default]
#[serde(rename = "info")]
Info,
#[serde(rename = "debug")]
Debug,
#[serde(rename = "trace")]
Trace,
}
impl FromStr for LogLevel {
type Err = &'static str;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"info" => Ok(LogLevel::Info),
"debug" => Ok(LogLevel::Debug),
"trace" => Ok(LogLevel::Trace),
_ => Err("Must be one of info, debug, trace"),
}
}
}
impl ToString for LogLevel {
fn to_string(&self) -> String {
match self {
LogLevel::Info => "info".to_string(),
LogLevel::Debug => "debug".to_string(),
LogLevel::Trace => "trace".to_string(),
}
}
}
impl From<LogLevel> for EnvFilter {
fn from(value: LogLevel) -> Self {
match value {
LogLevel::Info => EnvFilter::new("info"),
LogLevel::Debug => EnvFilter::new("debug"),
LogLevel::Trace => EnvFilter::new("trace"),
}
}
}
#[derive(Debug, Clone)]
pub struct IntegrationTestConfig {
pub admin_user: String,
@ -434,7 +392,6 @@ impl Configuration {
}
pub fn update_log_level(&mut self, level: &Option<LogLevel>) {
let level = level.clone();
self.log_level = level.unwrap_or_default();
}

View file

@ -38,7 +38,6 @@ use tokio_openssl::SslStream;
use futures_util::future::poll_fn;
use tokio::net::TcpListener;
use tracing::Level;
use std::io::ErrorKind;
use std::path::PathBuf;
@ -47,7 +46,7 @@ use std::sync::Arc;
use std::{net::SocketAddr, str::FromStr};
use tokio::sync::broadcast;
use tower_http::services::ServeDir;
use tower_http::trace::{DefaultOnRequest, TraceLayer};
use tower_http::trace::TraceLayer;
use uuid::Uuid;
use crate::CoreAction;
@ -288,7 +287,7 @@ pub async fn create_https_server(
let trace_layer = TraceLayer::new_for_http()
.make_span_with(trace::DefaultMakeSpanKanidmd::new())
// setting these to trace because all they do is print "started processing request", and we are already doing that enough!
.on_request(DefaultOnRequest::new().level(Level::TRACE));
.on_response(trace::DefaultOnResponseKanidmd::new());
let app = app
.merge(static_routes)

View file

@ -1,6 +1,9 @@
//! Reimplementation of tower-http's DefaultMakeSpan that only runs at "INFO" level for our own needs.
use http::Request;
use kanidm_proto::constants::KOPID;
use sketching::event_dynamic_lvl;
use tower_http::LatencyUnit;
use tracing::{Level, Span};
/// The default way Spans will be created for Trace.
@ -22,7 +25,7 @@ impl Default for DefaultMakeSpanKanidmd {
}
impl<B> tower_http::trace::MakeSpan<B> for DefaultMakeSpanKanidmd {
#[instrument(name = "handle_request", skip_all)]
#[instrument(name = "handle_request", skip_all, fields(latency, status_code))]
fn make_span(&mut self, request: &Request<B>) -> Span {
tracing::span!(
Level::INFO,
@ -33,3 +36,64 @@ impl<B> tower_http::trace::MakeSpan<B> for DefaultMakeSpanKanidmd {
)
}
}
#[derive(Clone, Debug)]
pub(crate) struct DefaultOnResponseKanidmd {
#[allow(dead_code)]
level: Level,
#[allow(dead_code)]
latency_unit: LatencyUnit,
#[allow(dead_code)]
include_headers: bool,
}
impl DefaultOnResponseKanidmd {
#[allow(dead_code)]
pub fn new() -> Self {
Self::default()
}
}
impl Default for DefaultOnResponseKanidmd {
fn default() -> Self {
Self {
level: Level::INFO,
latency_unit: LatencyUnit::Millis,
include_headers: false,
}
}
}
impl<B> tower_http::trace::OnResponse<B> for DefaultOnResponseKanidmd {
fn on_response(
self,
response: &axum::response::Response<B>,
latency: std::time::Duration,
_span: &Span,
) {
let kopid = match response.headers().get(KOPID) {
Some(val) => val.to_str().unwrap_or("<invalid kopid>"),
None => "<unknown>",
};
let (level, msg) =
match response.status().is_success() || response.status().is_informational() {
true => (Level::INFO, "response sent"),
false => {
if response.status().is_redirection() {
(Level::INFO, "client redirection sent")
} else if response.status().is_client_error() {
(Level::WARN, "client error") // it worked, but there was an input error
} else {
(Level::ERROR, "error handling request") // oh no the server failed
}
}
};
event_dynamic_lvl!(
level,
?latency,
status_code = response.status().as_u16(),
kopid = kopid,
msg
);
}
}

View file

@ -33,6 +33,12 @@ serde = { workspace = true, features = ["derive"] }
tokio = { workspace = true, features = ["rt-multi-thread", "macros", "signal"] }
tokio-util = { workspace = true, features = ["codec"] }
toml = { workspace = true }
opentelemetry = { workspace = true, features = ["logs"] }
opentelemetry_api = { workspace = true, features = ["logs"] }
tracing = { workspace = true, features = [
"max_level_trace",
"release_max_level_debug",
] }
[target.'cfg(target_os = "linux")'.dependencies]
sd-notify.workspace = true

File diff suppressed because it is too large Load diff

View file

@ -518,7 +518,7 @@ lazy_static! {
};
}
/// Make a list of all the non-admin BuiltinGroup's that are created by default, doing it in a standard-ish way so we can use it for testing and stuff
/// Make a list of all the non-admin BuiltinGroup's that are created by default, doing it in a standard-ish way so we can use it around the platform
pub fn idm_builtin_non_admin_groups() -> Vec<&'static BuiltinGroup> {
// Create any system default schema entries.
vec![

View file

@ -62,7 +62,16 @@ impl Plugin for Domain {
}
}
fn generate_domain_cookie_key() -> Value {
let mut key = [0; 64];
let mut rng = StdRng::from_entropy();
rng.fill(&mut key);
Value::new_privatebinary(&key)
}
impl Domain {
/// Generates the cookie key for the domain.
fn modify_inner<T: Clone + std::fmt::Debug>(
qs: &mut QueryServerWriteTransaction,
cand: &mut [Entry<EntryInvalid, T>],
@ -129,11 +138,7 @@ impl Domain {
if !e.attribute_pres(Attribute::PrivateCookieKey) {
security_info!("regenerating domain cookie key");
let mut key = [0; 64];
let mut rng = StdRng::from_entropy();
rng.fill(&mut key);
let v = Value::new_privatebinary(&key);
e.add_ava(Attribute::PrivateCookieKey, v);
e.add_ava(Attribute::PrivateCookieKey, generate_domain_cookie_key());
}
trace!(?e);

View file

@ -155,19 +155,20 @@ impl<'a> QueryServerWriteTransaction<'a> {
res
}
#[instrument(level = "debug", skip_all)]
/// - If the thing exists:
/// - Ensure the set of attributes match and are present
/// (but don't delete multivalue, or extended attributes in the situation.
/// - If not:
/// - Create the entry
///
/// This will extra classes an attributes alone!
///
/// NOTE: `gen_modlist*` IS schema aware and will handle multivalue correctly!
pub fn internal_migrate_or_create(
&mut self,
e: Entry<EntryInit, EntryNew>,
) -> Result<(), OperationError> {
// if the thing exists, ensure the set of attributes on
// Entry A match and are present (but don't delete multivalue, or extended
// attributes in the situation.
// If not exist, create from Entry B
//
// This will extra classes an attributes alone!
//
// NOTE: gen modlist IS schema aware and will handle multivalue
// correctly!
trace!("internal_migrate_or_create operating on {:?}", e.get_uuid());
let Some(filt) = e.filter_from_attrs(&[Attribute::Uuid.into()]) else {
@ -298,7 +299,7 @@ impl<'a> QueryServerWriteTransaction<'a> {
/// a future version.
///
/// An extended feature of this is the ability to store multiple TOTP's per entry.
#[instrument(level = "debug", skip_all)]
#[instrument(level = "info", skip_all)]
pub fn migrate_9_to_10(&mut self) -> Result<(), OperationError> {
admin_warn!("starting 9 to 10 migration.");
let filter = filter!(f_or!([
@ -318,7 +319,7 @@ impl<'a> QueryServerWriteTransaction<'a> {
/// are, they are migrated to the passkey type, allowing us to deprecate and remove the older
/// credential behaviour.
///
#[instrument(level = "debug", skip_all)]
#[instrument(level = "info", skip_all)]
pub fn migrate_10_to_11(&mut self) -> Result<(), OperationError> {
admin_warn!("starting 9 to 10 migration.");
let filter = filter!(f_pres(Attribute::PrimaryCredential));
@ -363,9 +364,9 @@ impl<'a> QueryServerWriteTransaction<'a> {
/// Migrate 11 to 12
///
/// Rewrite api-tokens from session to a dedicated api token type.
/// Rewrite api-tokens from session to a dedicated API token type.
///
#[instrument(level = "debug", skip_all)]
#[instrument(level = "info", skip_all)]
pub fn migrate_11_to_12(&mut self) -> Result<(), OperationError> {
admin_warn!("starting 11 to 12 migration.");
// sync_token_session
@ -421,7 +422,8 @@ impl<'a> QueryServerWriteTransaction<'a> {
self.internal_apply_writable(mod_candidates)
}
#[instrument(level = "debug", skip_all)]
#[instrument(level = "info", skip_all)]
/// Deletes the Domain info privatecookiekey to force a regeneration as we changed the format
pub fn migrate_12_to_13(&mut self) -> Result<(), OperationError> {
admin_warn!("starting 12 to 13 migration.");
let filter = filter!(f_and!([
@ -434,7 +436,8 @@ impl<'a> QueryServerWriteTransaction<'a> {
// Complete
}
#[instrument(level = "debug", skip_all)]
#[instrument(level = "info", skip_all)]
/// - Deletes the incorrectly added "member" attribute on dynamic groups
pub fn migrate_13_to_14(&mut self) -> Result<(), OperationError> {
admin_warn!("starting 13 to 14 migration.");
let filter = filter!(f_eq(
@ -447,18 +450,20 @@ impl<'a> QueryServerWriteTransaction<'a> {
// Complete
}
#[instrument(level = "debug", skip_all)]
#[instrument(level = "info", skip_all)]
/// - Deletes the non-existing attribute for idverification private key which triggers it to regen
pub fn migrate_14_to_15(&mut self) -> Result<(), OperationError> {
admin_warn!("starting 14 to 15 migration.");
let filter = filter!(f_eq(Attribute::Class, EntryClass::Person.into()));
// Delete the non-existing attr for idv private key which triggers
// it to regen.
// Delete the non-existing attr for idv private key which triggers it to regen.
let modlist = ModifyList::new_purge(Attribute::IdVerificationEcKey);
self.internal_modify(&filter, &modlist)
// Complete
}
#[instrument(level = "debug", skip_all)]
#[instrument(level = "info", skip_all)]
/// - updates the system config to include the new session expiry values.
/// - adds the account policy object to idm_all_accounts
pub fn migrate_15_to_16(&mut self) -> Result<(), OperationError> {
admin_warn!("starting 15 to 16 migration.");
@ -509,7 +514,7 @@ impl<'a> QueryServerWriteTransaction<'a> {
// Complete
}
#[instrument(level = "debug", skip_all)]
#[instrument(level = "info", skip_all)]
pub fn initialise_schema_core(&mut self) -> Result<(), OperationError> {
admin_debug!("initialise_schema_core -> start ...");
// Load in all the "core" schema, that we already have in "memory".
@ -532,7 +537,7 @@ impl<'a> QueryServerWriteTransaction<'a> {
r
}
#[instrument(level = "debug", skip_all)]
#[instrument(level = "info", skip_all)]
pub fn initialise_schema_idm(&mut self) -> Result<(), OperationError> {
admin_debug!("initialise_schema_idm -> start ...");
@ -652,8 +657,8 @@ impl<'a> QueryServerWriteTransaction<'a> {
r
}
// This function is idempotent
#[instrument(level = "debug", skip_all)]
#[instrument(level = "info", skip_all)]
/// This function is idempotent, runs all the startup functionality and checks
pub fn initialise_idm(&mut self) -> Result<(), OperationError> {
// First, check the system_info object. This stores some server information
// and details. It's a pretty const thing. Also check anonymous, important to many
@ -684,9 +689,7 @@ impl<'a> QueryServerWriteTransaction<'a> {
debug_assert!(res.is_ok());
res?;
let idm_entries = idm_builtin_non_admin_groups();
let res: Result<(), _> = idm_entries
let res: Result<(), _> = idm_builtin_non_admin_groups()
.into_iter()
.try_for_each(|e| self.internal_migrate_or_create(e.clone().try_into()?));
if res.is_ok() {
@ -756,7 +759,8 @@ impl<'a> QueryServerWriteTransaction<'a> {
res?;
// Delete entries that no longer need to exist.
let delete_entries = [UUID_IDM_ACP_OAUTH2_READ_PRIV_V1];
// TODO: Shouldn't this be a migration?
let delete_entries: [Uuid; 1] = [UUID_IDM_ACP_OAUTH2_READ_PRIV_V1];
let res: Result<(), _> = delete_entries
.into_iter()

View file

@ -6,7 +6,7 @@ use hashbrown::HashMap;
impl<'a> QueryServerWriteTransaction<'a> {
#[instrument(level = "debug", skip_all)]
pub fn purge_tombstones(&mut self) -> Result<(), OperationError> {
pub fn purge_tombstones(&mut self) -> Result<usize, OperationError> {
// purge everything that is a tombstone.
let trim_cid = self.trim_cid().clone();
@ -17,17 +17,18 @@ impl<'a> QueryServerWriteTransaction<'a> {
error!(err = ?e, "Tombstone purge operation failed (backend)");
e
})
.map(|_| {
.map(|res| {
admin_info!("Tombstone purge operation success");
res
})
}
#[instrument(level = "debug", skip_all)]
pub fn purge_recycled(&mut self) -> Result<(), OperationError> {
pub fn purge_recycled(&mut self) -> Result<usize, OperationError> {
// Send everything that is recycled to tombstone
// Search all recycled
let cid = self.cid.sub_secs(RECYCLEBIN_MAX_AGE).map_err(|e| {
admin_error!(err = ?e, "Unable to generate search cid");
admin_error!(err = ?e, "Unable to generate search cid for purge_recycled");
e
})?;
let rc = self.internal_search(filter_all!(f_and!([
@ -36,8 +37,8 @@ impl<'a> QueryServerWriteTransaction<'a> {
])))?;
if rc.is_empty() {
admin_info!("No recycled items present - purge operation success");
return Ok(());
admin_debug!("No recycled items present - purge operation success");
return Ok(0);
}
// Modify them to strip all avas except uuid
@ -56,6 +57,9 @@ impl<'a> QueryServerWriteTransaction<'a> {
.collect();
let tombstone_cand = tombstone_cand?;
// it's enough to say "yeah we tried to touch this many" because
// we're using this to decide if we're going to commit the txn
let touched = tombstone_cand.len();
// Backend Modify
self.be_txn
@ -66,6 +70,7 @@ impl<'a> QueryServerWriteTransaction<'a> {
})
.map(|_| {
admin_info!("Purge recycled operation success");
touched
})
}

View file

@ -4,6 +4,8 @@ use uuid::Uuid;
use crate::prelude::*;
// TODO: this should *totally* be running the OTEL metrics collector
pub struct StatusRequestEvent {
pub eventid: Uuid,
}

View file

@ -26,7 +26,6 @@ serde = { workspace = true, features = ["derive"] }
serde_json = { workspace = true }
serde-wasm-bindgen = { workspace = true }
time = { workspace = true }
url = { workspace = true }
uuid = { workspace = true }
wasm-bindgen = { workspace = true }
wasm-bindgen-futures = { workspace = true }

View file

@ -27,7 +27,6 @@ serde_json = { workspace = true }
serde-wasm-bindgen = { workspace = true }
wasm-bindgen = { workspace = true }
wasm-bindgen-futures = { workspace = true }
url = { workspace = true }
uuid = { workspace = true }
yew = { workspace = true, features = ["csr"] }
yew-router = { workspace = true }

View file

@ -225,7 +225,7 @@ function makeMutClosure(arg0, arg1, dtor, f) {
return real;
}
function __wbg_adapter_48(arg0, arg1) {
wasm._dyn_core__ops__function__FnMut_____Output___R_as_wasm_bindgen__closure__WasmClosure___describe__invoke__h09aa096681cc0b01(arg0, arg1);
wasm._dyn_core__ops__function__FnMut_____Output___R_as_wasm_bindgen__closure__WasmClosure___describe__invoke__ha55f8bc2a1dec3e6(arg0, arg1);
}
let stack_pointer = 128;
@ -1130,20 +1130,20 @@ function __wbg_get_imports() {
const ret = wasm.memory;
return addHeapObject(ret);
};
imports.wbg.__wbindgen_closure_wrapper1151 = function(arg0, arg1, arg2) {
const ret = makeMutClosure(arg0, arg1, 587, __wbg_adapter_48);
imports.wbg.__wbindgen_closure_wrapper1249 = function(arg0, arg1, arg2) {
const ret = makeMutClosure(arg0, arg1, 595, __wbg_adapter_48);
return addHeapObject(ret);
};
imports.wbg.__wbindgen_closure_wrapper3671 = function(arg0, arg1, arg2) {
const ret = makeMutClosure(arg0, arg1, 1710, __wbg_adapter_51);
imports.wbg.__wbindgen_closure_wrapper3675 = function(arg0, arg1, arg2) {
const ret = makeMutClosure(arg0, arg1, 1711, __wbg_adapter_51);
return addHeapObject(ret);
};
imports.wbg.__wbindgen_closure_wrapper3751 = function(arg0, arg1, arg2) {
const ret = makeMutClosure(arg0, arg1, 1739, __wbg_adapter_54);
imports.wbg.__wbindgen_closure_wrapper3755 = function(arg0, arg1, arg2) {
const ret = makeMutClosure(arg0, arg1, 1740, __wbg_adapter_54);
return addHeapObject(ret);
};
imports.wbg.__wbindgen_closure_wrapper3835 = function(arg0, arg1, arg2) {
const ret = makeMutClosure(arg0, arg1, 1776, __wbg_adapter_57);
imports.wbg.__wbindgen_closure_wrapper3839 = function(arg0, arg1, arg2) {
const ret = makeMutClosure(arg0, arg1, 1777, __wbg_adapter_57);
return addHeapObject(ret);
};

View file

@ -28,7 +28,6 @@ serde_json = { workspace = true }
serde-wasm-bindgen = { workspace = true }
wasm-bindgen = { workspace = true }
wasm-bindgen-futures = { workspace = true }
url = { workspace = true }
uuid = { workspace = true }
yew = { workspace = true, features = ["csr"] }
yew-router = { workspace = true }

View file

@ -225,7 +225,7 @@ function makeMutClosure(arg0, arg1, dtor, f) {
return real;
}
function __wbg_adapter_48(arg0, arg1) {
wasm._dyn_core__ops__function__FnMut_____Output___R_as_wasm_bindgen__closure__WasmClosure___describe__invoke__h09aa096681cc0b01(arg0, arg1);
wasm._dyn_core__ops__function__FnMut_____Output___R_as_wasm_bindgen__closure__WasmClosure___describe__invoke__ha55f8bc2a1dec3e6(arg0, arg1);
}
let stack_pointer = 128;
@ -1130,20 +1130,20 @@ function __wbg_get_imports() {
const ret = wasm.memory;
return addHeapObject(ret);
};
imports.wbg.__wbindgen_closure_wrapper1151 = function(arg0, arg1, arg2) {
const ret = makeMutClosure(arg0, arg1, 587, __wbg_adapter_48);
imports.wbg.__wbindgen_closure_wrapper1249 = function(arg0, arg1, arg2) {
const ret = makeMutClosure(arg0, arg1, 595, __wbg_adapter_48);
return addHeapObject(ret);
};
imports.wbg.__wbindgen_closure_wrapper3671 = function(arg0, arg1, arg2) {
const ret = makeMutClosure(arg0, arg1, 1710, __wbg_adapter_51);
imports.wbg.__wbindgen_closure_wrapper3675 = function(arg0, arg1, arg2) {
const ret = makeMutClosure(arg0, arg1, 1711, __wbg_adapter_51);
return addHeapObject(ret);
};
imports.wbg.__wbindgen_closure_wrapper3751 = function(arg0, arg1, arg2) {
const ret = makeMutClosure(arg0, arg1, 1739, __wbg_adapter_54);
imports.wbg.__wbindgen_closure_wrapper3755 = function(arg0, arg1, arg2) {
const ret = makeMutClosure(arg0, arg1, 1740, __wbg_adapter_54);
return addHeapObject(ret);
};
imports.wbg.__wbindgen_closure_wrapper3835 = function(arg0, arg1, arg2) {
const ret = makeMutClosure(arg0, arg1, 1776, __wbg_adapter_57);
imports.wbg.__wbindgen_closure_wrapper3839 = function(arg0, arg1, arg2) {
const ret = makeMutClosure(arg0, arg1, 1777, __wbg_adapter_57);
return addHeapObject(ret);
};