Update

Improve token handling
It was possible that a token could be updated in a way that caused existing cached information to be lost if an event was delayed in it's write to the user token. To prevent this, the writes to user tokens now require the HsmLock to be held, and refresh the token just ahead of writing to ensure that these data can't be lost. The benefit to this approach is that readers remain unblocked by a writer.
2025-04-27 20:55:06 +02:00 · 2025-04-09 11:54:38 +10:00 · 2025-04-09 11:54:38 +10:00 · 2025-04-09 09:39:19 +10:00
5 changed files with 110 additions and 38 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -5658,9 +5658,9 @@ dependencies = [

 [[package]]
 name = "tokio"
-version = "1.44.1"
+version = "1.44.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f382da615b842244d4b8738c82ed1275e6c5dd90c459a30941cd07080b06c91a"
+checksum = "e6b88822cbe49de4185e3a4cbf8321dd487cf5fe0c5c65695fef6346371e9c48"
 dependencies = [
 "backtrace",
 "bytes",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -268,7 +268,7 @@ tempfile = "3.15.0"
 testkit-macros = { path = "./server/testkit-macros" }
 time = { version = "^0.3.36", features = ["formatting", "local-offset"] }

-tokio = "^1.43.0"
+tokio = "^1.44.2"
 tokio-openssl = "^0.6.5"
 tokio-util = "^0.7.13"

--- a/unix_integration/resolver/src/idprovider/interface.rs
+++ b/unix_integration/resolver/src/idprovider/interface.rs
@ -194,7 +194,8 @@ impl Into<PamAuthResponse> for AuthRequest {
 }

 pub enum AuthResult {
-    Success { token: UserToken },
+    Success,
+    SuccessUpdate { new_token: UserToken },
    Denied,
    Next(AuthRequest),
 }
@ -251,6 +252,7 @@ pub trait IdProvider {
    async fn unix_user_online_auth_step(
        &self,
        _account_id: &str,
+        _current_token: Option<&UserToken>,
        _cred_handler: &mut AuthCredHandler,
        _pam_next_req: PamAuthRequest,
        _tpm: &mut tpm::BoxedDynTpm,
@ -290,7 +292,8 @@ pub trait IdProvider {
    // TPM key.
    async fn unix_user_offline_auth_step(
        &self,
-        _token: &UserToken,
+        _current_token: Option<&UserToken>,
+        _session_token: &UserToken,
        _cred_handler: &mut AuthCredHandler,
        _pam_next_req: PamAuthRequest,
        _tpm: &mut tpm::BoxedDynTpm,
--- a/unix_integration/resolver/src/idprovider/kanidm.rs
+++ b/unix_integration/resolver/src/idprovider/kanidm.rs
@ -55,8 +55,6 @@ impl KanidmProvider {
        tpm: &mut tpm::BoxedDynTpm,
        machine_key: &tpm::MachineKey,
    ) -> Result<Self, IdpError> {
-        // FUTURE: Randomised jitter on next check at startup.
-
        // Initially retrieve our HMAC key.
        let loadable_hmac_key: Option<tpm::LoadableHmacKey> = keystore
            .get_tagged_hsm_key(KANIDM_HMAC_KEY)
@ -248,13 +246,25 @@ impl KanidmProviderInternal {
            // Proceed
            CacheState::Online => true,
            CacheState::OfflineNextCheck(at_time) if now >= at_time => {
-                // Attempt online. If fails, return token.
                self.attempt_online(tpm, now).await
            }
            CacheState::OfflineNextCheck(_) | CacheState::Offline => false,
        }
    }

+    #[instrument(level = "debug", skip_all)]
+    async fn check_online_right_meow(
+        &mut self,
+        tpm: &mut tpm::BoxedDynTpm,
+        now: SystemTime,
+    ) -> bool {
+        match self.state {
+            CacheState::Online => true,
+            CacheState::OfflineNextCheck(_) => self.attempt_online(tpm, now).await,
+            CacheState::Offline => false,
+        }
+    }
+
    #[instrument(level = "debug", skip_all)]
    async fn attempt_online(&mut self, _tpm: &mut tpm::BoxedDynTpm, now: SystemTime) -> bool {
        let mut max_attempts = 3;
@ -295,7 +305,7 @@ impl IdProvider for KanidmProvider {

    async fn attempt_online(&self, tpm: &mut tpm::BoxedDynTpm, now: SystemTime) -> bool {
        let mut inner = self.inner.lock().await;
-        inner.check_online(tpm, now).await
+        inner.check_online_right_meow(tpm, now).await
    }

    async fn mark_next_check(&self, now: SystemTime) {
@ -431,6 +441,7 @@ impl IdProvider for KanidmProvider {
    async fn unix_user_online_auth_step(
        &self,
        account_id: &str,
+        current_token: Option<&UserToken>,
        cred_handler: &mut AuthCredHandler,
        pam_next_req: PamAuthRequest,
        tpm: &mut tpm::BoxedDynTpm,
@ -449,15 +460,23 @@ impl IdProvider for KanidmProvider {

                match auth_result {
                    Ok(Some(n_tok)) => {
-                        let mut token = UserToken::from(n_tok);
-                        token.kanidm_update_cached_password(
+                        let mut new_token = UserToken::from(n_tok);
+
+                        // Update any keys that may have been in the db in the current
+                        // token.
+                        if let Some(previous_token) = current_token {
+                            new_token.extra_keys = previous_token.extra_keys.clone();
+                        }
+
+                        // Set any new keys that are relevant from this authentication
+                        new_token.kanidm_update_cached_password(
                            &inner.crypto_policy,
                            cred.as_str(),
                            tpm,
                            &inner.hmac_key,
                        );

-                        Ok(AuthResult::Success { token })
+                        Ok(AuthResult::SuccessUpdate { new_token })
                    }
                    Ok(None) => {
                        // TODO: i'm not a huge fan of this rn, but currently the way we handle
@ -552,7 +571,8 @@ impl IdProvider for KanidmProvider {

    async fn unix_user_offline_auth_step(
        &self,
-        token: &UserToken,
+        current_token: Option<&UserToken>,
+        session_token: &UserToken,
        cred_handler: &mut AuthCredHandler,
        pam_next_req: PamAuthRequest,
        tpm: &mut tpm::BoxedDynTpm,
@ -561,11 +581,13 @@ impl IdProvider for KanidmProvider {
            (AuthCredHandler::Password, PamAuthRequest::Password { cred }) => {
                let inner = self.inner.lock().await;

-                if token.kanidm_check_cached_password(cred.as_str(), tpm, &inner.hmac_key) {
+                if session_token.kanidm_check_cached_password(cred.as_str(), tpm, &inner.hmac_key) {
+                    // Ensure we have either the latest token, or if none, at least the session token.
+                    let new_token = current_token.unwrap_or(session_token).clone();
+
                    // TODO: We can update the token here and then do lockouts.
-                    Ok(AuthResult::Success {
-                        token: token.clone(),
-                    })
+
+                    Ok(AuthResult::SuccessUpdate { new_token })
                } else {
                    Ok(AuthResult::Denied)
                }
--- a/unix_integration/resolver/src/resolver.rs
+++ b/unix_integration/resolver/src/resolver.rs
@ -47,7 +47,6 @@ pub enum AuthSession {
        client: Arc<dyn IdProvider + Sync + Send>,
        account_id: String,
        id: Id,
-        token: Option<Box<UserToken>>,
        cred_handler: AuthCredHandler,
        /// Some authentication operations may need to spawn background tasks. These tasks need
        /// to know when to stop as the caller has disconnected. This receiver allows that, so
@ -59,7 +58,7 @@ pub enum AuthSession {
        account_id: String,
        id: Id,
        client: Arc<dyn IdProvider + Sync + Send>,
-        token: Box<UserToken>,
+        session_token: Box<UserToken>,
        cred_handler: AuthCredHandler,
    },
    System {
@ -225,7 +224,7 @@ impl Resolver {
        //  Attempt to search these in the db.
        let mut dbtxn = self.db.write().await;
        let r = dbtxn.get_account(account_id).map_err(|err| {
-            debug!("get_cached_usertoken {:?}", err);
+            debug!(?err, "get_cached_usertoken");
        })?;

        drop(dbtxn);
@ -318,7 +317,12 @@ impl Resolver {
        }
    }

-    async fn set_cache_usertoken(&self, token: &mut UserToken) -> Result<(), ()> {
+    async fn set_cache_usertoken(
+        &self,
+        token: &mut UserToken,
+        // This is just for proof that only one write can occur at a time.
+        _tpm: &mut BoxedDynTpm,
+    ) -> Result<(), ()> {
        // Set an expiry
        let ex_time = SystemTime::now() + Duration::from_secs(self.timeout_seconds);
        let offset = ex_time
@ -451,6 +455,22 @@ impl Resolver {

        let mut hsm_lock = self.hsm.lock().await;

+        // We need to re-acquire the token now behind the hsmlock - this is so that
+        // we know that as we write the updated token, we know that no one else has
+        // written to this token, since we are now the only task that is allowed
+        // to be in a write phase.
+        let token = if token.is_some() {
+            self.get_cached_usertoken(account_id)
+                .await
+                .map(|(_expired, option_token)| option_token)
+                .map_err(|err| {
+                    debug!(?err, "get_usertoken error");
+                })?
+        } else {
+            // Was already none, leave it that way.
+            None
+        };
+
        let user_get_result = if let Some(tok) = token.as_ref() {
            // Re-use the provider that the token is from.
            match self.client_ids.get(&tok.provider) {
@ -486,12 +506,11 @@ impl Resolver {
            }
        };

-        drop(hsm_lock);
-
        match user_get_result {
            Ok(UserTokenState::Update(mut n_tok)) => {
                // We have the token!
-                self.set_cache_usertoken(&mut n_tok).await?;
+                self.set_cache_usertoken(&mut n_tok, hsm_lock.deref_mut())
+                    .await?;
                Ok(Some(n_tok))
            }
            Ok(UserTokenState::NotFound) => {
@ -958,7 +977,6 @@ impl Resolver {
                            client,
                            account_id: account_id.to_string(),
                            id,
-                            token: Some(Box::new(token)),
                            cred_handler,
                            shutdown_rx,
                        };
@ -979,7 +997,7 @@ impl Resolver {
                            account_id: account_id.to_string(),
                            id,
                            client,
-                            token: Box::new(token),
+                            session_token: Box::new(token),
                            cred_handler,
                        };
                        Ok((auth_session, next_req.into()))
@ -1022,7 +1040,6 @@ impl Resolver {
                            client: client.clone(),
                            account_id: account_id.to_string(),
                            id,
-                            token: None,
                            cred_handler,
                            shutdown_rx,
                        };
@ -1050,19 +1067,32 @@ impl Resolver {
        auth_session: &mut AuthSession,
        pam_next_req: PamAuthRequest,
    ) -> Result<PamAuthResponse, ()> {
+        let mut hsm_lock = self.hsm.lock().await;
+
        let maybe_err = match &mut *auth_session {
            &mut AuthSession::Online {
                ref client,
                ref account_id,
-                id: _,
-                token: _,
+                ref id,
                ref mut cred_handler,
                ref shutdown_rx,
            } => {
-                let mut hsm_lock = self.hsm.lock().await;
+                // This is not used in the authentication, but is so that any new
+                // extra keys or data on the token are updated correctly if the authentication
+                // requests an update. Since we hold the hsm_lock, no other task can
+                // update this token between now and completion of the fn.
+                let current_token = self
+                    .get_cached_usertoken(id)
+                    .await
+                    .map(|(_expired, option_token)| option_token)
+                    .map_err(|err| {
+                        debug!(?err, "get_usertoken error");
+                    })?;
+
                let result = client
                    .unix_user_online_auth_step(
                        account_id,
+                        current_token.as_ref(),
                        cred_handler,
                        pam_next_req,
                        hsm_lock.deref_mut(),
@ -1071,7 +1101,7 @@ impl Resolver {
                    .await;

                match result {
-                    Ok(AuthResult::Success { .. }) => {
+                    Ok(AuthResult::SuccessUpdate { .. } | AuthResult::Success) => {
                        info!(?account_id, "Authentication Success");
                    }
                    Ok(AuthResult::Denied) => {
@ -1087,17 +1117,29 @@ impl Resolver {
            }
            &mut AuthSession::Offline {
                ref account_id,
-                id: _,
+                ref id,
                ref client,
-                ref token,
+                ref session_token,
                ref mut cred_handler,
            } => {
+                // This is not used in the authentication, but is so that any new
+                // extra keys or data on the token are updated correctly if the authentication
+                // requests an update. Since we hold the hsm_lock, no other task can
+                // update this token between now and completion of the fn.
+                let current_token = self
+                    .get_cached_usertoken(id)
+                    .await
+                    .map(|(_expired, option_token)| option_token)
+                    .map_err(|err| {
+                        debug!(?err, "get_usertoken error");
+                    })?;
+
                // We are offline, continue. Remember, authsession should have
                // *everything you need* to proceed here!
-                let mut hsm_lock = self.hsm.lock().await;
                let result = client
                    .unix_user_offline_auth_step(
-                        token,
+                        current_token.as_ref(),
+                        session_token,
                        cred_handler,
                        pam_next_req,
                        hsm_lock.deref_mut(),
@ -1105,7 +1147,7 @@ impl Resolver {
                    .await;

                match result {
-                    Ok(AuthResult::Success { .. }) => {
+                    Ok(AuthResult::SuccessUpdate { .. } | AuthResult::Success) => {
                        info!(?account_id, "Authentication Success");
                    }
                    Ok(AuthResult::Denied) => {
@ -1156,8 +1198,13 @@ impl Resolver {

        match maybe_err {
            // What did the provider direct us to do next?
-            Ok(AuthResult::Success { mut token }) => {
-                self.set_cache_usertoken(&mut token).await?;
+            Ok(AuthResult::Success) => {
+                *auth_session = AuthSession::Success;
+                Ok(PamAuthResponse::Success)
+            }
+            Ok(AuthResult::SuccessUpdate { mut new_token }) => {
+                self.set_cache_usertoken(&mut new_token, hsm_lock.deref_mut())
+                    .await?;
                *auth_session = AuthSession::Success;

                Ok(PamAuthResponse::Success)