Replication tweaks - try the most recent successful one and error less (#2189)

* made an error less error-y and also found a way to try the last-most-working repl peer
This commit is contained in:
James Hodgkinson 2023-10-07 13:09:42 +10:00 committed by GitHub
parent 0adc3e0dd9
commit 48979b8e1a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 101 additions and 75 deletions

View file

@ -115,8 +115,15 @@ kanidm login --name demo_user
kanidm self whoami --name demo_user
```
{{#template templates/kani-warning.md imagepath=images title=Warning! text=Don't use the direct
credential reset to lock or invalidate an account. You should expire the account instead. }}
<!-- deno-fmt-ignore-start -->
{{#template templates/kani-warning.md
imagepath=images
title=Warning!
text=Don't use the direct credential reset to lock or invalidate an account. You should expire the account instead.
}}
<!-- deno-fmt-ignore-end -->
## Reauthentication / Privilege Access Mode

View file

@ -392,7 +392,7 @@ pub(crate) async fn handle_conn(
.serve_connection(tls_stream, svc)
.await
.map_err(|e| {
error!("Failed to complete connection: {:?}", e);
debug!("Failed to complete connection: {:?}", e);
std::io::Error::from(ErrorKind::ConnectionAborted)
})
}

View file

@ -8,12 +8,15 @@ use std::net::SocketAddr;
use std::pin::Pin;
use std::sync::Arc;
use std::time::Duration;
use tokio::net::{TcpListener, TcpStream};
use tokio::sync::broadcast;
use tokio::sync::mpsc;
use tokio::sync::oneshot;
use tokio::sync::Mutex;
use tokio::time::{interval, sleep, timeout};
use tokio::{
net::{TcpListener, TcpStream},
task::JoinHandle,
};
use tokio_openssl::SslStream;
use tokio_util::codec::{Framed, FramedRead, FramedWrite};
use tracing::{error, Instrument};
@ -77,7 +80,7 @@ pub(crate) async fn create_repl_server(
"Starting replication interface https://{} ...",
repl_config.bindaddress
);
let repl_handle = tokio::spawn(repl_acceptor(
let repl_handle: JoinHandle<()> = tokio::spawn(repl_acceptor(
listener,
idms,
repl_config.clone(),
@ -90,15 +93,22 @@ pub(crate) async fn create_repl_server(
}
#[instrument(level = "info", skip_all)]
/// This returns the remote address that worked, so you can try that first next time
async fn repl_consumer_connect_supplier(
domain: &str,
sock_addrs: &[SocketAddr],
tls_connector: &SslConnector,
consumer_conn_settings: &ConsumerConnSettings,
) -> Option<Framed<SslStream<TcpStream>, codec::ConsumerCodec>> {
) -> Option<(
SocketAddr,
Framed<SslStream<TcpStream>, codec::ConsumerCodec>,
)> {
// This is pretty gnarly, but we need to loop to try out each socket addr.
for sock_addr in sock_addrs {
debug!("Connecting to {} replica via {}", domain, sock_addr);
debug!(
"Attempting to connect to {} replica via {}",
domain, sock_addr
);
let tcpstream = match timeout(
consumer_conn_settings.replica_connect_timeout,
@ -108,29 +118,29 @@ async fn repl_consumer_connect_supplier(
{
Ok(Ok(tc)) => tc,
Ok(Err(err)) => {
error!(?err, "Failed to connect to {}", sock_addr);
debug!(?err, "Failed to connect to {}", sock_addr);
continue;
}
Err(_) => {
error!("Timeout connecting to {}", sock_addr);
debug!("Timeout connecting to {}", sock_addr);
continue;
}
};
trace!("connection established");
trace!("Connection established to peer on {:?}", sock_addr);
let mut tlsstream = match Ssl::new(tls_connector.context())
.and_then(|tls_obj| SslStream::new(tls_obj, tcpstream))
{
Ok(ta) => ta,
Err(e) => {
error!("replication client TLS setup error, continuing -> {:?}", e);
error!("Replication client TLS setup error, continuing -> {:?}", e);
continue;
}
};
if let Err(e) = SslStream::connect(Pin::new(&mut tlsstream)).await {
error!("replication client TLS accept error, continuing -> {:?}", e);
error!("Replication client TLS accept error, continuing -> {:?}", e);
continue;
};
@ -138,14 +148,19 @@ async fn repl_consumer_connect_supplier(
tlsstream,
codec::ConsumerCodec::new(consumer_conn_settings.max_frame_bytes),
);
return Some(supplier_conn);
// "hey this one worked, try it first next time!"
return Some((sock_addr.to_owned(), supplier_conn));
}
error!("Unable to connect to supplier.");
error!(
"Unable to connect to supplier, tried to connect to {:?}",
sock_addrs
);
None
}
/// This returns the socket address that worked, so you can try that first next time
#[instrument(level="info", skip(refresh_coord, tls_connector, idms), fields(uuid=Uuid::new_v4().to_string()))]
async fn repl_run_consumer_refresh(
refresh_coord: Arc<Mutex<(bool, mpsc::Sender<()>)>>,
domain: &str,
@ -153,7 +168,7 @@ async fn repl_run_consumer_refresh(
tls_connector: &SslConnector,
idms: &IdmServer,
consumer_conn_settings: &ConsumerConnSettings,
) {
) -> Result<Option<SocketAddr>, ()> {
// Take the refresh lock. Note that every replication consumer *should* end up here
// behind this lock, but only one can proceed. This is what we want!
@ -161,44 +176,37 @@ async fn repl_run_consumer_refresh(
// Simple case - task is already done.
if refresh_coord_guard.0 {
trace!("refresh already completed by another task, return.");
return;
trace!("Refresh already completed by another task, return.");
return Ok(None);
}
// okay, we need to proceed.
let Some(mut supplier_conn) =
let (addr, mut supplier_conn) =
repl_consumer_connect_supplier(domain, sock_addrs, tls_connector, consumer_conn_settings)
.await
else {
return;
};
.ok_or(())?;
// If we fail at any point, just RETURN because this leaves the next task to attempt, or
// the channel drops and that tells the caller this failed.
if let Err(err) = supplier_conn.send(ConsumerRequest::Refresh).await {
error!(?err, "consumer encode error, unable to continue.");
return;
}
supplier_conn
.send(ConsumerRequest::Refresh)
.await
.map_err(|err| error!(?err, "consumer encode error, unable to continue."))?;
let refresh = if let Some(codec_msg) = supplier_conn.next().await {
match codec_msg {
Ok(SupplierResponse::Refresh(changes)) => {
match codec_msg.map_err(|err| error!(?err, "Consumer decode error, unable to continue."))? {
SupplierResponse::Refresh(changes) => {
// Success - return to bypass the error message.
changes
}
Ok(SupplierResponse::Pong) | Ok(SupplierResponse::Incremental(_)) => {
SupplierResponse::Pong | SupplierResponse::Incremental(_) => {
error!("Supplier Response contains invalid State");
return;
}
Err(err) => {
error!(?err, "consumer decode error, unable to continue.");
return;
return Err(());
}
}
} else {
error!("Connection closed");
return;
return Err(());
};
// Now apply the refresh if possible
@ -206,14 +214,11 @@ async fn repl_run_consumer_refresh(
// Scope the transaction.
let ct = duration_from_epoch_now();
let mut write_txn = idms.proxy_write(ct).await;
if let Err(err) = write_txn
write_txn
.qs_write
.consumer_apply_refresh(&refresh)
.and_then(|cs| write_txn.commit().map(|()| cs))
{
error!(?err, "consumer was not able to apply refresh.");
return;
}
.map_err(|err| error!(?err, "Consumer was not able to apply refresh."))?;
}
// Now mark the refresh as complete AND indicate it to the channel.
@ -224,9 +229,11 @@ async fn repl_run_consumer_refresh(
// Here the coord guard will drop and every other task proceeds.
warn!("Replication refresh was successful.");
info!("Replication refresh was successful.");
Ok(Some(addr))
}
#[instrument(level="info", skip(tls_connector, idms), fields(eventid=Uuid::new_v4().to_string()))]
async fn repl_run_consumer(
domain: &str,
sock_addrs: &[SocketAddr],
@ -234,12 +241,12 @@ async fn repl_run_consumer(
automatic_refresh: bool,
idms: &IdmServer,
consumer_conn_settings: &ConsumerConnSettings,
) {
let Some(mut supplier_conn) =
) -> Option<SocketAddr> {
let Some((socket_addr, mut supplier_conn)) =
repl_consumer_connect_supplier(domain, sock_addrs, tls_connector, consumer_conn_settings)
.await
else {
return;
return None;
};
// Perform incremental.
@ -252,7 +259,7 @@ async fn repl_run_consumer(
?err,
"consumer ruv range could not be accessed, unable to continue."
);
return;
return None;
}
}
};
@ -262,7 +269,7 @@ async fn repl_run_consumer(
.await
{
error!(?err, "consumer encode error, unable to continue.");
return;
return None;
}
let changes = if let Some(codec_msg) = supplier_conn.next().await {
@ -273,16 +280,16 @@ async fn repl_run_consumer(
}
Ok(SupplierResponse::Pong) | Ok(SupplierResponse::Refresh(_)) => {
error!("Supplier Response contains invalid State");
return;
return None;
}
Err(err) => {
error!(?err, "consumer decode error, unable to continue.");
return;
return None;
}
}
} else {
error!("Connection closed");
return;
return None;
};
// Now apply the changes if possible
@ -297,7 +304,7 @@ async fn repl_run_consumer(
Ok(state) => state,
Err(err) => {
error!(?err, "consumer was not able to apply changes.");
return;
return None;
}
}
};
@ -306,21 +313,21 @@ async fn repl_run_consumer(
ConsumerState::Ok => {
info!("Incremental Replication Success");
// return to bypass the failure message.
return;
return Some(socket_addr);
}
ConsumerState::RefreshRequired => {
if automatic_refresh {
warn!("Consumer is out of date and must be refreshed. This will happen *now*.");
} else {
error!("Consumer is out of date and must be refreshed. You must manually resolve this situation.");
return;
return None;
};
}
}
if let Err(err) = supplier_conn.send(ConsumerRequest::Refresh).await {
error!(?err, "consumer encode error, unable to continue.");
return;
return None;
}
let refresh = if let Some(codec_msg) = supplier_conn.next().await {
@ -331,16 +338,16 @@ async fn repl_run_consumer(
}
Ok(SupplierResponse::Pong) | Ok(SupplierResponse::Incremental(_)) => {
error!("Supplier Response contains invalid State");
return;
return None;
}
Err(err) => {
error!(?err, "consumer decode error, unable to continue.");
return;
return None;
}
}
} else {
error!("Connection closed");
return;
return None;
};
// Now apply the refresh if possible
@ -352,10 +359,11 @@ async fn repl_run_consumer(
.and_then(|cs| write_txn.commit().map(|()| cs))
{
error!(?err, "consumer was not able to apply refresh.");
return;
return None;
}
warn!("Replication refresh was successful.");
Some(socket_addr)
}
#[derive(Debug, Clone)]
@ -450,43 +458,54 @@ async fn repl_task(
info!("Replica task for {} has started.", origin);
// we keep track of the "last known good" socketaddr so we can try that first next time.
let mut last_working_address: Option<SocketAddr> = None;
// Okay, all the parameters are setup. Now we wait on our interval.
loop {
// if the target address worked last time, then let's use it this time!
let mut sorted_socket_addrs = vec![];
if let Some(addr) = last_working_address {
sorted_socket_addrs.push(addr);
};
// this is O(2^n) but we *should* be talking about a small number of addresses for a given hostname
socket_addrs.iter().for_each(|addr| {
if !sorted_socket_addrs.contains(addr) {
sorted_socket_addrs.push(addr.to_owned());
}
});
tokio::select! {
Ok(task) = task_rx.recv() => {
match task {
ReplConsumerCtrl::Stop => break,
ReplConsumerCtrl::Refresh ( refresh_coord ) => {
let eventid = Uuid::new_v4();
let span = info_span!("replication_run_consumer_refresh", uuid = ?eventid);
// let _enter = span.enter();
repl_run_consumer_refresh(
last_working_address = match repl_run_consumer_refresh(
refresh_coord,
domain,
&socket_addrs,
&sorted_socket_addrs,
&tls_connector,
&idms,
&consumer_conn_settings
)
.instrument(span)
.await
.await {
Ok(val) => val,
Err(_) => None
};
}
}
}
_ = repl_interval.tick() => {
// Interval passed, attempt a replication run.
let eventid = Uuid::new_v4();
let span = info_span!("replication_run_consumer", uuid = ?eventid);
// let _enter = span.enter();
repl_run_consumer(
domain,
&socket_addrs,
&sorted_socket_addrs,
&tls_connector,
automatic_refresh,
&idms,
&consumer_conn_settings
)
.instrument(span)
.await;
}
}
@ -697,7 +716,7 @@ async fn repl_acceptor(
} => {
let task_rx = task_tx.subscribe();
let handle = tokio::spawn(repl_task(
let handle: JoinHandle<()> = tokio::spawn(repl_task(
origin.clone(),
server_key.clone(),
server_cert.clone(),
@ -879,7 +898,7 @@ async fn repl_acceptor(
let _ = task_tx.send(ReplConsumerCtrl::Stop);
for task_handle in task_handles.drain(..) {
// Let each task join.
let res: Result<(), _> = task_handle.await;
let res: Result<(), _> = task_handle.await.map(|_| ());
if res.is_err() {
warn!("Failed to join replication task, continuing ...");
}

View file

@ -661,7 +661,7 @@ impl FilterComp {
// * If all filters are okay, return Ok(Filter::Or())
// * Any filter is invalid, return the error.
// * An empty "or" is a valid filter in mathematical terms, but we throw an
// error to warn the user because it's super unlikey they want that
// error to warn the user because it's super unlikely they want that
if filters.is_empty() {
return Err(SchemaError::EmptyFilter);
};
@ -676,7 +676,7 @@ impl FilterComp {
// * If all filters are okay, return Ok(Filter::Or())
// * Any filter is invalid, return the error.
// * An empty "and" is a valid filter in mathematical terms, but we throw an
// error to warn the user because it's super unlikey they want that
// error to warn the user because it's super unlikely they want that
if filters.is_empty() {
return Err(SchemaError::EmptyFilter);
};