Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
197 changes: 157 additions & 40 deletions docs/reference/metrics.md

Large diffs are not rendered by default.

2 changes: 2 additions & 0 deletions quickwit/quickwit-common/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,7 @@ pub struct InFlightDataGauges {
pub doc_processor_mailbox: IntGauge,
pub indexer_mailbox: IntGauge,
pub index_writer: IntGauge,
pub get_object: IntGauge,
in_flight_gauge_vec: IntGaugeVec<1>,
}

Expand All @@ -365,6 +366,7 @@ impl Default for InFlightDataGauges {
doc_processor_mailbox: in_flight_gauge_vec.with_label_values(["doc_processor_mailbox"]),
indexer_mailbox: in_flight_gauge_vec.with_label_values(["indexer_mailbox"]),
index_writer: in_flight_gauge_vec.with_label_values(["index_writer"]),
get_object: in_flight_gauge_vec.with_label_values(["get_object"]),
in_flight_gauge_vec: in_flight_gauge_vec.clone(),
}
}
Expand Down
17 changes: 16 additions & 1 deletion quickwit/quickwit-ingest/src/ingest_v2/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ use mrecordlog::ResourceUsage;
use once_cell::sync::Lazy;
use quickwit_common::metrics::{
Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, exponential_buckets,
linear_buckets, new_counter_vec, new_gauge, new_gauge_vec, new_histogram, new_histogram_vec,
linear_buckets, new_counter, new_counter_vec, new_gauge, new_gauge_vec, new_histogram,
new_histogram_vec,
};

// Counter vec counting the different outcomes of ingest requests as
Expand Down Expand Up @@ -82,6 +83,8 @@ pub(super) struct IngestV2Metrics {
pub wal_disk_used_bytes: IntGauge,
pub wal_memory_used_bytes: IntGauge,
pub ingest_results: IngestResultMetrics,
pub replicated_num_bytes_total: IntCounter,
pub replicated_num_docs_total: IntCounter,
}

impl Default for IngestV2Metrics {
Expand Down Expand Up @@ -146,6 +149,18 @@ impl Default for IngestV2Metrics {
"ingest",
&[],
),
replicated_num_bytes_total: new_counter(
"replicated_num_bytes_total",
"Total size in bytes of the replicated docs.",
"ingest",
&[],
),
replicated_num_docs_total: new_counter(
"replicated_num_docs_total",
"Total number of docs replicated.",
"ingest",
&[],
),
}
}
}
Expand Down
6 changes: 3 additions & 3 deletions quickwit/quickwit-ingest/src/ingest_v2/replication.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ use super::metrics::report_wal_usage;
use super::models::IngesterShard;
use super::mrecordlog_utils::check_enough_capacity;
use super::state::IngesterState;
use crate::ingest_v2::metrics::INGEST_V2_METRICS;
use crate::ingest_v2::mrecordlog_utils::{AppendDocBatchError, append_non_empty_doc_batch};
use crate::metrics::INGEST_METRICS;
use crate::{estimate_size, with_lock_metrics};

pub(super) const SYN_REPLICATION_STREAM_CAPACITY: usize = 5;
Expand Down Expand Up @@ -667,10 +667,10 @@ impl ReplicationTask {
.expect("replica shard should be initialized")
.set_replication_position_inclusive(current_position_inclusive.clone(), now);

INGEST_METRICS
INGEST_V2_METRICS
.replicated_num_bytes_total
.inc_by(batch_num_bytes);
INGEST_METRICS
INGEST_V2_METRICS
.replicated_num_docs_total
.inc_by(batch_num_docs);

Expand Down
28 changes: 3 additions & 25 deletions quickwit/quickwit-ingest/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,14 @@
// limitations under the License.

use once_cell::sync::Lazy;
use quickwit_common::metrics::{IntCounter, IntGauge, new_counter, new_counter_vec, new_gauge};
use quickwit_common::metrics::{IntCounter, new_counter_vec};

pub struct IngestMetrics {
// With ingest V1 all ingested documents are considered valid
pub ingested_docs_bytes_valid: IntCounter,
pub ingested_docs_valid: IntCounter,
pub ingested_docs_bytes_invalid: IntCounter,
pub ingested_docs_invalid: IntCounter,
pub ingested_docs_valid: IntCounter,

pub replicated_num_bytes_total: IntCounter,
pub replicated_num_docs_total: IntCounter,
Comment on lines -24 to -25
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

moved this to be part of of ingest_v2 metrics

#[allow(dead_code)] // this really shouldn't be dead, it needs to be used somewhere
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this has been dead for a year

pub queue_count: IntGauge,
}

impl Default for IngestMetrics {
Expand Down Expand Up @@ -56,24 +52,6 @@ impl Default for IngestMetrics {
ingested_docs_bytes_invalid,
ingested_docs_valid,
ingested_docs_invalid,
replicated_num_bytes_total: new_counter(
"replicated_num_bytes_total",
"Total size in bytes of the replicated docs.",
"ingest",
&[],
),
replicated_num_docs_total: new_counter(
"replicated_num_docs_total",
"Total number of docs replicated.",
"ingest",
&[],
),
queue_count: new_gauge(
"queue_count",
"Number of queues currently active",
"ingest",
&[],
),
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions quickwit/quickwit-serve/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,14 +52,14 @@ impl Default for ServeMetrics {
),
ongoing_requests: new_gauge_vec(
"ongoing_requests",
"Number of ongoing requests.",
"Number of ongoing requests on specific endpoint groups",
"",
&[],
["endpoint_group"],
),
pending_requests: new_gauge_vec(
"pending_requests",
"Number of pending requests.",
"Number of pending requests on specific endpoint groups",
"",
&[],
["endpoint_group"],
Expand Down
131 changes: 41 additions & 90 deletions quickwit/quickwit-storage/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@

use once_cell::sync::Lazy;
use quickwit_common::metrics::{
GaugeGuard, Histogram, IntCounter, IntCounterVec, IntGauge, new_counter, new_counter_vec,
new_gauge, new_histogram_vec,
GaugeGuard, HistogramVec, IntCounter, IntCounterVec, IntGauge, MEMORY_METRICS, new_counter,
new_counter_vec, new_gauge, new_histogram_vec,
};

/// Counters associated to storage operations.
Expand All @@ -30,19 +30,11 @@ pub struct StorageMetrics {
pub searcher_split_cache: CacheMetrics,
pub get_slice_timeout_successes: [IntCounter; 3],
pub get_slice_timeout_all_timeouts: IntCounter,
pub object_storage_get_total: IntCounter,
pub object_storage_get_errors_total: IntCounterVec<1>,
pub object_storage_get_slice_in_flight_count: IntGauge,
pub object_storage_get_slice_in_flight_num_bytes: IntGauge,
Comment on lines -35 to -36
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • I created this metric to make sure that this wasn't the reason for OOMs on search and I confirmed it's not the case
  • I think it makes sense to keep tracking the memory usage, but move it to quickwit_memory_in_flight where it is more discoverable and actionable

pub object_storage_put_total: IntCounter,
pub object_storage_put_parts: IntCounter,
pub object_storage_download_num_bytes: IntCounter,
pub object_storage_upload_num_bytes: IntCounter,

pub object_storage_delete_requests_total: IntCounter,
pub object_storage_bulk_delete_requests_total: IntCounter,
pub object_storage_delete_request_duration: Histogram,
pub object_storage_bulk_delete_request_duration: Histogram,
pub object_storage_requests_total: IntCounterVec<2>,
pub object_storage_request_duration: HistogramVec<2>,
pub object_storage_download_num_bytes: IntCounterVec<1>,
pub object_storage_download_errors: IntCounterVec<1>,
pub object_storage_upload_num_bytes: IntCounterVec<1>,
}

impl Default for StorageMetrics {
Expand All @@ -63,31 +55,6 @@ impl Default for StorageMetrics {
let get_slice_timeout_all_timeouts =
get_slice_timeout_outcome_total_vec.with_label_values(["all_timeouts"]);

let object_storage_requests_total = new_counter_vec(
"object_storage_requests_total",
"Total number of object storage requests performed.",
"storage",
&[],
["action"],
);
let object_storage_delete_requests_total =
object_storage_requests_total.with_label_values(["delete_object"]);
let object_storage_bulk_delete_requests_total =
object_storage_requests_total.with_label_values(["delete_objects"]);

let object_storage_request_duration = new_histogram_vec(
"object_storage_request_duration_seconds",
"Duration of object storage requests in seconds.",
"storage",
&[],
["action"],
vec![0.1, 0.5, 1.0, 5.0, 10.0, 30.0, 60.0],
);
let object_storage_delete_request_duration =
object_storage_request_duration.with_label_values(["delete_object"]);
let object_storage_bulk_delete_request_duration =
object_storage_request_duration.with_label_values(["delete_objects"]);

StorageMetrics {
fast_field_cache: CacheMetrics::for_component("fastfields"),
fd_cache_metrics: CacheMetrics::for_component("fd"),
Expand All @@ -97,62 +64,50 @@ impl Default for StorageMetrics {
split_footer_cache: CacheMetrics::for_component("splitfooter"),
get_slice_timeout_successes,
get_slice_timeout_all_timeouts,
object_storage_get_total: new_counter(
"object_storage_gets_total",
"Number of objects fetched. Might be lower than get_slice_timeout_outcome if \
queries are debounced.",
object_storage_requests_total: new_counter_vec(
"object_storage_requests_total",
"Number of requests to the object store, by action and status. Requests are \
recorded when the response headers are returned, download failures will not \
appear as errors.",
"storage",
&[],
["action", "status"],
),
object_storage_get_errors_total: new_counter_vec::<1>(
"object_storage_get_errors_total",
"Number of GetObject errors.",
object_storage_request_duration: new_histogram_vec(
"object_storage_request_duration",
"Durations until the response headers are returned from the object store, by \
action and status. This does not measure the download time for the body content.",
"storage",
&[],
["code"],
["action", "status"],
vec![0.1, 0.5, 1.0, 5.0, 10.0, 30.0, 60.0],
),
object_storage_get_slice_in_flight_count: new_gauge(
"object_storage_get_slice_in_flight_count",
"Number of GetObject for which the memory was allocated but the download is still \
in progress.",
"storage",
&[],
),
object_storage_get_slice_in_flight_num_bytes: new_gauge(
"object_storage_get_slice_in_flight_num_bytes",
"Memory allocated for GetObject requests that are still in progress.",
"storage",
&[],
),
object_storage_put_total: new_counter(
"object_storage_puts_total",
"Number of objects uploaded. May differ from object_storage_requests_parts due to \
multipart upload.",
object_storage_download_num_bytes: new_counter_vec(
"object_storage_download_num_bytes",
"Amount of data downloaded from object storage.",
"storage",
&[],
["status"],
),
object_storage_put_parts: new_counter(
"object_storage_puts_parts",
"Number of object parts uploaded.",
"",
&[],
),
object_storage_download_num_bytes: new_counter(
"object_storage_download_num_bytes",
"Amount of data downloaded from an object storage.",
object_storage_download_errors: new_counter_vec(
"object_storage_download_errors",
// Download errors are recorded separately because the associated
// get_object requests were already recorded as successful in
// object_storage_requests_total
"Number of download requests that received successful response headers but failed \
during download.",
"storage",
&[],
["status"],
),
object_storage_upload_num_bytes: new_counter(
object_storage_upload_num_bytes: new_counter_vec(
"object_storage_upload_num_bytes",
"Amount of data uploaded to an object storage.",
"Amount of data uploaded to object storage. The value recorded for failed and \
aborted uploads is the full payload size.",
"storage",
&[],
["status"],
),
object_storage_delete_requests_total,
object_storage_bulk_delete_requests_total,
object_storage_delete_request_duration,
object_storage_bulk_delete_request_duration,
}
}
}
Expand Down Expand Up @@ -229,15 +184,11 @@ pub static STORAGE_METRICS: Lazy<StorageMetrics> = Lazy::new(StorageMetrics::def
pub static CACHE_METRICS_FOR_TESTS: Lazy<CacheMetrics> =
Lazy::new(|| CacheMetrics::for_component("fortest"));

pub fn object_storage_get_slice_in_flight_guards(
get_request_size: usize,
) -> (GaugeGuard<'static>, GaugeGuard<'static>) {
let mut bytes_guard = GaugeGuard::from_gauge(
&crate::STORAGE_METRICS.object_storage_get_slice_in_flight_num_bytes,
);
/// Helps tracking pre-allocated memory for downloads that are still in progress.
///
/// This is actually recorded as a memory metric and not a storage metric.
pub fn object_storage_get_slice_in_flight_guards(get_request_size: usize) -> GaugeGuard<'static> {
let mut bytes_guard = GaugeGuard::from_gauge(&MEMORY_METRICS.in_flight.get_object);
bytes_guard.add(get_request_size as i64);
let mut count_guard =
GaugeGuard::from_gauge(&crate::STORAGE_METRICS.object_storage_get_slice_in_flight_count);
count_guard.add(1);
(bytes_guard, count_guard)
bytes_guard
}
Loading