From e2af2fc86d5f43c2f7a5fa7310c232f67fde6989 Mon Sep 17 00:00:00 2001 From: Benjamin Kane Date: Tue, 11 Mar 2025 20:51:21 -0400 Subject: [PATCH] Optimize Query Performance counts with count scans (#5538) * optimize qp counts with count scans * optional fixes * update gql output --- .../estimatedCountsFragment.graphql.ts | 4 +- .../aggregationsQuery.graphql.ts | 5 +- app/packages/state/src/recoil/aggregations.ts | 2 + app/schema.graphql | 5 +- fiftyone/core/aggregations.py | 27 ++++++-- fiftyone/server/aggregations.py | 55 +++++++++------- tests/unittests/server_aggregations_tests.py | 65 +++++++++++-------- 7 files changed, 101 insertions(+), 62 deletions(-) diff --git a/app/packages/relay/src/fragments/__generated__/estimatedCountsFragment.graphql.ts b/app/packages/relay/src/fragments/__generated__/estimatedCountsFragment.graphql.ts index 03d65589e77..152dcf30c0b 100644 --- a/app/packages/relay/src/fragments/__generated__/estimatedCountsFragment.graphql.ts +++ b/app/packages/relay/src/fragments/__generated__/estimatedCountsFragment.graphql.ts @@ -1,5 +1,5 @@ /** - * @generated SignedSource<<7f7e5448668dba5a46cbd016ef2c2ce5>> + * @generated SignedSource<<2c72c0c7ecd6d46a9c22e81126108379>> * @lightSyntaxTransform * @nogrep */ @@ -12,7 +12,7 @@ import { Fragment, ReaderFragment } from 'relay-runtime'; import { FragmentRefs } from "relay-runtime"; export type estimatedCountsFragment$data = { readonly estimatedFrameCount: number | null; - readonly estimatedSampleCount: number; + readonly estimatedSampleCount: number | null; readonly " $fragmentType": "estimatedCountsFragment"; }; export type estimatedCountsFragment$key = { diff --git a/app/packages/relay/src/queries/__generated__/aggregationsQuery.graphql.ts b/app/packages/relay/src/queries/__generated__/aggregationsQuery.graphql.ts index 4fc5907aa33..5db4875b330 100644 --- a/app/packages/relay/src/queries/__generated__/aggregationsQuery.graphql.ts +++ b/app/packages/relay/src/queries/__generated__/aggregationsQuery.graphql.ts @@ -1,5 +1,5 @@ /** - * @generated SignedSource<> + * @generated SignedSource<<881e44ed7a20e117c89dd22661bc1a0d>> * @lightSyntaxTransform * @nogrep */ @@ -18,6 +18,7 @@ export type AggregationForm = { index?: number | null; mixed: boolean; paths: ReadonlyArray; + queryPerformance?: boolean | null; sampleIds: ReadonlyArray; slice?: string | null; slices?: ReadonlyArray | null; @@ -78,7 +79,7 @@ export type aggregationsQuery$data = { readonly values: ReadonlyArray<{ readonly count: number; readonly value: string; - }>; + }> | null; } | { // This will never be '%other', but we need some // value in case none of the concrete values match. diff --git a/app/packages/state/src/recoil/aggregations.ts b/app/packages/state/src/recoil/aggregations.ts index 86befde765a..aa3829531ac 100644 --- a/app/packages/state/src/recoil/aggregations.ts +++ b/app/packages/state/src/recoil/aggregations.ts @@ -13,6 +13,7 @@ import { groupStatistics, } from "./groups"; import { sidebarSampleId } from "./modal"; +import { queryPerformance } from "./queryPerformance"; import { RelayEnvironmentKey } from "./relay"; import * as schemaAtoms from "./schema"; import * as selectors from "./selectors"; @@ -85,6 +86,7 @@ export const aggregationQuery = graphQLSelectorFamily< slices: mixed ? get(groupSlices) : get(currentSlices(modal)), slice: get(groupSlice), view: customView ? customView : !root ? get(viewAtoms.view) : [], + queryPerformance: get(queryPerformance), }; return { diff --git a/app/schema.graphql b/app/schema.graphql index 4a7eef66d9b..037a83c0fb6 100644 --- a/app/schema.graphql +++ b/app/schema.graphql @@ -32,6 +32,7 @@ input AggregationForm { slices: [String!] view: BSONArray! viewName: String = null + queryPerformance: Boolean = false } union AggregationResponses = @@ -238,7 +239,7 @@ type Dataset { appConfig: DatasetAppConfig info: JSON estimatedFrameCount: Int - estimatedSampleCount: Int! + estimatedSampleCount: Int frameIndexes: [Index!] sampleIndexes: [Index!] stages(slug: String = null, view: BSONArray = null): BSONArray @@ -795,7 +796,7 @@ type StringAggregation implements Aggregation { path: String! count: Int! exists: Int! - values: [StringAggregationValue!]! + values: [StringAggregationValue!] } type StringAggregationValue { diff --git a/fiftyone/core/aggregations.py b/fiftyone/core/aggregations.py index fe85fd9758d..4a09ecd0530 100644 --- a/fiftyone/core/aggregations.py +++ b/fiftyone/core/aggregations.py @@ -5,6 +5,7 @@ | `voxel51.com `_ | """ + from collections import defaultdict, OrderedDict from copy import deepcopy from datetime import date, datetime @@ -537,9 +538,15 @@ class Count(Aggregation): """ def __init__( - self, field_or_expr=None, expr=None, safe=False, _unwind=True + self, + field_or_expr=None, + expr=None, + safe=False, + _optimize=False, + _unwind=True, ): super().__init__(field_or_expr, expr=expr, safe=safe) + self._optimize = _optimize self._unwind = _unwind def _kwargs(self): @@ -575,10 +582,15 @@ def to_mongo(self, sample_collection, context=None): safe=self._safe, unwind=self._unwind, context=context, + optimize=self._optimize, ) - if not sample_collection._contains_videos() or path != "frames": - pipeline.append({"$match": {"$expr": {"$gt": ["$" + path, None]}}}) + # todo: can we omit filtering out none in all situations + if not self._optimize: + if not sample_collection._contains_videos() or path != "frames": + pipeline.append( + {"$match": {"$expr": {"$gt": ["$" + path, None]}}} + ) pipeline.append({"$count": "count"}) @@ -795,9 +807,9 @@ def to_mongo(self, sample_collection, context=None): pipeline += [ { "$match": { - "$expr": {"$and": exprs} - if len(exprs) > 1 - else exprs[0] + "$expr": ( + {"$and": exprs} if len(exprs) > 1 else exprs[0] + ) } } ] @@ -2992,6 +3004,7 @@ def _parse_field_and_expr( unwind=True, allow_missing=False, context=None, + optimize=False, ): # unwind can be {True, False, -1} auto_unwind = unwind != False @@ -3098,7 +3111,7 @@ def _parse_field_and_expr( {"$replaceRoot": {"newRoot": "$frames"}}, ] ) - elif not context: + elif not context and not optimize: pipeline.append({"$project": {path: True}}) elif unwind_list_fields: pipeline.append({"$project": {path: True}}) diff --git a/fiftyone/server/aggregations.py b/fiftyone/server/aggregations.py index 56dcd0c9645..b83e44a0aab 100644 --- a/fiftyone/server/aggregations.py +++ b/fiftyone/server/aggregations.py @@ -42,6 +42,7 @@ class AggregationForm: slices: t.Optional[t.List[str]] view: BSONArray view_name: t.Optional[str] = None + query_performance: t.Optional[bool] = False @gql.interface @@ -53,8 +54,8 @@ class Aggregation: @gql.type class BooleanAggregation(Aggregation): - false: int - true: int + false: int = 0 + true: int = 0 @gql.type @@ -70,11 +71,11 @@ class IntAggregation(Aggregation): @gql.type class FloatAggregation(Aggregation): - inf: int + inf: int = 0 max: t.Optional[float] min: t.Optional[float] - nan: int - ninf: int + nan: int = 0 + ninf: int = 0 @gql.type @@ -92,7 +93,7 @@ class StringAggregationValue: @gql.type class StringAggregation(Aggregation): - values: t.List[StringAggregationValue] + values: t.Optional[t.List[StringAggregationValue]] = None AggregateResult = gql.union( @@ -141,7 +142,10 @@ async def aggregate_resolver( ) aggregations, deserializers = zip( - *[_resolve_path_aggregation(path, view) for path in form.paths] + *[ + _resolve_path_aggregation(path, view, form.query_performance) + for path in form.paths + ] ) counts = [len(a) for a in aggregations] flattened = [item for sublist in aggregations for item in sublist] @@ -209,10 +213,12 @@ async def _load_view(form: AggregationForm, slices: t.List[str]): def _resolve_path_aggregation( - path: str, view: foc.SampleCollection + path: str, view: foc.SampleCollection, query_performance: bool ) -> AggregateResult: aggregations: t.List[foa.Aggregation] = [ - foa.Count(path if path and path != "" else None) + foa.Count( + path if path and path != "" else None, _optimize=query_performance + ) ] field = view.get_field(path) @@ -225,23 +231,26 @@ def _resolve_path_aggregation( else RootAggregation ) - if meets_type(field, fof.BooleanField): - aggregations.append(foa.CountValues(path)) - - elif meets_type(field, (fof.DateField, fof.DateTimeField, fof.IntField)): - aggregations.append(foa.Bounds(path)) + if not query_performance: + if meets_type(field, fof.BooleanField): + aggregations.append(foa.CountValues(path)) - elif meets_type(field, fof.FloatField): - aggregations.append( - foa.Bounds( - path, - safe=True, - _count_nonfinites=True, + elif meets_type( + field, (fof.DateField, fof.DateTimeField, fof.IntField) + ): + aggregations.append(foa.Bounds(path)) + + elif meets_type(field, fof.FloatField): + aggregations.append( + foa.Bounds( + path, + safe=True, + _count_nonfinites=True, + ) ) - ) - elif meets_type(field, (fof.ObjectIdField, fof.StringField)): - aggregations.append(foa.CountValues(path, _first=LIST_LIMIT)) + elif meets_type(field, (fof.ObjectIdField, fof.StringField)): + aggregations.append(foa.CountValues(path, _first=LIST_LIMIT)) data = {"path": path} diff --git a/tests/unittests/server_aggregations_tests.py b/tests/unittests/server_aggregations_tests.py index bc715c31745..807af3b52a6 100644 --- a/tests/unittests/server_aggregations_tests.py +++ b/tests/unittests/server_aggregations_tests.py @@ -94,34 +94,29 @@ async def test_group_mode_sidebar_counts(self, dataset: fo.Dataset): } """ - result = await execute( - schema, - query, - { - "form": { - "index": 0, - "dataset": dataset.name, - "extended_stages": {}, - "filters": { - "label.label": { - "values": [ - "default", - ], - "exclude": False, - "isMatching": False, - } - }, - "group_id": None, - "hidden_labels": [], - "paths": ["label.label"], - "mixed": True, - "sample_ids": [], - "slice": "default", - "slices": None, - "view": [], + form = { + "index": 0, + "dataset": dataset.name, + "extended_stages": {}, + "filters": { + "label.label": { + "values": [ + "default", + ], + "exclude": False, + "isMatching": False, } }, - ) + "group_id": None, + "hidden_labels": [], + "paths": ["label.label"], + "mixed": True, + "sample_ids": [], + "slice": "default", + "slices": None, + "view": [], + } + result = await execute(schema, query, {"form": form}) # ensure only "default" count is returned, "other" should be omitted self.assertEqual( @@ -138,6 +133,24 @@ async def test_group_mode_sidebar_counts(self, dataset: fo.Dataset): }, ) + form["query_performance"] = True + result = await execute(schema, query, {"form": form}) + + # when query performance is on, omit values + self.assertEqual( + result.data, + { + "aggregations": [ + { + "path": "label.label", + "count": 1, + "exists": 1, + "values": None, + }, + ], + }, + ) + class TestGroupModeHistogramCounts(unittest.IsolatedAsyncioTestCase): @drop_async_dataset