From f810d2d855298051524124bf79a42e472efe72ac Mon Sep 17 00:00:00 2001 From: Colin Dellow Date: Sat, 28 Jan 2023 14:29:11 -0500 Subject: [PATCH 1/2] array facet: don't materialize unnecessary columns The presence of `inner.*` causes SQLite to materialize a row with all the columns. Those columns will be discarded later. Instead, we can select only the column we'll use. This lets SQLite's optimizer realize that the other columns in the CTE definition aren't needed. On a test table with 278K rows, 98K of which had an array, this speeds up the facet calculation from 4 sec to 1 sec. --- datasette/facets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasette/facets.py b/datasette/facets.py index 7fb0c68b45..129f6ee2f4 100644 --- a/datasette/facets.py +++ b/datasette/facets.py @@ -383,7 +383,7 @@ async def facet_results(self): deduped_array_items as ( select distinct j.value, - inner.* + inner.{col} from json_each([inner].{col}) j join inner From f529a3001d35a114d2e622dcc7913c4c25a95ed8 Mon Sep 17 00:00:00 2001 From: Colin Dellow Date: Sat, 28 Jan 2023 23:44:57 -0500 Subject: [PATCH 2/2] array facets uniqueness fix Use a window function to generate unique IDs for each row to avoid having to do a DISTINCT on arbitrarily wide rows. Add a test to capture the case where multiple rows are the same. --- datasette/facets.py | 16 +++++----------- tests/test_facets.py | 24 ++++++++++++++++++------ 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/datasette/facets.py b/datasette/facets.py index 129f6ee2f4..6daf7780c3 100644 --- a/datasette/facets.py +++ b/datasette/facets.py @@ -380,23 +380,17 @@ async def facet_results(self): # https://github.com/simonw/datasette/issues/448 facet_sql = """ with inner as ({sql}), - deduped_array_items as ( - select - distinct j.value, - inner.{col} - from - json_each([inner].{col}) j - join inner - ) + with_ids as (select row_number() over () as row_number, {col} as array from inner), + array_items as (select row_number, each.value from json_each(with_ids.array) each, with_ids) select value as value, - count(*) as count + count(distinct row_number) as count from - deduped_array_items + array_items group by value order by - count(*) desc, value limit {limit} + count(distinct row_number) desc, value limit {limit} """.format( col=escape_sqlite(column), sql=self.sql, limit=facet_size + 1 ) diff --git a/tests/test_facets.py b/tests/test_facets.py index d264f534b5..ea7c8aa81e 100644 --- a/tests/test_facets.py +++ b/tests/test_facets.py @@ -408,21 +408,33 @@ async def test_array_facet_results(ds_client): async def test_array_facet_handle_duplicate_tags(): ds = Datasette([], memory=True) db = ds.add_database(Database(ds, memory_name="test_array_facet")) - await db.execute_write("create table otters(name text, tags text)") - for name, tags in ( - ("Charles", ["friendly", "cunning", "friendly"]), - ("Shaun", ["cunning", "empathetic", "friendly"]), - ("Tracy", ["empathetic", "eager"]), + await db.execute_write("create table otters(tags text)") + for tags in ( + ["friendly", "cunning", "friendly"], + ["cunning", "empathetic", "friendly"], + ["empathetic", "eager"], + ["placid"], + ["placid"], + ["placid"], ): await db.execute_write( - "insert into otters (name, tags) values (?, ?)", [name, json.dumps(tags)] + "insert into otters (tags) values (?)", [json.dumps(tags)] ) response = await ds.client.get("/test_array_facet/otters.json?_facet_array=tags") + + print(response.json()["facet_results"]["tags"]) assert response.json()["facet_results"]["tags"] == { "name": "tags", "type": "array", "results": [ + { + "value": "placid", + "label": "placid", + "count": 3, + "toggle_url": "http://localhost/test_array_facet/otters.json?_facet_array=tags&tags__arraycontains=placid", + "selected": False, + }, { "value": "cunning", "label": "cunning",