From f810d2d855298051524124bf79a42e472efe72ac Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sat, 28 Jan 2023 14:29:11 -0500
Subject: [PATCH 1/2] array facet: don't materialize unnecessary columns

The presence of `inner.*` causes SQLite to materialize a row with all
the columns. Those columns will be discarded later.

Instead, we can select only the column we'll use. This lets SQLite's
optimizer realize that the other columns in the CTE definition
aren't needed.

On a test table with 278K rows, 98K of which had an array, this speeds up
the facet calculation from 4 sec to 1 sec.
---
 datasette/facets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasette/facets.py b/datasette/facets.py
index 7fb0c68b45..129f6ee2f4 100644
--- a/datasette/facets.py
+++ b/datasette/facets.py
@@ -383,7 +383,7 @@ async def facet_results(self):
                 deduped_array_items as (
                     select
                         distinct j.value,
-                        inner.*
+                        inner.{col}
                     from
                         json_each([inner].{col}) j
                         join inner

From f529a3001d35a114d2e622dcc7913c4c25a95ed8 Mon Sep 17 00:00:00 2001
From: Colin Dellow <cldellow@gmail.com>
Date: Sat, 28 Jan 2023 23:44:57 -0500
Subject: [PATCH 2/2] array facets uniqueness fix

Use a window function to generate unique IDs for each row to avoid
having to do a DISTINCT on arbitrarily wide rows.

Add a test to capture the case where multiple rows are the same.
---
 datasette/facets.py  | 16 +++++-----------
 tests/test_facets.py | 24 ++++++++++++++++++------
 2 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/datasette/facets.py b/datasette/facets.py
index 129f6ee2f4..6daf7780c3 100644
--- a/datasette/facets.py
+++ b/datasette/facets.py
@@ -380,23 +380,17 @@ async def facet_results(self):
             # https://github.com/simonw/datasette/issues/448
             facet_sql = """
                 with inner as ({sql}),
-                deduped_array_items as (
-                    select
-                        distinct j.value,
-                        inner.{col}
-                    from
-                        json_each([inner].{col}) j
-                        join inner
-                )
+                with_ids as (select row_number() over () as row_number, {col} as array from inner),
+                array_items as (select row_number, each.value from json_each(with_ids.array) each, with_ids)
                 select
                     value as value,
-                    count(*) as count
+                    count(distinct row_number) as count
                 from
-                    deduped_array_items
+                    array_items
                 group by
                     value
                 order by
-                    count(*) desc, value limit {limit}
+                    count(distinct row_number) desc, value limit {limit}
             """.format(
                 col=escape_sqlite(column), sql=self.sql, limit=facet_size + 1
             )
diff --git a/tests/test_facets.py b/tests/test_facets.py
index d264f534b5..ea7c8aa81e 100644
--- a/tests/test_facets.py
+++ b/tests/test_facets.py
@@ -408,21 +408,33 @@ async def test_array_facet_results(ds_client):
 async def test_array_facet_handle_duplicate_tags():
     ds = Datasette([], memory=True)
     db = ds.add_database(Database(ds, memory_name="test_array_facet"))
-    await db.execute_write("create table otters(name text, tags text)")
-    for name, tags in (
-        ("Charles", ["friendly", "cunning", "friendly"]),
-        ("Shaun", ["cunning", "empathetic", "friendly"]),
-        ("Tracy", ["empathetic", "eager"]),
+    await db.execute_write("create table otters(tags text)")
+    for tags in (
+        ["friendly", "cunning", "friendly"],
+        ["cunning", "empathetic", "friendly"],
+        ["empathetic", "eager"],
+        ["placid"],
+        ["placid"],
+        ["placid"],
     ):
         await db.execute_write(
-            "insert into otters (name, tags) values (?, ?)", [name, json.dumps(tags)]
+            "insert into otters (tags) values (?)", [json.dumps(tags)]
         )
 
     response = await ds.client.get("/test_array_facet/otters.json?_facet_array=tags")
+
+    print(response.json()["facet_results"]["tags"])
     assert response.json()["facet_results"]["tags"] == {
         "name": "tags",
         "type": "array",
         "results": [
+            {
+                "value": "placid",
+                "label": "placid",
+                "count": 3,
+                "toggle_url": "http://localhost/test_array_facet/otters.json?_facet_array=tags&tags__arraycontains=placid",
+                "selected": False,
+            },
             {
                 "value": "cunning",
                 "label": "cunning",