diff --git a/src/Http/Controllers/Api/AnnotationCandidateController.php b/src/Http/Controllers/Api/AnnotationCandidateController.php index 301b387..1b73fd1 100644 --- a/src/Http/Controllers/Api/AnnotationCandidateController.php +++ b/src/Http/Controllers/Api/AnnotationCandidateController.php @@ -88,34 +88,47 @@ public function indexSimilar($id, $id2) $job = MaiaJob::findOrFail($id); $this->authorize('access', $job); - $feature = AnnotationCandidateFeatureVector::where('job_id', $id) - ->findOrFail($id2); - - // Manually optimized query for the cosine distance. The nearestNeighbors() - // method of pgvector seems to compute the distances twice and returns lots - // of data that we don't need. - $ids = $feature->whereNotNull('vector') - ->where('id', '!=', $feature->id) - ->where('job_id', $id) - ->orderByRaw('vector <=> ?', [$feature->vector]) - ->pluck('id'); - - $count = $ids->count(); - if ($count === 0) { + $feature = AnnotationCandidateFeatureVector::findOrFail($id2); + $query = AnnotationCandidateFeatureVector::where('job_id', $id); + $hasFeatures = $query->clone() + ->whereNotNull('vector') + ->whereNot('id', $id2) + ->exists(); + + if (!$hasFeatures) { abort(Response::HTTP_NOT_FOUND); } - if ($count !== ($job->annotationCandidates()->count() - 1)) { + $yieldItems = function () use ($query, $feature, $job): \Generator { + // Manually optimized query for the cosine distance. The nearestNeighbors() + // method of pgvector seems to compute the distances twice and returns lots + // of data that we don't need. + $idsQuery = $query->clone() + ->whereNotNull('vector') + ->whereNot('id', $feature->id) + ->orderByRaw('vector <=> ?', [$feature->vector]) + ->orderBy('id') + ->select('id'); + + foreach ($idsQuery->lazy() as $item) { + yield $item->id; + } + // Add IDs of candidates without feature vectors at the end. - $ids = $ids->concat( - $job->annotationCandidates() - ->whereNotIn('id', $ids) - ->whereNot('id', $id2) - ->pluck('id') - ); - } + $remainingIdsQuery = $job->annotationCandidates() + ->whereNotIn('id', function ($query) use ($job) { + $query->select('id') + ->from('maia_annotation_candidate_feature_vectors') + ->where('job_id', $job->id); + }); + + foreach ($remainingIdsQuery->lazyById() as $item) { + yield $item->id; + } + }; - return $ids; + // Use a streamed response because there can be a lot of items. + return response()->streamJson($yieldItems()); } /** diff --git a/src/Http/Controllers/Api/TrainingProposalController.php b/src/Http/Controllers/Api/TrainingProposalController.php index 86d5676..97d85e2 100644 --- a/src/Http/Controllers/Api/TrainingProposalController.php +++ b/src/Http/Controllers/Api/TrainingProposalController.php @@ -85,34 +85,47 @@ public function indexSimilar($id, $id2) $job = MaiaJob::findOrFail($id); $this->authorize('access', $job); - $feature = TrainingProposalFeatureVector::where('job_id', $id) - ->findOrFail($id2); - - // Manually optimized query for the cosine distance. The nearestNeighbors() - // method of pgvector seems to compute the distances twice and returns lots - // of data that we don't need. - $ids = $feature->whereNotNull('vector') - ->where('id', '!=', $feature->id) - ->where('job_id', $id) - ->orderByRaw('vector <=> ?', [$feature->vector]) - ->pluck('id'); - - $count = $ids->count(); - if ($count === 0) { + $feature = TrainingProposalFeatureVector::findOrFail($id2); + $query = TrainingProposalFeatureVector::where('job_id', $id); + $hasFeatures = $query->clone() + ->whereNotNull('vector') + ->whereNot('id', $id2) + ->exists(); + + if (!$hasFeatures) { abort(Response::HTTP_NOT_FOUND); } - if ($count !== ($job->trainingProposals()->count() - 1)) { - // Add IDs of proposals without feature vectors at the end. - $ids = $ids->concat( - $job->trainingProposals() - ->whereNotIn('id', $ids) - ->whereNot('id', $id2) - ->pluck('id') - ); - } + $yieldItems = function () use ($query, $feature, $job): \Generator { + // Manually optimized query for the cosine distance. The nearestNeighbors() + // method of pgvector seems to compute the distances twice and returns lots + // of data that we don't need. + $idsQuery = $query->clone() + ->whereNotNull('vector') + ->whereNot('id', $feature->id) + ->orderByRaw('vector <=> ?', [$feature->vector]) + ->orderBy('id') + ->select('id'); + + foreach ($idsQuery->lazy() as $item) { + yield $item->id; + } + + // Add IDs of candidates without feature vectors at the end. + $remainingIdsQuery = $job->trainingProposals() + ->whereNotIn('id', function ($query) use ($job) { + $query->select('id') + ->from('maia_training_proposal_feature_vectors') + ->where('job_id', $job->id); + }); + + foreach ($remainingIdsQuery->lazyById() as $item) { + yield $item->id; + } + }; - return $ids; + // Use a streamed response because there can be a lot of items. + return response()->streamJson($yieldItems()); } /** diff --git a/tests/Http/Controllers/Api/AnnotationCandidateControllerTest.php b/tests/Http/Controllers/Api/AnnotationCandidateControllerTest.php index 66a648c..5f25cb1 100644 --- a/tests/Http/Controllers/Api/AnnotationCandidateControllerTest.php +++ b/tests/Http/Controllers/Api/AnnotationCandidateControllerTest.php @@ -225,9 +225,20 @@ public function testIndexSimilarity() 'vector' => range(0, 383), ]); - $this->getJson("/api/v1/maia-jobs/{$id}/annotation-candidates/similar-to/{$ac1->id}") - ->assertStatus(200) - ->assertExactJson([$ac3->id, $ac2->id]); + $response = $this->getJson("/api/v1/maia-jobs/{$id}/annotation-candidates/similar-to/{$ac1->id}") + ->assertStatus(200); + + ob_start(); + $response->sendContent(); + $content = ob_get_clean(); + $response = new TestResponse( + new Response($content, + $response->baseResponse->getStatusCode(), + $response->baseResponse->headers->all() + ) + ); + + $response->assertExactJson([$ac3->id, $ac2->id]); } public function testIndexSimilarityMissing() @@ -250,9 +261,20 @@ public function testIndexSimilarityMissing() $ac3 = AnnotationCandidateTest::create(['job_id' => $id]); $this->beEditor(); - $this->getJson("/api/v1/maia-jobs/{$id}/annotation-candidates/similar-to/{$ac1->id}") - ->assertStatus(200) - ->assertExactJson([$ac2->id, $ac3->id]); + $response = $this->getJson("/api/v1/maia-jobs/{$id}/annotation-candidates/similar-to/{$ac1->id}") + ->assertStatus(200); + + ob_start(); + $response->sendContent(); + $content = ob_get_clean(); + $response = new TestResponse( + new Response($content, + $response->baseResponse->getStatusCode(), + $response->baseResponse->headers->all() + ) + ); + + $response->assertExactJson([$ac2->id, $ac3->id]); } public function testIndexSimilarityEmpty() diff --git a/tests/Http/Controllers/Api/TrainingProposalControllerTest.php b/tests/Http/Controllers/Api/TrainingProposalControllerTest.php index c73403f..0bd04ad 100644 --- a/tests/Http/Controllers/Api/TrainingProposalControllerTest.php +++ b/tests/Http/Controllers/Api/TrainingProposalControllerTest.php @@ -207,9 +207,20 @@ public function testIndexSimilarity() 'vector' => range(0, 383), ]); - $this->getJson("/api/v1/maia-jobs/{$id}/training-proposals/similar-to/{$tp1->id}") - ->assertStatus(200) - ->assertExactJson([$tp3->id, $tp2->id]); + $response = $this->getJson("/api/v1/maia-jobs/{$id}/training-proposals/similar-to/{$tp1->id}") + ->assertStatus(200); + + ob_start(); + $response->sendContent(); + $content = ob_get_clean(); + $response = new TestResponse( + new Response($content, + $response->baseResponse->getStatusCode(), + $response->baseResponse->headers->all() + ) + ); + + $response->assertExactJson([$tp3->id, $tp2->id]); } public function testIndexSimilarityMissing() @@ -232,9 +243,20 @@ public function testIndexSimilarityMissing() $tp3 = TrainingProposalTest::create(['job_id' => $id]); $this->beEditor(); - $this->getJson("/api/v1/maia-jobs/{$id}/training-proposals/similar-to/{$tp1->id}") - ->assertStatus(200) - ->assertExactJson([$tp2->id, $tp3->id]); + $response = $this->getJson("/api/v1/maia-jobs/{$id}/training-proposals/similar-to/{$tp1->id}") + ->assertStatus(200); + + ob_start(); + $response->sendContent(); + $content = ob_get_clean(); + $response = new TestResponse( + new Response($content, + $response->baseResponse->getStatusCode(), + $response->baseResponse->headers->all() + ) + ); + + $response->assertExactJson([$tp2->id, $tp3->id]); } public function testIndexSimilarityEmpty()