Merge pull request #77 from jack-nzl/main

wilr · web-flow · commit afbd23c2c0f3 · 2025-02-13T14:54:09.000+13:00
feat: Added method to process DOM content so that pages with a lot of…
diff --git a/src/Service/AlgoliaPageCrawler.php b/src/Service/AlgoliaPageCrawler.php
@@ -113,7 +113,7 @@ public function getMainContent(): string
                 }
 
                 if (isset($nodes[0])) {
-                    $output = preg_replace('/\s+/', ' ', $nodes[0]->nodeValue);
+                    $output = $this->processMainContent($nodes[0]->nodeValue);
                 }
             }
         } catch (Throwable $e) {
@@ -132,4 +132,26 @@ public function getMainContent(): string
 
         return $output;
     }
+
+    /**
+     * Process page DOM content
+     * 
+     * @param string $content DOM node content
+     */
+    private function processMainContent($content): string
+    {
+        // Clean up the DOM content
+        $content = preg_replace('/\s+/', ' ', $content);
+        $content = trim($content);
+
+        // set cutoff to allow room for other fields
+        $cutoff = $this->config()->get('content_cutoff_bytes') - 20000;
+
+        // If content is still too large, truncate it
+        if (strlen($content) >= $cutoff) {
+            $content = mb_strcut($content, 0, $cutoff);
+        }
+
+        return $content;
+    }
 }