Skip to content

Commit afbd23c

Browse files
authored
Merge pull request #77 from jack-nzl/main
feat: Added method to process DOM content so that pages with a lot of…
2 parents ef189db + b12a51c commit afbd23c

File tree

1 file changed

+23
-1
lines changed

1 file changed

+23
-1
lines changed

src/Service/AlgoliaPageCrawler.php

+23-1
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ public function getMainContent(): string
113113
}
114114

115115
if (isset($nodes[0])) {
116-
$output = preg_replace('/\s+/', ' ', $nodes[0]->nodeValue);
116+
$output = $this->processMainContent($nodes[0]->nodeValue);
117117
}
118118
}
119119
} catch (Throwable $e) {
@@ -132,4 +132,26 @@ public function getMainContent(): string
132132

133133
return $output;
134134
}
135+
136+
/**
137+
* Process page DOM content
138+
*
139+
* @param string $content DOM node content
140+
*/
141+
private function processMainContent($content): string
142+
{
143+
// Clean up the DOM content
144+
$content = preg_replace('/\s+/', ' ', $content);
145+
$content = trim($content);
146+
147+
// set cutoff to allow room for other fields
148+
$cutoff = $this->config()->get('content_cutoff_bytes') - 20000;
149+
150+
// If content is still too large, truncate it
151+
if (strlen($content) >= $cutoff) {
152+
$content = mb_strcut($content, 0, $cutoff);
153+
}
154+
155+
return $content;
156+
}
135157
}

0 commit comments

Comments
 (0)