Skip to content

Commit a2a765f

Browse files
committed
new usersetting to blacklist (filter) also on description
1 parent c17a815 commit a2a765f

11 files changed

Lines changed: 362 additions & 9 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,3 +7,4 @@ npm-debug.log
77
.idea
88
.vscode
99
tools/release/config.json
10+
.agents

lib/FredyPipelineExecutioner.js

Lines changed: 52 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,11 +38,15 @@ import { formatListing } from './utils/formatListing.js';
3838
* 3) Normalize listings to the provider schema
3939
* 4) Filter out incomplete/blacklisted listings
4040
* 5) Identify new listings (vs. previously stored hashes)
41-
* 6) Persist new listings
42-
* 7) Filter out entries similar to already seen ones
43-
* 8) Filter out entries that do not match the job's specFilter
44-
* 9) Filter out entries that do not match the job's spatialFilter
45-
* 10) Dispatch notifications
41+
* 6) Optionally enrich new listings via provider.fetchDetails
42+
* 7) Optionally re-apply the provider blacklist using the (now enriched)
43+
* description — only when the user opted in via
44+
* `blacklist_filter_on_provider_details`
45+
* 8) Persist new listings
46+
* 9) Filter out entries similar to already seen ones
47+
* 10) Filter out entries that do not match the job's specFilter
48+
* 11) Filter out entries that do not match the job's spatialFilter
49+
* 12) Dispatch notifications
4650
*/
4751
class FredyPipelineExecutioner {
4852
/**
@@ -86,6 +90,7 @@ class FredyPipelineExecutioner {
8690
.then(this._filter.bind(this))
8791
.then(this._findNew.bind(this))
8892
.then(this._fetchDetails.bind(this))
93+
.then(this._filterAfterDetails.bind(this))
8994
.then(this._geocode.bind(this))
9095
.then(this._save.bind(this))
9196
.then(this._calculateDistance.bind(this))
@@ -266,6 +271,48 @@ class FredyPipelineExecutioner {
266271
);
267272
}
268273

274+
/**
275+
* Re-apply the provider's blacklist filter after `_fetchDetails` has had a
276+
* chance to enrich the listings (e.g., load the full description from the
277+
* detail page). The initial `_filter` step only sees the truncated snippet
278+
* exposed on the search results page, so a blacklisted term that lives
279+
* deeper in the listing's full description would otherwise slip through.
280+
*
281+
* Opt-in: gated by the user setting `blacklist_filter_on_provider_details`.
282+
* The full detail description tends to contain a lot of boilerplate (legal,
283+
* exposé contact info, generic marketing copy) which can accidentally match
284+
* a blacklist term and remove otherwise relevant listings. Users who want
285+
* the stricter behavior must enable the setting explicitly.
286+
*
287+
* Throws {@link NoNewListingsWarning} when all listings are filtered out
288+
* so the rest of the pipeline (save + notify) is short-circuited.
289+
*
290+
* @param {ParsedListing[]} listings Enriched listings to re-filter.
291+
* @returns {ParsedListing[]} Listings that still pass the provider's filter.
292+
* @throws {NoNewListingsWarning} When every listing is filtered out.
293+
*/
294+
_filterAfterDetails(listings) {
295+
if (typeof this._providerConfig.filter !== 'function') {
296+
return listings;
297+
}
298+
const userId = getJob(this._jobKey)?.userId;
299+
const enabled = getUserSettings(userId)?.blacklist_filter_on_provider_details === true;
300+
if (!enabled) {
301+
return listings;
302+
}
303+
const kept = listings.filter(this._providerConfig.filter);
304+
const removed = listings.length - kept.length;
305+
if (removed > 0) {
306+
logger.debug(
307+
`Re-filter after detail enrichment removed ${removed} listing(s) by blacklist (Provider: '${this._providerId}')`,
308+
);
309+
}
310+
if (kept.length === 0) {
311+
throw new NoNewListingsWarning();
312+
}
313+
return kept;
314+
}
315+
269316
/**
270317
* Determine which listings are new by comparing their IDs against stored hashes.
271318
*

lib/api/routes/userSettingsRoute.js

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,28 @@ export default async function userSettingsPlugin(fastify) {
103103
}
104104
});
105105

106+
fastify.post('/blacklist-filter-on-details', async (request, reply) => {
107+
const userId = request.session.currentUser;
108+
const { blacklist_filter_on_provider_details } = request.body;
109+
110+
const globalSettings = await getSettings();
111+
if (globalSettings.demoMode && !isAdmin(request)) {
112+
return reply.code(403).send({ error: 'In demo mode, it is not allowed to change settings.' });
113+
}
114+
115+
if (typeof blacklist_filter_on_provider_details !== 'boolean') {
116+
return reply.code(400).send({ error: 'blacklist_filter_on_provider_details must be a boolean.' });
117+
}
118+
119+
try {
120+
upsertSettings({ blacklist_filter_on_provider_details }, userId);
121+
return { success: true };
122+
} catch (error) {
123+
logger.error('Error updating blacklist-filter-on-details setting', error);
124+
return reply.code(500).send({ error: error.message });
125+
}
126+
});
127+
106128
fastify.post('/listings-view-mode', async (request, reply) => {
107129
const userId = request.session.currentUser;
108130
const { listings_view_mode } = request.body;

lib/provider/immoscout.js

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,7 +198,9 @@ function normalize(o) {
198198
* @returns {boolean}
199199
*/
200200
function applyBlacklist(o) {
201-
return !isOneOf(o.title, appliedBlackList);
201+
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
202+
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
203+
return titleNotBlacklisted && descNotBlacklisted;
202204
}
203205
/** @type {ProviderConfig} */
204206
const config = {

lib/provider/neubauKompass.js

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,9 @@ function normalize(o) {
4242
* @returns {boolean}
4343
*/
4444
function applyBlacklist(o) {
45-
return !isOneOf(o.title, appliedBlackList);
45+
const titleNotBlacklisted = !isOneOf(o.title, appliedBlackList);
46+
const descNotBlacklisted = !isOneOf(o.description, appliedBlackList);
47+
return titleNotBlacklisted && descNotBlacklisted;
4648
}
4749

4850
/** @type {ProviderConfig} */

test/mocks/mockStore.js

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,12 @@ export const getGeocoordinatesByAddress = (any) => {
1717
return null;
1818
};
1919

20+
let userSettings = null;
21+
export function setUserSettings(settings) {
22+
userSettings = settings;
23+
}
2024
export function getUserSettings(userId) {
21-
return null;
25+
return userSettings;
2226
}
2327

2428
export async function getSettings() {

test/pipeline_filtering.test.js

Lines changed: 222 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
* Licensed under Apache-2.0 with Commons Clause and Attribution/Naming Clause
44
*/
55

6-
import { expect } from 'vitest';
6+
import { afterEach, expect } from 'vitest';
77
import { mockFredy } from './utils.js';
88
import * as mockStore from './mocks/mockStore.js';
9+
import { get as getLastNotification } from './mocks/mockNotification.js';
910

1011
describe('Issue reproduction: listings filtered by similarity or area should be marked as manually deleted', () => {
1112
it('should call deleteListingsById when listings are filtered by similarity', async () => {
@@ -113,3 +114,223 @@ describe('Issue reproduction: listings filtered by similarity or area should be
113114
expect(mockStore.deletedIds).toContain('2');
114115
});
115116
});
117+
118+
describe('Blacklist is re-applied after detail enrichment', () => {
119+
afterEach(() => {
120+
mockStore.setUserSettings(null);
121+
});
122+
123+
it('filters out a listing whose blacklisted term only appears in the enriched description', async () => {
124+
const Fredy = await mockFredy();
125+
const providerId = 'test-provider';
126+
127+
mockStore.setUserSettings({
128+
provider_details: [providerId],
129+
blacklist_filter_on_provider_details: true,
130+
});
131+
132+
const mockSimilarityCache = {
133+
checkAndAddEntry: () => false,
134+
};
135+
136+
const blacklist = ['allkauf'];
137+
138+
// The search results page returns a clean snippet (no blacklisted term).
139+
// fetchDetails simulates loading the full detail page and discovers the
140+
// blacklisted term hidden deep in the description.
141+
const providerConfig = {
142+
url: 'http://example.com',
143+
getListings: () =>
144+
Promise.resolve([
145+
{
146+
id: 'kept',
147+
title: 'Nice house',
148+
address: 'Some street',
149+
price: '500000',
150+
link: 'http://example.com/kept',
151+
description: 'Cozy home with garden',
152+
},
153+
{
154+
id: 'blacklisted',
155+
title: 'Eleganz trifft Raumkomfort',
156+
address: 'Other street',
157+
price: '600000',
158+
link: 'http://example.com/blacklisted',
159+
description: 'Eleganz trifft Raumkomfort',
160+
},
161+
]),
162+
normalize: (l) => l,
163+
filter: (l) => {
164+
const text = `${l.title ?? ''} ${l.description ?? ''}`.toLowerCase();
165+
return !blacklist.some((term) => text.includes(term));
166+
},
167+
fetchDetails: (listing) => {
168+
if (listing.id === 'blacklisted') {
169+
return Promise.resolve({
170+
...listing,
171+
description: 'Mit allkauf Haus wird dein Traum vom Eigenheim wahr.',
172+
});
173+
}
174+
return Promise.resolve(listing);
175+
},
176+
crawlFields: {
177+
id: 'id',
178+
title: 'title',
179+
address: 'address',
180+
price: 'price',
181+
link: 'link',
182+
description: 'description',
183+
},
184+
requiredFieldNames: ['id', 'title', 'address', 'price', 'link', 'description'],
185+
};
186+
187+
const mockedJob = {
188+
id: 'blacklist-test-job',
189+
notificationAdapter: null,
190+
specFilter: null,
191+
spatialFilter: null,
192+
};
193+
194+
const fredy = new Fredy(providerConfig, mockedJob, providerId, mockSimilarityCache, undefined);
195+
196+
const result = await fredy.execute();
197+
198+
expect(result).toBeInstanceOf(Array);
199+
const ids = result.map((l) => l.id);
200+
expect(ids).toContain('kept');
201+
expect(ids).not.toContain('blacklisted');
202+
203+
const notification = getLastNotification();
204+
const notifiedIds = (notification?.payload ?? []).map((p) => p.id);
205+
expect(notifiedIds).not.toContain('blacklisted');
206+
});
207+
208+
it('short-circuits the pipeline when all listings get blacklisted after enrichment', async () => {
209+
const Fredy = await mockFredy();
210+
const providerId = 'all-blacklisted-provider';
211+
212+
mockStore.setUserSettings({
213+
provider_details: [providerId],
214+
blacklist_filter_on_provider_details: true,
215+
});
216+
217+
const mockSimilarityCache = {
218+
checkAndAddEntry: () => false,
219+
};
220+
221+
const blacklist = ['allkauf'];
222+
223+
const providerConfig = {
224+
url: 'http://example.com',
225+
getListings: () =>
226+
Promise.resolve([
227+
{
228+
id: 'only',
229+
title: 'Eleganz trifft Raumkomfort',
230+
address: 'Some street',
231+
price: '700000',
232+
link: 'http://example.com/only',
233+
description: 'Eleganz trifft Raumkomfort',
234+
},
235+
]),
236+
normalize: (l) => l,
237+
filter: (l) => {
238+
const text = `${l.title ?? ''} ${l.description ?? ''}`.toLowerCase();
239+
return !blacklist.some((term) => text.includes(term));
240+
},
241+
fetchDetails: (listing) =>
242+
Promise.resolve({
243+
...listing,
244+
description: 'Mit allkauf Haus wird dein Traum vom Eigenheim wahr.',
245+
}),
246+
crawlFields: {
247+
id: 'id',
248+
title: 'title',
249+
address: 'address',
250+
price: 'price',
251+
link: 'link',
252+
description: 'description',
253+
},
254+
requiredFieldNames: ['id', 'title', 'address', 'price', 'link', 'description'],
255+
};
256+
257+
const mockedJob = {
258+
id: 'all-blacklisted-job',
259+
notificationAdapter: null,
260+
specFilter: null,
261+
spatialFilter: null,
262+
};
263+
264+
const fredy = new Fredy(providerConfig, mockedJob, providerId, mockSimilarityCache, undefined);
265+
266+
// Should resolve to undefined (NoNewListingsWarning is caught in _handleError).
267+
const result = await fredy.execute();
268+
expect(result).toBeUndefined();
269+
});
270+
271+
it('does NOT re-filter when blacklist_filter_on_provider_details is disabled', async () => {
272+
const Fredy = await mockFredy();
273+
const providerId = 'opt-out-provider';
274+
275+
// provider_details enabled (so fetchDetails runs) but blacklist re-filter NOT enabled.
276+
mockStore.setUserSettings({
277+
provider_details: [providerId],
278+
blacklist_filter_on_provider_details: false,
279+
});
280+
281+
const mockSimilarityCache = {
282+
checkAndAddEntry: () => false,
283+
};
284+
285+
const blacklist = ['allkauf'];
286+
287+
const providerConfig = {
288+
url: 'http://example.com',
289+
getListings: () =>
290+
Promise.resolve([
291+
{
292+
id: 'leaks-through',
293+
title: 'Eleganz trifft Raumkomfort',
294+
address: 'Other street',
295+
price: '600000',
296+
link: 'http://example.com/leaks-through',
297+
description: 'Eleganz trifft Raumkomfort',
298+
},
299+
]),
300+
normalize: (l) => l,
301+
filter: (l) => {
302+
const text = `${l.title ?? ''} ${l.description ?? ''}`.toLowerCase();
303+
return !blacklist.some((term) => text.includes(term));
304+
},
305+
fetchDetails: (listing) =>
306+
Promise.resolve({
307+
...listing,
308+
description: 'Mit allkauf Haus wird dein Traum vom Eigenheim wahr.',
309+
}),
310+
crawlFields: {
311+
id: 'id',
312+
title: 'title',
313+
address: 'address',
314+
price: 'price',
315+
link: 'link',
316+
description: 'description',
317+
},
318+
requiredFieldNames: ['id', 'title', 'address', 'price', 'link', 'description'],
319+
};
320+
321+
const mockedJob = {
322+
id: 'opt-out-job',
323+
notificationAdapter: null,
324+
specFilter: null,
325+
spatialFilter: null,
326+
};
327+
328+
const fredy = new Fredy(providerConfig, mockedJob, providerId, mockSimilarityCache, undefined);
329+
330+
const result = await fredy.execute();
331+
332+
// Listing leaks through because user has not opted in to the stricter check.
333+
expect(result).toBeInstanceOf(Array);
334+
expect(result.map((l) => l.id)).toContain('leaks-through');
335+
});
336+
});

ui/src/locales/de.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,6 +334,11 @@
334334
"settings.providerDetailsPlaceholder": "Anbieter für Detail-Abruf auswählen...",
335335
"settings.providerDetailsUpdated": "Anbieter-Detail-Einstellung aktualisiert.",
336336
"settings.providerDetailsUpdateError": "Einstellung konnte nicht aktualisiert werden.",
337+
"settings.blacklistFilterOnProviderDetails": "Blacklist-Filter auf Anbieter-Details anwenden",
338+
"settings.blacklistFilterOnProviderDetailsHelp": "Wenn aktiv, wird die Blacklist zusätzlich gegen die vollständige Beschreibung geprüft, die durch den obigen Anbieter-Details-Schritt geladen wurde. Damit lassen sich Spam-Anbieter (z. B. 'allkauf', 'massa') herausfiltern, die nur tief in der Detail-Seite auftauchen und nicht im kurzen Vorschau-Text der Suchergebnisse stehen. Standardmäßig aus, weil die vollständige Beschreibung oft generischen Boilerplate-Text (Kontaktdaten, rechtliche Hinweise) enthält, der ein Blacklist-Wort versehentlich auslösen und passende Inserate entfernen kann. Hat keine Wirkung auf Anbieter, für die Anbieter-Details nicht aktiviert sind.",
339+
"settings.blacklistFilterOnProviderDetailsEnable": "Blacklist auf die vollständige Detail-Beschreibung anwenden",
340+
"settings.blacklistFilterOnProviderDetailsUpdated": "Einstellung Blacklist-auf-Details aktualisiert.",
341+
"settings.blacklistFilterOnProviderDetailsUpdateError": "Einstellung konnte nicht aktualisiert werden.",
337342
"settings.listingDeletion": "Inserate löschen",
338343
"settings.listingDeletionHelp": "Wähle den Standard-Löschmodus. Soft Delete blendet Inserate aus ohne erneutes Scraping; Hard Delete entfernt sie aus der Datenbank.",
339344
"settings.listingDeletionSoftLabel": "Als gelöscht markieren (Soft Delete)",

0 commit comments

Comments
 (0)