Skip to content

Commit f470009

Browse files
feat: enhance crawler endpoint and email notification localization
- Added optional parameters `email` and `language` to the crawler endpoint for improved user notifications. - Updated documentation in `crawler_endpoint.md` to reflect new parameters and enhance clarity. - Improved email notification content in both English and Farsi localization files to include the application name "Hatef" for better branding. - Enhanced email subject localization to dynamically include the number of pages indexed, improving user engagement. These changes significantly improve the user experience by providing localized notifications and clearer documentation for the crawler API.
1 parent 1ed68c9 commit f470009

File tree

9 files changed

+163
-45
lines changed

9 files changed

+163
-45
lines changed

docs/api/crawler_endpoint.md

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ Add a new site to the crawl queue with optimized SPA rendering.
3535
"url": "https://www.digikala.com",
3636
"maxPages": 100,
3737
"maxDepth": 3,
38+
"email": "[email protected]",
39+
"language": "en",
3840
"spaRenderingEnabled": true,
3941
"includeFullContent": false,
4042
"browserlessUrl": "http://browserless:3000",
@@ -45,16 +47,18 @@ Add a new site to the crawl queue with optimized SPA rendering.
4547

4648
#### Parameters
4749

48-
| Parameter | Type | Default | Description |
49-
| --------------------- | ------- | ------------------------- | -------------------------------------- |
50-
| `url` | string | **required** | Seed URL to start crawling |
51-
| `maxPages` | integer | 1000 | Maximum pages to crawl |
52-
| `maxDepth` | integer | 5 | Maximum crawl depth |
53-
| `spaRenderingEnabled` | boolean | true | Enable SPA rendering |
54-
| `includeFullContent` | boolean | false | Store full HTML content |
55-
| `browserlessUrl` | string | "http://browserless:3000" | Browserless service URL |
56-
| `timeout` | integer | 15000 | Request timeout in milliseconds |
57-
| `politenessDelay` | integer | 500 | Delay between requests in milliseconds |
50+
| Parameter | Type | Default | Description |
51+
| --------------------- | ------- | ------------------------- | --------------------------------------------------- |
52+
| `url` | string | **required** | Seed URL to start crawling |
53+
| `maxPages` | integer | 1000 | Maximum pages to crawl |
54+
| `maxDepth` | integer | 5 | Maximum crawl depth |
55+
| `email` | string | (optional) | Email address for completion notification |
56+
| `language` | string | "en" | Language for email notifications (en, fa, etc.) |
57+
| `spaRenderingEnabled` | boolean | true | Enable SPA rendering |
58+
| `includeFullContent` | boolean | false | Store full HTML content |
59+
| `browserlessUrl` | string | "http://browserless:3000" | Browserless service URL |
60+
| `timeout` | integer | 15000 | Request timeout in milliseconds |
61+
| `politenessDelay` | integer | 500 | Delay between requests in milliseconds |
5862

5963
#### Response
6064

locales/en/crawling-notification.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,12 @@
2222
"completed_at": "Completed At",
2323
"session_id": "Session ID"
2424
},
25-
"description": "Your pages are now searchable in our search engine. If you'd like to crawl and index more pages from your site, please visit our crawl request page.",
25+
"description": "Your pages are now searchable in Hatef search engine. If you'd like to crawl and index more pages from your site, please visit our crawl request page.",
2626
"cta": {
2727
"button_text": "Request More Crawling"
2828
},
2929
"footer": {
30-
"thank_you": "Thank you for using our search engine service!",
30+
"thank_you": "Thank you for using Hatef search engine service!",
3131
"automated_message": "This is an automated notification from Hatef Search Engine",
3232
"unsubscribe_text": "Unsubscribe from these notifications",
3333
"copyright": "© 2024 Hatef.ir - All rights reserved"

locales/fa/crawling-notification.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,12 @@
2222
"completed_at": "تکمیل شده در",
2323
"session_id": "شناسه جلسه"
2424
},
25-
"description": "صفحات شما اکنون در موتور جستجوی ما قابل جستجو هستند. اگر می‌خواهید صفحات بیشتری از سایت خود را خزش و نمایه‌سازی کنید، یا دامنه‌های اضافی برای افزودن دارید، لطفاً از صفحه درخواست خزش ما استفاده کنید.",
25+
"description": "صفحات شما اکنون در موتور جستجو هاتف قابل جستجو هستند. اگر می‌خواهید صفحات بیشتری از سایت خود را خزش و نمایه‌سازی کنید، یا دامنه‌های اضافی برای افزودن دارید، لطفاً از صفحه درخواست خزش ما استفاده کنید.",
2626
"cta": {
2727
"button_text": "درخواست خزش بیشتر"
2828
},
2929
"footer": {
30-
"thank_you": "از استفاده از خدمات موتور جستجوی ما متشکریم!",
30+
"thank_you": "از استفاده از خدمات موتور جستجو هاتف متشکریم!",
3131
"automated_message": "این پیام خودکار از موتور جستجوی هاتف ارسال شده است",
3232
"unsubscribe_text": "لغو اشتراک از این اعلان‌ها",
3333
"copyright": "© ۲۰۲۴ هاتف - تمام حقوق محفوظ است"

public/js/crawl-request-template.js

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,9 @@ function initializeTemplateData(data) {
3333
progressMessages = data.progressMessages || [];
3434

3535
// Debug logging
36-
console.log('Template data initialized:', templateData);
37-
console.log('Base URL from template:', templateData.baseUrl || templateData.base_url || 'Not set');
36+
console.log('🎯 Template data initialized:', templateData);
37+
console.log('🌐 Base URL from template:', templateData.baseUrl || templateData.base_url || 'Not set');
38+
console.log('🌍 Language from template:', templateData.language || 'Not set (will use API default)');
3839
}
3940

4041
let currentSessionId = null;
@@ -196,10 +197,21 @@ async function startCrawl() {
196197
maxDepth: maxDepth
197198
};
198199

200+
// Add language from template data (for localized email notifications)
201+
if (templateData.language) {
202+
payload.language = templateData.language;
203+
console.log('✅ Language set from template data:', templateData.language);
204+
} else {
205+
console.warn('⚠️ Template data language not found, email will use default language');
206+
console.log('Template data:', templateData);
207+
}
208+
199209
if (email) {
200210
payload.email = email;
201211
}
202212

213+
console.log('📤 Sending payload:', payload);
214+
203215
try {
204216
// Show progress section
205217
document.getElementById('form-section').style.display = 'none';

src/controllers/SearchController.cpp

Lines changed: 54 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ void SearchController::addSiteToCrawl(uWS::HttpResponse<false>* res, uWS::HttpRe
217217

218218
// Optional parameters
219219
std::string email = jsonBody.value("email", ""); // Email for completion notification
220+
std::string language = jsonBody.value("language", "en"); // Language for email notification (default: English)
220221
int maxPages = jsonBody.value("maxPages", 1000);
221222
int maxDepth = jsonBody.value("maxDepth", 3);
222223
bool restrictToSeedDomain = jsonBody.value("restrictToSeedDomain", true);
@@ -307,11 +308,11 @@ void SearchController::addSiteToCrawl(uWS::HttpResponse<false>* res, uWS::HttpRe
307308
// Create completion callback for email notification if email is provided
308309
CrawlCompletionCallback emailCallback = nullptr;
309310
if (!email.empty()) {
310-
LOG_INFO("Setting up email notification callback for: " + email);
311-
emailCallback = [this, email, url](const std::string& sessionId,
311+
LOG_INFO("Setting up email notification callback for: " + email + " (language: " + language + ")");
312+
emailCallback = [this, email, url, language](const std::string& sessionId,
312313
const std::vector<CrawlResult>& results,
313314
CrawlerManager* manager) {
314-
this->sendCrawlCompletionEmail(sessionId, email, url, results);
315+
this->sendCrawlCompletionEmail(sessionId, email, url, results, language);
315316
};
316317
}
317318

@@ -1696,9 +1697,10 @@ namespace {
16961697
}
16971698

16981699
void SearchController::sendCrawlCompletionEmail(const std::string& sessionId, const std::string& email,
1699-
const std::string& url, const std::vector<CrawlResult>& results) {
1700+
const std::string& url, const std::vector<CrawlResult>& results,
1701+
const std::string& language) {
17001702
try {
1701-
LOG_INFO("Sending crawl completion email for session: " + sessionId + " to: " + email);
1703+
LOG_INFO("Sending crawl completion email for session: " + sessionId + " to: " + email + " (language: " + language + ")");
17021704

17031705
// Get email service using lazy initialization
17041706
auto emailService = getEmailService();
@@ -1733,8 +1735,9 @@ void SearchController::sendCrawlCompletionEmail(const std::string& sessionId, co
17331735
}
17341736
}
17351737

1736-
// Load localized sender name
1737-
std::string senderName = loadLocalizedSenderName("fa"); // Default to Persian for now
1738+
// Load localized sender name and subject using the provided language
1739+
std::string senderName = loadLocalizedSenderName(language);
1740+
std::string localizedSubject = loadLocalizedSubject(language, crawledPagesCount);
17381741

17391742
// Prepare notification data
17401743
search_engine::storage::EmailService::NotificationData data;
@@ -1744,7 +1747,8 @@ void SearchController::sendCrawlCompletionEmail(const std::string& sessionId, co
17441747
data.crawledPagesCount = crawledPagesCount;
17451748
data.crawlSessionId = sessionId;
17461749
data.crawlCompletedAt = std::chrono::system_clock::now();
1747-
data.language = "fa"; // Default to Persian for now
1750+
data.language = language;
1751+
data.subject = localizedSubject; // Set localized subject
17481752

17491753
// Send email asynchronously with localized sender name
17501754
bool success = emailService->sendCrawlingNotificationAsync(data, senderName, "");
@@ -1877,4 +1881,46 @@ std::string SearchController::loadLocalizedSenderName(const std::string& languag
18771881
LOG_ERROR("SearchController: Exception loading localized sender name for language " + language + ": " + e.what());
18781882
return "Hatef Search Engine"; // Default fallback
18791883
}
1884+
}
1885+
1886+
std::string SearchController::loadLocalizedSubject(const std::string& language, int pageCount) const {
1887+
try {
1888+
// Load localization file
1889+
std::string localesPath = "locales/" + language + "/crawling-notification.json";
1890+
std::string localeContent = loadFile(localesPath);
1891+
1892+
if (localeContent.empty() && language != "en") {
1893+
LOG_WARNING("SearchController: Failed to load locale file: " + localesPath + ", falling back to English");
1894+
localesPath = "locales/en/crawling-notification.json";
1895+
localeContent = loadFile(localesPath);
1896+
}
1897+
1898+
if (localeContent.empty()) {
1899+
LOG_WARNING("SearchController: Failed to load any localization file, using default subject");
1900+
return "Crawling Complete - " + std::to_string(pageCount) + " pages indexed"; // Default fallback
1901+
}
1902+
1903+
// Parse JSON and extract subject
1904+
nlohmann::json localeData = nlohmann::json::parse(localeContent);
1905+
1906+
if (localeData.contains("email") && localeData["email"].contains("subject")) {
1907+
std::string subject = localeData["email"]["subject"];
1908+
1909+
// Replace {pages} placeholder with actual count
1910+
size_t pos = subject.find("{pages}");
1911+
if (pos != std::string::npos) {
1912+
subject.replace(pos, 7, std::to_string(pageCount));
1913+
}
1914+
1915+
LOG_DEBUG("SearchController: Loaded localized subject: " + subject + " for language: " + language);
1916+
return subject;
1917+
} else {
1918+
LOG_WARNING("SearchController: subject not found in locale file, using default");
1919+
return "Crawling Complete - " + std::to_string(pageCount) + " pages indexed"; // Default fallback
1920+
}
1921+
1922+
} catch (const std::exception& e) {
1923+
LOG_ERROR("SearchController: Exception loading localized subject for language " + language + ": " + e.what());
1924+
return "Crawling Complete - " + std::to_string(pageCount) + " pages indexed"; // Default fallback
1925+
}
18801926
}

src/controllers/SearchController.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,8 @@ class SearchController : public routing::Controller {
4444

4545
// Email notification for crawl completion
4646
void sendCrawlCompletionEmail(const std::string& sessionId, const std::string& email,
47-
const std::string& url, const std::vector<CrawlResult>& results);
47+
const std::string& url, const std::vector<CrawlResult>& results,
48+
const std::string& language);
4849

4950
// Email service access (lazy initialization)
5051
search_engine::storage::EmailService* getEmailService() const;
@@ -54,6 +55,9 @@ class SearchController : public routing::Controller {
5455

5556
// Localized sender name loading
5657
std::string loadLocalizedSenderName(const std::string& language) const;
58+
59+
// Localized email subject loading
60+
std::string loadLocalizedSubject(const std::string& language, int pageCount) const;
5761

5862
private:
5963
mutable std::unique_ptr<search_engine::storage::EmailService> emailService_;

src/storage/EmailService.cpp

Lines changed: 68 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,7 @@ std::string EmailService::formatEmailHeaders(const std::string& to, const std::s
320320

321321
headers << "To: " << to << "\r\n";
322322
headers << "From: " << config_.fromName << " <" << config_.fromEmail << ">\r\n";
323+
headers << "Reply-To: [email protected]\r\n";
323324
headers << "Subject: " << subject << "\r\n";
324325
headers << "MIME-Version: 1.0\r\n";
325326

@@ -627,7 +628,7 @@ bool EmailService::performSMTPRequest(const std::string& to, const std::string&
627628
}
628629

629630
std::string EmailService::generateDefaultNotificationHTML(const NotificationData& data) {
630-
LOG_INFO("EmailService: Using Inja template-based email generation");
631+
LOG_INFO("EmailService: Using Inja template-based email generation for language: " + data.language);
631632

632633
// Render the email template
633634
std::string templateHTML = renderEmailTemplate("email-crawling-notification.inja", data);
@@ -637,6 +638,9 @@ std::string EmailService::generateDefaultNotificationHTML(const NotificationData
637638
throw std::runtime_error("Failed to render email template");
638639
}
639640

641+
LOG_DEBUG("EmailService: Generated HTML content length: " + std::to_string(templateHTML.length()) + " bytes for language: " + data.language);
642+
LOG_DEBUG("EmailService: HTML preview (first 200 chars): " + templateHTML.substr(0, std::min(size_t(200), templateHTML.length())));
643+
640644
return templateHTML;
641645
}
642646

@@ -1103,31 +1107,72 @@ std::string EmailService::convertToPersianDate(const std::tm& gregorianDate) {
11031107
int gMonth = gregorianDate.tm_mon + 1;
11041108
int gDay = gregorianDate.tm_mday;
11051109

1106-
// Calculate days since March 21, 2024 (reference point: 1 Farvardin 1403)
1107-
int daysSinceMarch21 = 0;
1110+
// Determine Persian year based on Gregorian date
1111+
// Persian new year (Nowruz) is around March 20/21
1112+
int persianYear;
1113+
if (gMonth < 3 || (gMonth == 3 && gDay < 20)) {
1114+
// Before March 20: still in previous Persian year
1115+
persianYear = gYear - 621;
1116+
} else {
1117+
// March 20 onwards: new Persian year has started
1118+
persianYear = gYear - 621;
1119+
}
11081120

1109-
// Days in each month (from March to current month)
1110-
int monthDays[] = {31, 30, 31, 30, 31, 31, 30, 31, 30, 31, 31, 28}; // March to February
1121+
// Calculate day of year in Persian calendar
1122+
int persianDayOfYear;
11111123

1112-
if (gMonth >= 3) {
1113-
// Current year - calculate days from March 21
1114-
for (int i = 3; i < gMonth; i++) {
1115-
daysSinceMarch21 += monthDays[i - 3];
1124+
if (gMonth >= 3 && (gMonth > 3 || gDay >= 20)) {
1125+
// From March 20 onwards in current Gregorian year
1126+
int daysInGregorianMonths[] = {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
1127+
1128+
// Check for leap year
1129+
if ((gYear % 4 == 0 && gYear % 100 != 0) || (gYear % 400 == 0)) {
1130+
daysInGregorianMonths[1] = 29;
1131+
}
1132+
1133+
persianDayOfYear = 0;
1134+
// Add days from March 20 to end of March
1135+
if (gMonth == 3) {
1136+
persianDayOfYear = gDay - 20 + 1;
1137+
} else {
1138+
persianDayOfYear = daysInGregorianMonths[2] - 20 + 1; // Days left in March (12 days)
1139+
// Add full months between April and current month
1140+
for (int m = 4; m < gMonth; m++) {
1141+
persianDayOfYear += daysInGregorianMonths[m - 1];
1142+
}
1143+
// Add days in current month
1144+
persianDayOfYear += gDay;
11161145
}
1117-
daysSinceMarch21 += gDay - 21; // March 21 is day 0
11181146
} else {
1119-
// Previous year - calculate from March 21 of previous year
1120-
daysSinceMarch21 += 31 - 21 + 30 + 31 + 30 + 31 + 31 + 30 + 31 + 30 + 31 + 31 + 28; // March 21 to Dec 31
1121-
for (int i = 1; i < gMonth; i++) {
1122-
daysSinceMarch21 += monthDays[i - 1 + 9]; // Offset for month array
1147+
// Before March 20: in previous Persian year
1148+
persianYear--;
1149+
1150+
int daysInGregorianMonths[] = {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
1151+
1152+
// Check for leap year of previous Gregorian year
1153+
int prevGYear = gYear - 1;
1154+
if ((prevGYear % 4 == 0 && prevGYear % 100 != 0) || (prevGYear % 400 == 0)) {
1155+
daysInGregorianMonths[1] = 29;
1156+
}
1157+
1158+
// Days from March 20 to Dec 31 of previous year
1159+
persianDayOfYear = daysInGregorianMonths[2] - 20 + 1; // Rest of March (12 days)
1160+
for (int m = 4; m <= 12; m++) {
1161+
persianDayOfYear += daysInGregorianMonths[m - 1];
1162+
}
1163+
1164+
// Add days from Jan 1 to current date
1165+
for (int m = 1; m < gMonth; m++) {
1166+
// Use current year's month days for Jan-Feb
1167+
int currentYearMonthDays[] = {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31};
1168+
if ((gYear % 4 == 0 && gYear % 100 != 0) || (gYear % 400 == 0)) {
1169+
currentYearMonthDays[1] = 29;
1170+
}
1171+
persianDayOfYear += currentYearMonthDays[m - 1];
11231172
}
1124-
daysSinceMarch21 += gDay - 1;
1173+
persianDayOfYear += gDay;
11251174
}
11261175

1127-
// Convert to Persian date
1128-
int persianYear = 1403; // Base year for 2024
1129-
int persianDayOfYear = daysSinceMarch21 + 1;
1130-
11311176
// Persian months: 31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29/30
11321177
int persianMonthDays[] = {31, 31, 31, 31, 31, 31, 30, 30, 30, 30, 30, 29};
11331178

@@ -1159,6 +1204,10 @@ std::string EmailService::convertToPersianDate(const std::tm& gregorianDate) {
11591204
" (" + persianMonths[persianMonth - 1] + ") " +
11601205
"ساعت " + std::string(timeBuffer) + " (تهران)";
11611206

1207+
LOG_DEBUG("EmailService: Converted Gregorian " + std::to_string(gYear) + "/" +
1208+
std::to_string(gMonth) + "/" + std::to_string(gDay) +
1209+
" to Persian: " + persianDate);
1210+
11621211
return persianDate;
11631212

11641213
} catch (const std::exception& e) {

0 commit comments

Comments
 (0)