diff --git a/server/Dockerfile.jobs b/server/Dockerfile.jobs new file mode 100644 index 000000000..b3a785ed3 --- /dev/null +++ b/server/Dockerfile.jobs @@ -0,0 +1,31 @@ +# Cloud Run Jobs Dockerfile +# Used for batch processing jobs that replace MapReduce/Pipeline + +FROM python:3.11-slim + +# Set environment variables +ENV PYTHONUNBUFFERED=1 +ENV PYTHONDONTWRITEBYTECODE=1 +ENV GOOGLE_CLOUD_PROJECT=dancedeets-hrd + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + gcc \ + libffi-dev \ + && rm -rf /var/lib/apt/lists/* + +# Create app directory +WORKDIR /app + +# Copy requirements first for better caching +COPY requirements-jobs.txt . +RUN pip install --no-cache-dir -r requirements-jobs.txt + +# Copy application code +COPY dancedeets/ ./dancedeets/ + +# Set Python path +ENV PYTHONPATH=/app + +# Default command (overridden by Cloud Run Job configuration) +CMD ["python", "-m", "dancedeets.jobs.runner"] diff --git a/server/MIGRATION_PLAN.md b/server/MIGRATION_PLAN.md new file mode 100644 index 000000000..f1059be10 --- /dev/null +++ b/server/MIGRATION_PLAN.md @@ -0,0 +1,430 @@ +# MapReduce/Pipeline Migration Plan + +This document outlines the migration from legacy App Engine MapReduce/Pipeline to modern Google Cloud services. + +## Migration Progress + +| Phase | Status | Jobs Migrated | +|-------|--------|---------------| +| Phase 1: Infrastructure | ✅ COMPLETE | Framework, Dockerfile, requirements | +| Phase 2: Simple Mapper Jobs | ✅ COMPLETE | 6/6 jobs | +| Phase 3: GCS Output Jobs | ✅ COMPLETE | 5/5 jobs | +| Phase 4: MapReduce Pipeline Jobs | ✅ COMPLETE | 3/4 jobs (find_access_tokens pending) | +| Phase 5: Cloud Workflows | ✅ COMPLETE | 1 workflow + 3 jobs | +| Phase 6: Code Cleanup | ✅ COMPLETE | Old mapreduce code removed | + +## Cleanup Completed + +The following original files have been cleaned up to remove mapreduce/pipeline code: + +**Files modified (old mapreduce code removed, core functions retained):** +- `notifications/added_events.py` - Kept `promote_events_to_user()`, removed mapreduce handler +- `sitemaps/events.py` - Kept `generate_sitemap_entry()`, removed mapreduce handler +- `ml/gprediction.py` - Kept `predict()`, `get_predict_service()`, removed MR wrappers +- `users/user_event_tasks.py` - Kept `update_user_qualities()`, removed mapreduce handler +- `users/user_tasks.py` - Kept `fetch_and_save_fb_user()`, removed mapreduce handler +- `search/email_events.py` - Kept `email_for_user()`, removed mapreduce wrapper +- `pubsub/pubsub_tasks.py` - Kept social handlers, removed `PostJapanEventsHandler` MR code +- `rankings/rankings.py` - Kept utility functions, removed all mapreduce code +- `event_scraper/auto_add.py` - Kept classification logic, removed MR wrappers (added optional `metrics` param) +- `event_scraper/thing_db.py` - Kept Source model and helpers, removed MR pipeline code +- `event_scraper/thing_scraper2.py` - Replaced with deprecation stub +- `classes/class_pipeline.py` - Replaced with deprecation stub + +**Files deleted (fully migrated to Cloud Run Jobs):** +- `logic/mr_dump.py` → `jobs/dump_potential_events.py` +- `logic/unique_attendees.py` → `jobs/count_unique_attendees.py` +- `ml/mr_prediction.py` → `jobs/classify_events_ml.py` + +**Compat layer status:** +- `compat/` directory retained with `LEGACY_APIS_ENABLED = False` +- Provides stub implementations for imports that still reference mapreduce utilities +- `json_util.JsonProperty` still used by Source model +- Can be removed in future cleanup after all references are updated + +### New Files Created + +**Framework (`server/dancedeets/jobs/`):** +- `__init__.py` - Module exports +- `base.py` - Job, BatchJob, JobRunner classes +- `fb_utils.py` - Facebook API token handling +- `metrics.py` - JobMetrics, GroupedMetrics +- `gcs_output.py` - GCSOutputWriter +- `runner.py` - CLI entry point + +**Phase 2 Jobs:** +- `notify_users.py` - Push notifications by timezone +- `post_japan_events.py` - Post Japan events to social +- `compute_rankings.py` - City/country rankings +- `compute_user_stats.py` - User event statistics +- `refresh_users.py` - Refresh Facebook profiles +- `send_weekly_emails.py` - Weekly digest emails + +**Phase 3 Jobs:** +- `generate_sitemaps.py` - XML sitemap generation +- `dump_potential_events.py` - Export to CSV +- `generate_training_data.py` - ML training data +- `classify_events_ml.py` - ML classification +- `auto_add_events.py` - Auto-add dance events + +**Phase 4 Jobs:** +- `count_unique_attendees.py` - Unique RSVPs by city +- `update_source_stats.py` - Source quality metrics +- `scrape_and_classify.py` - Scrape and classify events + +**Phase 5 (Cloud Workflows):** +- `workflows/crawl_and_index_classes.yaml` - Orchestration workflow +- `start_spiders.py` - Start ScrapingHub spiders +- `reindex_classes.py` - Rebuild class search index +- `email_crawl_errors.py` - Send error reports + +**Docker/Config:** +- `Dockerfile.jobs` - Cloud Run Jobs container +- `requirements-jobs.txt` - Job dependencies + +--- + +## Migration Strategy + +| Legacy Pattern | Modern Replacement | +|----------------|-------------------| +| `start_map()` (mapper only) | **Cloud Run Jobs** | +| `MapreducePipeline` (map+reduce) | **Cloud Run Jobs** (simple) or **Cloud Dataflow** (complex) | +| `Pipeline` orchestration | **Cloud Workflows** | +| Task Queues | **Cloud Tasks** (already compatible) | + +--- + +## Phase 1: Infrastructure Setup + +### Task 1.1: Create Cloud Run Job Base Image +- **File to create**: `server/cloud_run/Dockerfile.jobs` +- **Purpose**: Base Docker image for all batch jobs +- **Contents**: Python runtime, common dependencies, Datastore client, GCS client +- **Priority**: HIGH (blocking for all other migrations) + +### Task 1.2: Create Job Runner Framework +- **File to create**: `server/dancedeets/jobs/base.py` +- **Purpose**: Base class for Cloud Run Jobs replacing mapreduce patterns +- **Features needed**: + - Datastore entity iteration with cursor-based pagination + - Parallel task execution (Cloud Run Jobs supports up to 10,000 parallel tasks) + - Counter/metrics collection + - GCS output writer + - Facebook API token injection (port from `fb_mapreduce.py`) + +### Task 1.3: Create Cloud Workflows Templates +- **File to create**: `server/workflows/` +- **Purpose**: YAML workflow definitions for orchestrated jobs +- **Priority**: MEDIUM (only needed for Pipeline migrations) + +--- + +## Phase 2: Simple Mapper Jobs → Cloud Run Jobs + +These jobs iterate over entities and perform side effects (no reduce step, no GCS output). + +### Task 2.1: `notifications/added_events.py` +- **Current**: `promote_events_to_user` via `start_map()` +- **Entity**: `User` (filtered by timezone_offset) +- **Action**: Sends push notifications for new events +- **Migration**: + 1. Create `server/dancedeets/jobs/notify_users.py` + 2. Query users by timezone offset + 3. For each user: search events, create Android push notification + 4. Schedule via Cloud Scheduler (hourly, matching current cron) +- **Complexity**: LOW +- **Facebook API**: No + +### Task 2.2: `pubsub/pubsub_tasks.py` +- **Current**: `map_post_jp_event` via `start_map()` +- **Entity**: `DBEvent` (filtered by TIME_FUTURE) +- **Action**: Posts Japan events to social media +- **Migration**: + 1. Create `server/dancedeets/jobs/post_japan_events.py` + 2. Query future events ending with 'Japan' + 3. Post to Twitter/social via pubsub module +- **Complexity**: LOW +- **Facebook API**: No + +### Task 2.3: `rankings/rankings.py` +- **Current**: `count_event_for_city`, `count_user_for_city` via `start_map()` +- **Entity**: `DBEvent` or `User` +- **Action**: Counts events/users by city, stores in counters +- **Migration**: + 1. Create `server/dancedeets/jobs/compute_rankings.py` + 2. Use in-memory counters (dict) instead of mapreduce counters + 3. Query entities, increment counters by city/country + 4. Call `_compute_summary()` at job end +- **Complexity**: LOW +- **Facebook API**: No + +### Task 2.4: `users/user_event_tasks.py` +- **Current**: `map_compute_user_stats` via `start_map()` +- **Entity**: `User` +- **Action**: Computes event statistics per user +- **Migration**: + 1. Create `server/dancedeets/jobs/compute_user_stats.py` + 2. Query all users + 3. For each: query PotentialEvent by source_ids, count by creating_method + 4. Update user properties +- **Complexity**: LOW +- **Facebook API**: No + +### Task 2.5: `users/user_tasks.py` +- **Current**: `map_load_fb_user` via `start_map()` +- **Entity**: `User` (optionally filtered by expired_oauth_token) +- **Action**: Refreshes user profile from Facebook +- **Migration**: + 1. Create `server/dancedeets/jobs/refresh_users.py` + 2. Query users (optionally skip expired tokens) + 3. For each: fetch LookupUser from FB, update Mailchimp, compute_derived_properties() + 4. Handle ExpiredOAuthToken exceptions +- **Complexity**: MEDIUM +- **Facebook API**: Yes (needs token handling) + +### Task 2.6: `search/email_events.py` +- **Current**: `map_email_user` via `start_map()` +- **Entity**: `User` +- **Action**: Sends weekly event digest emails +- **Migration**: + 1. Create `server/dancedeets/jobs/send_weekly_emails.py` + 2. Query all users + 3. For each: search events, render HTML via render_server, send via Mandrill + 4. Update user.weekly_email_send_date + 5. Handle NoEmailException, ExpiredOAuthToken +- **Complexity**: MEDIUM +- **Facebook API**: Yes (needs token handling) + +--- + +## Phase 3: Mapper Jobs with GCS Output → Cloud Run Jobs + +These jobs iterate and write results to Google Cloud Storage. + +### Task 3.1: `sitemaps/events.py` +- **Current**: `map_sitemap_event` via `start_map()` with output writer +- **Entity**: `DBEvent` (filtered by vertical, time_period) +- **Output**: XML sitemap to GCS +- **Migration**: + 1. Create `server/dancedeets/jobs/generate_sitemaps.py` + 2. Query events by filters + 3. Generate XML entries with lxml + 4. Write to GCS using google-cloud-storage client + 5. Handle file splitting if needed (sitemaps have size limits) +- **Complexity**: MEDIUM +- **Facebook API**: No + +### Task 3.2: `logic/mr_dump.py` +- **Current**: `map_dump_fb_json` via `start_map()` with output writer +- **Entity**: `PotentialEvent` (filtered by looked_at=None) +- **Output**: CSV to GCS +- **Migration**: + 1. Create `server/dancedeets/jobs/dump_potential_events.py` + 2. Query PotentialEvents not yet looked at + 3. Batch fetch from Facebook API + 4. Write CSV rows to GCS +- **Complexity**: MEDIUM +- **Facebook API**: Yes (batch_fetch) + +### Task 3.3: `ml/gprediction.py` +- **Current**: `map_training_data_for_pevents` via `start_map()` with output writer +- **Entity**: `PotentialEvent` +- **Output**: ML training data CSV to GCS +- **Migration**: + 1. Create `server/dancedeets/jobs/generate_training_data.py` + 2. Query PotentialEvents + 3. Fetch event details and attending from Facebook + 4. Extract training features + 5. Write to GCS +- **Complexity**: MEDIUM +- **Facebook API**: Yes + +### Task 3.4: `ml/mr_prediction.py` +- **Current**: `map_classify_events` via `start_map()` with output writer +- **Entity**: `PotentialEvent` (filtered by looked_at=None) +- **Output**: Classification results to GCS +- **Migration**: + 1. Create `server/dancedeets/jobs/classify_events_ml.py` + 2. Query unprocessed PotentialEvents + 3. Batch Facebook API requests + 4. Call Google Prediction API + 5. Write results to GCS +- **Complexity**: HIGH (ML service integration) +- **Facebook API**: Yes + +### Task 3.5: `event_scraper/auto_add.py` +- **Current**: `map_classify_events` via `start_map()` with output writer +- **Entity**: `PotentialEvent` (filtered by should_look_at, past_event) +- **Action**: Auto-adds dance events, writes results to GCS +- **Migration**: + 1. Create `server/dancedeets/jobs/auto_add_events.py` + 2. Query PotentialEvents matching criteria + 3. Run NLP classifier, attendee classifier + 4. Create DBEvent via add_entities.add_update_fb_event() + 5. Update PotentialEvent.looked_at, auto_looked_at + 6. Write summary to GCS +- **Complexity**: HIGH (multiple classifiers, entity creation) +- **Facebook API**: Yes + +--- + +## Phase 4: MapReduce Pipeline Jobs → Cloud Run Jobs or Dataflow + +These have both map and reduce steps. + +### Task 4.1: `logic/unique_attendees.py` +- **Current**: `MapreducePipeline` with mapper + reducer +- **Map**: Emits (city, attendee_id) from each event +- **Reduce**: Counts unique attendees per city +- **Migration Options**: + - **Option A (Cloud Run Jobs)**: Single job with in-memory aggregation + 1. Create `server/dancedeets/jobs/count_unique_attendees.py` + 2. Query all FB events + 3. Use `dict[city, set[attendee_id]]` for uniqueness + 4. Write final counts to GCS + - **Option B (Cloud Dataflow)**: Apache Beam pipeline (if scale demands) +- **Complexity**: MEDIUM +- **Facebook API**: Yes (batch_fetch for attending) + +### Task 4.2: `event_scraper/thing_db.py` +- **Current**: `MapreducePipeline` - counts events per source +- **Map**: `explode_per_source_count` - emits counts per source +- **Reduce**: `combine_source_count` - sums and updates Source entities +- **Migration**: + 1. Create `server/dancedeets/jobs/update_source_stats.py` + 2. Query all PotentialEvents + 3. Aggregate counts by source_id in memory + 4. Batch update Source entities +- **Complexity**: MEDIUM +- **Facebook API**: Yes + +### Task 4.3: `event_scraper/thing_scraper2.py` +- **Current**: `MapreducePipeline` - scrapes sources then processes events +- **Map**: `scrape_sources_for_events` - discovers events from sources +- **Reduce**: `process_events` - classifies discovered events +- **Migration**: + 1. Create `server/dancedeets/jobs/scrape_and_classify.py` + 2. Query all Sources (filtered by min_potential_events) + 3. Scrape each source for events + 4. Process through event_pipeline.process_discovered_events() +- **Complexity**: HIGH (multi-stage, external scraping) +- **Facebook API**: Yes + +### Task 4.4: `events/find_access_tokens.py` +- **Current**: Complex multi-stage `MapreducePipeline` +- **Stages**: Find events → Combine → Find tokens → Save +- **Migration**: + 1. This is best migrated to **Cloud Workflows** orchestrating multiple Cloud Run Jobs + 2. Create workflow: `server/workflows/find_access_tokens.yaml` + 3. Create jobs: + - `server/dancedeets/jobs/find_events_needing_tokens.py` + - `server/dancedeets/jobs/test_user_tokens.py` + - `server/dancedeets/jobs/save_valid_tokens.py` + 4. Workflow coordinates: job1 → job2 → job3 +- **Complexity**: HIGH (multi-stage orchestration) +- **Facebook API**: Yes + +--- + +## Phase 5: Custom Pipeline Jobs → Cloud Workflows + +### Task 5.1: `classes/class_pipeline.py` +- **Current**: `CrawlAndIndexClassesJob` Pipeline with 4 stages +- **Stages**: + 1. `start_spiders` - Triggers ScrapingHub spiders + 2. `WaitForJobs` - Polls for completion (30s retries) + 3. `ReindexClasses` - Rebuilds class search index + 4. `EmailErrors` - Sends error report via Mandrill +- **Migration**: + 1. Create workflow: `server/workflows/crawl_and_index_classes.yaml` + 2. Create Cloud Run Jobs: + - `server/dancedeets/jobs/start_spiders.py` + - `server/dancedeets/jobs/reindex_classes.py` + - `server/dancedeets/jobs/email_crawl_errors.py` + 3. Use Cloud Workflows built-in retry/polling for WaitForJobs + 4. Wire up: start_spiders → poll_completion → reindex → email_errors +- **Complexity**: MEDIUM (mostly orchestration) +- **Facebook API**: No + +--- + +## Phase 6: Utility Module Updates + +### Task 6.1: Port `util/fb_mapreduce.py` +- **Current**: Facebook token injection for mapreduce +- **New**: `server/dancedeets/jobs/fb_utils.py` +- **Features to port**: + - `get_fblookup()` - Get FBLookup with access token + - `get_multiple_tokens()` - Token rotation for long jobs + - Batch Facebook API request handling + +### Task 6.2: Port `util/mr.py` +- **Current**: Counter utilities for mapreduce +- **New**: `server/dancedeets/jobs/metrics.py` +- **Features**: + - In-memory counter implementation + - Optional Cloud Monitoring integration + +### Task 6.3: Deprecate Compatibility Layer +- **Files**: `server/dancedeets/compat/mapreduce/`, `server/dancedeets/compat/pipeline/` +- **Action**: Once all jobs migrated, remove compat layer entirely + +--- + +## Phase 7: Configuration & Deployment + +### Task 7.1: Update `queue.yaml` → Cloud Tasks +- Migrate queue definitions to Cloud Tasks API +- Update queue references in job code + +### Task 7.2: Create Cloud Run Job Definitions +- **File**: `server/cloudbuild.yaml` or Terraform configs +- Define all Cloud Run Jobs with resource limits + +### Task 7.3: Create Cloud Scheduler Triggers +- Replace App Engine cron with Cloud Scheduler +- Schedule all periodic jobs + +### Task 7.4: Create Cloud Workflows Definitions +- Deploy workflow YAML files +- Set up workflow triggers + +### Task 7.5: Update `batch.yaml` +- Either remove (if batch service no longer needed) or update for Cloud Run + +--- + +## File-by-File Migration Checklist + +| File | Current Pattern | Target | Priority | Complexity | FB API | +|------|-----------------|--------|----------|------------|--------| +| `notifications/added_events.py` | start_map | Cloud Run Job | HIGH | LOW | No | +| `pubsub/pubsub_tasks.py` | start_map | Cloud Run Job | MEDIUM | LOW | No | +| `rankings/rankings.py` | start_map | Cloud Run Job | MEDIUM | LOW | No | +| `users/user_event_tasks.py` | start_map | Cloud Run Job | MEDIUM | LOW | No | +| `users/user_tasks.py` | start_map | Cloud Run Job | MEDIUM | MEDIUM | Yes | +| `search/email_events.py` | start_map | Cloud Run Job | MEDIUM | MEDIUM | Yes | +| `sitemaps/events.py` | start_map+output | Cloud Run Job | HIGH | MEDIUM | No | +| `logic/mr_dump.py` | start_map+output | Cloud Run Job | LOW | MEDIUM | Yes | +| `ml/gprediction.py` | start_map+output | Cloud Run Job | LOW | MEDIUM | Yes | +| `ml/mr_prediction.py` | start_map+output | Cloud Run Job | LOW | HIGH | Yes | +| `event_scraper/auto_add.py` | start_map+output | Cloud Run Job | HIGH | HIGH | Yes | +| `logic/unique_attendees.py` | MapreducePipeline | Cloud Run Job | LOW | MEDIUM | Yes | +| `event_scraper/thing_db.py` | MapreducePipeline | Cloud Run Job | MEDIUM | MEDIUM | Yes | +| `event_scraper/thing_scraper2.py` | MapreducePipeline | Cloud Run Job | HIGH | HIGH | Yes | +| `events/find_access_tokens.py` | Multi-stage Pipeline | Cloud Workflows | LOW | HIGH | Yes | +| `classes/class_pipeline.py` | Custom Pipeline | Cloud Workflows | MEDIUM | MEDIUM | No | + +--- + +## Recommended Migration Order + +1. **Infrastructure** (Task 1.1-1.3) - Required first +2. **Simple side-effect jobs** (Task 2.1-2.4) - Quick wins, no FB API +3. **FB API jobs** (Task 2.5-2.6) - After FB token handling is ported +4. **GCS output jobs** (Task 3.1) - Sitemaps are user-facing +5. **Event processing jobs** (Task 3.5, 4.3) - Core functionality +6. **ML jobs** (Task 3.2-3.4) - Lower priority, complex +7. **Pipeline orchestration** (Task 5.1, 4.4) - After individual jobs work +8. **Cleanup** (Task 6.3, 7.x) - Final phase diff --git a/server/dancedeets/classes/class_pipeline.py b/server/dancedeets/classes/class_pipeline.py index f35137052..28a068b60 100644 --- a/server/dancedeets/classes/class_pipeline.py +++ b/server/dancedeets/classes/class_pipeline.py @@ -1,150 +1,38 @@ -# class_indexing_pipeline - -import datetime +""" +Class crawling and indexing pipeline. + +This functionality has been migrated to Cloud Workflows and Cloud Run Jobs. +See: +- Workflow: workflows/crawl_and_index_classes.yaml +- Jobs: dancedeets.jobs.start_spiders + dancedeets.jobs.reindex_classes + dancedeets.jobs.email_crawl_errors + +This module is kept for backwards compatibility but the Pipeline +handlers have been removed. Use Cloud Workflows instead. +""" import logging -from dancedeets.compat.pipeline import common -from dancedeets.compat.pipeline import pipeline -import scrapinghub - from dancedeets import app from dancedeets import base_servlet -from dancedeets.classes import class_index -from dancedeets import keys -from dancedeets.mail import mandrill_api -from dancedeets.util import fixed_pipelines - -# TODO(mindbody): -DISABLED_SPIDERS = ['EXPG', 'Boogiezone', 'IDA', 'mL', 'NeighborhoodStudio'] - - -def get_spiders(): - return [ - # NY - 'PMT', - 'Evolution', - 'Peridance', - 'BDC', - 'EXPG', - # LA - 'Millenium', - 'EDGE', - 'DebbieReynolds', - 'TheLab', - 'Boogiezone', - 'IDA', - 'mL', - 'NeighborhoodStudio', - ] - # This depends on Twisted, which depends on zope.interface and lxml. And that whole ball of wax fails when run in the appengine dev sandbox. - # We can't import any of classes/scrapers/ (since it all ultimately depends on scrapy), so there's no great way to get a list of classes. - # Instead, class_pipeline_test does depend on it safely within nosetests, and verifies the above list matches what we get from scrapy's API) - # from scrapy.utils.project import get_project_settings - # from scrapy.crawler import CrawlerRunner - # runner = CrawlerRunner(get_project_settings()) - # return runner.spider_loader.list() - - -def get_shub_project(): - conn = scrapinghub.Connection(keys.get('scrapinghub_key')) - project = scrapinghub.Project(conn, 27474) - return project - - -def start_spiders(spiders): - project = get_shub_project() - job_keys = [] - for spider in spiders: - job_id = project.schedule(spider) - job_keys.append(job_id) - logging.info("Scheduled jobs: %s", job_keys) - return job_keys - - -class CrawlAndIndexClassesJob(fixed_pipelines.Pipeline): - def run(self): - run_time = datetime.datetime.now() - # Find all spiders by looking at modules on disk - spiders = set(get_spiders()).difference(DISABLED_SPIDERS) - - # Trigger new spider jobs on scrapinghub - job_keys = start_spiders(spiders) - - # Wait for crawls to finish - jobs_completed = yield WaitForJobs(job_keys) - - # In parallel, trigger reindex and emailing-of-errors - yield ReindexClasses(job_keys, jobs_completed) - yield EmailErrors(run_time, job_keys, jobs_completed) - - -class WaitForJobs(fixed_pipelines.Pipeline): - def run(self, job_keys): - project = get_shub_project() - jobs = [project.job(x) for x in job_keys] - unfinished = [x for x in jobs if x.info['state'] != 'finished'] - logging.info("Waiting for %s unfinished spiders", len(unfinished)) - if unfinished: - # Try again in 30 seconds - with pipeline.InOrder(): - yield common.Delay(seconds=30) - yield WaitForJobs(job_keys) - else: - yield common.Return(True) - - -class ReindexClasses(fixed_pipelines.Pipeline): - def run(self, job_keys, jobs_completed): - class_index.StudioClassIndex.rebuild_from_query() - - -class EmailErrors(fixed_pipelines.Pipeline): - def run(self, run_time, job_keys, jobs_completed): - project = get_shub_project() - jobs = [project.job(x) for x in job_keys] - - error_lines = {} - - for spider_job in jobs: - if not spider_job.info['items_scraped']: - error_lines.setdefault(spider_job.info['spider'], []).append('Could not find any items.') - - for line in spider_job.log(): - if line['level'] >= 40: - error_lines.setdefault(spider_job.info['spider'], []).append(line['message']) - - if not error_lines: - return - - rendered = ["The following crawl errors occurred:"] - for crawler, errors in error_lines.items(): - rendered += ["%s:" % crawler] - rendered += errors - rendered += [] - - body = '\n'.join(rendered) - logging.warning("%s", body) - - subject = 'Crawl Errors for %s' % run_time.strftime('%b %d, %Y: %H:%M') - message = { - 'from_email': 'reports@dancedeets.com', - 'from_name': 'DanceDeets Reports', - 'subject': subject, - 'to': [{ - 'email': 'reports@dancedeets.com', - 'name': 'DanceDeets Reports', - 'type': 'to', - }], - 'text': body, - } - mandrill_api.send_message(message) @app.route('/tasks/crawl_and_index_classes') class CrawlAndIndexClassesHandler(base_servlet.BaseTaskRequestHandler): + """ + Legacy handler - crawling has been migrated to Cloud Workflows. + + Use Cloud Workflow: crawl_and_index_classes + Or individual jobs: + - python -m dancedeets.jobs.runner --job=start_spiders + - python -m dancedeets.jobs.runner --job=reindex_classes + - python -m dancedeets.jobs.runner --job=email_crawl_errors + """ def get(self): - pipeline = CrawlAndIndexClassesJob() - pipeline.start(queue_name='slow-queue') - self.response.out.write('OK') + logging.warning( + 'This endpoint is deprecated. ' + 'Use Cloud Workflow: crawl_and_index_classes instead.' + ) + self.response.out.write('DEPRECATED: Use Cloud Workflow crawl_and_index_classes instead') post = get diff --git a/server/dancedeets/event_scraper/auto_add.py b/server/dancedeets/event_scraper/auto_add.py index 9667763c1..f57899eac 100644 --- a/server/dancedeets/event_scraper/auto_add.py +++ b/server/dancedeets/event_scraper/auto_add.py @@ -1,3 +1,14 @@ +""" +Auto-add event classification logic. + +The batch processing has been migrated to Cloud Run Jobs. +See: dancedeets.jobs.auto_add_events + +This module retains: +- classify_events: Filter and classify potential events +- really_classify_events: Core classification and adding logic +- maybe_add_events: Add events by IDs (for non-batch contexts) +""" import datetime import logging import re @@ -9,50 +20,74 @@ from dancedeets.nlp import event_auto_classifier from dancedeets.nlp import event_classifier from dancedeets.nlp.styles import street -from dancedeets.util import fb_mapreduce -from dancedeets.util import mr from . import add_entities from . import potential_events def is_good_event_by_text(fb_event, classified_event): + """Check if event is a good dance event based on text classification.""" return event_auto_classifier.is_auto_add_event(classified_event).is_good_event() -def classify_events(fbl, pe_list, fb_list): +def classify_events(fbl, pe_list, fb_list, metrics=None): + """ + Filter and classify potential events. + + Args: + fbl: Facebook batch lookup + pe_list: List of PotentialEvent objects + fb_list: List of Facebook event data + metrics: Optional metrics counter (for Cloud Run Jobs) + + Returns: + List of result strings for added events + """ new_pe_list = [] new_fb_list = [] # Go through and find all potential events we actually want to attempt to classify for pe, fb_event in zip(pe_list, fb_list): # Get these past events out of the way, saved, then continue. - # Next time through this mapreduce, we shouldn't need to process them. if pe.set_past_event(fb_event): pe.put() if not fb_event or fb_event['empty']: - mr.increment('skip-due-to-empty') + if metrics: + metrics.increment('skip-due-to-empty') continue # Don't process events we've already looked at, or don't need to look at. - # This doesn't happen with the mapreduce that pre-filters them out, - # but it does happen when we scrape users potential events and throw them all in here. if pe.looked_at: logging.info('Already looked at event (added, or manually discarded), so no need to re-process.') - mr.increment('skip-due-to-looked-at') + if metrics: + metrics.increment('skip-due-to-looked-at') continue event_id = pe.fb_event_id if not re.match(r'^\d+$', event_id): logging.error('Found a very strange potential event id: %s', event_id) - mr.increment('skip-due-to-bad-id') + if metrics: + metrics.increment('skip-due-to-bad-id') continue new_pe_list.append(pe) new_fb_list.append(fb_event) - return really_classify_events(fbl, new_pe_list, new_fb_list) + return really_classify_events(fbl, new_pe_list, new_fb_list, metrics=metrics) + + +def really_classify_events(fbl, new_pe_list, new_fb_list, allow_posting=True, metrics=None): + """ + Core classification logic - classify and add dance events. + Args: + fbl: Facebook batch lookup + new_pe_list: List of PotentialEvent objects + new_fb_list: List of Facebook event data + allow_posting: Whether to post to social media + metrics: Optional metrics counter (for Cloud Run Jobs) -def really_classify_events(fbl, new_pe_list, new_fb_list, allow_posting=True): + Returns: + List of result strings for added events + """ if not new_pe_list: new_pe_list = [None] * len(new_fb_list) logging.info('Filtering out already-added events and others, have %s remaining events to run the classifier on', len(new_fb_list)) @@ -97,20 +132,19 @@ def really_classify_events(fbl, new_pe_list, new_fb_list, allow_posting=True): pe2.looked_at = True pe2.auto_looked_at = True pe2.put() - # TODO(lambert): handle un-add-able events differently results.append(result) - mr.increment('auto-added-dance-events') - if e.start_time < datetime.datetime.now(): - mr.increment('auto-added-dance-events-past') - # mr.increment('auto-added-dance-events-past-eventid-%s' % event_id) + if metrics: + metrics.increment('auto-added-dance-events') + if e.start_time < datetime.datetime.now(): + metrics.increment('auto-added-dance-events-past') + for vertical in e.verticals: + metrics.increment('auto-added-dance-event-past-vertical-%s' % vertical) + else: + metrics.increment('auto-added-dance-events-future') + for vertical in e.verticals: + metrics.increment('auto-added-dance-event-future-vertical-%s' % vertical) for vertical in e.verticals: - mr.increment('auto-added-dance-event-past-vertical-%s' % vertical) - else: - mr.increment('auto-added-dance-events-future') - for vertical in e.verticals: - mr.increment('auto-added-dance-event-future-vertical-%s' % vertical) - for vertical in e.verticals: - mr.increment('auto-added-dance-event-vertical-%s' % vertical) + metrics.increment('auto-added-dance-event-vertical-%s' % vertical) except fb_api.NoFetchedDataException as e: logging.error("Error adding event %s, no fetched data: %s", event_id, e) except add_entities.AddEventException as e: @@ -118,39 +152,19 @@ def really_classify_events(fbl, new_pe_list, new_fb_list, allow_posting=True): return results -def classify_events_with_yield(fbl, pe_list): - fb_list = fbl.get_multi(fb_api.LookupEvent, [x.fb_event_id for x in pe_list], allow_fail=True) - results = classify_events(fbl, pe_list, fb_list) - yield ''.join(results).encode('utf-8') - - -map_classify_events = fb_mapreduce.mr_wrap(classify_events_with_yield) - +def maybe_add_events(fbl, event_ids): + """ + Attempt to add events by their IDs. -def mr_classify_potential_events(fbl, past_event, dancey_only): - filters = [] - if dancey_only: - filters.append(('should_look_at', '=', True)) - if past_event is not None: - filters.append(('past_event', '=', past_event)) - fb_mapreduce.start_map( - fbl, - 'Auto-Add Events', - 'dancedeets.event_scraper.auto_add.map_classify_events', - 'dancedeets.event_scraper.potential_events.PotentialEvent', - filters=filters, - # Make sure we don't process so many that we cause the tasks to time out - handle_batch_size=10, - queue='fast-queue', - output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter', - output_writer={ - 'mime_type': 'text/plain', - 'bucket_name': 'dancedeets-hrd.appspot.com', - }, - ) + Used for non-batch contexts where we have specific event IDs to check. + Args: + fbl: Facebook batch lookup + event_ids: List of Facebook event IDs -def maybe_add_events(fbl, event_ids): + Returns: + List of result strings for added events + """ fb_events = fbl.get_multi(fb_api.LookupEvent, event_ids) empty_ids = [eid for x, eid in zip(fb_events, event_ids) if x['empty']] logging.info('Found empty ids: %s', empty_ids) diff --git a/server/dancedeets/event_scraper/thing_db.py b/server/dancedeets/event_scraper/thing_db.py index 58d8afc43..5e3b03ae4 100644 --- a/server/dancedeets/event_scraper/thing_db.py +++ b/server/dancedeets/event_scraper/thing_db.py @@ -1,17 +1,24 @@ +""" +Source entity management for Facebook pages/groups/profiles. + +The batch source statistics computation has been migrated to Cloud Run Jobs. +See: dancedeets.jobs.update_source_stats + +This module retains: +- Source model: Datastore entity for FB sources +- create_source_from_id: Create/update source from FB ID +- create_sources_from_event: Extract sources from event admins/owners +- Helper functions for FB source type detection +""" import datetime -import json import logging from google.appengine.ext import db from dancedeets.compat.mapreduce import json_util -from dancedeets.compat.mapreduce import mapreduce_pipeline -from dancedeets.compat.mapreduce import operation -from dancedeets.events import eventdata from dancedeets import fb_api from dancedeets.loc import gmaps_api from dancedeets.logic import backgrounder -from dancedeets.util import fb_mapreduce GRAPH_TYPE_PROFILE = 'GRAPH_TYPE_PROFILE' GRAPH_TYPE_FANPAGE = 'GRAPH_TYPE_FANPAGE' @@ -25,9 +32,7 @@ GRAPH_TYPE_GROUP, ] -# Start small -# Only set of sources with walls, and only hand-curated sources (or events). not grabbing new peoples yet. - +# Field types for source scraping FIELD_FEED = 'FIELD_FEED' # /feed FIELD_EVENTS = 'FIELD_EVENTS' # /events FIELD_INVITES = 'FIELD_INVITES' # fql query on invites for signed-up users @@ -35,6 +40,7 @@ class Source(db.Model): + """Represents a Facebook source (page, group, profile) for event discovery.""" graph_id = property(lambda x: str(x.key().name())) graph_type = db.StringProperty(choices=GRAPH_TYPES) @@ -50,8 +56,7 @@ class Source(db.Model): verticals = db.ListProperty(str, indexed=True) - # probably to assume for a given event? rough weighting factor? - # do we want to delete these now? + # Style weighting factors (legacy) freestyle = db.FloatProperty(indexed=False) choreo = db.FloatProperty(indexed=False) @@ -60,6 +65,7 @@ class Source(db.Model): creation_time = db.DateTimeProperty(indexed=False, auto_now_add=True) last_scrape_time = db.DateTimeProperty(indexed=False) + # Statistics (updated by Cloud Run Job: update_source_stats) num_all_events = db.IntegerProperty(indexed=False) num_potential_events = db.IntegerProperty(indexed=False) num_real_events = db.IntegerProperty(indexed=False) @@ -77,7 +83,6 @@ def fraction_potential_are_real(self, bias=1): def fraction_real_are_false_negative(self, bias=1): if self.num_real_events: - #TODO(lambert): figure out why num_false_negatives is None, in particular for source id=107687589275667 even after saving num_false_negatives = (self.num_false_negatives or 0) + bias num_real_events = (self.num_real_events or 0) + bias return 1.0 * num_false_negatives / num_real_events @@ -88,22 +93,19 @@ def compute_derived_properties(self, fb_source_common, fb_source_data): if fb_source_common['empty']: # only update these when we have feed data self.fb_info = {} else: - self.fb_info = fb_source_data['info'] # LookupThing* (and all fb_info dependencies). Only used for /search_pages functionality + self.fb_info = fb_source_data['info'] self.graph_type = _type_for_fb_source(fb_source_common) if 'name' not in fb_source_common['info']: logging.error('cannot find name for fb event data: %s, cannot update source data...', fb_source_common) return self.name = fb_source_common['info']['name'] self.emails = fb_source_data['info'].get('emails', []) - if not self.emails: - pass # TODO: trigger basic crawl of website to search for emails feed = fb_source_common['feed']['data'] if len(feed): dt = datetime.datetime.strptime(feed[-1]['created_time'], '%Y-%m-%dT%H:%M:%S+0000') td = datetime.datetime.now() - dt total_seconds = td.seconds + td.days * 24 * 3600 self.feed_history_in_seconds = total_seconds - #logging.info('feed time delta is %s', self.feed_history_in_seconds) else: self.feed_history_in_seconds = 0 location = fb_source_data['info'].get('location') @@ -118,10 +120,10 @@ def compute_derived_properties(self, fb_source_common, fb_source_data): geocode = gmaps_api.lookup_address(address) if geocode: self.latitude, self.longitude = geocode.latlng() - #TODO(lambert): at some point we need to calculate all potential events, and all real events, and update the numbers with values from them. and all fake events. we have a problem where a new source gets added, adds in the potential events and/or real events, but doesn't properly tally them all. can fix this one-off, but it's too-late now, and i imagine our data will grow inaccurate over time anyway. def link_for_fb_source(data): + """Generate Facebook URL for a source.""" if 'link' in data['info']: return data['info']['link'] elif 'version' in data['info']: @@ -133,6 +135,7 @@ def link_for_fb_source(data): def _type_for_fb_source(fb_source_common): + """Determine graph type from FB metadata.""" source_type = fb_source_common['metadata']['metadata']['type'] if source_type == 'page': return GRAPH_TYPE_FANPAGE @@ -148,6 +151,7 @@ def _type_for_fb_source(fb_source_common): def get_lookup_for_graph_type(graph_type): + """Get the appropriate FB API lookup type for a graph type.""" if graph_type == GRAPH_TYPE_FANPAGE: return fb_api.LookupThingPage elif graph_type == GRAPH_TYPE_GROUP: @@ -160,18 +164,19 @@ def get_lookup_for_graph_type(graph_type): def create_source_from_id(fbl, source_id, verticals=None): + """Create or update a Source from a Facebook ID.""" source = create_source_from_id_without_saving(fbl, source_id, verticals=verticals) if source: new_source = (not source.creation_time) source.put() if new_source: - # It seems some "new" sources are existing sources without a creation_time set, so let's force-set it here source.creation_time = datetime.datetime.now() backgrounder.load_sources([source_id], fb_uid=fbl.fb_uid) return source def create_source_from_id_without_saving(fbl, source_id, verticals=None): + """Create a Source object without saving to Datastore.""" logging.info('create_source_from_id: %s', source_id) if not source_id: return None @@ -184,8 +189,6 @@ def create_source_from_id_without_saving(fbl, source_id, verticals=None): original_allow_cache = fbl.allow_cache fbl.allow_cache = True try: - - # technically we could check if the object exists in the db, before we bother fetching the feed fb_source_common = fbl.get(fb_api.LookupThingCommon, source_id) if fb_source_common['empty']: logging.error('Error loading Common Fields for Source: %s', source_id) @@ -210,106 +213,9 @@ def create_source_from_id_without_saving(fbl, source_id, verticals=None): def create_sources_from_event(fbl, db_event): + """Create Source entities from an event's owner and admins.""" logging.info('create_sources_from_event: %s', db_event.id) create_source_from_id(fbl, db_event.owner_fb_uid, verticals=db_event.verticals) for admin in db_event.admins: if admin['id'] != db_event.owner_fb_uid: create_source_from_id(fbl, admin['id'], verticals=db_event.verticals) - - -map_create_sources_from_event = fb_mapreduce.mr_wrap(create_sources_from_event) - - -def explode_per_source_count(pe): - db_event = eventdata.DBEvent.get_by_id(pe.fb_event_id) - - is_potential_event = pe.match_score > 0 - real_event = db_event != None - false_negative = bool(db_event and not is_potential_event) - result = (is_potential_event, real_event, false_negative) - - for source_id in pe.source_ids_only(): - yield (source_id, json.dumps(result)) - - -def combine_source_count(source_id, counts_to_sum): - s = Source.get_by_key_name(source_id) - if not s: - return - - s.num_all_events = 0 - s.num_potential_events = 0 - s.num_real_events = 0 - s.num_false_negatives = 0 - - for result in counts_to_sum: - (potential_event, real_event, false_negative) = json.loads(result) - s.num_all_events += 1 - if potential_event: - s.num_potential_events += 1 - if real_event: - s.num_real_events += 1 - if false_negative: - s.num_false_negatives += 1 - yield operation.db.Put(s) - - -def mr_count_potential_events(fbl, queue): - mapper_params = { - 'entity_kind': 'dancedeets.event_scraper.potential_events.PotentialEvent', - } - mapper_params.update(fb_mapreduce.get_fblookup_params(fbl)) - pipeline = mapreduce_pipeline.MapreducePipeline( - 'clean source counts', - 'dancedeets.event_scraper.thing_db.explode_per_source_count', - 'dancedeets.event_scraper.thing_db.combine_source_count', - 'mapreduce.input_readers.DatastoreInputReader', - None, - mapper_params=mapper_params, - ) - pipeline.start(queue_name=queue) - - -""" -user: -- invited-events fql (event, if member) -- friends (user, if member) -- events (event) -- wall (event, user, page, group) -- likes (page) -- groups (group) - -fanpage: -- wall (event, user, page, group) -- likes (page) -- events (event) -- groups (group) - -event: -- wall (event, user, page, group) -- attending (user) -- creator (user) - -group: -- wall (event, user, page, group) -- members (user) - -Known Dancer Entities (profiles, fan pages, events, groups) -- scrape them for events -- track in each entity, how many events were found on wall, events -- track total-time-of-wall so we know refresh frequency - -status: -dance-related, scrape, add everything in here to "maybe" list -maybe-dance-related, scrape but only return high-quality events, don't scrape for anything-but-events -not-dance-related, don't scrape -old (event), no longer scrape, happens after event has passed - -status set periodically in all-out-mapreduce -- old events stay old -- sources stay dance-related if manually set -- sources become dance-related if they find dance events via it -- sources become not-dance-related if there are no dance events on it after a month or two? or if number of dancer-friends is <20? - -- also want to track how many pages/groups were found via this entity -""" diff --git a/server/dancedeets/event_scraper/thing_scraper2.py b/server/dancedeets/event_scraper/thing_scraper2.py index 3197971f1..91ec54e76 100644 --- a/server/dancedeets/event_scraper/thing_scraper2.py +++ b/server/dancedeets/event_scraper/thing_scraper2.py @@ -1,83 +1,28 @@ -import json -import logging +""" +Source scraping and event processing. + +This functionality has been migrated to Cloud Run Jobs. +See: dancedeets.jobs.scrape_and_classify -from dancedeets.compat.mapreduce import mapreduce_pipeline -from dancedeets.util import fb_mapreduce +This module is kept for backwards compatibility but the mapreduce +handlers have been removed. Use the Cloud Run Job instead. +""" +import logging from dancedeets import app from dancedeets import base_servlet -from dancedeets.util import mr -from . import event_pipeline -from . import potential_events -from . import thing_scraper - - -def scrape_sources_for_events(sources): - fbl = fb_mapreduce.get_fblookup() - fbl.allow_cache = False - # Eliminate all caches (both fetching, and saving!) - # This should save on a bunch of unnecessary put() calls while scraping - # (Current estimates are 30qps * 60 sec/min * 50min * $0.18/10K Queries * 30 days = $48/month) - fbl.make_passthrough() - discovered_list = thing_scraper.discover_events_from_sources(fbl, sources) - for x in discovered_list: - state = (x.event_id, x.source_id, x.source_field, x.extra_source_id) - mr.increment('found-event-to-check') - # Don't "shard" events....just group them by id. - # And let the functionality of them sharing sources happen naturally - yield (x.event_id, json.dumps(state)) - - -def process_events(event_id, via_sources): - fbl = fb_mapreduce.get_fblookup() - fbl.allow_cache = True - discovered_list = [] - logging.info('Running process_events with %s event-sources', len(via_sources)) - for data in via_sources: - event_id, source_id, source_field, extra_source_id = json.loads(data) - discovered = potential_events.DiscoveredEvent(event_id, None, source_field, extra_source_id) - discovered.source = None # TODO: This will come back to bite us I'm sure :( - discovered.source_id = source_id - discovered_list.append(discovered) - # Some of these are newly-discovered events, some of these are already-cached and classified. - # TODO: Filter out the already-classified ones, so we don't waste time re-classifying on cached on data. - event_pipeline.process_discovered_events(fbl, discovered_list) @app.route('/tasks/scrape_sources_and_process_events') class LoadPotentialEventsFromWallPostsHandler(base_servlet.BaseTaskFacebookRequestHandler): - def get(self): - min_potential_events = int(self.request.get('min_potential_events', '0')) - queue = self.request.get('queue', 'slow-queue') - mapreduce_scrape_sources_and_process_events(self.fbl, min_potential_events=min_potential_events, queue=queue) - + """ + Legacy handler - scraping has been migrated to Cloud Run Jobs. -def mapreduce_scrape_sources_and_process_events(fbl, min_potential_events, queue): - mapper_params = { - 'entity_kind': 'dancedeets.event_scraper.thing_db.Source', - 'min_potential_events': min_potential_events, - 'handle_batch_size': 20, - } - reducer_params = { - 'output_writer': { - 'bucket_name': 'dancedeets-hrd.appspot.com', - 'content_type': 'text/plain', - } - } - fb_params = fb_mapreduce.get_fblookup_params(fbl, randomize_tokens=True) - mapper_params.update(fb_params) - reducer_params.update(fb_params) - - # output = yield ... - pipeline = mapreduce_pipeline.MapreducePipeline( - 'Scrape sources, then load and classify the events', - 'dancedeets.event_scraper.thing_scraper2.scrape_sources_for_events', - 'dancedeets.event_scraper.thing_scraper2.process_events', - 'mapreduce.input_readers.DatastoreInputReader', - 'mapreduce.output_writers.GoogleCloudStorageOutputWriter', - mapper_params=mapper_params, - reducer_params=reducer_params, - shards=16, - ) - - pipeline.start(queue_name=queue) + Use: python -m dancedeets.jobs.runner --job=scrape_and_classify + """ + def get(self): + logging.warning( + 'This endpoint is deprecated. ' + 'Use Cloud Run Job: dancedeets.jobs.scrape_and_classify instead.' + ) + self.response.out.write('DEPRECATED: Use Cloud Run Job scrape_and_classify instead') diff --git a/server/dancedeets/jobs/__init__.py b/server/dancedeets/jobs/__init__.py new file mode 100644 index 000000000..73f47354f --- /dev/null +++ b/server/dancedeets/jobs/__init__.py @@ -0,0 +1,51 @@ +# Cloud Run Jobs framework for DanceDeets +# Replaces legacy App Engine MapReduce/Pipeline + +from .base import Job, JobRunner, BatchJob +from .fb_utils import get_fblookup, get_fblookup_params, get_multiple_tokens, FBJobContext +from .metrics import JobMetrics, GroupedMetrics +from .gcs_output import GCSOutputWriter + +__all__ = [ + # Base classes + 'Job', + 'JobRunner', + 'BatchJob', + # Facebook utilities + 'FBJobContext', + 'get_fblookup', + 'get_fblookup_params', + 'get_multiple_tokens', + # Metrics + 'JobMetrics', + 'GroupedMetrics', + # GCS output + 'GCSOutputWriter', +] + +# Available jobs (for reference): +# +# Phase 2 - Simple mapper jobs: +# - notify_users: Send push notifications by timezone +# - post_japan_events: Post Japan events to social media +# - compute_rankings: Compute city/country rankings +# - compute_user_stats: Compute user event statistics +# - refresh_users: Refresh user profiles from Facebook +# - send_weekly_emails: Send weekly event digest emails +# +# Phase 3 - GCS output jobs: +# - generate_sitemaps: Generate XML sitemaps +# - dump_potential_events: Export potential events to CSV +# - generate_training_data: Generate ML training data +# - classify_events_ml: ML event classification +# - auto_add_events: Auto-add dance events +# +# Phase 4 - MapReduce pipeline replacements: +# - count_unique_attendees: Count unique RSVPs by city +# - update_source_stats: Update source quality metrics +# - scrape_and_classify: Scrape sources and classify events +# +# Phase 5 - Pipeline orchestration (Cloud Workflows): +# - start_spiders: Start ScrapingHub spider jobs +# - reindex_classes: Rebuild class search index +# - email_crawl_errors: Send crawl error reports diff --git a/server/dancedeets/jobs/auto_add_events.py b/server/dancedeets/jobs/auto_add_events.py new file mode 100644 index 000000000..56d8b05dc --- /dev/null +++ b/server/dancedeets/jobs/auto_add_events.py @@ -0,0 +1,300 @@ +""" +Cloud Run Job: Automatically add dance events to the database. + +Migrated from: dancedeets/event_scraper/auto_add.py + +This job classifies potential events using NLP and attendee analysis, +and automatically adds those that qualify as dance events. + +Usage: + python -m dancedeets.jobs.runner --job=auto_add_events + python -m dancedeets.jobs.runner --job=auto_add_events --dancey_only=true +""" + +import datetime +import logging +import re +from typing import List, Optional + +from dancedeets import fb_api +from dancedeets.event_attendees import event_attendee_classifier +from dancedeets.events import eventdata +from dancedeets.jobs.base import BatchJob, JobRunner +from dancedeets.jobs.fb_utils import FBJobContext, get_multiple_tokens +from dancedeets.jobs.gcs_output import GCSOutputWriter, DEFAULT_BUCKET +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics +from dancedeets.nlp import event_auto_classifier +from dancedeets.nlp import event_classifier +from dancedeets.nlp.styles import street +from dancedeets.event_scraper import add_entities +from dancedeets.event_scraper import potential_events + +logger = logging.getLogger(__name__) + + +class AutoAddEventsJob(BatchJob): + """ + Job that automatically classifies and adds dance events. + + For each potential event: + 1. Fetch event data from Facebook + 2. Run NLP classifier on event text + 3. If text doesn't match, check attendee profiles + 4. Add qualifying events to the database + """ + + def __init__( + self, + fb_context: Optional[FBJobContext] = None, + bucket_name: str = DEFAULT_BUCKET, + allow_posting: bool = True, + dry_run: bool = False, + ): + # Use small batch size to avoid timeouts (complex classification) + super().__init__(batch_size=10) + self.fb_context = fb_context + self.bucket_name = bucket_name + self.allow_posting = allow_posting + self.dry_run = dry_run + self.output_writer = None + logger.info("AutoAddEventsJob initialized") + + def setup(self) -> None: + """Initialize the output writer.""" + if not self.dry_run: + self.output_writer = GCSOutputWriter( + bucket_name=self.bucket_name, + blob_name='auto_add/results.txt', + content_type='text/plain', + ) + + def run_batch(self, pe_list: list) -> None: + """Process a batch of potential events.""" + if not self.fb_context: + logger.warning("No FB context, skipping batch") + self.metrics.increment('batches_skipped_no_fb') + return + + fbl = self.fb_context.get_fblookup() + + # Fetch event data from Facebook + fb_list = fbl.get_multi( + fb_api.LookupEvent, + [x.fb_event_id for x in pe_list], + allow_fail=True, + ) + + # Filter and classify events + results = self._classify_events(fbl, pe_list, fb_list) + + # Write results + if results and not self.dry_run: + for result in results: + self.output_writer.write(result) + + self.metrics.increment('batches_processed') + + def _classify_events(self, fbl, pe_list: list, fb_list: list) -> List[str]: + """ + Filter and classify events. + + Returns: + List of result strings for successfully added events + """ + new_pe_list = [] + new_fb_list = [] + + for pe, fb_event in zip(pe_list, fb_list): + # Handle past events + if pe.set_past_event(fb_event): + if not self.dry_run: + pe.put() + + if not fb_event or fb_event.get('empty'): + self.metrics.increment('skip-due-to-empty') + continue + + # Skip already processed events + if pe.looked_at: + logger.debug('Already looked at event, skipping') + self.metrics.increment('skip-due-to-looked-at') + continue + + event_id = pe.fb_event_id + if not re.match(r'^\d+$', event_id): + logger.error(f'Found a very strange potential event id: {event_id}') + self.metrics.increment('skip-due-to-bad-id') + continue + + new_pe_list.append(pe) + new_fb_list.append(fb_event) + + return self._really_classify_events(fbl, new_pe_list, new_fb_list) + + def _really_classify_events(self, fbl, pe_list: list, fb_list: list) -> List[str]: + """ + Actually classify events and add qualifying ones. + + Returns: + List of result strings + """ + if not fb_list: + return [] + + if not pe_list: + pe_list = [None] * len(fb_list) + + logger.info(f'Classifying {len(fb_list)} events') + + # Fetch attendee data + fb_event_ids = [x['info']['id'] for x in fb_list] + fb_attending_maybe_list = fbl.get_multi( + fb_api.LookupEventAttendingMaybe, + fb_event_ids, + allow_fail=True, + ) + + results = [] + for pe, fb_event, fb_event_attending_maybe in zip(pe_list, fb_list, fb_attending_maybe_list): + event_id = fb_event['info']['id'] + logger.debug(f'Classifying event {event_id}') + + # Run text classifier + classified_event = event_classifier.get_classified_event(fb_event) + auto_add_result = event_auto_classifier.is_auto_add_event(classified_event) + logger.debug(f'Text classification result: {auto_add_result}') + + good_event = False + method = None + verticals = [] + + if auto_add_result.is_good_event(): + good_event = True + method = eventdata.CM_AUTO + verticals = auto_add_result.verticals() + elif fb_event_attending_maybe: + # Try attendee-based classification + logger.debug(f'Trying attendee classification for {event_id}') + good_event = event_attendee_classifier.is_good_event_by_attendees( + fbl, + fb_event, + fb_event_attending_maybe=fb_event_attending_maybe, + classified_event=classified_event, + ) + logger.debug(f'Attendee classification result: {good_event}') + method = eventdata.CM_AUTO_ATTENDEE + verticals = [street.Style.get_name()] + + if good_event: + result = f"+{event_id}\t{fb_event['info'].get('name', '')}\n" + + if self.dry_run: + logger.info(f"[DRY RUN] Would add event {event_id}") + self.metrics.increment('events_would_add') + results.append(result) + continue + + try: + invite_ids = pe.get_invite_uids() if pe else [] + logger.info(f'Adding event {event_id}, invite_ids: {invite_ids}') + + e = add_entities.add_update_fb_event( + fb_event, + fbl, + visible_to_fb_uids=invite_ids, + creating_method=method, + allow_posting=self.allow_posting, + verticals=verticals, + ) + + # Mark as processed + pe2 = potential_events.PotentialEvent.get_by_key_name(event_id) + pe2.looked_at = True + pe2.auto_looked_at = True + pe2.put() + + results.append(result) + self.metrics.increment('auto-added-dance-events') + + # Track by time period + if e.start_time < datetime.datetime.now(): + self.metrics.increment('auto-added-dance-events-past') + else: + self.metrics.increment('auto-added-dance-events-future') + + # Track by vertical + for vertical in e.verticals: + self.metrics.increment(f'auto-added-vertical-{vertical}') + + except fb_api.NoFetchedDataException as e: + logger.error(f"Error adding event {event_id}, no fetched data: {e}") + self.metrics.increment('events_failed_no_data') + except add_entities.AddEventException as e: + logger.warning(f"Error adding event {event_id}: {e}") + self.metrics.increment('events_failed_add') + + return results + + def teardown(self) -> None: + """Finalize the output.""" + if not self.dry_run and self.output_writer: + uri = self.output_writer.flush() + logger.info(f"Results written to {uri}") + + +def main( + dancey_only: bool = False, + past_event: bool = None, + dry_run: bool = False, + **kwargs, +) -> None: + """ + Main entry point for the auto_add_events job. + + Args: + dancey_only: If True, only process events with should_look_at=True + past_event: Filter by past_event status (True/False/None for all) + dry_run: If True, don't actually add events + """ + logger.info(f"Starting auto_add_events job: dancey_only={dancey_only}, past_event={past_event}") + + # Get tokens for Facebook API access + try: + tokens = get_multiple_tokens(token_count=50) + logger.info(f"Got {len(tokens)} access tokens for rotation") + except Exception as e: + logger.warning(f"Could not get multiple tokens: {e}") + tokens = [] + + fb_context = FBJobContext( + fb_uid='system', + access_tokens=tokens, + allow_cache=True, + ) if tokens else None + + job = AutoAddEventsJob( + fb_context=fb_context, + dry_run=dry_run, + ) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + + # Build filters + filters = [] + if dancey_only: + filters.append(('should_look_at', '=', True)) + if past_event is not None: + filters.append(('past_event', '=', past_event)) + + runner.run_from_datastore_batched( + entity_kind='dancedeets.event_scraper.potential_events.PotentialEvent', + filters=filters, + batch_size=10, + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/base.py b/server/dancedeets/jobs/base.py new file mode 100644 index 000000000..8834f9f4a --- /dev/null +++ b/server/dancedeets/jobs/base.py @@ -0,0 +1,379 @@ +""" +Base classes for Cloud Run Jobs. + +This module provides the foundation for running batch jobs that replace +the legacy App Engine MapReduce functionality. + +Cloud Run Jobs are containerized batch tasks that: +- Run to completion (not request-response like services) +- Support parallel execution via CLOUD_RUN_TASK_INDEX +- Can run up to 24 hours +- Support automatic retries + +Usage: + class MyJob(Job): + def run(self, entity): + # Process a single entity + pass + + if __name__ == '__main__': + runner = JobRunner(MyJob()) + runner.run_from_datastore('dancedeets.events.eventdata.DBEvent') +""" + +import abc +import logging +import os +import sys +from typing import Any, Callable, Dict, Generator, Iterable, List, Optional, Type + +from google.cloud import datastore +from google.cloud import storage + +from .metrics import JobMetrics + +# Configure logging for Cloud Run Jobs +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + stream=sys.stdout, +) +logger = logging.getLogger(__name__) + + +class Job(abc.ABC): + """Base class for all Cloud Run Jobs.""" + + def __init__(self): + self.metrics = JobMetrics() + self._gcs_client: Optional[storage.Client] = None + self._datastore_client: Optional[datastore.Client] = None + + @property + def gcs_client(self) -> storage.Client: + """Lazy-loaded GCS client.""" + if self._gcs_client is None: + self._gcs_client = storage.Client() + return self._gcs_client + + @property + def datastore_client(self) -> datastore.Client: + """Lazy-loaded Datastore client.""" + if self._datastore_client is None: + self._datastore_client = datastore.Client() + return self._datastore_client + + @abc.abstractmethod + def run(self, entity: Any) -> Optional[Any]: + """ + Process a single entity. + + Args: + entity: The entity to process (from Datastore query) + + Returns: + Optional output to be collected (for jobs with output) + """ + pass + + def setup(self) -> None: + """Called once before processing entities. Override for initialization.""" + pass + + def teardown(self) -> None: + """Called once after all entities processed. Override for cleanup.""" + pass + + def on_batch_complete(self, batch: List[Any]) -> None: + """Called after processing a batch of entities. Override for batch operations.""" + pass + + +class BatchJob(Job): + """Job that processes entities in batches instead of one at a time.""" + + def __init__(self, batch_size: int = 20): + super().__init__() + self.batch_size = batch_size + + def run(self, entity: Any) -> Optional[Any]: + """Not used for batch jobs - override run_batch instead.""" + raise NotImplementedError("BatchJob should override run_batch, not run") + + @abc.abstractmethod + def run_batch(self, entities: List[Any]) -> Optional[List[Any]]: + """ + Process a batch of entities. + + Args: + entities: List of entities to process + + Returns: + Optional list of outputs to be collected + """ + pass + + +class JobRunner: + """ + Runs a Job against a set of entities. + + Supports: + - Datastore entity iteration with cursor-based pagination + - Parallel task execution via CLOUD_RUN_TASK_INDEX + - Output collection to GCS + - Progress logging + """ + + def __init__(self, job: Job, project_id: Optional[str] = None): + self.job = job + self.project_id = project_id or os.environ.get('GOOGLE_CLOUD_PROJECT', 'dancedeets-hrd') + self._datastore_client: Optional[datastore.Client] = None + + # Cloud Run Job environment variables + self.task_index = int(os.environ.get('CLOUD_RUN_TASK_INDEX', '0')) + self.task_count = int(os.environ.get('CLOUD_RUN_TASK_COUNT', '1')) + self.attempt_index = int(os.environ.get('CLOUD_RUN_TASK_ATTEMPT', '0')) + + logger.info( + f"JobRunner initialized: task {self.task_index + 1}/{self.task_count}, " + f"attempt {self.attempt_index + 1}" + ) + + @property + def datastore_client(self) -> datastore.Client: + """Lazy-loaded Datastore client.""" + if self._datastore_client is None: + self._datastore_client = datastore.Client(project=self.project_id) + return self._datastore_client + + def run_from_datastore( + self, + entity_kind: str, + filters: Optional[List[tuple]] = None, + batch_size: int = 100, + limit: Optional[int] = None, + ) -> None: + """ + Run the job against entities from Datastore. + + Args: + entity_kind: Full entity kind path (e.g., 'dancedeets.events.eventdata.DBEvent') + filters: Optional list of (property, operator, value) tuples + batch_size: Number of entities to fetch per query + limit: Optional maximum number of entities to process + """ + filters = filters or [] + + # Extract just the kind name (last part of the dotted path) + kind_name = entity_kind.split('.')[-1] + + logger.info(f"Starting job for entity kind: {kind_name}") + logger.info(f"Filters: {filters}") + + self.job.setup() + + try: + processed_count = 0 + output_buffer: List[Any] = [] + + for entity in self._iterate_entities(kind_name, filters, batch_size, limit): + try: + if isinstance(self.job, BatchJob): + # Batch jobs handle their own batching in _iterate_entities_batched + raise NotImplementedError("Use run_from_datastore_batched for BatchJob") + + result = self.job.run(entity) + if result is not None: + if isinstance(result, (list, tuple)): + output_buffer.extend(result) + else: + output_buffer.append(result) + + processed_count += 1 + self.job.metrics.increment('entities_processed') + + if processed_count % 100 == 0: + logger.info(f"Processed {processed_count} entities") + + except Exception as e: + logger.error(f"Error processing entity {entity.key}: {e}") + self.job.metrics.increment('entities_failed') + # Continue processing other entities + + logger.info(f"Job complete. Processed {processed_count} entities.") + logger.info(f"Metrics: {self.job.metrics.get_all()}") + + finally: + self.job.teardown() + + def run_from_datastore_batched( + self, + entity_kind: str, + filters: Optional[List[tuple]] = None, + batch_size: int = 20, + limit: Optional[int] = None, + ) -> None: + """ + Run a BatchJob against entities from Datastore. + + Args: + entity_kind: Full entity kind path + filters: Optional list of (property, operator, value) tuples + batch_size: Number of entities per batch (overrides job.batch_size) + limit: Optional maximum number of entities to process + """ + if not isinstance(self.job, BatchJob): + raise TypeError("run_from_datastore_batched requires a BatchJob") + + filters = filters or [] + kind_name = entity_kind.split('.')[-1] + + logger.info(f"Starting batch job for entity kind: {kind_name}") + logger.info(f"Batch size: {batch_size}, Filters: {filters}") + + self.job.setup() + + try: + processed_count = 0 + batch: List[Any] = [] + + for entity in self._iterate_entities(kind_name, filters, batch_size, limit): + batch.append(entity) + + if len(batch) >= batch_size: + self._process_batch(batch) + processed_count += len(batch) + batch = [] + + if processed_count % 100 == 0: + logger.info(f"Processed {processed_count} entities") + + # Process remaining entities + if batch: + self._process_batch(batch) + processed_count += len(batch) + + logger.info(f"Batch job complete. Processed {processed_count} entities.") + logger.info(f"Metrics: {self.job.metrics.get_all()}") + + finally: + self.job.teardown() + + def _process_batch(self, batch: List[Any]) -> None: + """Process a batch of entities.""" + try: + self.job.run_batch(batch) + self.job.metrics.increment('batches_processed') + self.job.metrics.increment('entities_processed', len(batch)) + except Exception as e: + logger.error(f"Error processing batch: {e}") + self.job.metrics.increment('batches_failed') + self.job.metrics.increment('entities_failed', len(batch)) + + def _iterate_entities( + self, + kind: str, + filters: List[tuple], + batch_size: int, + limit: Optional[int], + ) -> Generator[Any, None, None]: + """ + Iterate over Datastore entities with cursor-based pagination. + + For parallel Cloud Run Jobs, entities are distributed across tasks + using modulo on the entity key. + """ + query = self.datastore_client.query(kind=kind) + + for prop, op, value in filters: + query.add_filter(prop, op, value) + + cursor = None + total_fetched = 0 + + while True: + # Fetch a page of results + query_iter = query.fetch(start_cursor=cursor, limit=batch_size) + page = list(query_iter) + + if not page: + break + + for entity in page: + # For parallel execution, only process entities assigned to this task + if self.task_count > 1: + # Use hash of key for distribution + entity_hash = hash(str(entity.key)) + if entity_hash % self.task_count != self.task_index: + continue + + yield entity + total_fetched += 1 + + if limit and total_fetched >= limit: + return + + # Get cursor for next page + cursor = query_iter.next_page_token + if cursor is None: + break + + def write_output_to_gcs( + self, + output_lines: Iterable[str], + bucket_name: str, + blob_name: str, + content_type: str = 'text/plain', + ) -> str: + """ + Write output lines to Google Cloud Storage. + + Args: + output_lines: Iterable of strings to write + bucket_name: GCS bucket name + blob_name: Path within the bucket + content_type: MIME type of the output + + Returns: + GCS URI of the written file + """ + gcs_client = storage.Client() + bucket = gcs_client.bucket(bucket_name) + + # Include task index in filename for parallel jobs + if self.task_count > 1: + name, ext = os.path.splitext(blob_name) + blob_name = f"{name}-{self.task_index:05d}{ext}" + + blob = bucket.blob(blob_name) + + # Write as a single string + content = '\n'.join(output_lines) + blob.upload_from_string(content, content_type=content_type) + + uri = f"gs://{bucket_name}/{blob_name}" + logger.info(f"Wrote output to {uri}") + return uri + + +def run_job( + job_class: Type[Job], + entity_kind: str, + filters: Optional[List[tuple]] = None, + **kwargs, +) -> None: + """ + Convenience function to run a job. + + This is the main entry point for Cloud Run Job containers. + + Args: + job_class: The Job class to instantiate and run + entity_kind: Datastore entity kind to process + filters: Optional query filters + **kwargs: Additional arguments passed to run_from_datastore + """ + job = job_class() + runner = JobRunner(job) + runner.run_from_datastore(entity_kind, filters=filters, **kwargs) diff --git a/server/dancedeets/jobs/classify_events_ml.py b/server/dancedeets/jobs/classify_events_ml.py new file mode 100644 index 000000000..d3b27a44b --- /dev/null +++ b/server/dancedeets/jobs/classify_events_ml.py @@ -0,0 +1,188 @@ +""" +Cloud Run Job: Classify potential events using ML prediction. + +Migrated from: dancedeets/ml/mr_prediction.py + +This job uses a trained ML model to classify potential events and +score them as likely dance events. + +Usage: + python -m dancedeets.jobs.runner --job=classify_events_ml +""" + +import logging +from typing import Optional + +from dancedeets import fb_api +from dancedeets.event_scraper import potential_events +from dancedeets.jobs.base import BatchJob, JobRunner +from dancedeets.jobs.fb_utils import FBJobContext, get_multiple_tokens +from dancedeets.jobs.gcs_output import GCSOutputWriter, DEFAULT_BUCKET +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics +from dancedeets.ml import gprediction + +logger = logging.getLogger(__name__) + + +class ClassifyEventsMLJob(BatchJob): + """ + Job that classifies potential events using ML prediction. + + Uses Google Prediction API to score events as likely dance events. + """ + + def __init__( + self, + fb_context: Optional[FBJobContext] = None, + bucket_name: str = DEFAULT_BUCKET, + dry_run: bool = False, + ): + super().__init__(batch_size=20) + self.fb_context = fb_context + self.bucket_name = bucket_name + self.dry_run = dry_run + self.output_writer = None + self._predict_service = None + logger.info("ClassifyEventsMLJob initialized") + + @property + def predict_service(self): + """Lazy-loaded prediction service.""" + if self._predict_service is None: + self._predict_service = gprediction.get_predict_service() + return self._predict_service + + def setup(self) -> None: + """Initialize the output writer.""" + if not self.dry_run: + self.output_writer = GCSOutputWriter( + bucket_name=self.bucket_name, + blob_name='ml/classification_results.txt', + content_type='text/plain', + ) + + def run_batch(self, pe_list: list) -> None: + """Process a batch of potential events.""" + # Filter to events with match_score > 0 + pe_list = [x for x in pe_list if x.match_score > 0] + if not pe_list: + self.metrics.increment('batches_empty') + return + + if not self.fb_context: + logger.warning("No FB context, skipping batch") + self.metrics.increment('batches_skipped_no_fb') + return + + fbl = self.fb_context.get_fblookup() + + # Get events that don't already have scores + pe_ids = [x.fb_event_id for x in pe_list if not getattr(x, 'dance_bias_score', None)] + if pe_ids: + fbl.request_multi(fb_api.LookupEvent, pe_ids) + fbl.request_multi(fb_api.LookupEventAttending, pe_ids) + + try: + fbl.batch_fetch() + except Exception as e: + logger.error(f"Error fetching Facebook data: {e}") + self.metrics.increment('batches_failed_fb') + return + + results = [] + for pe in pe_list: + if not getattr(pe, 'dance_bias_score', None): + try: + fb_event = fbl.fetched_data(fb_api.LookupEvent, pe.fb_event_id) + fb_event_attending = fbl.fetched_data(fb_api.LookupEventAttending, pe.fb_event_id) + except fb_api.NoFetchedDataException: + self.metrics.increment('events_skipped_no_data') + continue + + if fb_event.get('empty'): + self.metrics.increment('events_skipped_empty') + continue + + # Score the event + if self.dry_run: + logger.info(f"[DRY RUN] Would classify event {pe.fb_event_id}") + self.metrics.increment('events_would_classify') + else: + pe = potential_events.update_scores_for_potential_event( + pe, fb_event, fb_event_attending, self.predict_service + ) + self.metrics.increment('events_classified') + + logger.debug( + f"{pe.fb_event_id}: ms={pe.match_score}, " + f"d={pe.dance_bias_score}, nd={pe.non_dance_bias_score}" + ) + + # Report events with high scores in both models + if (getattr(pe, 'dance_bias_score', 0) or 0) > 0.5 and \ + (getattr(pe, 'non_dance_bias_score', 0) or 0) > 0.5: + result = f"{pe.fb_event_id}:{pe.match_score}:{pe.dance_bias_score}:{pe.non_dance_bias_score}\n" + results.append(result) + self.metrics.increment('events_high_score') + + # Write results + if results and not self.dry_run: + for result in results: + self.output_writer.write(result) + + self.metrics.increment('batches_processed') + + def teardown(self) -> None: + """Finalize the output.""" + if not self.dry_run and self.output_writer: + uri = self.output_writer.flush() + logger.info(f"Classification results written to {uri}") + + +def main(dry_run: bool = False, **kwargs) -> None: + """ + Main entry point for the classify_events_ml job. + + Args: + dry_run: If True, don't actually classify or save + """ + logger.info("Starting classify_events_ml job") + + # Get tokens for Facebook API access + try: + tokens = get_multiple_tokens(token_count=50) + logger.info(f"Got {len(tokens)} access tokens for rotation") + except Exception as e: + logger.warning(f"Could not get multiple tokens: {e}") + tokens = [] + + fb_context = FBJobContext( + fb_uid='system', + access_tokens=tokens, + allow_cache=True, + ) if tokens else None + + job = ClassifyEventsMLJob( + fb_context=fb_context, + dry_run=dry_run, + ) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + + # Only process events that haven't been looked at + filters = [ + ('looked_at', '=', None), + ] + + runner.run_from_datastore_batched( + entity_kind='dancedeets.event_scraper.potential_events.PotentialEvent', + filters=filters, + batch_size=20, + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/compute_rankings.py b/server/dancedeets/jobs/compute_rankings.py new file mode 100644 index 000000000..546555012 --- /dev/null +++ b/server/dancedeets/jobs/compute_rankings.py @@ -0,0 +1,219 @@ +""" +Cloud Run Job: Compute city/country rankings by events and users. + +Migrated from: dancedeets/rankings/rankings.py + +This job counts events and users by city/country for ranking calculations. +Results are stored in memcache for display on the website. + +Usage: + python -m dancedeets.jobs.runner --job=compute_rankings --ranking_type=events --vertical=STREET + python -m dancedeets.jobs.runner --job=compute_rankings --ranking_type=users +""" + +import datetime +import logging + +from dancedeets.jobs.base import Job, JobRunner +from dancedeets.jobs.metrics import GroupedMetrics, JobMetrics, set_current_metrics +from dancedeets.rankings import rankings +from dancedeets.util import memcache + +logger = logging.getLogger(__name__) + +# Time period definitions (from rankings.py) +LAST_WEEK = "LAST_WEEK" +LAST_MONTH = "LAST_MONTH" +ALL_TIME = "ALL_TIME" + + +def get_time_periods(timestamp): + """Get applicable time periods for a given timestamp.""" + now = datetime.datetime.now() + if timestamp > now - datetime.timedelta(days=7): + yield LAST_WEEK + if timestamp > now - datetime.timedelta(days=31): + yield LAST_MONTH + yield ALL_TIME + + +class ComputeEventRankingsJob(Job): + """ + Job that counts events by city for rankings. + + Iterates over all events (optionally filtered by vertical) and + counts them by city and country for different time periods. + """ + + def __init__(self, vertical: str = None, dry_run: bool = False): + super().__init__() + self.vertical = vertical + self.dry_run = dry_run + self.city_counts = GroupedMetrics() + self.country_counts = GroupedMetrics() + logger.info(f"ComputeEventRankingsJob initialized for vertical={vertical}") + + def run(self, dbevent) -> None: + """Process a single event.""" + if not dbevent.start_time: # deleted event, don't count + self.metrics.increment('events_skipped_deleted') + return + + if not dbevent.latitude or not dbevent.longitude: + self.metrics.increment('events_skipped_no_location') + return + + city = dbevent.city_name + country = dbevent.country + + # Determine which time periods this event counts for + timestamp = dbevent.creation_time or dbevent.start_time + for time_period in get_time_periods(timestamp): + if city: + self.city_counts.increment(city, time_period) + if country: + self.country_counts.increment(country, time_period) + + self.metrics.increment('events_processed') + + def teardown(self) -> None: + """Save rankings to memcache after processing.""" + if self.dry_run: + logger.info("[DRY RUN] Would save rankings to memcache") + logger.info(f"City counts: {len(self.city_counts.get_all_groups())} cities") + logger.info(f"Country counts: {len(self.country_counts.get_all_groups())} countries") + return + + # Store city rankings + city_rankings = {} + for city, periods in self.city_counts.get_all_groups().items(): + city_rankings[city] = periods + + # Store country rankings + country_rankings = {} + for country, periods in self.country_counts.get_all_groups().items(): + country_rankings[country] = periods + + # Save to memcache (similar to _compute_summary) + vertical_key = f":{self.vertical}" if self.vertical else "" + memcache.set( + f"CityEventRankings{vertical_key}", + city_rankings, + rankings.TOTALS_EXPIRY, + ) + memcache.set( + f"CountryEventRankings{vertical_key}", + country_rankings, + rankings.TOTALS_EXPIRY, + ) + + logger.info(f"Saved rankings for {len(city_rankings)} cities, {len(country_rankings)} countries") + + # Update the totals summary + total_events = sum( + periods.get(ALL_TIME, 0) + for periods in city_rankings.values() + ) + logger.info(f"Total events (all time): {total_events}") + + +class ComputeUserRankingsJob(Job): + """ + Job that counts users by city for rankings. + + Iterates over all users and counts them by city for different time periods. + """ + + def __init__(self, dry_run: bool = False): + super().__init__() + self.dry_run = dry_run + self.city_counts = GroupedMetrics() + logger.info("ComputeUserRankingsJob initialized") + + def run(self, user) -> None: + """Process a single user.""" + user_city = user.city_name + if not user_city: + self.metrics.increment('users_skipped_no_city') + return + + timestamp = user.creation_time + if not timestamp: + # Use ALL_TIME if no creation time + self.city_counts.increment(user_city, ALL_TIME) + else: + for time_period in get_time_periods(timestamp): + self.city_counts.increment(user_city, time_period) + + self.metrics.increment('users_processed') + + def teardown(self) -> None: + """Save rankings to memcache after processing.""" + if self.dry_run: + logger.info("[DRY RUN] Would save user rankings to memcache") + logger.info(f"City counts: {len(self.city_counts.get_all_groups())} cities") + return + + # Store city rankings + city_rankings = {} + for city, periods in self.city_counts.get_all_groups().items(): + city_rankings[city] = periods + + memcache.set( + "CityUserRankings", + city_rankings, + rankings.TOTALS_EXPIRY, + ) + + logger.info(f"Saved user rankings for {len(city_rankings)} cities") + + # Update the totals summary + total_users = sum( + periods.get(ALL_TIME, 0) + for periods in city_rankings.values() + ) + logger.info(f"Total users (all time): {total_users}") + + +def main( + ranking_type: str = 'events', + vertical: str = None, + dry_run: bool = False, + **kwargs, +) -> None: + """ + Main entry point for the compute_rankings job. + + Args: + ranking_type: 'events' or 'users' + vertical: Optional vertical filter (e.g., 'STREET') for events + dry_run: If True, don't save to memcache + """ + logger.info(f"Starting compute_rankings job: type={ranking_type}, vertical={vertical}") + + if ranking_type == 'events': + job = ComputeEventRankingsJob(vertical=vertical, dry_run=dry_run) + entity_kind = 'dancedeets.events.eventdata.DBEvent' + filters = [] + if vertical: + filters.append(('verticals', '=', vertical)) + elif ranking_type == 'users': + job = ComputeUserRankingsJob(dry_run=dry_run) + entity_kind = 'dancedeets.users.users.User' + filters = [] + else: + raise ValueError(f"Unknown ranking_type: {ranking_type}") + + set_current_metrics(job.metrics) + runner = JobRunner(job) + + runner.run_from_datastore( + entity_kind=entity_kind, + filters=filters, + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/compute_user_stats.py b/server/dancedeets/jobs/compute_user_stats.py new file mode 100644 index 000000000..b7ba4762e --- /dev/null +++ b/server/dancedeets/jobs/compute_user_stats.py @@ -0,0 +1,151 @@ +""" +Cloud Run Job: Compute user event statistics. + +Migrated from: dancedeets/users/user_event_tasks.py + +This job calculates statistics for each user about how many events +they've contributed (auto-added, hand-added, etc.). + +Usage: + python -m dancedeets.jobs.runner --job=compute_user_stats +""" + +import logging + +from dancedeets.jobs.base import Job, JobRunner +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics +from dancedeets.events import eventdata +from dancedeets.event_scraper import potential_events + +logger = logging.getLogger(__name__) + + +def update_user_qualities(user) -> dict: + """ + Calculate and update user event contribution statistics. + + Returns: + Dict with the calculated statistics + """ + # Query potential events where this user is a source + # STR_ID_MIGRATE: using long() for fb_uid + try: + fb_uid_long = int(user.fb_uid) + except (ValueError, TypeError): + fb_uid_long = user.fb_uid + + source_potential_events = potential_events.PotentialEvent.gql( + 'WHERE source_ids = :graph_id', + graph_id=fb_uid_long + ).fetch(1000) + + # Get the actual events that were added + added_events = eventdata.DBEvent.get_by_ids( + [x.fb_event_id for x in source_potential_events] + ) + + # Count auto-added events + num_auto_added = len([ + x for x in added_events + if x and x.creating_method in [eventdata.CM_AUTO, eventdata.CM_AUTO_ATTENDEE] + ]) + + # Count auto-added events owned by this user + num_auto_added_own = len([ + x for x in added_events + if x and x.creating_method in [eventdata.CM_AUTO, eventdata.CM_AUTO_ATTENDEE] + and x.owner_fb_uid == user.fb_uid + ]) + + # Count hand-added events (created by this user) + # STR_ID_MIGRATE + num_hand_added = len([ + x for x in added_events + if x and x.creating_method == eventdata.CM_USER + and str(x.creating_fb_uid) == user.fb_uid + ]) + + # Count hand-added events owned by this user + # STR_ID_MIGRATE + num_hand_added_own = len([ + x for x in added_events + if x and x.creating_method == eventdata.CM_USER + and str(x.creating_fb_uid) == user.fb_uid + and x.owner_fb_uid == user.fb_uid + ]) + + # Update user properties + user.num_auto_added_events = num_auto_added + user.num_auto_added_own_events = num_auto_added_own + user.num_hand_added_events = num_hand_added + user.num_hand_added_own_events = num_hand_added_own + + return { + 'auto_added': num_auto_added, + 'auto_added_own': num_auto_added_own, + 'hand_added': num_hand_added, + 'hand_added_own': num_hand_added_own, + } + + +class ComputeUserStatsJob(Job): + """ + Job that computes event statistics for each user. + + For each user, counts: + - Auto-added events (via ML classifier) + - Hand-added events (manually added by user) + - Events owned by the user + """ + + def __init__(self, dry_run: bool = False): + super().__init__() + self.dry_run = dry_run + logger.info("ComputeUserStatsJob initialized") + + def run(self, user) -> None: + """Process a single user.""" + try: + stats = update_user_qualities(user) + + if self.dry_run: + logger.info( + f"[DRY RUN] User {user.fb_uid}: " + f"auto={stats['auto_added']}, hand={stats['hand_added']}" + ) + self.metrics.increment('users_would_update') + else: + user.put() + self.metrics.increment('users_updated') + + # Track totals + self.metrics.increment('total_auto_added', stats['auto_added']) + self.metrics.increment('total_hand_added', stats['hand_added']) + + except Exception as e: + logger.error(f"Error processing user {user.fb_uid}: {e}") + self.metrics.increment('users_failed') + + +def main(dry_run: bool = False, **kwargs) -> None: + """ + Main entry point for the compute_user_stats job. + + Args: + dry_run: If True, don't save changes to users + """ + logger.info("Starting compute_user_stats job") + + job = ComputeUserStatsJob(dry_run=dry_run) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + runner.run_from_datastore( + entity_kind='dancedeets.users.users.User', + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/count_unique_attendees.py b/server/dancedeets/jobs/count_unique_attendees.py new file mode 100644 index 000000000..ca7a50891 --- /dev/null +++ b/server/dancedeets/jobs/count_unique_attendees.py @@ -0,0 +1,195 @@ +""" +Cloud Run Job: Count unique attendees per city. + +Migrated from: dancedeets/logic/unique_attendees.py + +This job counts unique RSVPs across all events, grouped by city and country. + +Usage: + python -m dancedeets.jobs.runner --job=count_unique_attendees +""" + +import logging +from collections import defaultdict +from typing import Dict, Optional, Set + +from dancedeets import fb_api +from dancedeets.jobs.base import BatchJob, JobRunner +from dancedeets.jobs.fb_utils import FBJobContext, get_multiple_tokens +from dancedeets.jobs.gcs_output import GCSOutputWriter, DEFAULT_BUCKET +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics + +logger = logging.getLogger(__name__) + +BATCH_SIZE = 20 + + +class CountUniqueAttendeesJob(BatchJob): + """ + Job that counts unique attendees per city/country. + + This is a "reduce" style job that aggregates attendees across all events. + Uses in-memory aggregation instead of MapReduce framework. + """ + + def __init__( + self, + fb_context: Optional[FBJobContext] = None, + bucket_name: str = DEFAULT_BUCKET, + dry_run: bool = False, + ): + super().__init__(batch_size=BATCH_SIZE) + self.fb_context = fb_context + self.bucket_name = bucket_name + self.dry_run = dry_run + self.output_writer = None + + # In-memory aggregation (replaces MapReduce reduce step) + # Maps location -> set of attendee IDs + self.city_attendees: Dict[str, Set[str]] = defaultdict(set) + self.country_attendees: Dict[str, Set[str]] = defaultdict(set) + + # For counting total RSVPs (not unique) + self.city_rsvp_count: Dict[str, int] = defaultdict(int) + self.country_rsvp_count: Dict[str, int] = defaultdict(int) + + logger.info("CountUniqueAttendeesJob initialized") + + def setup(self) -> None: + """Initialize the output writer.""" + if not self.dry_run: + self.output_writer = GCSOutputWriter( + bucket_name=self.bucket_name, + blob_name='analytics/unique_attendees.txt', + content_type='text/plain', + ) + + def run_batch(self, db_events: list) -> None: + """Process a batch of events.""" + # Filter to Facebook events only + db_events = [x for x in db_events if x.is_fb_event] + if not db_events: + self.metrics.increment('batches_empty') + return + + if not self.fb_context: + logger.warning("No FB context, skipping batch") + self.metrics.increment('batches_skipped_no_fb') + return + + fbl = self.fb_context.get_fblookup() + + # Request attending data + fbl.request_multi(fb_api.LookupEventAttending, [x.fb_event_id for x in db_events]) + + try: + fbl.batch_fetch() + except Exception as e: + logger.error(f"Error fetching Facebook data: {e}") + self.metrics.increment('batches_failed_fb') + return + + for db_event in db_events: + try: + fb_event_attending = fbl.fetched_data( + fb_api.LookupEventAttending, db_event.id + ) + except fb_api.NoFetchedDataException: + logger.warning(f'No attending found for {db_event.id}') + self.metrics.increment('events_no_attending') + continue + + if fb_event_attending.get('empty'): + self.metrics.increment('events_empty_attending') + continue + + attendees = fb_event_attending.get('attending', {}).get('data', []) + city = db_event.city_name + country = db_event.country + + for attendee in attendees: + attendee_id = attendee['id'] + + if city: + self.city_attendees[city].add(attendee_id) + self.city_rsvp_count[city] += 1 + + if country: + self.country_attendees[country].add(attendee_id) + self.country_rsvp_count[country] += 1 + + self.metrics.increment('events_processed') + self.metrics.increment('attendees_processed', len(attendees)) + + self.metrics.increment('batches_processed') + + def teardown(self) -> None: + """Write final results.""" + if self.dry_run: + logger.info("[DRY RUN] Would write attendee counts") + logger.info(f" Cities: {len(self.city_attendees)}") + logger.info(f" Countries: {len(self.country_attendees)}") + return + + # Write city results + for city in sorted(self.city_attendees.keys()): + unique_count = len(self.city_attendees[city]) + total_count = self.city_rsvp_count[city] + self.output_writer.write(f"Unique Attendees in City: {city}: {unique_count}") + self.output_writer.write(f"Total RSVPs in City: {city}: {total_count}") + + # Write country results + for country in sorted(self.country_attendees.keys()): + unique_count = len(self.country_attendees[country]) + total_count = self.country_rsvp_count[country] + self.output_writer.write(f"Unique Attendees in Country: {country}: {unique_count}") + self.output_writer.write(f"Total RSVPs in Country: {country}: {total_count}") + + uri = self.output_writer.flush() + logger.info(f"Results written to {uri}") + + # Log summary + total_unique = sum(len(s) for s in self.city_attendees.values()) + logger.info(f"Total unique attendees across all cities: {total_unique}") + + +def main(dry_run: bool = False, **kwargs) -> None: + """ + Main entry point for the count_unique_attendees job. + + Args: + dry_run: If True, don't write to GCS + """ + logger.info("Starting count_unique_attendees job") + + # Get tokens for Facebook API access + try: + tokens = get_multiple_tokens(token_count=50) + logger.info(f"Got {len(tokens)} access tokens for rotation") + except Exception as e: + logger.warning(f"Could not get multiple tokens: {e}") + tokens = [] + + fb_context = FBJobContext( + fb_uid='system', + access_tokens=tokens, + allow_cache=True, + ) if tokens else None + + job = CountUniqueAttendeesJob( + fb_context=fb_context, + dry_run=dry_run, + ) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + runner.run_from_datastore_batched( + entity_kind='dancedeets.events.eventdata.DBEvent', + batch_size=BATCH_SIZE, + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/dump_potential_events.py b/server/dancedeets/jobs/dump_potential_events.py new file mode 100644 index 000000000..7bf2ab007 --- /dev/null +++ b/server/dancedeets/jobs/dump_potential_events.py @@ -0,0 +1,160 @@ +""" +Cloud Run Job: Dump potential events data to GCS. + +Migrated from: dancedeets/logic/mr_dump.py + +This job exports potential event data (from Facebook API) to CSV format +in Google Cloud Storage for analysis or ML training. + +Usage: + python -m dancedeets.jobs.runner --job=dump_potential_events +""" + +import csv +import io +import json +import logging +from typing import Optional + +from dancedeets import fb_api +from dancedeets.jobs.base import BatchJob, JobRunner +from dancedeets.jobs.fb_utils import FBJobContext, get_multiple_tokens +from dancedeets.jobs.gcs_output import GCSOutputWriter, DEFAULT_BUCKET +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics + +logger = logging.getLogger(__name__) + + +class DumpPotentialEventsJob(BatchJob): + """ + Job that dumps potential event data to GCS. + + Fetches event data from Facebook and writes as CSV. + """ + + def __init__( + self, + fb_context: Optional[FBJobContext] = None, + bucket_name: str = DEFAULT_BUCKET, + dry_run: bool = False, + ): + super().__init__(batch_size=80) + self.fb_context = fb_context + self.bucket_name = bucket_name + self.dry_run = dry_run + self.output_writer = None + logger.info("DumpPotentialEventsJob initialized") + + def setup(self) -> None: + """Initialize the output writer.""" + if not self.dry_run: + self.output_writer = GCSOutputWriter( + bucket_name=self.bucket_name, + blob_name='exports/potential_events.csv', + content_type='text/csv', + ) + + def run_batch(self, pe_list: list) -> None: + """Process a batch of potential events.""" + # Filter to events with match_score > 0 + pe_list = [x for x in pe_list if x.match_score > 0] + if not pe_list: + self.metrics.increment('batches_empty') + return + + # Get Facebook lookup + if not self.fb_context: + logger.warning("No FB context, skipping batch") + self.metrics.increment('batches_skipped_no_fb') + return + + fbl = self.fb_context.get_fblookup() + + # Request event data from Facebook + fbl.request_multi(fb_api.LookupEvent, [x.fb_event_id for x in pe_list]) + + try: + fbl.batch_fetch() + except Exception as e: + logger.error(f"Error fetching Facebook data: {e}") + self.metrics.increment('batches_failed_fb') + return + + # Build CSV output + csv_file = io.StringIO() + csv_writer = csv.writer(csv_file) + + for pe in pe_list: + try: + result = json.dumps(fbl.fetched_data(fb_api.LookupEvent, pe.fb_event_id)) + cache_key = fbl.key_to_cache_key( + fb_api.generate_key(fb_api.LookupEvent, pe.fb_event_id) + ) + csv_writer.writerow([cache_key, result]) + self.metrics.increment('events_exported') + except fb_api.NoFetchedDataException: + logger.error(f"Skipping row for event id {pe.fb_event_id}") + self.metrics.increment('events_skipped_no_data') + + # Write to GCS + if self.dry_run: + logger.info(f"[DRY RUN] Would write {len(pe_list)} events to GCS") + else: + self.output_writer.write(csv_file.getvalue()) + + self.metrics.increment('batches_processed') + + def teardown(self) -> None: + """Finalize the output.""" + if not self.dry_run and self.output_writer: + uri = self.output_writer.flush() + logger.info(f"Export written to {uri}") + + +def main(dry_run: bool = False, **kwargs) -> None: + """ + Main entry point for the dump_potential_events job. + + Args: + dry_run: If True, don't write to GCS + """ + logger.info("Starting dump_potential_events job") + + # Get tokens for Facebook API access + try: + tokens = get_multiple_tokens(token_count=50) + logger.info(f"Got {len(tokens)} access tokens for rotation") + except Exception as e: + logger.warning(f"Could not get multiple tokens: {e}") + tokens = [] + + fb_context = FBJobContext( + fb_uid='system', + access_tokens=tokens, + allow_cache=False, # Don't pollute cache with this data + ) if tokens else None + + job = DumpPotentialEventsJob( + fb_context=fb_context, + dry_run=dry_run, + ) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + + # Only process events that haven't been looked at + filters = [ + ('looked_at', '=', None), + ] + + runner.run_from_datastore_batched( + entity_kind='dancedeets.event_scraper.potential_events.PotentialEvent', + filters=filters, + batch_size=80, + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/email_crawl_errors.py b/server/dancedeets/jobs/email_crawl_errors.py new file mode 100644 index 000000000..a70b70fab --- /dev/null +++ b/server/dancedeets/jobs/email_crawl_errors.py @@ -0,0 +1,175 @@ +""" +Cloud Run Job: Email crawl errors from spider jobs. + +Migrated from: dancedeets/classes/class_pipeline.py (EmailErrors) + +This job collects errors from completed spider jobs and sends +an email report. + +Usage: + python -m dancedeets.jobs.runner --job=email_crawl_errors --job_keys=key1,key2 +""" + +import datetime +import json +import logging +import os +from typing import Dict, List + +import scrapinghub + +from dancedeets import keys +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics +from dancedeets.mail import mandrill_api + +logger = logging.getLogger(__name__) + + +def get_shub_project(): + """Get ScrapingHub project connection.""" + api_key = keys.get('scrapinghub_key') + conn = scrapinghub.Connection(api_key) + project = scrapinghub.Project(conn, 27474) + return project + + +def collect_errors(job_keys: List[str]) -> Dict[str, List[str]]: + """ + Collect errors from completed spider jobs. + + Args: + job_keys: List of ScrapingHub job keys + + Returns: + Dict mapping spider name to list of error messages + """ + project = get_shub_project() + error_lines: Dict[str, List[str]] = {} + + for job_key in job_keys: + try: + job = project.job(job_key) + spider_name = job.info.get('spider', job_key) + + # Check for no items scraped + if not job.info.get('items_scraped'): + error_lines.setdefault(spider_name, []).append( + 'Could not find any items.' + ) + + # Collect error-level log entries + for line in job.log(): + if line.get('level', 0) >= 40: # ERROR level + error_lines.setdefault(spider_name, []).append( + line.get('message', str(line)) + ) + + except Exception as e: + logger.error(f"Error collecting logs for {job_key}: {e}") + error_lines.setdefault(job_key, []).append(f"Error collecting logs: {e}") + + return error_lines + + +def send_error_email( + error_lines: Dict[str, List[str]], + run_time: datetime.datetime, + dry_run: bool = False, +) -> None: + """ + Send email report of crawl errors. + + Args: + error_lines: Dict mapping spider name to error messages + run_time: When the crawl started + dry_run: If True, don't actually send email + """ + if not error_lines: + logger.info("No errors to report") + return + + # Build email body + rendered = ["The following crawl errors occurred:"] + for crawler, errors in sorted(error_lines.items()): + rendered.append(f"\n{crawler}:") + rendered.extend(f" - {error}" for error in errors) + + body = '\n'.join(rendered) + logger.warning(body) + + if dry_run: + logger.info("[DRY RUN] Would send error email") + return + + subject = f"Crawl Errors for {run_time.strftime('%b %d, %Y: %H:%M')}" + message = { + 'from_email': 'reports@dancedeets.com', + 'from_name': 'DanceDeets Reports', + 'subject': subject, + 'to': [{ + 'email': 'reports@dancedeets.com', + 'name': 'DanceDeets Reports', + 'type': 'to', + }], + 'text': body, + } + + try: + mandrill_api.send_message(message) + logger.info("Error report email sent") + except Exception as e: + logger.error(f"Error sending email: {e}") + + +def main( + job_keys: str = None, + run_time: str = None, + dry_run: bool = False, + **kwargs, +) -> None: + """ + Main entry point for the email_crawl_errors job. + + Args: + job_keys: Comma-separated list of ScrapingHub job keys + run_time: ISO format timestamp of when crawl started + dry_run: If True, don't send email + """ + logger.info("Starting email_crawl_errors job") + + metrics = JobMetrics() + set_current_metrics(metrics) + + # Parse job keys + if not job_keys: + logger.warning("No job_keys provided") + return + + if isinstance(job_keys, str): + keys_list = [k.strip() for k in job_keys.split(',') if k.strip()] + else: + keys_list = job_keys + + logger.info(f"Checking {len(keys_list)} jobs for errors") + + # Parse run time + if run_time: + try: + crawl_time = datetime.datetime.fromisoformat(run_time) + except ValueError: + crawl_time = datetime.datetime.now() + else: + crawl_time = datetime.datetime.now() + + # Collect and send errors + error_lines = collect_errors(keys_list) + metrics.increment('spiders_checked', len(keys_list)) + metrics.increment('spiders_with_errors', len(error_lines)) + + send_error_email(error_lines, crawl_time, dry_run=dry_run) + + metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/fb_utils.py b/server/dancedeets/jobs/fb_utils.py new file mode 100644 index 000000000..1aefa8efc --- /dev/null +++ b/server/dancedeets/jobs/fb_utils.py @@ -0,0 +1,185 @@ +""" +Facebook API utilities for Cloud Run Jobs. + +Ported from dancedeets.util.fb_mapreduce to work with the new +Cloud Run Jobs framework instead of MapReduce. +""" + +import datetime +import logging +import random +from typing import Any, Dict, List, Optional + +from dancedeets import fb_api +from dancedeets.users import access_tokens + +logger = logging.getLogger(__name__) + + +class FBJobContext: + """ + Context for Facebook API access within a job. + + Replaces the MapReduce context-based token storage with explicit + parameter passing. + """ + + def __init__( + self, + fb_uid: str, + access_token: Optional[str] = None, + access_tokens: Optional[List[str]] = None, + allow_cache: bool = True, + oldest_allowed: Optional[datetime.datetime] = None, + ): + self.fb_uid = fb_uid + self._access_token = access_token + self._access_tokens = access_tokens or [] + self.allow_cache = allow_cache + self.oldest_allowed = oldest_allowed + + @property + def access_token(self) -> str: + """Get an access token, randomly selecting from pool if available.""" + if self._access_tokens: + return random.choice(self._access_tokens) + return self._access_token or '' + + def get_fblookup(self, user: Optional[Any] = None) -> fb_api.FBLookup: + """ + Create an FBLookup instance for API calls. + + Args: + user: Optional user object with fb_uid and fb_access_token + + Returns: + Configured FBLookup instance + """ + if user: + uid = user.fb_uid + token = user.fb_access_token or self.access_token + else: + uid = self.fb_uid + token = self.access_token + + fbl = fb_api.FBLookup(uid, token) + fbl.allow_cache = self.allow_cache + + if self.oldest_allowed is not None: + fbl.db.oldest_allowed = self.oldest_allowed + + return fbl + + +def get_fblookup_params( + fbl: fb_api.FBLookup, + randomize_tokens: bool = False, + token_count: int = 50, +) -> Dict[str, Any]: + """ + Extract parameters from an FBLookup for job configuration. + + This creates a serializable dict that can be passed to job constructors. + + Args: + fbl: Source FBLookup instance + randomize_tokens: If True, fetch multiple tokens for rotation + token_count: Number of tokens to fetch when randomizing + + Returns: + Dict of parameters for FBJobContext + """ + params = { + 'fb_uid': fbl.fb_uid, + 'allow_cache': fbl.allow_cache, + } + + if fbl.db.oldest_allowed != datetime.datetime.min: + params['oldest_allowed'] = fbl.db.oldest_allowed + + if randomize_tokens: + tokens = get_multiple_tokens(token_count=token_count) + logger.info(f'Found {len(tokens)} valid tokens') + if len(tokens) == 0: + raise Exception('No Valid Tokens') + params['access_tokens'] = tokens + else: + params['access_token'] = fbl.access_token + + return params + + +def get_multiple_tokens(token_count: int = 50) -> List[str]: + """ + Get multiple valid access tokens for token rotation. + + For long-running jobs, using multiple tokens helps avoid rate limiting. + + Args: + token_count: Maximum number of tokens to return + + Returns: + List of valid access token strings + """ + return access_tokens.get_multiple_tokens(token_count=token_count) + + +def get_fblookup( + fb_uid: str, + access_token: Optional[str] = None, + access_tokens: Optional[List[str]] = None, + allow_cache: bool = True, + oldest_allowed: Optional[datetime.datetime] = None, + user: Optional[Any] = None, +) -> fb_api.FBLookup: + """ + Create an FBLookup instance for API calls. + + This is a convenience function that mirrors the old MapReduce pattern. + + Args: + fb_uid: Facebook user ID + access_token: Single access token + access_tokens: List of tokens for rotation + allow_cache: Whether to use caching + oldest_allowed: Oldest cache entry to accept + user: Optional user object to get token from + + Returns: + Configured FBLookup instance + """ + ctx = FBJobContext( + fb_uid=fb_uid, + access_token=access_token, + access_tokens=access_tokens, + allow_cache=allow_cache, + oldest_allowed=oldest_allowed, + ) + return ctx.get_fblookup(user=user) + + +def create_fb_context_from_fbl( + fbl: fb_api.FBLookup, + randomize_tokens: bool = False, +) -> FBJobContext: + """ + Create an FBJobContext from an existing FBLookup. + + Useful when starting a job from a web request handler that already + has an authenticated FBLookup. + + Args: + fbl: Source FBLookup instance + randomize_tokens: If True, fetch multiple tokens for rotation + + Returns: + FBJobContext configured from the FBLookup + """ + params = get_fblookup_params(fbl, randomize_tokens=randomize_tokens) + return FBJobContext( + fb_uid=params['fb_uid'], + access_token=params.get('access_token'), + access_tokens=params.get('access_tokens'), + allow_cache=params['allow_cache'], + oldest_allowed=params.get('oldest_allowed'), + ) diff --git a/server/dancedeets/jobs/gcs_output.py b/server/dancedeets/jobs/gcs_output.py new file mode 100644 index 000000000..34f4f4e93 --- /dev/null +++ b/server/dancedeets/jobs/gcs_output.py @@ -0,0 +1,167 @@ +""" +Google Cloud Storage output utilities for Cloud Run Jobs. + +Provides helpers for writing job output to GCS, replacing the +MapReduce GoogleCloudStorageOutputWriter. +""" + +import datetime +import logging +import os +from typing import Iterable, Optional + +from google.cloud import storage + +logger = logging.getLogger(__name__) + +DEFAULT_BUCKET = 'dancedeets-hrd.appspot.com' + + +class GCSOutputWriter: + """ + Writes job output to Google Cloud Storage. + + Usage: + with GCSOutputWriter(bucket, 'output/results.txt') as writer: + for line in results: + writer.write(line) + """ + + def __init__( + self, + bucket_name: str = DEFAULT_BUCKET, + blob_name: Optional[str] = None, + content_type: str = 'text/plain', + include_task_index: bool = True, + ): + self.bucket_name = bucket_name + self._blob_name = blob_name + self.content_type = content_type + self.include_task_index = include_task_index + self._buffer: list = [] + self._client: Optional[storage.Client] = None + + @property + def blob_name(self) -> str: + """Get the blob name, optionally including task index.""" + if self._blob_name is None: + timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') + job_name = os.environ.get('CLOUD_RUN_JOB', 'job') + self._blob_name = f"jobs/{job_name}/{timestamp}/output.txt" + + if self.include_task_index: + task_count = int(os.environ.get('CLOUD_RUN_TASK_COUNT', '1')) + if task_count > 1: + task_index = int(os.environ.get('CLOUD_RUN_TASK_INDEX', '0')) + name, ext = os.path.splitext(self._blob_name) + return f"{name}-{task_index:05d}{ext}" + + return self._blob_name + + @property + def client(self) -> storage.Client: + """Lazy-loaded GCS client.""" + if self._client is None: + self._client = storage.Client() + return self._client + + def write(self, line: str) -> None: + """Write a line to the buffer.""" + self._buffer.append(line) + + def write_all(self, lines: Iterable[str]) -> None: + """Write multiple lines to the buffer.""" + self._buffer.extend(lines) + + def flush(self) -> str: + """ + Flush the buffer to GCS. + + Returns: + GCS URI of the written file + """ + if not self._buffer: + logger.warning("No content to write to GCS") + return "" + + bucket = self.client.bucket(self.bucket_name) + blob = bucket.blob(self.blob_name) + + content = '\n'.join(str(line) for line in self._buffer) + blob.upload_from_string(content, content_type=self.content_type) + + uri = f"gs://{self.bucket_name}/{self.blob_name}" + logger.info(f"Wrote {len(self._buffer)} lines to {uri}") + + self._buffer = [] + return uri + + def __enter__(self) -> 'GCSOutputWriter': + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + if self._buffer: + self.flush() + + +def write_to_gcs( + content: str, + bucket_name: str = DEFAULT_BUCKET, + blob_name: Optional[str] = None, + content_type: str = 'text/plain', +) -> str: + """ + Convenience function to write content directly to GCS. + + Args: + content: String content to write + bucket_name: GCS bucket name + blob_name: Path within the bucket + content_type: MIME type + + Returns: + GCS URI of the written file + """ + writer = GCSOutputWriter(bucket_name, blob_name, content_type) + writer.write(content) + return writer.flush() + + +def read_from_gcs( + bucket_name: str, + blob_name: str, +) -> str: + """ + Read content from a GCS file. + + Args: + bucket_name: GCS bucket name + blob_name: Path within the bucket + + Returns: + File contents as string + """ + client = storage.Client() + bucket = client.bucket(bucket_name) + blob = bucket.blob(blob_name) + return blob.download_as_text() + + +def list_gcs_blobs( + bucket_name: str, + prefix: str, +) -> list: + """ + List blobs in a GCS bucket with a given prefix. + + Args: + bucket_name: GCS bucket name + prefix: Blob name prefix to filter by + + Returns: + List of blob names + """ + client = storage.Client() + bucket = client.bucket(bucket_name) + blobs = bucket.list_blobs(prefix=prefix) + return [blob.name for blob in blobs] diff --git a/server/dancedeets/jobs/generate_sitemaps.py b/server/dancedeets/jobs/generate_sitemaps.py new file mode 100644 index 000000000..1af259678 --- /dev/null +++ b/server/dancedeets/jobs/generate_sitemaps.py @@ -0,0 +1,219 @@ +""" +Cloud Run Job: Generate XML sitemaps for search engine indexing. + +Migrated from: dancedeets/sitemaps/events.py + +This job generates XML sitemap files for all events and uploads them +to Google Cloud Storage for search engine crawlers. + +Usage: + python -m dancedeets.jobs.runner --job=generate_sitemaps + python -m dancedeets.jobs.runner --job=generate_sitemaps --vertical=STREET --time_period=FUTURE +""" + +import datetime +import logging +from typing import Optional + +from lxml import etree + +from dancedeets.jobs.base import BatchJob, JobRunner +from dancedeets.jobs.gcs_output import GCSOutputWriter, DEFAULT_BUCKET +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics +from dancedeets.util import urls + +logger = logging.getLogger(__name__) + + +def generate_sitemap_entry(event) -> Optional[str]: + """ + Generate a single sitemap XML entry for an event. + + Args: + event: DBEvent instance + + Returns: + XML string for the URL entry, or None if event should be skipped + """ + if not event.has_content(): + return None + + url_node = etree.Element('url') + + # Location + loc_node = etree.Element('loc') + loc_node.text = urls.dd_event_url(event) + url_node.append(loc_node) + + # Last modified (from Facebook updated_time) + if event.is_fb_event: + if 'updated_time' in event.fb_event.get('info', {}): + lastmod_node = etree.Element('lastmod') + updated = event.fb_event['info']['updated_time'] + updated = updated.replace('+0000', '+00:00') + lastmod_node.text = updated + url_node.append(lastmod_node) + else: + logger.debug(f'Event {event.id} does not have updated_time') + + # Calculate timing-based metadata + if event.end_time: + end_time = event.end_time + else: + end_time = event.start_time + datetime.timedelta(hours=2) + + start_time_delta = event.start_time - datetime.datetime.now() + end_time_delta = end_time - datetime.datetime.now() + event_delta = end_time - event.start_time + + # Change frequency and priority + changefreq_node = etree.Element('changefreq') + priority_node = etree.Element('priority') + priority_node.text = '0.5' + + # Event is active and not a multi-week event + if event_delta.days < 7 and start_time_delta.days <= 1 and end_time_delta.days >= 0: + changefreq_node.text = 'hourly' + # If it ended awhile ago + elif end_time_delta.days < -30: + changefreq_node.text = 'yearly' + priority_node.text = '0.1' + elif end_time_delta.days < -10: + changefreq_node.text = 'weekly' + # If it's coming up soon + elif start_time_delta.days < 30: + changefreq_node.text = 'daily' + else: + changefreq_node.text = 'weekly' + + url_node.append(changefreq_node) + url_node.append(priority_node) + + # Return as single line + return etree.tostring(url_node, encoding='unicode') + + +class GenerateSitemapsJob(BatchJob): + """ + Job that generates XML sitemaps for events. + + Processes events in batches and writes sitemap entries to GCS. + """ + + def __init__( + self, + vertical: Optional[str] = None, + time_period: Optional[str] = None, + bucket_name: str = DEFAULT_BUCKET, + dry_run: bool = False, + ): + super().__init__(batch_size=20) + self.vertical = vertical + self.time_period = time_period + self.bucket_name = bucket_name + self.dry_run = dry_run + + # Build output path + parts = ['sitemaps'] + if vertical: + parts.append(vertical.lower()) + if time_period: + parts.append(time_period.lower()) + timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M%S') + parts.append(f'sitemap-{timestamp}.xml') + self.blob_name = '/'.join(parts) + + self.output_writer = None + logger.info(f"GenerateSitemapsJob initialized: vertical={vertical}, time_period={time_period}") + + def setup(self) -> None: + """Initialize the output writer.""" + if not self.dry_run: + self.output_writer = GCSOutputWriter( + bucket_name=self.bucket_name, + blob_name=self.blob_name, + content_type='text/xml', + ) + # Write XML header + self.output_writer.write('') + self.output_writer.write('') + + def run_batch(self, events: list) -> None: + """Process a batch of events.""" + for event in events: + try: + entry = generate_sitemap_entry(event) + if entry: + if self.dry_run: + logger.debug(f"Would write sitemap entry for event {event.id}") + self.metrics.increment('entries_would_write') + else: + self.output_writer.write(entry) + self.metrics.increment('entries_written') + else: + self.metrics.increment('events_skipped_no_content') + except Exception as e: + logger.error(f"Error generating sitemap for event {event.id}: {e}") + self.metrics.increment('events_failed') + + self.metrics.increment('events_processed', len(events)) + + def teardown(self) -> None: + """Finalize and upload the sitemap.""" + if self.dry_run: + logger.info("[DRY RUN] Would write sitemap to GCS") + return + + if self.output_writer: + # Write closing tag + self.output_writer.write('') + uri = self.output_writer.flush() + logger.info(f"Sitemap written to {uri}") + + +def main( + vertical: Optional[str] = None, + time_period: Optional[str] = None, + bucket_name: str = DEFAULT_BUCKET, + dry_run: bool = False, + **kwargs, +) -> None: + """ + Main entry point for the generate_sitemaps job. + + Args: + vertical: Optional vertical filter (e.g., 'STREET') + time_period: Optional time period filter (e.g., 'FUTURE', 'PAST') + bucket_name: GCS bucket for output + dry_run: If True, don't write to GCS + """ + logger.info(f"Starting generate_sitemaps job: vertical={vertical}, time_period={time_period}") + + job = GenerateSitemapsJob( + vertical=vertical, + time_period=time_period, + bucket_name=bucket_name, + dry_run=dry_run, + ) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + + # Build filters + filters = [] + if vertical: + filters.append(('verticals', '=', vertical)) + if time_period: + filters.append(('search_time_period', '=', time_period)) + + runner.run_from_datastore_batched( + entity_kind='dancedeets.events.eventdata.DBEvent', + filters=filters, + batch_size=20, + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/generate_training_data.py b/server/dancedeets/jobs/generate_training_data.py new file mode 100644 index 000000000..2284bcc03 --- /dev/null +++ b/server/dancedeets/jobs/generate_training_data.py @@ -0,0 +1,228 @@ +""" +Cloud Run Job: Generate ML training data from potential events. + +Migrated from: dancedeets/ml/gprediction.py + +This job generates training data for the ML classifier by extracting +features from potential events and writing them to GCS. + +Usage: + python -m dancedeets.jobs.runner --job=generate_training_data +""" + +import csv +import io +import logging +import string +from typing import Optional + +from dancedeets import fb_api +from dancedeets.events import eventdata +from dancedeets.events import event_locations +from dancedeets.jobs.base import BatchJob, JobRunner +from dancedeets.jobs.fb_utils import FBJobContext, get_multiple_tokens +from dancedeets.jobs.gcs_output import GCSOutputWriter, DEFAULT_BUCKET +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics + +logger = logging.getLogger(__name__) + +# Character translation for stripping punctuation +convert_chars = string.punctuation + '\r\n\t' +trans = str.maketrans(convert_chars, ' ' * len(convert_chars)) + + +def strip_punctuation(s: str) -> str: + """Remove punctuation from a string.""" + return s.translate(trans) + + +def get_training_features(potential_event, fb_event, fb_event_attending) -> tuple: + """ + Extract training features from an event. + + Returns: + Tuple of feature values + """ + if 'owner' in fb_event['info']: + owner_name = 'id%s' % fb_event['info']['owner']['id'] + else: + owner_name = '' + + location = event_locations.get_address_for_fb_event(fb_event) + + def strip_text(s): + if isinstance(s, bytes): + s = s.decode('utf-8') + return strip_punctuation(s).lower() + + name = strip_text(fb_event['info'].get('name', '')) + description = strip_text(fb_event['info'].get('description', '')) + + attendee_list = ' '.join([ + 'id%s' % x['id'] + for x in fb_event_attending.get('attending', {}).get('data', []) + ]) + + source_list = ' '.join( + 'id%s' % x.id + for x in potential_event.source_ids_only() + ) + + # Currently only returning attendee_list (as per original code) + return (attendee_list,) + # Full features would be: + # return (potential_event.language, owner_name, location, name, description, attendee_list, source_list) + + +class GenerateTrainingDataJob(BatchJob): + """ + Job that generates ML training data from potential events. + + Extracts features from events and writes CSV training data to GCS. + """ + + def __init__( + self, + fb_context: Optional[FBJobContext] = None, + bucket_name: str = DEFAULT_BUCKET, + dry_run: bool = False, + ): + super().__init__(batch_size=20) + self.fb_context = fb_context + self.bucket_name = bucket_name + self.dry_run = dry_run + self.output_writer = None + logger.info("GenerateTrainingDataJob initialized") + + def setup(self) -> None: + """Initialize the output writer.""" + if not self.dry_run: + self.output_writer = GCSOutputWriter( + bucket_name=self.bucket_name, + blob_name='ml/training_data.csv', + content_type='text/csv', + ) + + def run_batch(self, pevents: list) -> None: + """Process a batch of potential events.""" + if not self.fb_context: + logger.warning("No FB context, skipping batch") + self.metrics.increment('batches_skipped_no_fb') + return + + fbl = self.fb_context.get_fblookup() + fbl.allow_memcache_write = False # Don't pollute memcache + + # Only process events that have been looked at + fb_event_ids = [x.fb_event_id for x in pevents if x.looked_at] + if not fb_event_ids: + self.metrics.increment('batches_empty') + return + + # Fetch from Facebook + fbl.request_multi(fb_api.LookupEvent, fb_event_ids) + fbl.request_multi(fb_api.LookupEventAttending, fb_event_ids) + + try: + fbl.batch_fetch() + except Exception as e: + logger.error(f"Error fetching Facebook data: {e}") + self.metrics.increment('batches_failed_fb') + return + + # Get existing events to determine labels + good_event_ids = [ + x.fb_event_id + for x in eventdata.DBEvent.get_by_ids(fb_event_ids, keys_only=True) + if x + ] + + # Build CSV + csv_file = io.StringIO() + csv_writer = csv.writer(csv_file) + + for potential_event in pevents: + if not potential_event.looked_at: + continue + + try: + # Label: 'dance' if event exists in DB, 'nodance' otherwise + label = 'dance' if potential_event.fb_event_id in good_event_ids else 'nodance' + + fb_event = fbl.fetched_data(fb_api.LookupEvent, potential_event.fb_event_id) + if fb_event.get('empty'): + self.metrics.increment('events_skipped_empty') + continue + + fb_event_attending = fbl.fetched_data( + fb_api.LookupEventAttending, + potential_event.fb_event_id + ) + + training_features = get_training_features( + potential_event, fb_event, fb_event_attending + ) + csv_writer.writerow([label] + list(training_features)) + self.metrics.increment('rows_written') + + except fb_api.NoFetchedDataException: + logger.debug(f"No data fetched for event id {potential_event.fb_event_id}") + self.metrics.increment('events_skipped_no_data') + + # Write to GCS + output = csv_file.getvalue() + if output: + if self.dry_run: + logger.info(f"[DRY RUN] Would write training data to GCS") + else: + self.output_writer.write(output) + + self.metrics.increment('batches_processed') + + def teardown(self) -> None: + """Finalize the output.""" + if not self.dry_run and self.output_writer: + uri = self.output_writer.flush() + logger.info(f"Training data written to {uri}") + + +def main(dry_run: bool = False, **kwargs) -> None: + """ + Main entry point for the generate_training_data job. + + Args: + dry_run: If True, don't write to GCS + """ + logger.info("Starting generate_training_data job") + + # Get tokens for Facebook API access + try: + tokens = get_multiple_tokens(token_count=50) + logger.info(f"Got {len(tokens)} access tokens for rotation") + except Exception as e: + logger.warning(f"Could not get multiple tokens: {e}") + tokens = [] + + fb_context = FBJobContext( + fb_uid='system', + access_tokens=tokens, + allow_cache=True, + ) if tokens else None + + job = GenerateTrainingDataJob( + fb_context=fb_context, + dry_run=dry_run, + ) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + runner.run_from_datastore_batched( + entity_kind='dancedeets.event_scraper.potential_events.PotentialEvent', + batch_size=20, + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/metrics.py b/server/dancedeets/jobs/metrics.py new file mode 100644 index 000000000..e3b7be82a --- /dev/null +++ b/server/dancedeets/jobs/metrics.py @@ -0,0 +1,151 @@ +""" +Metrics tracking for Cloud Run Jobs. + +Replaces MapReduce counters with in-memory tracking and optional +Cloud Monitoring integration. +""" + +import logging +import os +from collections import defaultdict +from typing import Dict, Optional + +logger = logging.getLogger(__name__) + + +class JobMetrics: + """ + In-memory counter implementation for job metrics. + + Replaces MapReduce op.counters.Increment with a simple dict-based + counter that can optionally export to Cloud Monitoring. + """ + + def __init__(self, job_name: Optional[str] = None): + self.job_name = job_name or os.environ.get('CLOUD_RUN_JOB', 'unknown') + self._counters: Dict[str, int] = defaultdict(int) + + def increment(self, key: str, delta: int = 1) -> None: + """ + Increment a counter. + + Args: + key: Counter name + delta: Amount to increment (default 1) + """ + self._counters[key] += delta + + def get(self, key: str) -> int: + """ + Get the current value of a counter. + + Args: + key: Counter name + + Returns: + Current counter value (0 if not set) + """ + return self._counters.get(key, 0) + + def get_all(self) -> Dict[str, int]: + """ + Get all counter values. + + Returns: + Dict of counter names to values + """ + return dict(self._counters) + + def log_summary(self) -> None: + """Log a summary of all counters.""" + logger.info(f"Job metrics for {self.job_name}:") + for key, value in sorted(self._counters.items()): + logger.info(f" {key}: {value}") + + def export_to_cloud_monitoring(self) -> None: + """ + Export metrics to Cloud Monitoring. + + This is optional and requires the google-cloud-monitoring package. + """ + try: + from google.cloud import monitoring_v3 + + client = monitoring_v3.MetricServiceClient() + project_name = f"projects/{os.environ.get('GOOGLE_CLOUD_PROJECT', 'dancedeets-hrd')}" + + for key, value in self._counters.items(): + # Create a custom metric descriptor if needed + # This is a simplified version - full implementation would + # create proper metric descriptors + logger.info(f"Would export to Cloud Monitoring: {key}={value}") + + except ImportError: + logger.warning("google-cloud-monitoring not installed, skipping export") + except Exception as e: + logger.error(f"Error exporting to Cloud Monitoring: {e}") + + +class GroupedMetrics: + """ + Metrics that can be grouped by a key (e.g., city, time_period). + + Useful for ranking-style aggregations. + """ + + def __init__(self): + self._groups: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int)) + + def increment(self, group_key: str, counter_key: str, delta: int = 1) -> None: + """ + Increment a counter within a group. + + Args: + group_key: The group identifier (e.g., city name) + counter_key: The counter name within the group + delta: Amount to increment + """ + self._groups[group_key][counter_key] += delta + + def get_group(self, group_key: str) -> Dict[str, int]: + """Get all counters for a group.""" + return dict(self._groups.get(group_key, {})) + + def get_all_groups(self) -> Dict[str, Dict[str, int]]: + """Get all groups and their counters.""" + return {k: dict(v) for k, v in self._groups.items()} + + def get_totals(self, counter_key: str) -> Dict[str, int]: + """Get totals for a specific counter across all groups.""" + return { + group_key: counters.get(counter_key, 0) + for group_key, counters in self._groups.items() + if counters.get(counter_key, 0) > 0 + } + + +# Global metrics instance for compatibility with old mr.increment() pattern +_current_metrics: Optional[JobMetrics] = None + + +def set_current_metrics(metrics: JobMetrics) -> None: + """Set the current job metrics instance (for compatibility).""" + global _current_metrics + _current_metrics = metrics + + +def get_current_metrics() -> Optional[JobMetrics]: + """Get the current job metrics instance.""" + return _current_metrics + + +def increment(key: str, delta: int = 1) -> None: + """ + Increment a counter (compatibility wrapper). + + This provides the same interface as the old mr.increment() function. + """ + if _current_metrics: + _current_metrics.increment(key, delta) + else: + logger.warning(f"No current metrics context, cannot increment {key}") diff --git a/server/dancedeets/jobs/notify_users.py b/server/dancedeets/jobs/notify_users.py new file mode 100644 index 000000000..33cbed16d --- /dev/null +++ b/server/dancedeets/jobs/notify_users.py @@ -0,0 +1,170 @@ +""" +Cloud Run Job: Send push notifications about new events to users. + +Migrated from: dancedeets/notifications/added_events.py + +This job runs hourly and sends notifications to users in a specific +timezone about recently added events near them. + +Usage: + python -m dancedeets.jobs.runner --job=notify_users --offset=8 + python -m dancedeets.jobs.runner --job=notify_users # auto-calculates offset +""" + +import datetime +import logging +import time + +from google.cloud import datastore + +from dancedeets.jobs.base import Job, JobRunner +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics +from dancedeets.loc import gmaps_api +from dancedeets.loc import math as loc_math +from dancedeets.notifications import android +from dancedeets.search import search +from dancedeets.search import search_base + +logger = logging.getLogger(__name__) + + +def get_time_offset() -> float: + """ + Calculate the timezone offset to target for 4pm local time notifications. + + Returns: + Float timezone offset (e.g., 8.0 for UTC+8) + """ + desired_hour = 16 # send new-event notifications at 4pm + current_hour = datetime.datetime.now().hour # should be UTC hour + offset = desired_hour - current_hour + if offset <= -12: + offset += 24 + if offset > 12: + offset -= 24 + return float(offset) + + +class NotifyUsersJob(Job): + """ + Job that sends push notifications about new events to users. + + For each user in the target timezone: + 1. Check if they can receive Android notifications + 2. Search for events near their location added in the last 24 hours + 3. Send push notifications for each new event + """ + + def __init__(self, offset: float = None, dry_run: bool = False): + super().__init__() + self.offset = offset if offset is not None else get_time_offset() + self.dry_run = dry_run + logger.info(f"NotifyUsersJob initialized for timezone offset {self.offset}") + + def run(self, user) -> None: + """Process a single user.""" + # Check if user can receive notifications + if not android.can_notify(user): + self.metrics.increment('users_skipped_no_android') + return + + if not user: + logger.error("No user provided") + return + + if user.expired_oauth_token: + logger.info(f"User has expired token, skipping: {user.fb_uid}") + self.metrics.increment('users_skipped_expired_token') + return + + user_location = user.location + if not user_location: + self.metrics.increment('users_skipped_no_location') + return + + logger.info(f"Processing user {user.fb_uid}") + + distance_in_km = user.distance_in_km() + min_attendees = user.min_attendees + + # Search for relevant events + geocode = gmaps_api.lookup_address(user_location) + if not geocode: + self.metrics.increment('users_skipped_geocode_failed') + return + + bounds = loc_math.expand_bounds(geocode.latlng_bounds(), distance_in_km) + query = search_base.SearchQuery( + time_period=search_base.TIME_UPCOMING, + bounds=bounds, + min_attendees=min_attendees, + ) + + one_day_ago = time.mktime( + (datetime.datetime.now() - datetime.timedelta(hours=24)).timetuple() + ) + + search_query = search.Search(query) + search_query.extra_fields = ['creation_time'] + search_results = search_query._get_candidate_doc_events() + + # Filter to recently added events + recent_events = [ + x.doc_id + for x in search_results + if x.field('creation_time').value > one_day_ago + ] + + logger.info( + f"Found {len(search_results)} search results, " + f"{len(recent_events)} new events for user {user.fb_uid}" + ) + + self.metrics.increment('events_found', len(recent_events)) + + for event_id in recent_events: + if self.dry_run: + logger.info(f"[DRY RUN] Would notify user {user.fb_uid} about event {event_id}") + self.metrics.increment('notifications_would_send') + else: + if android.add_notify(user, event_id): + logger.info(f"Sent notification to {user.fb_uid} for event {event_id}") + self.metrics.increment('notifications_sent') + + self.metrics.increment('users_processed') + + +def main(offset: float = None, dry_run: bool = False, **kwargs) -> None: + """ + Main entry point for the notify_users job. + + Args: + offset: Timezone offset to target (auto-calculated if not provided) + dry_run: If True, don't actually send notifications + """ + if offset is None: + offset = get_time_offset() + + logger.info(f"Starting notify_users job for timezone offset {offset}") + + job = NotifyUsersJob(offset=offset, dry_run=dry_run) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + + # Query users in the target timezone range + filters = [ + ('timezone_offset', '>=', offset), + ('timezone_offset', '<', offset + 1), + ] + + runner.run_from_datastore( + entity_kind='dancedeets.users.users.User', + filters=filters, + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/post_japan_events.py b/server/dancedeets/jobs/post_japan_events.py new file mode 100644 index 000000000..670d4f2e0 --- /dev/null +++ b/server/dancedeets/jobs/post_japan_events.py @@ -0,0 +1,93 @@ +""" +Cloud Run Job: Post future Japan events to social media. + +Migrated from: dancedeets/pubsub/pubsub_tasks.py + +This job finds all future dance events in Japan and publishes them +to configured social media accounts. + +Usage: + python -m dancedeets.jobs.runner --job=post_japan_events + python -m dancedeets.jobs.runner --job=post_japan_events --token_nickname=twitter_jp +""" + +import logging + +from dancedeets.jobs.base import BatchJob, JobRunner +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics +from dancedeets.pubsub import pubsub +from dancedeets.util import dates + +logger = logging.getLogger(__name__) + + +class PostJapanEventsJob(BatchJob): + """ + Job that posts Japan events to social media. + + Processes events in batches, filtering to only those in Japan, + then publishes each to social media via the pubsub module. + """ + + def __init__(self, token_nickname: str = None, dry_run: bool = False): + super().__init__(batch_size=20) + self.token_nickname = token_nickname + self.dry_run = dry_run + logger.info(f"PostJapanEventsJob initialized with token_nickname={token_nickname}") + + def run_batch(self, db_events: list) -> None: + """Process a batch of events.""" + # Filter to Japan events + japan_events = [ + event for event in db_events + if event.actual_city_name and event.actual_city_name.endswith('Japan') + ] + + logger.info(f"Batch: {len(db_events)} events, {len(japan_events)} in Japan") + self.metrics.increment('events_total', len(db_events)) + self.metrics.increment('events_in_japan', len(japan_events)) + + for db_event in japan_events: + try: + if self.dry_run: + logger.info(f"[DRY RUN] Would publish event {db_event.id}") + self.metrics.increment('events_would_publish') + else: + pubsub.eventually_publish_event(db_event.id, self.token_nickname) + self.metrics.increment('events_published') + except Exception as e: + logger.error(f"Error publishing event {db_event.id}: {e}") + self.metrics.increment('events_failed') + + +def main(token_nickname: str = None, dry_run: bool = False, **kwargs) -> None: + """ + Main entry point for the post_japan_events job. + + Args: + token_nickname: Optional social media token nickname to use + dry_run: If True, don't actually publish events + """ + logger.info(f"Starting post_japan_events job") + + job = PostJapanEventsJob(token_nickname=token_nickname, dry_run=dry_run) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + + # Query future events + filters = [ + ('search_time_period', '=', dates.TIME_FUTURE), + ] + + runner.run_from_datastore_batched( + entity_kind='dancedeets.events.eventdata.DBEvent', + filters=filters, + batch_size=20, + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/refresh_users.py b/server/dancedeets/jobs/refresh_users.py new file mode 100644 index 000000000..5a02b533d --- /dev/null +++ b/server/dancedeets/jobs/refresh_users.py @@ -0,0 +1,174 @@ +""" +Cloud Run Job: Refresh user profiles from Facebook. + +Migrated from: dancedeets/users/user_tasks.py + +This job refreshes user profile information from the Facebook API +and updates the local user records. + +Usage: + python -m dancedeets.jobs.runner --job=refresh_users + python -m dancedeets.jobs.runner --job=refresh_users --all_users=true +""" + +import logging +from typing import Optional + +from dancedeets import fb_api +from dancedeets.jobs.base import Job, JobRunner +from dancedeets.jobs.fb_utils import FBJobContext, get_fblookup_params, get_multiple_tokens +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics +from dancedeets.mail import mailchimp_api + +logger = logging.getLogger(__name__) + + +class RefreshUsersJob(Job): + """ + Job that refreshes user profiles from Facebook. + + For each user: + 1. Check if they have a valid access token + 2. Fetch updated profile info from Facebook + 3. Update local user record + 4. Optionally update Mailchimp subscription + """ + + def __init__( + self, + fb_context: Optional[FBJobContext] = None, + mailchimp_list_id: Optional[str] = None, + dry_run: bool = False, + ): + super().__init__() + self.fb_context = fb_context + self.mailchimp_list_id = mailchimp_list_id or mailchimp_api.get_list_id() + self.dry_run = dry_run + logger.info(f"RefreshUsersJob initialized, mailchimp_list_id={mailchimp_list_id}") + + def run(self, user) -> None: + """Process a single user.""" + if user.expired_oauth_token: + logger.info( + f"Skipping user {user.fb_uid} ({user.full_name}) " + "due to expired access_token" + ) + self.metrics.increment('users_skipped_expired') + if not self.dry_run: + user.put() # Save any pending changes + return + + # Get access token (prefer user's own token, fall back to context) + access_token = user.fb_access_token + if not access_token and self.fb_context: + access_token = self.fb_context.access_token + + if not access_token: + logger.info( + f"Skipping user {user.fb_uid} ({user.full_name}) " + "due to not having an access_token" + ) + self.metrics.increment('users_skipped_no_token') + if not self.dry_run: + user.put() + return + + # Fetch and update user from Facebook + try: + self._fetch_and_save_fb_user(user, access_token) + self.metrics.increment('users_refreshed') + except Exception as e: + logger.error(f"Error refreshing user {user.fb_uid}: {e}") + self.metrics.increment('users_failed') + + def _fetch_and_save_fb_user(self, user, access_token: str) -> None: + """Fetch user data from Facebook and save.""" + fbl = fb_api.FBLookup(user.fb_uid, access_token) + + if self.fb_context: + fbl.allow_cache = self.fb_context.allow_cache + if self.fb_context.oldest_allowed: + fbl.db.oldest_allowed = self.fb_context.oldest_allowed + + try: + fb_user = fbl.get(fb_api.LookupUser, user.fb_uid) + except fb_api.ExpiredOAuthToken as e: + logger.info(f"Auth token now expired for {user.fb_uid}: {e}") + user.expired_oauth_token_reason = str(e.args[0]) if e.args else "Unknown" + user.expired_oauth_token = True + if not self.dry_run: + user.put() + self.metrics.increment('users_token_expired') + return + + if self.dry_run: + logger.info(f"[DRY RUN] Would update user {user.fb_uid}") + return + + user.compute_derived_properties(fb_user) + user.put() + + # Update Mailchimp if configured + # Note: mailchimp update is typically handled by user.put() via signals + + +def main( + all_users: bool = False, + dry_run: bool = False, + **kwargs, +) -> None: + """ + Main entry point for the refresh_users job. + + Args: + all_users: If True, include users with expired tokens + dry_run: If True, don't save changes + """ + logger.info(f"Starting refresh_users job, all_users={all_users}") + + # Get tokens for Facebook API access + try: + tokens = get_multiple_tokens(token_count=50) + logger.info(f"Got {len(tokens)} access tokens for rotation") + except Exception as e: + logger.warning(f"Could not get multiple tokens: {e}") + tokens = [] + + # Create FB context with token pool + fb_context = FBJobContext( + fb_uid='system', # System-level access + access_tokens=tokens, + allow_cache=True, + ) if tokens else None + + # Get Mailchimp list ID + try: + mailchimp_list_id = mailchimp_api.get_list_id() + except Exception as e: + logger.warning(f"Could not get Mailchimp list ID: {e}") + mailchimp_list_id = None + + job = RefreshUsersJob( + fb_context=fb_context, + mailchimp_list_id=mailchimp_list_id, + dry_run=dry_run, + ) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + + # Build filters + filters = [] + if not all_users: + filters.append(('expired_oauth_token', '=', False)) + + runner.run_from_datastore( + entity_kind='dancedeets.users.users.User', + filters=filters, + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/reindex_classes.py b/server/dancedeets/jobs/reindex_classes.py new file mode 100644 index 000000000..d43c29e37 --- /dev/null +++ b/server/dancedeets/jobs/reindex_classes.py @@ -0,0 +1,50 @@ +""" +Cloud Run Job: Reindex dance classes in search. + +Migrated from: dancedeets/classes/class_pipeline.py (ReindexClasses) + +This job rebuilds the dance class search index from scraped data. + +Usage: + python -m dancedeets.jobs.runner --job=reindex_classes +""" + +import logging + +from dancedeets.classes import class_index +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics + +logger = logging.getLogger(__name__) + + +def main(dry_run: bool = False, **kwargs) -> None: + """ + Main entry point for the reindex_classes job. + + Args: + dry_run: If True, don't actually reindex + """ + logger.info("Starting reindex_classes job") + + metrics = JobMetrics() + set_current_metrics(metrics) + + if dry_run: + logger.info("[DRY RUN] Would rebuild class index") + metrics.increment('reindex_skipped') + else: + logger.info("Rebuilding class index...") + try: + class_index.StudioClassIndex.rebuild_from_query() + logger.info("Class index rebuilt successfully") + metrics.increment('reindex_completed') + except Exception as e: + logger.error(f"Error rebuilding class index: {e}") + metrics.increment('reindex_failed') + raise + + metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/runner.py b/server/dancedeets/jobs/runner.py new file mode 100644 index 000000000..8a0c3419c --- /dev/null +++ b/server/dancedeets/jobs/runner.py @@ -0,0 +1,141 @@ +""" +Cloud Run Job runner entry point. + +This module provides a CLI interface for running jobs from Cloud Run. +Jobs are specified by name and executed with the provided parameters. + +Usage: + python -m dancedeets.jobs.runner --job=notify_users --offset=8 + python -m dancedeets.jobs.runner --job=generate_sitemaps --vertical=STREET +""" + +import argparse +import importlib +import logging +import os +import sys + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + stream=sys.stdout, +) +logger = logging.getLogger(__name__) + +# Registry of available jobs +JOB_REGISTRY = { + # Phase 2: Simple mapper jobs + 'notify_users': 'dancedeets.jobs.notify_users', + 'post_japan_events': 'dancedeets.jobs.post_japan_events', + 'compute_rankings': 'dancedeets.jobs.compute_rankings', + 'compute_user_stats': 'dancedeets.jobs.compute_user_stats', + 'refresh_users': 'dancedeets.jobs.refresh_users', + 'send_weekly_emails': 'dancedeets.jobs.send_weekly_emails', + + # Phase 3: GCS output jobs + 'generate_sitemaps': 'dancedeets.jobs.generate_sitemaps', + 'dump_potential_events': 'dancedeets.jobs.dump_potential_events', + 'generate_training_data': 'dancedeets.jobs.generate_training_data', + 'classify_events_ml': 'dancedeets.jobs.classify_events_ml', + 'auto_add_events': 'dancedeets.jobs.auto_add_events', + + # Phase 4: MapReduce pipeline jobs + 'count_unique_attendees': 'dancedeets.jobs.count_unique_attendees', + 'update_source_stats': 'dancedeets.jobs.update_source_stats', + 'scrape_and_classify': 'dancedeets.jobs.scrape_and_classify', + 'find_access_tokens': 'dancedeets.jobs.find_access_tokens', + + # Phase 5: Pipeline orchestration jobs (individual steps) + 'start_spiders': 'dancedeets.jobs.start_spiders', + 'reindex_classes': 'dancedeets.jobs.reindex_classes', + 'email_crawl_errors': 'dancedeets.jobs.email_crawl_errors', +} + + +def run_job(job_name: str, **kwargs) -> None: + """ + Run a job by name with the given parameters. + + Args: + job_name: Name of the job from JOB_REGISTRY + **kwargs: Job-specific parameters + """ + if job_name not in JOB_REGISTRY: + available = ', '.join(sorted(JOB_REGISTRY.keys())) + raise ValueError(f"Unknown job: {job_name}. Available jobs: {available}") + + module_path = JOB_REGISTRY[job_name] + logger.info(f"Loading job module: {module_path}") + + try: + module = importlib.import_module(module_path) + except ImportError as e: + logger.error(f"Failed to import job module {module_path}: {e}") + raise + + if not hasattr(module, 'main'): + raise ValueError(f"Job module {module_path} must have a main() function") + + logger.info(f"Running job: {job_name}") + logger.info(f"Parameters: {kwargs}") + + # Cloud Run Job environment info + task_index = os.environ.get('CLOUD_RUN_TASK_INDEX', '0') + task_count = os.environ.get('CLOUD_RUN_TASK_COUNT', '1') + logger.info(f"Task {int(task_index) + 1} of {task_count}") + + try: + module.main(**kwargs) + logger.info(f"Job {job_name} completed successfully") + except Exception as e: + logger.error(f"Job {job_name} failed: {e}") + raise + + +def parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser(description='Run a DanceDeets batch job') + parser.add_argument( + '--job', + required=True, + help='Name of the job to run', + ) + parser.add_argument( + '--dry-run', + action='store_true', + help='Run in dry-run mode (no side effects)', + ) + + # Allow arbitrary additional arguments + args, unknown = parser.parse_known_args() + + # Parse unknown args as key=value pairs + extra_args = {} + for arg in unknown: + if '=' in arg: + key, value = arg.split('=', 1) + key = key.lstrip('-') + # Try to convert to appropriate types + if value.lower() in ('true', 'false'): + value = value.lower() == 'true' + elif value.isdigit(): + value = int(value) + elif value.replace('.', '').isdigit(): + value = float(value) + extra_args[key] = value + + return args, extra_args + + +def main(): + """Main entry point for the job runner.""" + args, extra_args = parse_args() + + if args.dry_run: + extra_args['dry_run'] = True + + run_job(args.job, **extra_args) + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/scrape_and_classify.py b/server/dancedeets/jobs/scrape_and_classify.py new file mode 100644 index 000000000..c873e6731 --- /dev/null +++ b/server/dancedeets/jobs/scrape_and_classify.py @@ -0,0 +1,192 @@ +""" +Cloud Run Job: Scrape sources for events and classify them. + +Migrated from: dancedeets/event_scraper/thing_scraper2.py + +This job scrapes configured sources (fan pages, profiles, etc.) for +event listings, then classifies discovered events. + +Usage: + python -m dancedeets.jobs.runner --job=scrape_and_classify + python -m dancedeets.jobs.runner --job=scrape_and_classify --min_potential_events=5 +""" + +import json +import logging +from collections import defaultdict +from typing import Dict, List, Optional, Set + +from dancedeets.jobs.base import BatchJob, JobRunner +from dancedeets.jobs.fb_utils import FBJobContext, get_multiple_tokens +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics +from dancedeets.event_scraper import event_pipeline +from dancedeets.event_scraper import potential_events +from dancedeets.event_scraper import thing_scraper + +logger = logging.getLogger(__name__) + + +class ScrapeAndClassifyJob(BatchJob): + """ + Job that scrapes sources and classifies discovered events. + + This combines the map and reduce steps from the original MapReduce: + 1. Map: Scrape each source for events + 2. Reduce: Group by event_id and process + + Uses in-memory aggregation since event volumes are manageable. + """ + + def __init__( + self, + fb_context: Optional[FBJobContext] = None, + min_potential_events: int = 0, + dry_run: bool = False, + ): + super().__init__(batch_size=20) + self.fb_context = fb_context + self.min_potential_events = min_potential_events + self.dry_run = dry_run + + # Aggregate discovered events by event_id + # event_id -> list of (source_id, source_field, extra_source_id) + self.discovered_events: Dict[str, List[tuple]] = defaultdict(list) + + logger.info(f"ScrapeAndClassifyJob initialized with min_potential_events={min_potential_events}") + + def run_batch(self, sources: list) -> None: + """Process a batch of sources (scraping phase).""" + # Filter sources by min_potential_events + if self.min_potential_events > 0: + sources = [ + s for s in sources + if (s.num_potential_events or 0) >= self.min_potential_events + ] + + if not sources: + self.metrics.increment('batches_empty') + return + + if not self.fb_context: + logger.warning("No FB context, skipping batch") + self.metrics.increment('batches_skipped_no_fb') + return + + fbl = self.fb_context.get_fblookup() + fbl.allow_cache = False # Don't cache during scraping + # Make passthrough to avoid unnecessary memcache puts + fbl.make_passthrough() + + # Discover events from sources + try: + discovered_list = thing_scraper.discover_events_from_sources(fbl, sources) + except Exception as e: + logger.error(f"Error scraping sources: {e}") + self.metrics.increment('batches_failed_scrape') + return + + # Aggregate by event_id + for discovered in discovered_list: + state = ( + discovered.source_id, + discovered.source_field, + discovered.extra_source_id, + ) + self.discovered_events[discovered.event_id].append(state) + self.metrics.increment('events_discovered') + + self.metrics.increment('sources_scraped', len(sources)) + self.metrics.increment('batches_processed') + + def teardown(self) -> None: + """Process all discovered events (reduce phase).""" + logger.info(f"Processing {len(self.discovered_events)} unique events") + + if self.dry_run: + logger.info("[DRY RUN] Would process discovered events") + for event_id, sources in list(self.discovered_events.items())[:10]: + logger.info(f" Event {event_id}: {len(sources)} sources") + return + + if not self.fb_context: + logger.warning("No FB context, skipping event processing") + return + + fbl = self.fb_context.get_fblookup() + fbl.allow_cache = True # Use cache for classification + + # Process events in batches + events_processed = 0 + events_failed = 0 + + for event_id, source_list in self.discovered_events.items(): + try: + # Build discovered event objects + discovered_list = [] + for source_id, source_field, extra_source_id in source_list: + discovered = potential_events.DiscoveredEvent( + event_id, None, source_field, extra_source_id + ) + discovered.source = None + discovered.source_id = source_id + discovered_list.append(discovered) + + # Process through event pipeline + event_pipeline.process_discovered_events(fbl, discovered_list) + events_processed += 1 + + except Exception as e: + logger.error(f"Error processing event {event_id}: {e}") + events_failed += 1 + + self.metrics.increment('events_processed', events_processed) + self.metrics.increment('events_failed', events_failed) + logger.info(f"Processed {events_processed} events, {events_failed} failed") + + +def main( + min_potential_events: int = 0, + dry_run: bool = False, + **kwargs, +) -> None: + """ + Main entry point for the scrape_and_classify job. + + Args: + min_potential_events: Only scrape sources with at least this many potential events + dry_run: If True, don't actually process events + """ + logger.info(f"Starting scrape_and_classify job with min_potential_events={min_potential_events}") + + # Get tokens for Facebook API access + try: + tokens = get_multiple_tokens(token_count=50) + logger.info(f"Got {len(tokens)} access tokens for rotation") + except Exception as e: + logger.warning(f"Could not get multiple tokens: {e}") + tokens = [] + + fb_context = FBJobContext( + fb_uid='system', + access_tokens=tokens, + allow_cache=False, + ) if tokens else None + + job = ScrapeAndClassifyJob( + fb_context=fb_context, + min_potential_events=min_potential_events, + dry_run=dry_run, + ) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + runner.run_from_datastore_batched( + entity_kind='dancedeets.event_scraper.thing_db.Source', + batch_size=20, + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/send_weekly_emails.py b/server/dancedeets/jobs/send_weekly_emails.py new file mode 100644 index 000000000..7c7f9f300 --- /dev/null +++ b/server/dancedeets/jobs/send_weekly_emails.py @@ -0,0 +1,292 @@ +""" +Cloud Run Job: Send weekly event digest emails to users. + +Migrated from: dancedeets/search/email_events.py + +This job sends a weekly email to users with dance events near them. + +Usage: + python -m dancedeets.jobs.runner --job=send_weekly_emails + python -m dancedeets.jobs.runner --job=send_weekly_emails --dry_run=true +""" + +import datetime +import logging +import random +import re +import urllib.parse +from typing import Optional + +from dancedeets import fb_api +from dancedeets import render_server +from dancedeets.jobs.base import Job, JobRunner +from dancedeets.jobs.fb_utils import FBJobContext, get_multiple_tokens +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics +from dancedeets.loc import names +from dancedeets.logic import api_format +from dancedeets.logic import mobile +from dancedeets.mail import mandrill_api +from dancedeets.search import search_base +from dancedeets.search import search +from dancedeets.users import users + +logger = logging.getLogger(__name__) + + +class NoEmailException(Exception): + """Raised when email cannot be sent for a user.""" + pass + + +def email_for_user(user, fbl, should_send: bool = True): + """ + Generate and optionally send a weekly email for a user. + + Args: + user: User object + fbl: FBLookup instance + should_send: Whether to actually send the email + + Returns: + The email message dict + + Raises: + NoEmailException: If email cannot be sent for various reasons + """ + if not user.send_email: + raise NoEmailException('User has send_email==False') + + email_address = user.email + if not email_address: + raise NoEmailException('User does not have an email') + + # Check if we sent an email recently + if user.weekly_email_send_date: + if user.weekly_email_send_date > datetime.datetime.now() - datetime.timedelta(days=3): + message = f"Skipping user {user.fb_uid} ({user.full_name}) because last weekly email was sent on {user.weekly_email_send_date}" + logger.warning(message) + raise NoEmailException(message) + + fb_user = fbl.fetched_data(fb_api.LookupUser, fbl.fb_uid) + if 'profile' not in fb_user: + raise NoEmailException(f'Could not find LookupUser: {fb_user}') + + user_location = user.location + if not user_location: + raise NoEmailException('User does not have location') + + # Build search query for this week's events + d = datetime.date.today() + start_time = d - datetime.timedelta(days=d.weekday()) # round down to last monday + end_time = start_time + datetime.timedelta(days=8) + data = { + 'location': user_location, + 'distance': user.distance_in_km(), + 'distance_units': 'km', + 'start': start_time, + 'end': end_time, + } + form = search_base.SearchForm(data=data) + + geocode = None + distance = None + if form.location.data: + try: + geocode, distance = search_base.get_geocode_with_distance(form) + except Exception as e: + raise NoEmailException(f'Could not normalize user location: {data}: {e}') + + try: + search_query = form.build_query(start_end_query=True) + except Exception: + logger.error(f'Error looking up user location for user {user.fb_uid}, form: {form}') + raise + + search_results = search.Search(search_query).get_search_results() + if not search_results: + raise NoEmailException('No search results for user') + + # Build the email content + need_full_event = False + json_search_response = api_format.build_search_results_api( + form, search_query, search_results, (2, 0), need_full_event, geocode, distance, skip_people=True + ) + locale = user.locale or 'en_US' + email_unsubscribe_url = f'https://www.dancedeets.com/user/unsubscribe?email={urllib.parse.quote(email_address)}' + props = { + 'user': { + 'userName': user.first_name or user.full_name or '', + 'city': user.city_name, + 'countryName': names.get_country_name(user.location_country), + }, + 'response': json_search_response, + 'currentLocale': locale.replace('_', '-'), + 'mobileIosUrl': mobile.IOS_URL, + 'mobileAndroidUrl': mobile.ANDROID_URL, + 'emailPreferencesUrl': email_unsubscribe_url, + } + + # Render the email template + email_template = 'weeklyMail.js' + response = render_server.render_jsx(email_template, props, static_html=True) + if response.error: + message = f'Error rendering weeklyMail.js: {response.error}' + logger.error(message) + raise NoEmailException(message) + + mjml_response = render_server.render_mjml(response.markup) + rendered_html = mjml_response['html'] + if mjml_response.get('errors'): + message = f'Errors rendering weeklyMail.mjml: {mjml_response["errors"]}' + logger.error(message) + raise NoEmailException(message) + + # Build the message + messages = [ + 'Your Week in Dance: %s', + 'DanceDeets Weekly: %s', + 'Dance Events for %s', + ] + message_template = random.choice(messages) + tag = re.sub(r'[^a-z]', '-', message_template.lower())[:50] + tags = ['weekly', tag] + + subject = message_template % d.strftime('%b %d, %Y') + message = { + 'from_email': 'events@dancedeets.com', + 'from_name': 'DanceDeets Events', + 'subject': subject, + 'to': [{ + 'email': email_address, + 'name': user.full_name or user.first_name or '', + 'type': 'to', + }], + 'html': rendered_html, + 'metadata': { + 'user_id': user.fb_uid, + 'email_type': 'weekly', + }, + 'tags': tags, + } + + if should_send: + logger.info(f'Sending weekly mail for user {user.fb_uid} ({user.full_name})') + # Update the last-sent-time here, so any retryable errors don't cause emails to be multi-sent + user = users.User.get_by_id(user.fb_uid) + user.weekly_email_send_date = datetime.datetime.now() + user.put() + # And send the message now + mandrill_api.send_message(message) + + return message + + +class SendWeeklyEmailsJob(Job): + """ + Job that sends weekly event digest emails to users. + + For each user: + 1. Fetch user profile from Facebook + 2. Search for events near their location + 3. Render and send email via Mandrill + """ + + def __init__( + self, + fb_context: Optional[FBJobContext] = None, + dry_run: bool = False, + ): + super().__init__() + self.fb_context = fb_context + self.dry_run = dry_run + logger.info("SendWeeklyEmailsJob initialized") + + def run(self, user) -> None: + """Process a single user.""" + # Get access token + access_token = user.fb_access_token + if not access_token and self.fb_context: + access_token = self.fb_context.access_token + + if not access_token: + logger.info(f"Skipping user {user.fb_uid} - no access token") + self.metrics.increment('users_skipped_no_token') + return + + # Create FBLookup for this user + fbl = fb_api.FBLookup(user.fb_uid, access_token) + if self.fb_context: + fbl.allow_cache = self.fb_context.allow_cache + + # Fetch user data from Facebook + fbl.request(fb_api.LookupUser, user.fb_uid) + fbl.request(fb_api.LookupUserEvents, user.fb_uid) + + try: + fbl.batch_fetch() + except fb_api.ExpiredOAuthToken as e: + logger.info(f"Auth token now expired for {user.fb_uid}: {e}") + user.expired_oauth_token_reason = str(e.args[0]) if e.args else "Unknown" + user.expired_oauth_token = True + if not self.dry_run: + user.put() + self.metrics.increment('users_token_expired') + return + + # Generate and send email + try: + should_send = not self.dry_run + email_for_user(user, fbl, should_send=should_send) + + if self.dry_run: + logger.info(f"[DRY RUN] Would send email to {user.fb_uid}") + self.metrics.increment('emails_would_send') + else: + self.metrics.increment('emails_sent') + + except NoEmailException as e: + logger.info(f"Not sending email for user {user.fb_uid}: {e}") + self.metrics.increment('users_skipped_no_email') + + except Exception as e: + logger.exception(f"Error sending email for user {user.fb_uid}") + self.metrics.increment('emails_failed') + + +def main(dry_run: bool = False, **kwargs) -> None: + """ + Main entry point for the send_weekly_emails job. + + Args: + dry_run: If True, don't actually send emails + """ + logger.info("Starting send_weekly_emails job") + + # Get tokens for Facebook API access + try: + tokens = get_multiple_tokens(token_count=50) + logger.info(f"Got {len(tokens)} access tokens for rotation") + except Exception as e: + logger.warning(f"Could not get multiple tokens: {e}") + tokens = [] + + # Create FB context with token pool + fb_context = FBJobContext( + fb_uid='system', + access_tokens=tokens, + allow_cache=True, + ) if tokens else None + + job = SendWeeklyEmailsJob(fb_context=fb_context, dry_run=dry_run) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + runner.run_from_datastore( + entity_kind='dancedeets.users.users.User', + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/start_spiders.py b/server/dancedeets/jobs/start_spiders.py new file mode 100644 index 000000000..b74ffd1dc --- /dev/null +++ b/server/dancedeets/jobs/start_spiders.py @@ -0,0 +1,120 @@ +""" +Cloud Run Job: Start ScrapingHub spider jobs. + +Migrated from: dancedeets/classes/class_pipeline.py (start_spiders) + +This job triggers spider crawls on ScrapingHub for dance studio schedules. + +Usage: + python -m dancedeets.jobs.runner --job=start_spiders +""" + +import json +import logging +import os +from typing import List + +import scrapinghub + +from dancedeets import keys +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics + +logger = logging.getLogger(__name__) + +# Spiders that are currently disabled +DISABLED_SPIDERS = ['EXPG', 'Boogiezone', 'IDA', 'mL', 'NeighborhoodStudio'] + + +def get_spiders() -> List[str]: + """Get list of active spiders.""" + all_spiders = [ + # NY + 'PMT', + 'Evolution', + 'Peridance', + 'BDC', + 'EXPG', + # LA + 'Millenium', + 'EDGE', + 'DebbieReynolds', + 'TheLab', + 'Boogiezone', + 'IDA', + 'mL', + 'NeighborhoodStudio', + ] + return [s for s in all_spiders if s not in DISABLED_SPIDERS] + + +def get_shub_project(): + """Get ScrapingHub project connection.""" + api_key = keys.get('scrapinghub_key') + conn = scrapinghub.Connection(api_key) + project = scrapinghub.Project(conn, 27474) + return project + + +def start_spiders(spiders: List[str], dry_run: bool = False) -> List[str]: + """ + Start spider jobs on ScrapingHub. + + Args: + spiders: List of spider names to run + dry_run: If True, don't actually start spiders + + Returns: + List of job keys for started spiders + """ + if dry_run: + logger.info(f"[DRY RUN] Would start {len(spiders)} spiders: {spiders}") + return [f"dry-run-{s}" for s in spiders] + + project = get_shub_project() + job_keys = [] + + for spider in spiders: + try: + job_id = project.schedule(spider) + job_keys.append(job_id) + logger.info(f"Scheduled spider {spider}: {job_id}") + except Exception as e: + logger.error(f"Error scheduling spider {spider}: {e}") + + logger.info(f"Scheduled {len(job_keys)} jobs: {job_keys}") + return job_keys + + +def main(dry_run: bool = False, **kwargs) -> None: + """ + Main entry point for the start_spiders job. + + Args: + dry_run: If True, don't actually start spiders + + Outputs: + Writes job keys to stdout for workflow consumption + """ + logger.info("Starting start_spiders job") + + metrics = JobMetrics() + set_current_metrics(metrics) + + spiders = get_spiders() + logger.info(f"Active spiders: {spiders}") + + job_keys = start_spiders(spiders, dry_run=dry_run) + + metrics.increment('spiders_started', len(job_keys)) + metrics.log_summary() + + # Output job keys for workflow to consume + output = { + 'jobKeys': job_keys, + 'spidersStarted': len(job_keys), + } + print(json.dumps(output)) + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/jobs/update_source_stats.py b/server/dancedeets/jobs/update_source_stats.py new file mode 100644 index 000000000..a77e09692 --- /dev/null +++ b/server/dancedeets/jobs/update_source_stats.py @@ -0,0 +1,130 @@ +""" +Cloud Run Job: Update source statistics. + +Migrated from: dancedeets/event_scraper/thing_db.py (mr_count_potential_events) + +This job counts potential events, real events, and false negatives +per source (fan pages, profiles, etc.) for source quality tracking. + +Usage: + python -m dancedeets.jobs.runner --job=update_source_stats +""" + +import json +import logging +from collections import defaultdict +from typing import Dict, List, Optional, Tuple + +from dancedeets.events import eventdata +from dancedeets.event_scraper import thing_db +from dancedeets.jobs.base import Job, JobRunner +from dancedeets.jobs.metrics import JobMetrics, set_current_metrics + +logger = logging.getLogger(__name__) + + +class UpdateSourceStatsJob(Job): + """ + Job that updates source statistics. + + For each potential event, counts: + - Whether it's a potential event (match_score > 0) + - Whether it became a real event (exists in DBEvent) + - Whether it's a false negative (real but not potential) + + Aggregates by source and updates Source entities. + """ + + def __init__(self, dry_run: bool = False): + super().__init__() + self.dry_run = dry_run + + # Aggregate counts per source: source_id -> (potential, real, false_negative, total) + self.source_counts: Dict[str, Dict[str, int]] = defaultdict( + lambda: {'all': 0, 'potential': 0, 'real': 0, 'false_negative': 0} + ) + + logger.info("UpdateSourceStatsJob initialized") + + def run(self, pe) -> None: + """Process a single potential event.""" + # Check if this became a real event + db_event = eventdata.DBEvent.get_by_id(pe.fb_event_id) + + is_potential_event = pe.match_score > 0 + real_event = db_event is not None + false_negative = bool(db_event and not is_potential_event) + + # Aggregate for each source + for source_id in pe.source_ids_only(): + source_id_str = str(source_id) + self.source_counts[source_id_str]['all'] += 1 + if is_potential_event: + self.source_counts[source_id_str]['potential'] += 1 + if real_event: + self.source_counts[source_id_str]['real'] += 1 + if false_negative: + self.source_counts[source_id_str]['false_negative'] += 1 + + self.metrics.increment('events_processed') + + def teardown(self) -> None: + """Update all Source entities with aggregated counts.""" + logger.info(f"Updating {len(self.source_counts)} sources") + + sources_updated = 0 + sources_not_found = 0 + + for source_id, counts in self.source_counts.items(): + source = thing_db.Source.get_by_key_name(source_id) + if not source: + logger.debug(f"Source not found: {source_id}") + sources_not_found += 1 + continue + + if self.dry_run: + logger.debug( + f"[DRY RUN] Would update source {source_id}: " + f"all={counts['all']}, potential={counts['potential']}, " + f"real={counts['real']}, false_negative={counts['false_negative']}" + ) + else: + source.num_all_events = counts['all'] + source.num_potential_events = counts['potential'] + source.num_real_events = counts['real'] + source.num_false_negatives = counts['false_negative'] + source.put() + + sources_updated += 1 + + self.metrics.increment('sources_updated', sources_updated) + self.metrics.increment('sources_not_found', sources_not_found) + + if self.dry_run: + logger.info(f"[DRY RUN] Would update {sources_updated} sources") + else: + logger.info(f"Updated {sources_updated} sources") + + +def main(dry_run: bool = False, **kwargs) -> None: + """ + Main entry point for the update_source_stats job. + + Args: + dry_run: If True, don't update sources + """ + logger.info("Starting update_source_stats job") + + job = UpdateSourceStatsJob(dry_run=dry_run) + set_current_metrics(job.metrics) + + runner = JobRunner(job) + runner.run_from_datastore( + entity_kind='dancedeets.event_scraper.potential_events.PotentialEvent', + ) + + job.metrics.log_summary() + + +if __name__ == '__main__': + main() diff --git a/server/dancedeets/logic/mr_dump.py b/server/dancedeets/logic/mr_dump.py deleted file mode 100644 index 5c265a431..000000000 --- a/server/dancedeets/logic/mr_dump.py +++ /dev/null @@ -1,48 +0,0 @@ -import csv -import io -import json -import logging - -from dancedeets import fb_api -from dancedeets.util import fb_mapreduce - - -def dump_fb_json(fbl, pe_list): - pe_list = [x for x in pe_list if x.match_score > 0] - if not pe_list: - return - - fbl.request_multi(fb_api.LookupEvent, [x.fb_event_id for x in pe_list]) - fbl.batch_fetch() - - csv_file = io.StringIO() - csv_writer = csv.writer(csv_file) - - for pe in pe_list: - try: - result = json.dumps(fbl.fetched_data(fb_api.LookupEvent, pe.fb_event_id)) - cache_key = fbl.key_to_cache_key(fb_api.generate_key(fb_api.LookupEvent, pe.fb_event_id)) - csv_writer.writerow([cache_key, result]) - except fb_api.NoFetchedDataException: - logging.error("skipping row for event id %s", pe.fb_event_id) - yield csv_file.getvalue() - - -map_dump_fb_json = fb_mapreduce.mr_wrap(dump_fb_json) - - -def mr_dump_events(fbl): - fb_mapreduce.start_map( - fbl, - 'Dump Potential FB Event Data', - 'dancedeets.logic.mr_dump.map_dump_fb_json', - 'dancedeets.event_scraper.potential_events.PotentialEvent', - handle_batch_size=80, - queue=None, - filters=[('looked_at', '=', None)], - output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter', - output_writer={ - 'mime_type': 'text/plain', - 'bucket_name': 'dancedeets-hrd.appspot.com', - }, - ) diff --git a/server/dancedeets/logic/unique_attendees.py b/server/dancedeets/logic/unique_attendees.py deleted file mode 100644 index ca9e96bc4..000000000 --- a/server/dancedeets/logic/unique_attendees.py +++ /dev/null @@ -1,66 +0,0 @@ -import logging - -from dancedeets.compat.mapreduce import mapreduce_pipeline - -from dancedeets import app -from dancedeets import base_servlet -from dancedeets import fb_api -from dancedeets.util import fb_mapreduce - -BATCH_SIZE = 20 - - -def map_each_attendee(db_events): - db_events = [x for x in db_events if x.is_fb_event] - - fbl = fb_mapreduce.get_fblookup() - fbl.request_multi(fb_api.LookupEventAttending, [x.fb_event_id for x in db_events]) - fbl.batch_fetch() - - for db_event in db_events: - try: - fb_event_attending = fbl.fetched_data(fb_api.LookupEventAttending, db_event.id) - except fb_api.NoFetchedDataException: - logging.warning('No attending found for %s', db_event.id) - continue - if fb_event_attending['empty']: - continue - for attendee in fb_event_attending['attending']['data']: - yield ('City: %s' % db_event.city_name, attendee['id']) - yield ('Country: %s' % db_event.country, attendee['id']) - - -def reduce_just_unique_attendees(location, all_attendees): - yield 'Unique Attendees in %s: %s\n' % (location, len(set(all_attendees))) - yield 'Total RSVPs in %s: %s\n' % (location, len(all_attendees)) - - -def mr_count_attendees_per_city(fbl): - mapper_params = { - 'entity_kind': 'dancedeets.events.eventdata.DBEvent', - 'handle_batch_size': BATCH_SIZE, - } - mapper_params.update(fb_mapreduce.get_fblookup_params(fbl, randomize_tokens=True)) - mrp = mapreduce_pipeline.MapreducePipeline( - 'unique_attendees', - 'dancedeets.logic.unique_attendees.map_each_attendee', - 'dancedeets.logic.unique_attendees.reduce_just_unique_attendees', - 'mapreduce.input_readers.DatastoreInputReader', - 'mapreduce.output_writers.GoogleCloudStorageOutputWriter', - mapper_params=mapper_params, - reducer_params={ - 'output_writer': { - 'mime_type': 'text/plain', - 'bucket_name': 'dancedeets-hrd.appspot.com', - }, - }, - shards=8, - ) - mrp.start() - return mrp - - -@app.route('/tools/unique_attendees') -class ExportSourcesHandler(base_servlet.BaseTaskFacebookRequestHandler): - def get(self): - mr_count_attendees_per_city(self.fbl) diff --git a/server/dancedeets/ml/gprediction.py b/server/dancedeets/ml/gprediction.py index 1b89475bd..743568422 100644 --- a/server/dancedeets/ml/gprediction.py +++ b/server/dancedeets/ml/gprediction.py @@ -1,105 +1,86 @@ -import csv -import io +""" +Google Prediction API integration for event classification. + +The batch processing jobs have been migrated to Cloud Run Jobs. +See: +- dancedeets.jobs.generate_training_data +- dancedeets.jobs.classify_events_ml + +This module retains: +- get_predict_service(): Google Prediction API client +- get_training_features(): Feature extraction for ML +- predict(): Single event prediction +""" import logging import string -from dancedeets.events import eventdata from dancedeets.events import event_locations -from dancedeets import fb_api -from dancedeets.util import fb_mapreduce convert_chars = string.punctuation + '\r\n\t' trans = str.maketrans(convert_chars, ' ' * len(convert_chars)) def strip_punctuation(s): + """Remove punctuation from a string.""" return s.translate(trans) -def training_data_for_pevents(fbl, pevents): - fbl.allow_memcache_write = False # don't pollute memcache - fb_event_ids = [x.fb_event_id for x in pevents if x.looked_at] - fbl.request_multi(fb_api.LookupEvent, fb_event_ids) - fbl.request_multi(fb_api.LookupEventAttending, fb_event_ids) - fbl.batch_fetch() - - good_event_ids = [x.fb_event_id for x in eventdata.DBEvent.get_by_ids(fb_event_ids, keys_only=True) if x] - - csv_file = io.StringIO() - csv_writer = csv.writer(csv_file) - - for potential_event in pevents: - if not potential_event.looked_at: - continue - try: - good_event = potential_event.fb_event_id in good_event_ids and 'dance' or 'nodance' - - fb_event = fbl.fetched_data(fb_api.LookupEvent, potential_event.fb_event_id) - if fb_event['empty']: - continue - fb_event_attending = fbl.fetched_data(fb_api.LookupEventAttending, potential_event.fb_event_id) - - training_features = get_training_features(potential_event, fb_event, fb_event_attending) - csv_writer.writerow([good_event] + list(training_features)) - except fb_api.NoFetchedDataException: - logging.info("No data fetched for event id %s", potential_event.fb_event_id) - yield csv_file.getvalue() - - -map_training_data_for_pevents = fb_mapreduce.mr_wrap(training_data_for_pevents) +def get_training_features(potential_event, fb_event, fb_event_attending): + """ + Extract training features from an event. + Args: + potential_event: PotentialEvent instance + fb_event: Facebook event data + fb_event_attending: Facebook event attending data -def get_training_features(potential_event, fb_event, fb_event_attending): + Returns: + Tuple of feature values + """ if 'owner' in fb_event['info']: owner_name = 'id%s' % fb_event['info']['owner']['id'] else: owner_name = '' - location = event_locations.get_address_for_fb_event(fb_event).encode('utf-8') + location = event_locations.get_address_for_fb_event(fb_event) + if isinstance(location, str): + location = location.encode('utf-8') def strip_text(s): - return strip_punctuation(s.encode('utf8')).lower() + if isinstance(s, str): + s = s.encode('utf8') + return strip_punctuation(s.decode('utf8') if isinstance(s, bytes) else s).lower() name = strip_text(fb_event['info'].get('name', '')) description = strip_text(fb_event['info'].get('description', '')) - attendee_list = ' '.join(['id%s' % x['id'] for x in fb_event_attending['attending']['data']]) + attending_data = fb_event_attending.get('attending', {}).get('data', []) + attendee_list = ' '.join(['id%s' % x['id'] for x in attending_data]) source_list = ' '.join('id%s' % x.id for x in potential_event.source_ids_only()) - #TODO(lambert): maybe include number-of-keywords and keyword-density? - - #TODO(lambert): someday write this as a proper mapreduce that reduces across languages and builds a classifier model per language? - # for now we can just grep and build sub-models per-language on my client machine. + # Currently only returning attendee_list (other features commented out in original) return (attendee_list,) - return (potential_event.language, owner_name, location, name, description, attendee_list, source_list) - - -def mr_generate_training_data(fbl): - fb_mapreduce.start_map( - fbl=fbl, - name='Write Training Data', - handler_spec='dancedeets.ml.gprediction.map_training_data_for_pevents', - output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter', - handle_batch_size=20, - entity_kind='dancedeets.event_scraper.potential_events.PotentialEvent', - output_writer={ - 'mime_type': 'text/plain', - 'bucket_name': 'dancedeets-hrd.appspot.com', - }, - queue=None, - ) + # Full features would be: + # return (potential_event.language, owner_name, location, name, description, attendee_list, source_list) MAGIC_USER_ID = '100529355548393795594' def get_predict_service(): - #TODO(lambert): we need to cache this somehow, if we use this, since it appears to not even use memcache for credentials. + """ + Get the Google Prediction API service client. + + Note: This uses OAuth credentials stored in Datastore. + """ + # TODO(lambert): we need to cache this somehow import httplib2 from apiclient.discovery import build from oauth2client import appengine - credentials = appengine.StorageByKeyName(appengine.CredentialsModel, MAGIC_USER_ID, 'credentials').get() + credentials = appengine.StorageByKeyName( + appengine.CredentialsModel, MAGIC_USER_ID, 'credentials' + ).get() http = credentials.authorize(httplib2.Http()) service = build("prediction", "v1.5", http=http) @@ -112,15 +93,46 @@ def get_predict_service(): def predict(potential_event, fb_event, fb_event_attending, service=None): - body = {'input': {'csvInstance': get_training_features(potential_event, fb_event, fb_event_attending)}} + """ + Predict whether an event is a dance event. + + Args: + potential_event: PotentialEvent instance + fb_event: Facebook event data + fb_event_attending: Facebook event attending data + service: Optional prediction service (will be created if not provided) + + Returns: + Tuple of (dance_bias_score, not_dance_bias_score) + """ + body = { + 'input': { + 'csvInstance': get_training_features( + potential_event, fb_event, fb_event_attending + ) + } + } logging.info("Dance Data: %r", body) + service = service or get_predict_service() train = service.trainedmodels() + dance_bias_prediction = train.predict(body=body, id=DANCE_BIAS_MODEL_NAME).execute() - dance_bias_score = [x['score'] for x in dance_bias_prediction['outputMulti'] if x['label'] == 'dance'][0] - not_dance_bias_prediction = train.predict(body=body, id=NOT_DANCE_BIAS_MODEL_NAME).execute() - not_dance_bias_score = [x['score'] for x in not_dance_bias_prediction['outputMulti'] if x['label'] == 'dance'][0] + dance_bias_score = [ + x['score'] for x in dance_bias_prediction['outputMulti'] + if x['label'] == 'dance' + ][0] + + not_dance_bias_prediction = train.predict( + body=body, id=NOT_DANCE_BIAS_MODEL_NAME + ).execute() + not_dance_bias_score = [ + x['score'] for x in not_dance_bias_prediction['outputMulti'] + if x['label'] == 'dance' + ][0] + logging.info("Dance Result: %s", dance_bias_prediction) logging.info("NoDance Result: %s", not_dance_bias_prediction) logging.info("Dance Score: %s, NoDance Score: %s", dance_bias_score, not_dance_bias_score) + return dance_bias_score, not_dance_bias_score diff --git a/server/dancedeets/ml/mr_prediction.py b/server/dancedeets/ml/mr_prediction.py deleted file mode 100644 index 6cb85cec6..000000000 --- a/server/dancedeets/ml/mr_prediction.py +++ /dev/null @@ -1,56 +0,0 @@ -import logging - -from dancedeets import fb_api - -from dancedeets.event_scraper import potential_events -from . import gprediction -from dancedeets.util import fb_mapreduce - - -def classify_events(fbl, pe_list): - pe_list = [x for x in pe_list if x.match_score > 0] - if not pe_list: - return - predict_service = None - pe_ids = [x.fb_event_id for x in pe_list if not getattr(x, 'dance_bias_score')] - fbl.request_multi(fb_api.LookupEvent, pe_ids) - fbl.request_multi(fb_api.LookupEventAttending, pe_ids) - fbl.batch_fetch() - - results = [] - for pe in pe_list: - if not getattr(pe, 'dance_bias_score'): - try: - fb_event = fbl.fetched_data(fb_api.LookupEvent, pe.fb_event_id) - fb_event_attending = fbl.fetched_data(fb_api.LookupEventAttending, pe.fb_event_id) - except fb_api.NoFetchedDataException: - continue - if fb_event['empty']: - continue - predict_service = predict_service or gprediction.get_predict_service() - pe = potential_events.update_scores_for_potential_event(pe, fb_event, fb_event_attending, predict_service) - logging.info("%s has ms=%s, d=%s, nd=%s", pe.fb_event_id, pe.match_score, pe.dance_bias_score, pe.non_dance_bias_score) - if pe.dance_bias_score > 0.5 and pe.non_dance_bias_score > 0.5: - result = '%s:%s:%s:%s\n' % (pe.fb_event_id, pe.match_score, pe.dance_bias_score, pe.non_dance_bias_score) - results.append(result) - yield ''.join(results).encode('utf-8') - - -map_classify_events = fb_mapreduce.mr_wrap(classify_events) - - -def mr_classify_potential_events(fbl): - fb_mapreduce.start_map( - fbl, - 'Auto-Classify Events', - 'dancedeets.ml.mr_prediction.map_classify_events', - 'dancedeets.event_scraper.potential_events.PotentialEvent', - filters=[('looked_at', '=', None)], - handle_batch_size=20, - queue='slow-queue', - output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter', - output_writer={ - 'mime_type': 'text/plain', - 'bucket_name': 'dancedeets-hrd.appspot.com', - }, - ) diff --git a/server/dancedeets/notifications/added_events.py b/server/dancedeets/notifications/added_events.py index 03c1351e1..92fe23fe8 100644 --- a/server/dancedeets/notifications/added_events.py +++ b/server/dancedeets/notifications/added_events.py @@ -1,9 +1,17 @@ +""" +User event notifications. + +The main batch processing has been migrated to Cloud Run Jobs. +See: dancedeets.jobs.notify_users + +This module retains: +- promote_events_to_user(): Core notification logic (used by jobs and dev handler) +- /tasks/promote_new_events_to_user: Dev handler for testing single user +""" import datetime import logging import time -from dancedeets.compat.mapreduce import control - from dancedeets import app from dancedeets import base_servlet from dancedeets.loc import gmaps_api @@ -12,13 +20,10 @@ from dancedeets.search import search_base from dancedeets.users import users from . import android -""" -Runs a mapreduce hourly, which finds all users with that timezone offset, -and sends notifications about recently-aevents to those users -""" def get_time_offset(): + """Calculate timezone offset to target for 4pm local notifications.""" desired_hour = 16 # send new-event notifications at 4pm current_hour = datetime.datetime.now().hour # should be UTC hour offset = desired_hour - current_hour @@ -29,32 +34,7 @@ def get_time_offset(): return float(offset) -@app.route('/tasks/promote_new_events') -class RemindUserMapReduceHandler(base_servlet.BaseTaskRequestHandler): - def get(self): - if self.request.get('offset'): - offset = float(self.request.get('offset')) - else: - offset = get_time_offset() - string_offset = '%+03d00' % offset - logging.info("Got time offset %s for our run", string_offset) - # offset needs to be of type float, or this doesn't work - control.start_map( - name='Send New Events to Users in TZ%s' % string_offset, - reader_spec='mapreduce.input_readers.DatastoreInputReader', - handler_spec='dancedeets.notifications.added_events.promote_events_to_user', - mapper_parameters={ - 'entity_kind': 'dancedeets.users.users.User', - 'filters': [ - ('timezone_offset', '>=', offset), - ('timezone_offset', '<', offset + 1), - ], - }, - shard_count=1, - ) - - -# for development only, usually this will be called via mapreduce +# For development/testing only @app.route('/tasks/promote_new_events_to_user') class RemindUserHandler(base_servlet.BaseTaskRequestHandler): def get(self): @@ -64,12 +44,18 @@ def get(self): def promote_events_to_user(user): + """ + Send push notifications about new events to a user. + + This is the core notification logic used by both: + - Cloud Run Job: dancedeets.jobs.notify_users + - Dev handler: /tasks/promote_new_events_to_user + """ # TODO: Adjust when we have iphone notifications if not android.can_notify(user): return logging.info("Promoting new events to user %s", user.fb_uid) - # Only send notifications for Mike for now user = users.User.get_by_id(user.fb_uid) if not user: logging.error("No user found: %s", user.fb_uid) @@ -89,17 +75,29 @@ def promote_events_to_user(user): if not geocode: return None bounds = math.expand_bounds(geocode.latlng_bounds(), distance_in_km) - query = search_base.SearchQuery(time_period=search_base.TIME_UPCOMING, bounds=bounds, min_attendees=min_attendees) + query = search_base.SearchQuery( + time_period=search_base.TIME_UPCOMING, + bounds=bounds, + min_attendees=min_attendees + ) - one_day_ago = time.mktime((datetime.datetime.now() - datetime.timedelta(hours=24)).timetuple()) + one_day_ago = time.mktime( + (datetime.datetime.now() - datetime.timedelta(hours=24)).timetuple() + ) search_query = search.Search(query) search_query.extra_fields = ['creation_time'] search_results = search_query._get_candidate_doc_events() # TODO: can we move this filter into the search query itself?? - recent_events = [x.doc_id for x in search_results if x.field('creation_time').value > one_day_ago] - - logging.info("Found %s search_results, %s new events", len(search_results), len(recent_events)) + recent_events = [ + x.doc_id for x in search_results + if x.field('creation_time').value > one_day_ago + ] + + logging.info( + "Found %s search_results, %s new events", + len(search_results), len(recent_events) + ) for event_id in recent_events: if android.add_notify(user, event_id): logging.info("Sent notification!") diff --git a/server/dancedeets/pubsub/pubsub_tasks.py b/server/dancedeets/pubsub/pubsub_tasks.py index f922c4011..412bf1c4f 100644 --- a/server/dancedeets/pubsub/pubsub_tasks.py +++ b/server/dancedeets/pubsub/pubsub_tasks.py @@ -1,5 +1,14 @@ -from dancedeets.compat.mapreduce import control +""" +Social publishing task handlers. +The batch posting of Japan events has been migrated to Cloud Run Jobs. +See: dancedeets.jobs.post_japan_events + +This module retains: +- SocialPublisherHandler: Pulls and publishes events from pubsub queue +- WeeklyEventsPostHandler: Posts weekly events for top US cities +- EventNotificationsHandler: Prepares event reminder notifications +""" import datetime from dancedeets import app @@ -8,7 +17,6 @@ from dancedeets.rankings import cities_db from dancedeets.search import search_base from dancedeets.search import search -from dancedeets.util import dates from . import pubsub @@ -18,36 +26,6 @@ def get(self): pubsub.pull_and_publish_event() -def yield_post_jp_event(db_events): - from dancedeets.compat.mapreduce import context - ctx = context.get() - params = ctx.mapreduce_spec.mapper.params - token_nickname = params.get('token_nickname') - db_events = [x for x in db_events if x.actual_city_name and x.actual_city_name.endswith('Japan')] - for db_event in db_events: - pubsub.eventually_publish_event(db_event.id, token_nickname) - - -@app.route('/tasks/post_japan_events') -class PostJapanEventsHandler(base_servlet.BaseTaskFacebookRequestHandler): - def get(self): - token_nickname = self.request.get('token_nickname', None) - mapper_params = { - 'entity_kind': 'dancedeets.events.eventdata.DBEvent', - 'handle_batch_size': 20, - 'filters': [('search_time_period', '=', dates.TIME_FUTURE)], - 'token_nickname': token_nickname, - } - control.start_map( - name='Post Future Japan Events', - reader_spec='mapreduce.input_readers.DatastoreInputReader', - handler_spec='dancedeets.pubsub.pubsub_tasks.map_post_jp_event', - shard_count=8, # since we want to stick it in the slow-queue, and don't care how fast it executes - queue_name='fast-queue', - mapper_parameters=mapper_params, - ) - - def blacklisted(city): if city.country_name == 'US' and city.state_name == 'NY' and city.city_name in [ 'Brooklyn', 'Borough of Queens', 'Manhattan', 'The Bronx' diff --git a/server/dancedeets/rankings/rankings.py b/server/dancedeets/rankings/rankings.py index 03361797c..a4465cd13 100644 --- a/server/dancedeets/rankings/rankings.py +++ b/server/dancedeets/rankings/rankings.py @@ -1,20 +1,20 @@ -import datetime - -from dancedeets.util import memcache +""" +City/country rankings utilities. -# Note: MapReduce is no longer available in App Engine Flexible. -# These imports are kept for reference but the functions won't work. -# Use Cloud Dataflow for batch processing. +The batch ranking computation has been migrated to Cloud Run Jobs. +See: dancedeets.jobs.compute_rankings -from dancedeets.loc import gmaps_api -from . import cities_db +This module retains: +- TIME_PERIODS and constants for display +- retrieve_summary(): Get cached ranking totals +- compute_city_template_rankings(): Format rankings for templates +""" +import datetime -EVENT_FOR_CITY_RANKING = "CITY_EVENT_RANKING" -USER_FOR_CITY_RANKING = "CITY_USER_RANKING" +from dancedeets.util import memcache -# location is a city in cities/state/country -# time_period is one of ALL_TIME, LAST_MONTH, LAST_WEEK +# Time period constants LAST_WEEK = "LAST_WEEK" LAST_MONTH = "LAST_MONTH" ALL_TIME = "ALL_TIME" @@ -33,6 +33,7 @@ def get_time_periods(timestamp): + """Get applicable time periods for a given timestamp.""" if timestamp > datetime.datetime.now() - datetime.timedelta(days=7): yield LAST_WEEK if timestamp > datetime.datetime.now() - datetime.timedelta(days=31): @@ -40,168 +41,29 @@ def get_time_periods(timestamp): yield ALL_TIME -def make_key_name(key_name, **kwargs): - return "%s/%s" % ( - key_name, - "/".join("%s=%s" % (k, v) for (k, v) in sorted(kwargs.items())), - ) - - -def count_event_for_city(dbevent): - if not dbevent.start_time: # deleted event, don't count - return - if not dbevent.latitude or not dbevent.longitude: # no-location event, don't count - return - city = dbevent.city_name - for time_period in get_time_periods(dbevent.creation_time or dbevent.start_time): - yield op.counters.Increment( - make_key_name("City", city=city, time_period=time_period) - ) - yield op.counters.Increment( - make_key_name("Country", country=dbevent.country, time_period=time_period) - ) - - -def count_user_for_city(user): - user_city = user.city_name - for time_period in get_time_periods(user.creation_time): - yield op.counters.Increment( - make_key_name("City", city=user_city, time_period=time_period) - ) - - -def begin_event_ranking_calculations(vertical): - filters = [("verticals", "=", vertical)] - - control.start_map( - name="Compute City Rankings by %s Events" % vertical, - reader_spec="mapreduce.input_readers.DatastoreInputReader", - handler_spec="dancedeets.rankings.rankings.count_event_for_city", - mapper_parameters={ - "entity_kind": "dancedeets.events.eventdata.DBEvent", - "filters": filters, - }, - queue_name="fast-queue", - shard_count=16, - _app=_get_app_id(EVENT_FOR_CITY_RANKING, vertical), - ) - _compute_summary(expiry=5 * 60) # 5 minutes - - -def begin_user_ranking_calculations(): - control.start_map( - name="Compute City Rankings by Users", - reader_spec="mapreduce.input_readers.DatastoreInputReader", - handler_spec="dancedeets.rankings.rankings.count_user_for_city", - mapper_parameters={"entity_kind": "dancedeets.users.users.User"}, - queue_name="fast-queue", - shard_count=16, - _app=USER_FOR_CITY_RANKING, - ) - _compute_summary(expiry=5 * 60) # 5 minutes - - TOTALS_KEY = "StatTotals" TOTALS_EXPIRY = 6 * 3600 def retrieve_summary(): + """ + Retrieve cached ranking summary. + + Returns cached totals or empty dict if not available. + Rankings are computed by the Cloud Run Job: dancedeets.jobs.compute_rankings + """ totals = memcache.get(TOTALS_KEY) if not totals: - totals = _compute_summary() - return totals - - -def _get_app_id(app_name, vertical): - return "%s:%s" % (app_name, vertical) - - -def _compute_summary(expiry=TOTALS_EXPIRY): - # TODO: make this handle non-street events better - vertical = "STREET" - - # IN PROGRESS - event_rankings = get_city_by_event_rankings(vertical) - if event_rankings: - total_events = _compute_sum(event_rankings, ALL_TIME) - else: - total_events = 0 - user_rankings = get_city_by_user_rankings() - if user_rankings: - total_users = _compute_sum(user_rankings, ALL_TIME) - else: - total_users = 0 - - # save - totals = dict(total_events=total_events, total_users=total_users) - memcache.set(TOTALS_KEY, totals, expiry) - + # Rankings not yet computed - return empty totals + totals = dict(total_events=0, total_users=0) return totals -def _parse_key_name(full_key_name): - if "/" not in full_key_name: - return None, {} - key_name, kwargs_string = full_key_name.split("/", 1) - try: - kwargs = dict(kv.split("=") for kv in kwargs_string.split("/")) - except ValueError: - return None, {} - return key_name, kwargs - - -def _get_counter_map_for_ranking(ranking): - # MapReduce is not available in App Engine Flexible Environment. - # This function would have queried mapreduce.model.MapreduceState, - # but that API is not available. Return None to indicate no rankings data. - # TODO: Implement using Cloud Dataflow or BigQuery for batch processing. - return None - - -def _group_cities_time_period(final_counter_map): - cities = {} - for k, counter in final_counter_map.items(): - prefix, kwargs = _parse_key_name(k) - if prefix != "City": - continue - cities.setdefault(kwargs["city"], {})[kwargs["time_period"]] = counter - return cities - - -def _group_users_time_period(final_counter_map, city): - users = {} - for k, counter in final_counter_map.items(): - prefix, kwargs = _parse_key_name(k) - if prefix != "User": - continue - if city and kwargs["city"] != city: - continue - users.setdefault(kwargs["user"], {})[kwargs["time_period"]] = counter - return users - - -def get_city_by_event_rankings(vertical): - final_counter_map = _get_counter_map_for_ranking( - _get_app_id(EVENT_FOR_CITY_RANKING, vertical) - ) - if not final_counter_map: - return {} - cities = _group_cities_time_period(final_counter_map) - return cities - - -def get_city_by_user_rankings(): - final_counter_map = _get_counter_map_for_ranking(USER_FOR_CITY_RANKING) - if not final_counter_map: - return {} - cities = _group_cities_time_period(final_counter_map) - return cities - - def _compute_sum(all_rankings, time_period): + """Compute total count across all cities for a time period.""" total_count = 0 for city, times in all_rankings.items(): - count = times.get(time_period, {}) + count = times.get(time_period, 0) total_count += count return total_count @@ -209,11 +71,23 @@ def _compute_sum(all_rankings, time_period): def compute_city_template_rankings( all_rankings, time_period, vertical=None, use_url=True ): + """ + Format city rankings for template display. + + Args: + all_rankings: Dict of city -> time_period -> count + time_period: Which time period to display + vertical: Event vertical for admin URLs + use_url: Whether to include URLs in output + + Returns: + List of dicts with city, count, and url + """ city_ranking = [] for city, times in all_rankings.items(): if city == "Unknown": continue - count = times.get(time_period, {}) + count = times.get(time_period, 0) if count: if use_url == "ADMIN": url = "/tools/recent_events?vertical=%s&city=%s" % (vertical, city) diff --git a/server/dancedeets/search/email_events.py b/server/dancedeets/search/email_events.py index 7381ec0d1..a7dec60d2 100644 --- a/server/dancedeets/search/email_events.py +++ b/server/dancedeets/search/email_events.py @@ -1,3 +1,14 @@ +""" +Weekly email functionality. + +The batch sending has been migrated to Cloud Run Jobs. +See: dancedeets.jobs.send_weekly_emails + +This module retains: +- email_for_user: Core function to generate and send email for a user +- yield_email_user: Wrapper that handles FB token and error handling +- DisplayEmailHandler: Admin tool to preview weekly emails +""" import datetime import logging import random @@ -13,7 +24,6 @@ from dancedeets.logic import mobile from dancedeets.mail import mandrill_api from dancedeets.users import users -from dancedeets.util import fb_mapreduce from . import search_base from . import search @@ -178,17 +188,3 @@ def yield_email_user(fbl, user): except Exception as e: logging.exception("Error sending email for user %s", user.fb_uid) return None - - -map_email_user = fb_mapreduce.mr_user_wrap(yield_email_user) -email_user = fb_mapreduce.nomr_wrap(yield_email_user) - - -def mr_email_user(fbl): - fb_mapreduce.start_map( - fbl=fbl, - name='Email Users', - #TODO: MOVE - handler_spec='dancedeets.search.email_events.map_email_user', - entity_kind='dancedeets.users.users.User', - ) diff --git a/server/dancedeets/sitemaps/events.py b/server/dancedeets/sitemaps/events.py index c0eb0e314..2bfe29212 100644 --- a/server/dancedeets/sitemaps/events.py +++ b/server/dancedeets/sitemaps/events.py @@ -1,108 +1,76 @@ +""" +Sitemap generation utilities. + +The main batch processing has been migrated to Cloud Run Jobs. +See: dancedeets.jobs.generate_sitemaps + +This module retains the sitemap entry generation helper for use by the job. +""" import datetime from lxml import etree import logging -# local -from dancedeets import app -from dancedeets import base_servlet -from dancedeets.util import fb_mapreduce from dancedeets.util import urls -def yield_sitemap_event(fbl, all_events): - # Don't really need fbl, but makes everything easier - - for event in all_events: - if not event.has_content(): - continue - - url_node = etree.Element('url') - loc_node = etree.Element('loc') - loc_node.text = urls.dd_event_url(event) - if event.is_fb_event: - if 'updated_time' in event.fb_event['info']: - lastmod_node = etree.Element('lastmod') - updated = event.fb_event['info']['updated_time'] - updated = updated.replace('+0000', '+00:00') - lastmod_node.text = updated - url_node.append(lastmod_node) - else: - logging.info('Event %s does not have updated_time: %s' % (event.id, event.fb_event)) - changefreq_node = etree.Element('changefreq') - priority_node = etree.Element('priority') - - if event.end_time: - end_time = event.end_time - else: - end_time = event.start_time + datetime.timedelta(hours=2) - - start_time_delta = event.start_time - datetime.datetime.now() - end_time_delta = end_time - datetime.datetime.now() - event_delta = end_time - event.start_time - - priority_node.text = '0.5' - - # Event is active and not a multi-week event: - if event_delta.days < 7 and start_time_delta.days <= 1 and end_time_delta.days >= 0: - changefreq_node.text = 'hourly' - - # If it ended awhile ago - elif end_time_delta.days < -30: - changefreq_node.text = 'yearly' - priority_node.text = '0.1' - elif end_time_delta.days < -10: - changefreq_node.text = 'weekly' +def generate_sitemap_entry(event): + """ + Generate a sitemap XML entry for a single event. - # If it's coming up soon - elif start_time_delta.days < 30: - changefreq_node.text = 'daily' + Args: + event: DBEvent instance - else: - changefreq_node.text = 'weekly' - - url_node.append(loc_node) - url_node.append(changefreq_node) - url_node.append(priority_node) - # prints out as one line - yield '%s\n' % etree.tostring(url_node) - - -map_sitemap_event = fb_mapreduce.mr_wrap(yield_sitemap_event) -sitemap_event = fb_mapreduce.nomr_wrap(yield_sitemap_event) + Returns: + XML string for the URL entry, or None if event should be skipped + """ + if not event.has_content(): + return None + url_node = etree.Element('url') + loc_node = etree.Element('loc') + loc_node.text = urls.dd_event_url(event) -@app.route('/tasks/generate_sitemaps') -class ReloadEventsHandler(base_servlet.BaseTaskFacebookRequestHandler): - def get(self): - queue = self.request.get('queue', 'fast-queue') - time_period = self.request.get('time_period', None) - vertical = self.request.get('vertical', None) - - filters = [] - if vertical: - filters.append(('verticals', '=', vertical)) - vertical_string = '%s ' % vertical - else: - vertical_string = '' - - if time_period: - filters.append(('search_time_period', '=', time_period)) - name = 'Generate %s %sSitemaps' % (time_period, vertical_string) + if event.is_fb_event: + if 'updated_time' in event.fb_event.get('info', {}): + lastmod_node = etree.Element('lastmod') + updated = event.fb_event['info']['updated_time'] + updated = updated.replace('+0000', '+00:00') + lastmod_node.text = updated + url_node.append(lastmod_node) else: - name = 'Generate %sSitemaps' % vertical_string - fb_mapreduce.start_map( - fbl=self.fbl, - name=name, - handler_spec='dancedeets.sitemaps.events.map_sitemap_event', - entity_kind='dancedeets.events.eventdata.DBEvent', - handle_batch_size=20, - filters=filters, - queue=queue, - output_writer_spec='mapreduce.output_writers.GoogleCloudStorageOutputWriter', - output_writer={ - 'mime_type': 'text/plain', - 'bucket_name': 'dancedeets-hrd.appspot.com', - }, - ) - - post = get + logging.debug('Event %s does not have updated_time', event.id) + + changefreq_node = etree.Element('changefreq') + priority_node = etree.Element('priority') + + if event.end_time: + end_time = event.end_time + else: + end_time = event.start_time + datetime.timedelta(hours=2) + + start_time_delta = event.start_time - datetime.datetime.now() + end_time_delta = end_time - datetime.datetime.now() + event_delta = end_time - event.start_time + + priority_node.text = '0.5' + + # Event is active and not a multi-week event: + if event_delta.days < 7 and start_time_delta.days <= 1 and end_time_delta.days >= 0: + changefreq_node.text = 'hourly' + # If it ended awhile ago + elif end_time_delta.days < -30: + changefreq_node.text = 'yearly' + priority_node.text = '0.1' + elif end_time_delta.days < -10: + changefreq_node.text = 'weekly' + # If it's coming up soon + elif start_time_delta.days < 30: + changefreq_node.text = 'daily' + else: + changefreq_node.text = 'weekly' + + url_node.append(loc_node) + url_node.append(changefreq_node) + url_node.append(priority_node) + + return etree.tostring(url_node, encoding='unicode') diff --git a/server/dancedeets/users/user_event_tasks.py b/server/dancedeets/users/user_event_tasks.py index dc0945550..b3aa51d4d 100644 --- a/server/dancedeets/users/user_event_tasks.py +++ b/server/dancedeets/users/user_event_tasks.py @@ -1,49 +1,58 @@ -from dancedeets.compat.mapreduce import control +""" +User event statistics utilities. -from dancedeets import app -from dancedeets import base_servlet +The batch processing has been migrated to Cloud Run Jobs. +See: dancedeets.jobs.compute_user_stats + +This module retains the core update_user_qualities function for reuse. +""" from dancedeets.events import eventdata from dancedeets.event_scraper import potential_events def update_user_qualities(user): - #STR_ID_MIGRATE - source_potential_events = potential_events.PotentialEvent.gql('WHERE source_ids = :graph_id', graph_id=long(user.fb_uid)).fetch(1000) - added_events = eventdata.DBEvent.get_by_ids([x.fb_event_id for x in source_potential_events]) + """ + Calculate and update user event contribution statistics. + + Counts: + - Auto-added events (via ML classifier) + - Auto-added own events (user is the event owner) + - Hand-added events (manually added by user) + - Hand-added own events (user is both creator and owner) + """ + # STR_ID_MIGRATE + try: + fb_uid_long = int(user.fb_uid) + except (ValueError, TypeError): + fb_uid_long = user.fb_uid + + source_potential_events = potential_events.PotentialEvent.gql( + 'WHERE source_ids = :graph_id', graph_id=fb_uid_long + ).fetch(1000) + + added_events = eventdata.DBEvent.get_by_ids( + [x.fb_event_id for x in source_potential_events] + ) user.num_auto_added_events = len([ - x for x in added_events if x and x.creating_method in [eventdata.CM_AUTO, eventdata.CM_AUTO_ATTENDEE] + x for x in added_events + if x and x.creating_method in [eventdata.CM_AUTO, eventdata.CM_AUTO_ATTENDEE] ]) user.num_auto_added_own_events = len([ x for x in added_events - if x and x.creating_method in [eventdata.CM_AUTO, eventdata.CM_AUTO_ATTENDEE] and x.owner_fb_uid == user.fb_uid + if x and x.creating_method in [eventdata.CM_AUTO, eventdata.CM_AUTO_ATTENDEE] + and x.owner_fb_uid == user.fb_uid ]) - #STR_ID_MIGRATE + # STR_ID_MIGRATE user.num_hand_added_events = len([ - x for x in added_events if x and x.creating_method == eventdata.CM_USER and str(x.creating_fb_uid) == user.fb_uid + x for x in added_events + if x and x.creating_method == eventdata.CM_USER + and str(x.creating_fb_uid) == user.fb_uid ]) - #STR_ID_MIGRATE + # STR_ID_MIGRATE user.num_hand_added_own_events = len([ x for x in added_events - if x and x.creating_method == eventdata.CM_USER and str(x.creating_fb_uid) == user.fb_uid and x.owner_fb_uid == user.fb_uid + if x and x.creating_method == eventdata.CM_USER + and str(x.creating_fb_uid) == user.fb_uid + and x.owner_fb_uid == user.fb_uid ]) - - -def map_compute_user_stats(user): - update_user_qualities(user) - user.put() - - -@app.route('/tasks/recompute_user_stats') -class RecomputeUserStatsHandler(base_servlet.BaseTaskRequestHandler): - def get(self): - control.start_map( - name='Compute User-Event Stats', - reader_spec='mapreduce.input_readers.DatastoreInputReader', - handler_spec='dancedeets.users.user_event_tasks.map_compute_user_stats', - mapper_parameters={'entity_kind': 'dancedeets.users.users.User'}, - queue_name='fast-queue', - shard_count=5, - ) - - post = get diff --git a/server/dancedeets/users/user_tasks.py b/server/dancedeets/users/user_tasks.py index c778a9aab..cf99206ac 100644 --- a/server/dancedeets/users/user_tasks.py +++ b/server/dancedeets/users/user_tasks.py @@ -1,14 +1,25 @@ +""" +User management tasks. + +The batch user refresh has been migrated to Cloud Run Jobs. +See: dancedeets.jobs.refresh_users + +This module retains: +- LookupAppFriendUsers: FB API lookup type for friend tracking +- TrackNewUserFriendsHandler: Handler for tracking new user friends +- LoadUserHandler: Handler for loading specific users +- fetch_and_save_fb_user: Core function for FB user refresh +""" import logging from dancedeets import app from dancedeets import base_servlet from dancedeets import fb_api -from dancedeets.mail import mailchimp_api -from dancedeets.util import fb_mapreduce from dancedeets.users import users class LookupAppFriendUsers(fb_api.LookupType): + """FB API lookup type for getting app friends.""" @classmethod def get_lookups(cls, object_id): return [('info', cls.url('%s/friends' % object_id))] @@ -16,6 +27,7 @@ def get_lookups(cls, object_id): @app.route('/tasks/track_newuser_friends') class TrackNewUserFriendsHandler(base_servlet.BaseTaskFacebookRequestHandler): + """Track friends for newly registered users.""" def get(self): key = fb_api.generate_key(LookupAppFriendUsers, self.fb_uid) fb_result = self.fbl.fb.fetch_keys([key]) @@ -30,53 +42,23 @@ def get(self): @app.route('/tasks/load_users') class LoadUserHandler(base_servlet.UserOperationHandler): + """Load specific users from Facebook.""" user_operation = lambda self, fbl, load_users: [load_fb_user(fbl, x) for x in load_users] -@app.route('/tasks/reload_all_users') -class ReloadAllUsersHandler(base_servlet.BaseTaskFacebookRequestHandler): - def get(self): - all_users = self.request.get('all_users', '0') == '1' - if all_users: - filters = [] - else: - filters = [('expired_oauth_token', '=', False)] - # this calls a map function wrapped by mr_user_wrap, so it works correctly on a per-user basis - mailchimp_list_id = mailchimp_api.get_list_id() - fb_mapreduce.start_map( - fbl=self.fbl, - name='Load %sUsers' % ('All ' if all_users else ''), - handler_spec='dancedeets.users.user_tasks.map_load_fb_user', - entity_kind='dancedeets.users.users.User', - filters=filters, - extra_mapper_params={ - 'mailchimp_list_id': mailchimp_list_id, - }, - queue='fast-queue' - ) - - post = get - - -def yield_load_fb_user(fbl, user): - if user.expired_oauth_token: - logging.info('Skipping user %s (%s) due to expired access_token', user.fb_uid, user.full_name) - user.put() - elif not fbl.access_token: - logging.info('Skipping user %s (%s) due to not having an access_token', user.fb_uid, user.full_name) - user.put() - else: - fetch_and_save_fb_user(fbl, user) - # The above function calls user.put(), so no need for: - # users.update_mailchimp(user) - - def fetch_and_save_fb_user(fbl, user): + """ + Fetch user data from Facebook and save to Datastore. + + This is the core function used by both: + - Cloud Run Job: dancedeets.jobs.refresh_users + - LoadUserHandler for individual user loading + """ try: fb_user = fbl.get(fb_api.LookupUser, user.fb_uid) except fb_api.ExpiredOAuthToken as e: logging.info('Auth token now expired, mark as such: %s', e) - user.expired_oauth_token_reason = e.args[0] + user.expired_oauth_token_reason = e.args[0] if e.args else "Unknown" user.expired_oauth_token = True user.put() return @@ -85,5 +67,13 @@ def fetch_and_save_fb_user(fbl, user): user.put() -map_load_fb_user = fb_mapreduce.mr_user_wrap(yield_load_fb_user) -load_fb_user = fb_mapreduce.nomr_wrap(yield_load_fb_user) +def load_fb_user(fbl, user): + """Load and save a single user (wrapper for non-mapreduce context).""" + if user.expired_oauth_token: + logging.info('Skipping user %s (%s) due to expired access_token', user.fb_uid, user.full_name) + user.put() + elif not fbl.access_token: + logging.info('Skipping user %s (%s) due to not having an access_token', user.fb_uid, user.full_name) + user.put() + else: + fetch_and_save_fb_user(fbl, user) diff --git a/server/requirements-jobs.txt b/server/requirements-jobs.txt new file mode 100644 index 000000000..f4c12ec58 --- /dev/null +++ b/server/requirements-jobs.txt @@ -0,0 +1,22 @@ +# Requirements for Cloud Run Jobs +# Minimal dependencies for batch processing + +# Google Cloud clients +google-cloud-datastore>=2.15.0 +google-cloud-storage>=2.10.0 +google-cloud-tasks>=2.14.0 + +# Optional: Cloud Monitoring for metrics export +# google-cloud-monitoring>=2.16.0 + +# HTTP requests +requests>=2.31.0 + +# XML processing (for sitemaps) +lxml>=4.9.0 + +# Date/time utilities +python-dateutil>=2.8.2 + +# Environment variables +python-dotenv>=1.0.0 diff --git a/server/workflows/crawl_and_index_classes.yaml b/server/workflows/crawl_and_index_classes.yaml new file mode 100644 index 000000000..e0b1f665e --- /dev/null +++ b/server/workflows/crawl_and_index_classes.yaml @@ -0,0 +1,104 @@ +# Cloud Workflow: Crawl and Index Dance Classes +# +# Migrated from: dancedeets/classes/class_pipeline.py +# +# This workflow orchestrates the dance class scraping pipeline: +# 1. Start spider jobs on ScrapingHub +# 2. Wait for spiders to complete +# 3. Reindex classes in search +# 4. Email any crawl errors +# +# Usage: +# gcloud workflows run crawl-and-index-classes + +main: + params: [args] + steps: + - init: + assign: + - project_id: ${sys.get_env("GOOGLE_CLOUD_PROJECT")} + - region: "us-central1" + - run_time: ${time.format(sys.now())} + + - start_spiders: + call: http.post + args: + url: ${"https://" + region + "-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/" + project_id + "/jobs/start-spiders:run"} + auth: + type: OIDC + result: spider_result + + - get_job_keys: + assign: + - job_keys: ${spider_result.body.jobKeys} + + - wait_for_spiders: + call: wait_for_completion + args: + job_keys: ${job_keys} + max_attempts: 60 + delay_seconds: 30 + result: jobs_completed + + - parallel_finalize: + parallel: + branches: + - reindex: + call: http.post + args: + url: ${"https://" + region + "-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/" + project_id + "/jobs/reindex-classes:run"} + auth: + type: OIDC + - email_errors: + call: http.post + args: + url: ${"https://" + region + "-run.googleapis.com/apis/run.googleapis.com/v1/namespaces/" + project_id + "/jobs/email-crawl-errors:run"} + body: + run_time: ${run_time} + job_keys: ${job_keys} + auth: + type: OIDC + + - return_result: + return: + status: "completed" + run_time: ${run_time} + jobs_completed: ${jobs_completed} + +# Subworkflow: Wait for ScrapingHub jobs to complete +wait_for_completion: + params: [job_keys, max_attempts, delay_seconds] + steps: + - init_wait: + assign: + - attempt: 0 + + - check_jobs: + call: http.get + args: + url: "https://app.scrapinghub.com/api/jobs/list.json" + query: + job: ${job_keys} + auth: + type: OIDC + result: jobs_status + + - evaluate_status: + switch: + - condition: ${all(job.state == "finished" for job in jobs_status.body.jobs)} + next: return_success + - condition: ${attempt >= max_attempts} + raise: "Timeout waiting for spider jobs to complete" + + - increment_attempt: + assign: + - attempt: ${attempt + 1} + + - wait: + call: sys.sleep + args: + seconds: ${delay_seconds} + next: check_jobs + + - return_success: + return: true