diff --git a/backend/api/content_planning/services/content_strategy/autofill/normalizers/analytics_normalizer.py b/backend/api/content_planning/services/content_strategy/autofill/normalizers/analytics_normalizer.py index 9989b85f..d4ec34ea 100644 --- a/backend/api/content_planning/services/content_strategy/autofill/normalizers/analytics_normalizer.py +++ b/backend/api/content_planning/services/content_strategy/autofill/normalizers/analytics_normalizer.py @@ -21,6 +21,7 @@ async def normalize_gsc_analytics(gsc_data: Dict[str, Any]) -> Dict[str, Any]: # Extract metrics from GSC data metrics = gsc_data.get('metrics', {}) data = gsc_data.get('data', {}) + query_page_opportunities = data.get('query_page_opportunities', []) or metrics.get('query_page_opportunities', []) normalized = { 'traffic_metrics': { @@ -31,6 +32,7 @@ async def normalize_gsc_analytics(gsc_data: Dict[str, Any]) -> Dict[str, Any]: }, 'top_queries': data.get('top_queries', []) or metrics.get('top_queries', []), 'top_pages': data.get('top_pages', []) or metrics.get('top_pages', []), + 'query_page_opportunities': query_page_opportunities, 'traffic_sources': { 'organic_search': { 'clicks': metrics.get('total_clicks', 0) or data.get('clicks', 0), diff --git a/backend/services/analytics/handlers/gsc_handler.py b/backend/services/analytics/handlers/gsc_handler.py index 8c7c1f0f..5be22b10 100644 --- a/backend/services/analytics/handlers/gsc_handler.py +++ b/backend/services/analytics/handlers/gsc_handler.py @@ -286,6 +286,40 @@ def _process_gsc_metrics(self, search_analytics: Dict[str, Any]) -> Dict[str, An except Exception as e: logger.warning(f"Failed processing top_pages: {e}") + # Prepare query-page opportunities for refresh-vs-new decisions + query_page_opportunities = [] + try: + qp_rows = search_analytics.get('query_page_data', {}).get('rows', []) + if qp_rows: + sorted_qp_rows = sorted( + qp_rows, + key=lambda x: (x.get('impressions', 0) or 0, x.get('clicks', 0) or 0), + reverse=True, + )[:100] + for row in sorted_qp_rows: + keys = row.get('keys', []) + if not keys or len(keys) < 2: + continue + query_key = keys[0]['keys'][0] if isinstance(keys[0], dict) else str(keys[0]) + page_key = keys[1]['keys'][0] if isinstance(keys[1], dict) else str(keys[1]) + clicks_val = row.get('clicks', 0) or 0 + impr_val = row.get('impressions', 0) or 0 + raw_ctr = row.get('ctr', None) + if raw_ctr is not None: + ctr_percent = round(float(raw_ctr) * 100, 2) + else: + ctr_percent = round(((clicks_val / impr_val) * 100), 2) if impr_val > 0 else 0.0 + query_page_opportunities.append({ + 'query': query_key, + 'page': page_key, + 'clicks': clicks_val, + 'impressions': impr_val, + 'ctr': ctr_percent, + 'position': round(row.get('position', 0) or 0, 2), + }) + except Exception as e: + logger.warning(f"Failed processing query_page_opportunities: {e}") + # Detect Cannibalization (query mapping to multiple pages) cannibalization = [] try: @@ -382,6 +416,7 @@ def _process_gsc_metrics(self, search_analytics: Dict[str, Any]) -> Dict[str, An 'total_queries': len(top_queries_source) if top_queries_source else 0, 'top_queries': top_queries, 'top_pages': top_pages, + 'query_page_opportunities': query_page_opportunities, 'cannibalization': cannibalization } @@ -397,6 +432,7 @@ def _process_gsc_metrics(self, search_analytics: Dict[str, Any]) -> Dict[str, An 'total_queries': 0, 'top_queries': [], 'top_pages': [], + 'query_page_opportunities': [], 'error': str(e) } diff --git a/backend/services/gsc_service.py b/backend/services/gsc_service.py index 95dfabde..46f43b2c 100644 --- a/backend/services/gsc_service.py +++ b/backend/services/gsc_service.py @@ -16,6 +16,9 @@ from dotenv import load_dotenv +QUERY_PAGE_OPPORTUNITIES_ROW_LIMIT = 2500 +QUERY_PAGE_OPPORTUNITIES_MAX_WINDOW_DAYS = 90 + class GSCService: """Service for Google Search Console integration.""" @@ -514,15 +517,18 @@ def get_search_analytics(self, user_id: str, site_url: str, page_rows = [] page_row_count = 0 - # Step 5: Get query+page combined data for mapping queries to pages + # Step 5: Get query+page combined data for mapping queries to pages. + # Keep this request bounded because query-page combinations can grow quickly + # for larger date windows/sites. qp_rows = [] qp_row_count = 0 try: + qp_start_date, qp_end_date = self._get_query_page_opportunity_window(start_date, end_date) qp_request = { - 'startDate': start_date, - 'endDate': end_date, + 'startDate': qp_start_date, + 'endDate': qp_end_date, 'dimensions': ['query', 'page'], - 'rowLimit': 1000 + 'rowLimit': QUERY_PAGE_OPPORTUNITIES_ROW_LIMIT } logger.info(f"GSC Query+Page request for user {user_id}: {qp_request}") qp_response = service.searchanalytics().query( @@ -553,7 +559,12 @@ def get_search_analytics(self, user_id: str, site_url: str, }, 'query_page_data': { 'rows': qp_rows, - 'rowCount': qp_row_count + 'rowCount': qp_row_count, + 'requested_window': { + 'startDate': qp_start_date, + 'endDate': qp_end_date, + 'rowLimit': QUERY_PAGE_OPPORTUNITIES_ROW_LIMIT, + }, }, 'verification_data': { 'rows': verification_rows, @@ -596,6 +607,20 @@ def get_search_analytics(self, user_id: str, site_url: str, except Exception as e: logger.error(f"Error getting search analytics for user {user_id}: {e}") raise + + def _get_query_page_opportunity_window(self, start_date: str, end_date: str) -> tuple[str, str]: + """Build a bounded query-page window to prevent oversized opportunity payloads.""" + try: + parsed_end = datetime.strptime(end_date, '%Y-%m-%d') + parsed_start = datetime.strptime(start_date, '%Y-%m-%d') + except Exception: + parsed_end = datetime.now() + parsed_start = parsed_end - timedelta(days=30) + + max_window_start = parsed_end - timedelta(days=QUERY_PAGE_OPPORTUNITIES_MAX_WINDOW_DAYS - 1) + bounded_start = max(parsed_start, max_window_start) + + return bounded_start.strftime('%Y-%m-%d'), parsed_end.strftime('%Y-%m-%d') def get_sitemaps(self, user_id: str, site_url: str) -> List[Dict[str, Any]]: """Get sitemaps from GSC."""