diff --git a/.gitignore b/.gitignore index 2ec71d4eaf..8f62c31378 100644 --- a/.gitignore +++ b/.gitignore @@ -2,11 +2,12 @@ .venv .vscode/ +.idea/ # dependencies /node_modules vendor/ src/public/ - +/media/ celery-worker.state # testing diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 47b822bbff..0000000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "python.linting.mypyEnabled": true, - "python.linting.enabled": true, - "python.formatting.provider": "true" -} diff --git a/Dockerfile b/Dockerfile index ab04550773..8f3a30d688 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,49 +1,50 @@ -FROM python:3.8-slim-buster as base +FROM python:3.8-bullseye AS base LABEL maintainer="Deep Dev dev@thedeep.io" - ENV PYTHONUNBUFFERED 1 WORKDIR /code +# Copy dependency files COPY pyproject.toml poetry.lock /code/ +# Install required system dependencies RUN apt-get update -y \ && apt-get install -y --no-install-recommends \ - # Basic Packages iproute2 git vim \ - # Build required packages gcc libc-dev libproj-dev \ - # NOTE: procps: For pkill command - procps \ - # Deep Required Packages wait-for-it binutils gdal-bin \ - # Upgrade pip and install python packages for code + libcairo2 \ + libpango1.0-dev \ + libpangocairo-1.0-0 \ + fonts-dejavu-core \ + fonts-liberation \ && pip install --upgrade --no-cache-dir pip poetry \ - && poetry --version \ - # Configure to use system instead of virtualenvs && poetry config virtualenvs.create false \ && poetry install --no-root \ - # Clean-up - && pip uninstall -y poetry virtualenv-clone virtualenv \ && apt-get remove -y gcc libc-dev libproj-dev \ && apt-get autoremove -y \ && rm -rf /var/lib/apt/lists/* +# Verify installation +RUN pip install weasyprint==53.0 # -------------------------- WEB --------------------------------------- FROM base AS web +# Copy all project files COPY . /code/ # -------------------------- WORKER --------------------------------------- FROM base AS worker +# Additional worker-specific tools RUN apt-get update -y \ && apt-get install -y --no-install-recommends \ libreoffice \ && apt-get autoremove -y \ && rm -rf /var/lib/apt/lists/* +# Copy all project files COPY . /code/ diff --git a/apps/deepl_integration/handlers.py b/apps/deepl_integration/handlers.py index 43771d6a5b..0360f861c6 100644 --- a/apps/deepl_integration/handlers.py +++ b/apps/deepl_integration/handlers.py @@ -581,6 +581,7 @@ def send_trigger_request_to_extractor( 'request_type': NlpRequestType.USER if high_priority else NlpRequestType.SYSTEM, } response_content = None + print('the extraction endpoint is ', DeeplServiceEndpoint.DOCS_EXTRACTOR_ENDPOINT) try: response = requests.post( DeeplServiceEndpoint.DOCS_EXTRACTOR_ENDPOINT, @@ -610,6 +611,7 @@ def trigger_lead_extract(cls, lead, task_instance=None): return True # Get the lead to be extracted url_to_extract = None + print('is lead url', lead.url, "is lead attachment", lead.attachment) if lead.attachment: url_to_extract = generate_file_url_for_legacy_deepl_server(lead.attachment) elif lead.url: diff --git a/apps/organization/views.py b/apps/organization/views.py index 6b0f34919a..676bdcc94a 100644 --- a/apps/organization/views.py +++ b/apps/organization/views.py @@ -44,3 +44,5 @@ def get_queryset(self): if self.kwargs.get('pk'): return Organization.objects.prefetch_related('parent') return Organization.objects.filter(parent=None) + + diff --git a/apps/static/image/graphQL-logo.svg b/apps/static/image/graphQL-logo.svg index 8e353ddbaa..58b2cc2877 100644 --- a/apps/static/image/graphQL-logo.svg +++ b/apps/static/image/graphQL-logo.svg @@ -1,71 +1,71 @@ - - - - + + + + diff --git a/apps/templates/connector/pdf.html b/apps/templates/connector/pdf.html new file mode 100644 index 0000000000..31f0711af0 --- /dev/null +++ b/apps/templates/connector/pdf.html @@ -0,0 +1,79 @@ + + + + + Data PDF + + + +

Data Report

+
+ {% load filter %} + {% for row in rows %} +
+

Entry {{ forloop.counter }}

+ +
+ {% endfor %} +
+ + diff --git a/apps/unified_connector/migrations/0011_alter_connectorsource_source.py b/apps/unified_connector/migrations/0011_alter_connectorsource_source.py new file mode 100644 index 0000000000..309dc7f6be --- /dev/null +++ b/apps/unified_connector/migrations/0011_alter_connectorsource_source.py @@ -0,0 +1,18 @@ +# Generated by Django 3.2.25 on 2024-10-16 12:53 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('unified_connector', '0010_auto_20240625_0806'), + ] + + operations = [ + migrations.AlterField( + model_name='connectorsource', + name='source', + field=models.CharField(choices=[('atom-feed', 'Atom Feed'), ('relief-web', 'Relifweb'), ('rss-feed', 'RSS Feed'), ('unhcr-portal', 'UNHCR Portal'), ('humanitarian-resp', 'Humanitarian Response'), ('pdna', 'Post Disaster Needs Assessments'), ('emm', 'European Media Monitor'), ('kobo', 'KoboToolbox')], max_length=20), + ), + ] diff --git a/apps/unified_connector/models.py b/apps/unified_connector/models.py index 0f0485d6c1..953065b23c 100644 --- a/apps/unified_connector/models.py +++ b/apps/unified_connector/models.py @@ -15,6 +15,7 @@ humanitarian_response, pdna, emm, + kobo, ) @@ -120,6 +121,7 @@ class Source(models.TextChoices): HUMANITARIAN_RESP = 'humanitarian-resp', 'Humanitarian Response' PDNA = 'pdna', 'Post Disaster Needs Assessments' EMM = 'emm', 'European Media Monitor' + KOBO = 'kobo', 'KoboToolbox' class Status(models.IntegerChoices): PENDING = 0, 'Pending' @@ -135,6 +137,7 @@ class Status(models.IntegerChoices): Source.HUMANITARIAN_RESP: humanitarian_response.HumanitarianResponse, Source.PDNA: pdna.PDNA, Source.EMM: emm.EMM, + Source.KOBO: kobo.Kobo, } title = models.CharField(max_length=255) @@ -228,3 +231,4 @@ def update_aleady_added_using_lead(cls, lead, added=True): connector_lead=lead.connector_lead, source__unified_connector__project=lead.project, ).update(already_added=added) + diff --git a/apps/unified_connector/mutation.py b/apps/unified_connector/mutation.py index 54c8bc070e..965e5b0061 100644 --- a/apps/unified_connector/mutation.py +++ b/apps/unified_connector/mutation.py @@ -1,4 +1,5 @@ import graphene +import requests from utils.graphene.mutation import ( generate_input_type_for_serializer, @@ -36,6 +37,51 @@ serializer_class=ConnectorSourceLeadGqSerializer, ) +class KoboValPsGrapheneMutation(PsGrapheneMutation): + @classmethod + def perform_mutate(cls, root, info, **kwargs): + from graphql import GraphQLError + data = kwargs['data'] + print('data is ', data) + if not cls.validate_kobo(data): + raise GraphQLError("Invalid Kobo data: 'project_id' and 'token' combination did not retrieve any valid data") + instance, errors = cls._save_item(data, info, **kwargs) + return cls(result=instance, errors=errors, ok=not errors) + + @classmethod + def validate_kobo(cls, data): + #TODO validate all sources + sources = data.get('sources', []) + source = sources[0] if sources else {} + if source and source.get('title') != 'KoboToolbox': + return True + + params = source.get("params", {}) + project_id = params.get('project_id') + token = params.get('token') + + if not project_id or not token: + return False + + # Validate Kobo API fetch + return cls.valid_kobo_fetch(project_id, token) + + @classmethod + def valid_kobo_fetch(cls, project_id, token): + URL = 'https://kf.kobotoolbox.org/api/v2/assets/' + api_url = f"{URL}{project_id}/data/?format=json" + headers = {"Authorization": f"Token {token}"} + + try: + response = requests.get(api_url, headers=headers, stream=True) + if response.status_code == 200: + return True + else: + # logger.error("Failed to fetch data from API, Status code: %d", response.status_code) + return False + except requests.RequestException as e: + # logger.critical("A critical error occurred while fetching data: %s", e) + return False class UnifiedConnectorMixin(): @classmethod @@ -43,7 +89,7 @@ def filter_queryset(cls, qs, info): return qs.filter(project=info.context.active_project) -class CreateUnifiedConnector(UnifiedConnectorMixin, PsGrapheneMutation): +class CreateUnifiedConnector(UnifiedConnectorMixin, KoboValPsGrapheneMutation): class Arguments: data = UnifiedConnectorWithSourceInputType(required=True) model = UnifiedConnector @@ -52,7 +98,8 @@ class Arguments: permissions = [PP.Permission.CREATE_UNIFIED_CONNECTOR] -class UpdateUnifiedConnector(UnifiedConnectorMixin, PsGrapheneMutation): + +class UpdateUnifiedConnector(UnifiedConnectorMixin, KoboValPsGrapheneMutation): class Arguments: id = graphene.ID(required=True) data = UnifiedConnectorInputType(required=True) @@ -62,7 +109,9 @@ class Arguments: permissions = [PP.Permission.UPDATE_UNIFIED_CONNECTOR] -class UpdateUnifiedConnectorWithSource(UnifiedConnectorMixin, PsGrapheneMutation): + + +class UpdateUnifiedConnectorWithSource(UnifiedConnectorMixin, KoboValPsGrapheneMutation): class Arguments: id = graphene.ID(required=True) data = UnifiedConnectorWithSourceInputType(required=True) diff --git a/apps/unified_connector/schema.py b/apps/unified_connector/schema.py index 3d281b7719..01132986c7 100644 --- a/apps/unified_connector/schema.py +++ b/apps/unified_connector/schema.py @@ -32,6 +32,7 @@ def get_unified_connector_qs(info): + qs = UnifiedConnector.objects.filter(project=info.context.active_project) if PP.check_permission(info, PP.Permission.VIEW_UNIFIED_CONNECTOR): return qs @@ -71,12 +72,21 @@ class Meta: 'authors', ) + @staticmethod + def resolve_url(root, info, **_): + print('the root title is', root.url.split('amazonaws.com/')[-1]) + pdf_url = root.url.split('amazonaws.com/')[-1] + csv_url = pdf_url.replace('pdf', 'csv') + return {"pdf": get_presigned_url(pdf_url), "csv":get_presigned_url(csv_url)} if root.source.title=="KoboToolbox" else root.url + @staticmethod def resolve_source(root, info, **_): + return root.source_id and info.context.dl.unified_connector.connector_lead_source.load(root.source_id) @staticmethod def resolve_authors(root, info, **_): + return info.context.dl.unified_connector.connector_lead_authors.load(root.pk) @@ -92,6 +102,7 @@ class Meta: 'already_added', ) + @staticmethod def get_custom_queryset(queryset, info, **_): return get_connector_source_lead_qs(info) @@ -107,6 +118,7 @@ class Meta: filterset_class = ConnectorSourceLeadGQFilterSet + class ConnectorSourceStatsType(graphene.ObjectType): date = graphene.Date(required=True) count = graphene.Int(required=True) @@ -143,14 +155,17 @@ class Meta: @staticmethod def get_custom_queryset(queryset, info, **_): + return get_connector_source_qs(info) @staticmethod def resolve_stats(root, info, **_): + return (root.stats or {}).get('published_dates') or [] @staticmethod def resolve_leads_count(root, info, **_): + return info.context.dl.unified_connector.connector_source_leads_count.load(root.pk) @@ -240,7 +255,8 @@ def resolve_connector_sources(root, info, **kwargs) -> QuerySet: @staticmethod def resolve_connector_source_leads(root, info, **kwargs) -> QuerySet: - return get_connector_source_lead_qs(info) + qs = get_connector_source_lead_qs(info) + return qs class RssFieldType(graphene.ObjectType): @@ -264,3 +280,20 @@ def resolve_rss_fields(root, info, url): @staticmethod def resolve_atom_feed_fields(root, info, url): return AtomFeed().query_fields({"feed-url": url}) + +def get_presigned_url(object_key, expiration=3600): + import boto3 + from botocore.exceptions import NoCredentialsError, PartialCredentialsError + from deep import settings + s3_client = boto3.client('s3', region_name=settings.AWS_S3_REGION_NAME, + aws_access_key_id=settings.AWS_ACCESS_KEY_ID, + aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY, ) + try: + return s3_client.generate_presigned_url( + 'get_object', + Params={'Bucket': settings.AWS_STORAGE_BUCKET_NAME, 'Key': object_key}, + ExpiresIn=expiration + ) + except (NoCredentialsError, PartialCredentialsError) as e: + print(f"Error generating presigned URL: {e}") + return None diff --git a/apps/unified_connector/serializers.py b/apps/unified_connector/serializers.py index 20b99a6c8c..e79e7aa83f 100644 --- a/apps/unified_connector/serializers.py +++ b/apps/unified_connector/serializers.py @@ -35,6 +35,8 @@ class Meta: ) + + class UnifiedConnectorGqSerializer(ProjectPropertySerializerMixin, TempClientIdMixin, UserResourceSerializer): class Meta: model = UnifiedConnector @@ -56,6 +58,8 @@ def create(self, data): return instance + + class UnifiedConnectorWithSourceGqSerializer(UnifiedConnectorGqSerializer): sources = ConnectorSourceGqSerializer(required=False, many=True) @@ -71,11 +75,10 @@ class Meta: def _get_prefetch_related_instances_qs(self, qs): if self.instance: return qs.filter(unified_connector=self.instance) - return qs.none() # On create throw error if existing id is provided + return qs.none() def validate_sources(self, sources): source_found = set() - # Only allow unique source per unified connectors for source in sources: source_type = source['source'] if source_type in source_found: @@ -84,6 +87,7 @@ def validate_sources(self, sources): return sources + class ConnectorSourceLeadGqSerializer(serializers.ModelSerializer): class Meta: model = ConnectorSourceLead diff --git a/apps/unified_connector/sources/emm.py b/apps/unified_connector/sources/emm.py index 7c3ba952ed..369c904344 100644 --- a/apps/unified_connector/sources/emm.py +++ b/apps/unified_connector/sources/emm.py @@ -22,7 +22,6 @@ class EMM(RssFeed): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - # Sets up conf self.has_emm_entities = False self.has_emm_triggers = False self.initialize() @@ -92,6 +91,7 @@ def query_fields(self, params): return real_fields def get_content(self, url, params): + resp = requests.get(url) return resp.content diff --git a/apps/unified_connector/sources/kobo.py b/apps/unified_connector/sources/kobo.py new file mode 100644 index 0000000000..963241380a --- /dev/null +++ b/apps/unified_connector/sources/kobo.py @@ -0,0 +1,367 @@ +import logging + +import uuid, csv, io +import time + +from botocore.exceptions import BotoCoreError, ClientError +from django.conf import settings +from rest_framework.exceptions import ValidationError +import requests +import datetime +from connector.utils import ConnectorWrapper +from lead.models import Lead +from unified_connector.sources.base import Source + +from deep import settings +from io import BytesIO +from django.template.loader import render_to_string +from weasyprint import HTML + +import hashlib +from django.core.files.base import ContentFile +import os +from boto3.session import Session +from botocore.exceptions import NoCredentialsError + + +logger = logging.getLogger(__name__) + + +@ConnectorWrapper +class Kobo(Source): + + URL = 'https://kf.kobotoolbox.org/api/v2/assets/' + title = 'KoboToolbox Reports' + key = 'kobo-toolbox' + + options = [ + { + 'key': 'project_id', + 'field_type': 'text', + 'title': 'Project ID', + }, + { + 'key': 'token', + 'field_type': 'text', + 'title': 'Kobo API Token', + } + ] + + def get_content(self, project_id, token): + api_url = f"{self.URL}{project_id}/data/?format=json" + headers = {"Authorization": f"Token {token}"} + + try: + with requests.get(api_url, headers=headers, stream=True) as response: + if response.status_code == 200: + return response.json().get('results', []) + else: + logger.error("Failed to fetch data from API, Status code: %d", response.status_code) + except requests.RequestException as e: + logger.critical("A critical error occurred while fetching data: %s", e) + return [] + + def fetch(self, params): + logger.info(f'fetching for kobo commenced with params {params}') + result = [] + project_id = params.get('project_id') + if not project_id: + return [], 0 + + token = params.get('token') + if not token: + return [], 0 + + + try: + records = self.get_content(project_id, token) + if records: + + qualitative_columns, rows = accumulate_columns_and_rows(records) + context = { + 'columns': qualitative_columns, + 'rows': rows, + } + + html_string = render_to_string('connector/pdf.html', context) + + html = HTML(string=html_string) + pdf_file = html.write_pdf() + + pdf_stream = BytesIO(pdf_file) + + file_path = save_file_remote(project_id, context, pdf_file=pdf_stream) + print(f'the media url is {settings.MEDIA_URL} and the media files location is {settings.MEDIAFILES_LOCATION}') + file_url = os.path.join(settings.MEDIA_URL, file_path) + + date = datetime.now() + result = [{ + 'title': project_id, + 'url': file_url, + 'source': 'KoboToolbox', + 'author': 'KoboToolbox', + 'published_on': date.date(), + 'source_type': Lead.SourceType.WEBSITE} + ] + + logger.info(f'the resulted data of kobo is: {result}') + return result, len(result) + except Exception as e: + logger.error("An error occurred: %s", e) + return [], 0 + + + +def calculate_md5(file_content): + """Calculate the MD5 checksum of a file-like object.""" + hash_md5 = hashlib.md5() + for chunk in iter(lambda: file_content.read(4096), b""): + hash_md5.update(chunk) + file_content.seek(0) # Reset file pointer + return hash_md5.hexdigest() + +def verify_checksum_s3(bucket_name, object_key, local_checksum, s3_client): + """Verify the checksum of a file in S3.""" + try: + response = s3_client.head_object(Bucket=bucket_name, Key=object_key) + s3_etag = response['ETag'].strip('"') # Remove quotes from ETag + return s3_etag == local_checksum + except NoCredentialsError: + raise Exception("AWS credentials not found.") + except Exception as e: + raise Exception(f"Error verifying checksum: {e}") + + +def upload_to_s3_with_retry(bucket_name, object_key, file_content, local_checksum, max_retries=10, + retry_delay=1): + """ + Upload a file to S3 with retry mechanism as a normal function. + + Args: + bucket_name (str): S3 bucket name. + object_key (str): S3 object key. + encoded_pdf_content (str): Base64-encoded file content. + local_checksum (str): MD5 checksum of the file. + max_retries (int): Maximum number of retries. + retry_delay (int): Delay (in seconds) between retries. + + Raises: + Exception: If all retries fail. + """ + session = Session( + aws_access_key_id=settings.AWS_ACCESS_KEY_ID, + aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY, + region_name=settings.AWS_S3_REGION_NAME, + ) + s3_client = session.client('s3') + + for attempt in range(max_retries): + try: + s3_client.put_object(Bucket=bucket_name, Key=object_key, Body=file_content) + logger.info(f"File {object_key} uploaded successfully to bucket {bucket_name}. and is going to be verified") + + # Verify checksum + if not verify_checksum_s3(bucket_name, object_key, local_checksum, s3_client): + message = 'Checksum validation error' + logger.warning(f'{message} retrying... Attempt {attempt + 1} of {max_retries}') + raise ValidationError(message) # Raise to trigger retry + logger.info('checksum validation successful') + return True + except (BotoCoreError, ClientError, ValidationError) as exc: + logger.error(f"Attempt {attempt + 1} failed: {exc}") + if attempt < max_retries - 1: + time.sleep(retry_delay) + else: + logger.error("All retry attempts failed.") + raise + + +def save_file_remote(project_id, context, pdf_file): + timestamp = datetime.now().strftime('%Y%m%dT%H%M%S') + directory_path = os.path.join(str(project_id), str(timestamp)) + os.makedirs(directory_path, exist_ok=True) + file_id = uuid.uuid4() + bucket_name = settings.AWS_STORAGE_BUCKET_NAME + + pdf_content = pdf_file.getvalue() + def compose_file_path(file_type): + file_path = os.path.join(file_type, directory_path, f"{file_id}.{file_type}") + remote_file_path = os.path.join(settings.MEDIAFILES_LOCATION, file_path) + return remote_file_path, file_path + + pdf_remote_path, pdf_path = compose_file_path('pdf') + csv_remote_path, csv_path = compose_file_path('csv') + + def generate_csv_data(context): + csv_buffer = io.StringIO() + writer = csv.writer(csv_buffer) + writer.writerow(context['columns']) + for row in context['rows']: + writer.writerow(row) + csv_content = csv_buffer.getvalue().encode('utf-8') + csv_file = ContentFile(csv_content) + return csv_content, csv_file + + csv_content, csv_file = generate_csv_data(context) + + def remote_save_routine(file, file_content, remote_file_path): + file_local_checksum = calculate_md5(file) + upload_to_s3_with_retry(bucket_name, remote_file_path, file_content, file_local_checksum) + + remote_save_routine(pdf_file, pdf_content, pdf_remote_path) + remote_save_routine(csv_file, csv_content, csv_remote_path) + return pdf_path + + + +# def pdf_save_path_and_url(project_id, context, pdf_file): +# project_id = project_id +# timestamp = datetime.now().strftime('%Y%m%dT%H%M%S') +# import uuid, csv +# directory_path = os.path.join( +# str(project_id), +# str(timestamp), +# ) +# pdf_directory_path = os.path.join("pdf", directory_path) +# os.makedirs(directory_path, exist_ok=True) +# file_id = uuid.uuid4() +# pdf_file_path = os.path.join(pdf_directory_path, f"{file_id}.pdf") +# def save_pdf(): +# +# file_path = os.path.join(settings.MEDIAFILES_LOCATION, pdf_directory_path, f"{file_id}.pdf") +# default_storage.save(file_path, ContentFile(pdf_file.getvalue())) +# save_pdf() +# +# csv_directory_path = os.path.join(settings.MEDIAFILES_LOCATION, "csv", directory_path) +# csv_file_path = os.path.join(csv_directory_path, f"{file_id}.csv") +# +# def save_csv(): +# import io +# csv_buffer = io.StringIO() +# +# writer = csv.writer(csv_buffer) +# writer.writerow(context['columns']) +# for row in context['rows']: +# writer.writerow(row) +# +# csv_content = ContentFile(csv_buffer.getvalue().encode('utf-8')) +# default_storage.save(csv_file_path, csv_content) +# +# save_csv() +# +# return pdf_file_path +# + +def accumulate_columns_and_rows(records): + """Accumulate all columns from the records and filter qualitative columns.""" + all_columns_set = set() + rows = [] + + # Accumulate all unique columns across all records + for record in records: + all_columns_set.update(record.keys()) + + all_columns = sorted(all_columns_set) + + # Filter qualitative columns based on values across all records + qualitative_columns = [] + for col in all_columns: + if all(is_qualitative(col, record.get(col, "N/A")) for record in records): + qualitative_columns.append(col) + + # Build rows with qualitative data + for record in records: + row = [record.get(column, "N/A") for column in qualitative_columns] + rows.append(row) + + return qualitative_columns, rows + + +import re +from datetime import datetime + +BOOLEAN_TRUE_VALUES = {'true', 'yes', '1', 'on'} +BOOLEAN_FALSE_VALUES = {'false', 'no', '0', 'off'} + + +def is_uuid(value): + """Check if a string is a valid UUID.""" + uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$', re.IGNORECASE) + return bool(uuid_pattern.match(value)) + + +def is_id_field(key, value): + """Check if a field is likely to be an ID field.""" + if isinstance(key, str): + # Check if the key contains 'id' or 'uuid' + if 'id' in key.lower() or 'uuid' in key.lower(): + return True + + # Check if the value is a UUID + if isinstance(value, str) and is_uuid(value): + return True + + # Check if it's a numeric ID + if isinstance(value, (int, str)): + try: + int(value) + return len(str(value)) > 5 # Assume IDs are typically longer than 5 digits + except ValueError: + pass + + return False + + +def is_date(value): + """Check if a string is a valid date.""" + try: + datetime.fromisoformat(value.replace('Z', '+00:00')) + return True + except (ValueError, AttributeError): + return False + + +def is_boolean(value): + """Check if the value represents a boolean.""" + if isinstance(value, bool): + return True # Already a boolean + + if isinstance(value, str): + normalized_value = value.strip().lower() + if normalized_value in BOOLEAN_TRUE_VALUES or normalized_value in BOOLEAN_FALSE_VALUES: + return True + + return False + + +def is_qualitative(key, value): + """ + Helper function to determine if a value is qualitative based on its key, type, and content. + """ + + # Check if it's an ID field + if isinstance(value, (dict, list)): + return True + + if is_id_field(key, value): + return False + + # Check if it's a boolean + if is_boolean(value): + return False + + if isinstance(value, str): + # Check if it's a number or date disguised as a string + try: + float(value) + return False # It's a number + except ValueError: + if is_date(value): + return False # It's a date + return True # It's a regular string, consider it qualitative + + if isinstance(value, (int, float)): + return False # Numbers are quantitative + + # Consider everything else as qualitative + return True \ No newline at end of file diff --git a/apps/unified_connector/sources/pdna.py b/apps/unified_connector/sources/pdna.py index 8a7a7255ee..89bb21456f 100644 --- a/apps/unified_connector/sources/pdna.py +++ b/apps/unified_connector/sources/pdna.py @@ -89,6 +89,7 @@ def get_content(self, url, params): return resp.text def fetch(self, params): + print('ffffffffffffffffff', params) country = params.get('country') if not country: return [], 0 @@ -119,6 +120,7 @@ def fetch(self, params): 'source_type': Lead.SourceType.WEBSITE, } results.append(data) + except Exception as e: logger.warning( "Exception parsing {} with params {}: {}".format( diff --git a/apps/unified_connector/sources/relief_web.py b/apps/unified_connector/sources/relief_web.py index 83a0292c14..5f1df8b5fd 100644 --- a/apps/unified_connector/sources/relief_web.py +++ b/apps/unified_connector/sources/relief_web.py @@ -317,7 +317,7 @@ def get_content(self, url, params): def parse_filter_params(self, params): filters = [] - + print('aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa') if params.get('country'): filters.append({'field': 'country.iso3', 'value': params['country']}) if params.get('primary-country'): diff --git a/apps/unified_connector/sources/unhcr_portal.py b/apps/unified_connector/sources/unhcr_portal.py index 211f3b1c8b..59e4aae1a2 100644 --- a/apps/unified_connector/sources/unhcr_portal.py +++ b/apps/unified_connector/sources/unhcr_portal.py @@ -1,5 +1,7 @@ import json import copy +import logging + import requests import datetime @@ -9,7 +11,7 @@ from connector.utils import ConnectorWrapper from .base import Source - +logger = logging.Logger(__name__) COUNTRIES_OPTIONS = [ {"label": "All", "key": ""}, @@ -314,6 +316,7 @@ def fetch(self, params): 'source_type': '', } results.append(data) + logger.info(f'the resulted data of unhcr is: {results}') footer = soup.find('div', {'class': 'pgSearch_results_footer'}) if not footer: break diff --git a/apps/unified_connector/templatetags/__init__.py b/apps/unified_connector/templatetags/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/apps/unified_connector/templatetags/filter.py b/apps/unified_connector/templatetags/filter.py new file mode 100644 index 0000000000..c6f1a6f057 --- /dev/null +++ b/apps/unified_connector/templatetags/filter.py @@ -0,0 +1,27 @@ +from django import template + +register = template.Library() + +@register.filter +def get_value_from_dict(dictionary, key): + """Returns the value from the dictionary corresponding to the given key.""" + return dictionary.get(key, '') + +from django import template + +register = template.Library() + +@register.filter +def get_item(list, key): + """Return the value for a given key in a dictionary.""" + return list.get(key, '') # + + +from django import template + +register = template.Library() + +@register.filter +def zipl(list1, list2): + """Return zipped lists as a list of tuples.""" + return zip(list1, list2) diff --git a/changes.md b/changes.md new file mode 100644 index 0000000000..e3a3ec58ea --- /dev/null +++ b/changes.md @@ -0,0 +1,105 @@ +# Changes Report + +This report lists the files that have changed between the local HEAD and the remote branch `origin/develop`. + +## Files Changed +| File Path | Change Type | +|-----------------|---------------| +| .gitignore | Modified | +| .vscode/settings.json | Added | +| Dockerfile | Modified | +| PDFS/1/aBjuSQpEPKeu45Mn7hQVgX/20241116T201608/3195cf72-f1dc-4197-b2ca-7d8ef51e8c25.pdf | Deleted | +| apps/analysis/models.py | Modified | +| apps/analysis/mutation.py | Modified | +| apps/analysis/schema.py | Modified | +| apps/analysis/serializers.py | Modified | +| apps/analysis/tasks.py | Modified | +| apps/analysis/tests/test_mutations.py | Modified | +| apps/analysis_framework/serializers.py | Modified | +| apps/assisted_tagging/admin.py | Modified | +| apps/assisted_tagging/dataloaders.py | Modified | +| apps/assisted_tagging/migrations/0013_llmassistedtaggingpredication.py | Added | +| apps/assisted_tagging/models.py | Modified | +| apps/assisted_tagging/schema.py | Modified | +| apps/assisted_tagging/serializers.py | Modified | +| apps/assisted_tagging/tasks.py | Modified | +| apps/assisted_tagging/tests/test_query.py | Modified | +| apps/deepl_integration/handlers.py | Modified | +| apps/deepl_integration/serializers.py | Modified | +| apps/deepl_integration/views.py | Modified | +| apps/entry/dataloaders.py | Modified | +| apps/export/entries/json_exporter.py | Modified | +| apps/export/tasks/tasks_entries.py | Modified | +| apps/geo/enums.py | Modified | +| apps/geo/filter_set.py | Modified | +| apps/geo/migrations/0044_region_status.py | Added | +| apps/geo/models.py | Modified | +| apps/geo/mutations.py | Modified | +| apps/geo/schema.py | Modified | +| apps/geo/serializers.py | Modified | +| apps/geo/tasks.py | Modified | +| apps/lead/filter_set.py | Modified | +| apps/organization/views.py | Modified | +| apps/project/admin.py | Modified | +| apps/project/mutation.py | Modified | +| apps/project/serializers.py | Modified | +| apps/static/image/graphQL-logo.svg | Modified | +| apps/templates/connector/pdf.html | Deleted | +| apps/unified_connector/migrations/0011_alter_connectorsource_source.py | Deleted | +| apps/unified_connector/models.py | Modified | +| apps/unified_connector/mutation.py | Modified | +| apps/unified_connector/schema.py | Modified | +| apps/unified_connector/serializers.py | Modified | +| apps/unified_connector/sources/emm.py | Modified | +| apps/unified_connector/sources/kobo.py | Deleted | +| apps/unified_connector/sources/pdna.py | Modified | +| apps/unified_connector/sources/relief_web.py | Modified | +| apps/unified_connector/sources/unhcr_portal.py | Modified | +| apps/unified_connector/templatetags/__init__.py | Deleted | +| apps/unified_connector/templatetags/filter.py | Deleted | +| csv/aBjuSQpEPKeu45Mn7hQVgX/20241116T213712/d9f2f7b7-d516-4a27-8e8d-a72124f4716b.csv | Deleted | +| csv/aBjuSQpEPKeu45Mn7hQVgX/20241116T213739/cabd68e3-1892-4f7e-a29f-d219a1d718b1.csv | Deleted | +| csv/aBjuSQpEPKeu45Mn7hQVgX/20241116T214255/746fdba5-1a64-476d-ad1a-be87ccf34193.csv | Deleted | +| deep/deepl.py | Modified | +| deep/exception_handler.py | Modified | +| deep/settings.py | Modified | +| deep/tests/test_case.py | Modified | +| deep/urls.py | Modified | +| documents/csv/aBjuSQpEPKeu45Mn7hQVgX/20241125T215942/4d345645-43f5-4a56-a05f-5bb0ddde923f.csv | Deleted | +| documents/csv/aBjuSQpEPKeu45Mn7hQVgX/20241125T220607/9bee8649-a5df-4b61-bed9-1e3234084669.csv | Deleted | +| documents/csv/aBjuSQpEPKeu45Mn7hQVgX/20241125T220607/b816cc3b-6ca8-4132-af15-0f582203f35d.csv | Deleted | +| documents/csv/aBjuSQpEPKeu45Mn7hQVgX/20241125T220610/ab206fdf-7952-470f-a043-474720133e1d.csv | Deleted | +| documents/csv/aBjuSQpEPKeu45Mn7hQVgX/20241125T220610/ce8fd664-fd63-48b3-97b4-3e081305632f.csv | Deleted | +| documents/csv/aBjuSQpEPKeu45Mn7hQVgX/20241125T220613/46f7a509-5c8f-4506-9339-da2deea919d3.csv | Deleted | +| documents/csv/aBjuSQpEPKeu45Mn7hQVgX/20241125T220613/699246ab-410d-4794-a208-6b278566307d.csv | Deleted | +| documents/csv/aBjuSQpEPKeu45Mn7hQVgX/20241125T220615/b300c23b-2f87-4406-a6ff-a727bfa68366.csv | Deleted | +| documents/csv/aBjuSQpEPKeu45Mn7hQVgX/20241125T220618/2a49ddae-5a77-4436-b2a5-e18906d267b9.csv | Deleted | +| documents/csv/aBjuSQpEPKeu45Mn7hQVgX/20241125T220620/7e9b8ed9-339c-4bd3-a241-ffdd509b4522.csv | Deleted | +| documents/csv/aBjuSQpEPKeu45Mn7hQVgX/20241125T233046/5420ef95-087c-4950-a2a3-507c2c8bc99a.csv | Deleted | +| documents/csv/aBjuSQpEPKeu45Mn7hQVgX/20241125T233046/dc705d72-3f13-4915-bcb4-0eee3332cd71.csv | Deleted | +| documents/csv/aBjuSQpEPKeu45Mn7hQVgX/20241125T233300/25b6d3c8-1d85-421c-892f-8e9d8b6a67e2.csv | Deleted | +| documents/csv/afxi6P5vmSMxRn8APv9nTd/20241125T220601/195b5b2d-b964-4517-9823-6c1a531454c8.csv | Deleted | +| documents/csv/afxi6P5vmSMxRn8APv9nTd/20241125T220601/b11a6263-2c8a-40b2-8a11-e050e0ebbb73.csv | Deleted | +| documents/csv/afxi6P5vmSMxRn8APv9nTd/20241125T220602/1e81ba8b-8d03-48eb-a39e-22425b79c20d.csv | Deleted | +| documents/csv/afxi6P5vmSMxRn8APv9nTd/20241125T220602/a5f9574b-ff0a-4b42-ada2-be6ff31dfaf6.csv | Deleted | +| documents/csv/afxi6P5vmSMxRn8APv9nTd/20241125T220603/ac8c7049-709b-457a-9794-9fe6280274fa.csv | Deleted | +| documents/pdf/aBjuSQpEPKeu45Mn7hQVgX/20241125T215942/4d345645-43f5-4a56-a05f-5bb0ddde923f.pdf | Deleted | +| documents/pdf/aBjuSQpEPKeu45Mn7hQVgX/20241125T220607/9bee8649-a5df-4b61-bed9-1e3234084669.pdf | Deleted | +| documents/pdf/aBjuSQpEPKeu45Mn7hQVgX/20241125T220607/b816cc3b-6ca8-4132-af15-0f582203f35d.pdf | Deleted | +| documents/pdf/aBjuSQpEPKeu45Mn7hQVgX/20241125T220610/ab206fdf-7952-470f-a043-474720133e1d.pdf | Deleted | +| documents/pdf/aBjuSQpEPKeu45Mn7hQVgX/20241125T220610/ce8fd664-fd63-48b3-97b4-3e081305632f.pdf | Deleted | +| documents/pdf/aBjuSQpEPKeu45Mn7hQVgX/20241125T220613/46f7a509-5c8f-4506-9339-da2deea919d3.pdf | Deleted | +| documents/pdf/aBjuSQpEPKeu45Mn7hQVgX/20241125T220613/699246ab-410d-4794-a208-6b278566307d.pdf | Deleted | +| documents/pdf/aBjuSQpEPKeu45Mn7hQVgX/20241125T220615/b300c23b-2f87-4406-a6ff-a727bfa68366.pdf | Deleted | +| documents/pdf/aBjuSQpEPKeu45Mn7hQVgX/20241125T220618/2a49ddae-5a77-4436-b2a5-e18906d267b9.pdf | Deleted | +| documents/pdf/aBjuSQpEPKeu45Mn7hQVgX/20241125T220620/7e9b8ed9-339c-4bd3-a241-ffdd509b4522.pdf | Deleted | +| documents/pdf/afxi6P5vmSMxRn8APv9nTd/20241125T220601/195b5b2d-b964-4517-9823-6c1a531454c8.pdf | Deleted | +| documents/pdf/afxi6P5vmSMxRn8APv9nTd/20241125T220601/b11a6263-2c8a-40b2-8a11-e050e0ebbb73.pdf | Deleted | +| documents/pdf/afxi6P5vmSMxRn8APv9nTd/20241125T220602/1e81ba8b-8d03-48eb-a39e-22425b79c20d.pdf | Deleted | +| documents/pdf/afxi6P5vmSMxRn8APv9nTd/20241125T220602/a5f9574b-ff0a-4b42-ada2-be6ff31dfaf6.pdf | Deleted | +| documents/pdf/afxi6P5vmSMxRn8APv9nTd/20241125T220603/ac8c7049-709b-457a-9794-9fe6280274fa.pdf | Deleted | +| documentser/csv/aBjuSQpEPKeu45Mn7hQVgX/20241125T215855/6a674513-b7b0-4cba-b88c-30c18d95adcc.csv | Deleted | +| documentser/pdf/aBjuSQpEPKeu45Mn7hQVgX/20241125T215855/6a674513-b7b0-4cba-b88c-30c18d95adcc.pdf | Deleted | +| poetry.lock | Modified | +| pyproject.toml | Modified | +| schema.graphql | Modified | diff --git a/deep/settings.py b/deep/settings.py index dc24009be4..7f085a9c12 100644 --- a/deep/settings.py +++ b/deep/settings.py @@ -413,6 +413,8 @@ if env('DJANGO_USE_S3'): # AWS S3 Bucket Credentials + AWS_STORAGE_BUCKET_NAME = env('AWS_STORAGE_BUCKET_NAME') + AWS_S3_REGION_NAME = env('AWS_S3_REGION_NAME') AWS_STORAGE_BUCKET_NAME_STATIC = env('AWS_STORAGE_BUCKET_NAME_STATIC') AWS_STORAGE_BUCKET_NAME_MEDIA = env('AWS_STORAGE_BUCKET_NAME_MEDIA') # If environment variable are not provided, then EC2 Role will be used. @@ -434,7 +436,7 @@ AWS_S3_FILE_OVERWRITE = False AWS_DEFAULT_ACL = 'private' AWS_QUERYSTRING_AUTH = True - AWS_S3_CUSTOM_DOMAIN = None + AWS_S3_CUSTOM_DOMAIN = None AWS_QUERYSTRING_EXPIRE = GALLERY_FILE_EXPIRE AWS_S3_SIGNATURE_VERSION = 's3v4' AWS_IS_GZIPPED = True @@ -452,6 +454,7 @@ MEDIAFILES_LOCATION = 'media' MEDIA_URL = "https://%s/%s/" % (AWS_S3_CUSTOM_DOMAIN, MEDIAFILES_LOCATION) DEFAULT_FILE_STORAGE = 'deep.s3_storages.MediaStorage' + else: STATIC_URL = '/static/' STATIC_ROOT = '/static' diff --git a/deep/urls.py b/deep/urls.py index 3f14a5db8b..6bf6d48b8d 100644 --- a/deep/urls.py +++ b/deep/urls.py @@ -17,7 +17,6 @@ from . import converters -# import autofixture from user.views import ( UserViewSet, @@ -644,10 +643,12 @@ def get_api_path(path): ), name="favicon"), ] + static.static( - settings.MEDIA_URL, view=xframe_options_exempt(serve), + settings.MEDIA_URL, + view=xframe_options_exempt(serve), document_root=settings.MEDIA_ROOT ) + if settings.DEBUG: import debug_toolbar if 'debug_toolbar' in settings.INSTALLED_APPS: @@ -663,8 +664,10 @@ def get_api_path(path): re_path(r'^ec-email/$', EntryCommentEmail.as_view()), re_path(r'^erc-email/$', EntryReviewCommentEmail.as_view()), re_path(r'^render-debug/$', RenderChart.as_view()), + ] + handler404 = Api_404View.as_view() # TODO Uncomment after fixing custom autofixtures