Skip to content

Commit 17acd2e

Browse files
authored
Merge pull request #261 from the-deep/hotfix-generate-preview-thumbnail
Hotfix generate preview thumbnail
2 parents 1913d1a + a4d4f68 commit 17acd2e

File tree

11 files changed

+149
-27
lines changed

11 files changed

+149
-27
lines changed
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from django.core.management.base import BaseCommand
2+
3+
from lead.tasks import generate_previews
4+
5+
6+
class Command(BaseCommand):
7+
help = 'Extract preview/images from leads'
8+
9+
def add_arguments(self, parser):
10+
parser.add_argument(
11+
'--lead_id',
12+
nargs='+',
13+
type=int,
14+
help='List of lead ids'
15+
)
16+
17+
def handle(self, *args, **options):
18+
if options['lead_id']:
19+
generate_previews.delay(options['lead_id'])
20+
else:
21+
generate_previews.delay()

apps/connector/sources/base.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33

44
class Source(ABC):
5+
DEFAULT_PER_PAGE = 25
6+
57
def __init__(self):
68
if not hasattr(self, 'title') \
79
or not hasattr(self, 'key') \
@@ -12,9 +14,16 @@ def __init__(self):
1214
def fetch(params, page=None, limit=None):
1315
pass
1416

15-
def query_leads(self, params):
17+
def query_leads(self, params, limit=None, offset=None):
1618
from connector.serializers import SourceDataSerializer
19+
20+
if offset is None or offset < 0:
21+
offset = 0
22+
if not limit or limit < 0:
23+
limit = Source.DEFAULT_PER_PAGE
24+
25+
data = self.fetch(params)[0]
1726
return SourceDataSerializer(
18-
self.fetch(params)[0],
27+
data[offset:offset + limit],
1928
many=True,
2029
).data

apps/connector/sources/rss_feed.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -66,11 +66,7 @@ def fetch(self, params, offset=None, limit=None):
6666
url_field = params.get('url-field')
6767
website_field = params.get('website-field')
6868

69-
for item in (
70-
items[offset:offset + limit] if (
71-
offset is not None and limit is not None
72-
) else items
73-
):
69+
for item in items:
7470
def get_field(field):
7571
if not field:
7672
return ''
@@ -103,7 +99,7 @@ def query_options(self, params):
10399
options[field]['options'] = fields
104100
return options
105101

106-
def query_fields(self, params):
102+
def query_fields(self, params, limit=None, offset=None):
107103
if not params or not params.get('feed-url'):
108104
return []
109105

@@ -145,4 +141,8 @@ def replace_ns(tag):
145141
if fields.count(field) == 1:
146142
real_fields.append(field)
147143

148-
return real_fields
144+
if offset is None or offset < 0:
145+
offset = 1
146+
if not limit or limit < 0:
147+
limit = Source.DEFAULT_PER_PAGE
148+
return real_fields[offset:offset + limit]

apps/connector/views.py

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@
1010
from rest_framework.decorators import action
1111
from deep.permissions import ModifyPermission
1212
from project.models import Project
13+
from utils.common import parse_number
14+
1315
from .serializers import (
1416
SourceSerializer,
1517
SourceDataSerializer,
@@ -24,6 +26,7 @@
2426
ConnectorProject,
2527
)
2628
from .sources.store import source_store
29+
from .sources.base import Source
2730

2831

2932
class SourceViewSet(viewsets.ViewSet):
@@ -45,7 +48,13 @@ class SourceQueryView(views.APIView):
4548
def query(self, source_type, query, params):
4649
source = source_store[source_type]()
4750
method = getattr(source, 'query_{}'.format(query))
48-
results = method(params)
51+
52+
query_params = self.request.query_params
53+
54+
limit = parse_number(query_params.get('limit'))
55+
offset = parse_number(query_params.get('offset'))
56+
57+
results = method(params, limit, offset)
4958

5059
if isinstance(results, list):
5160
return response.Response({
@@ -112,8 +121,8 @@ def get_leads(self, request, pk=None, version=None):
112121
project_id = request.data.pop('project', None)
113122
project = project_id and Project.objects.get(id=project_id)
114123

115-
offset = request.data.pop('offset', None)
116-
limit = request.data.pop('limit', None)
124+
offset = request.data.pop('offset', None) or 0
125+
limit = request.data.pop('limit', None) or Source.DEFAULT_PER_PAGE
117126

118127
params = {
119128
**(connector.params or {}),
@@ -122,6 +131,12 @@ def get_leads(self, request, pk=None, version=None):
122131

123132
source = source_store[connector.source]()
124133
data, count = source.fetch(params, offset, limit)
134+
135+
# Paginate manually
136+
# FIXME: Make this better: probably cache, and also optimize
137+
# Because, right now, every data is pulled and then only paginated
138+
data = data[offset:offset + limit]
139+
125140
serializer = SourceDataSerializer(
126141
data,
127142
many=True,
@@ -131,7 +146,7 @@ def get_leads(self, request, pk=None, version=None):
131146

132147
return response.Response({
133148
'count': count,
134-
'count_per_page': getattr(source, 'count_per_page', None),
149+
'count_per_page': limit,
135150
'results': results
136151
})
137152

apps/lead/tasks.py

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# from channels import Group
33
from django.core.files import File
44
from django.db import transaction
5+
from django.db.models import Q
56
# from django.utils import timezone
67
from django.conf import settings
78
from lead.models import (
@@ -16,7 +17,7 @@
1617
from utils.extractor.thumbnailers import DocThumbnailer
1718
# from utils.websocket.subscription import SubscriptionConsumer
1819

19-
# import json
20+
import time
2021
import reversion
2122
import os
2223
import re
@@ -192,7 +193,7 @@ def send_lead_text_to_deepl(self, lead_id):
192193
preview.classified_doc_id = classified_doc_id
193194
preview.save()
194195
return True
195-
except Exception as e:
196+
except Exception:
196197
# Retry with exponential decay
197198
logger.warning("Error while sending request to deepl. {}".format(
198199
traceback.format_exc()))
@@ -216,7 +217,7 @@ def extract_from_lead(lead_id):
216217
# and try to prevent useless parallel extraction of same lead that
217218
# that might happen.
218219
key = 'lead_extraction_{}'.format(lead_id)
219-
lock = redis.get_lock(key, 60 * 60 * 4) # Lock lifetime 4 hours
220+
lock = redis.get_lock(key, 60 * 60 * 0.5) # Lock lifetime half hours
220221
have_lock = lock.acquire(blocking=False)
221222
if not have_lock:
222223
return False
@@ -252,3 +253,16 @@ def extract_from_lead(lead_id):
252253

253254
lock.release()
254255
return return_value
256+
257+
258+
@shared_task
259+
def generate_previews(lead_ids=None):
260+
"""Generae previews of leads which do not have preview"""
261+
lead_ids = lead_ids or Lead.objects.filter(
262+
Q(leadpreview__isnull=True) |
263+
Q(leadpreview__text_extract=''),
264+
).values_list('id', flat=True)
265+
266+
for lead_id in lead_ids:
267+
extract_from_lead.s(lead_id).delay()
268+
time.sleep(0.5)

deep/documents_types.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
88
'application/wps-office.docx',
99
]
10+
MSWORD_MIME_TYPES = [
11+
'application/msword', 'application/wps-office.doc',
12+
]
1013
POWERPOINT_MIME_TYPES = [
1114
'application/vnd.openxmlformats-officedocument.presentationml.presentation', # noqa
1215
'application/vnd.ms-powerpoint',
@@ -22,14 +25,15 @@
2225
# Overall Supported Mime Types
2326
DEEP_SUPPORTED_MIME_TYPES = [
2427
'application/rtf', 'text/plain', 'font/otf', 'text/csv',
25-
'application/json', 'application/xml', 'application/msword',
28+
'application/json', 'application/xml',
2629
] + (
27-
DOCX_MIME_TYPES + PDF_MIME_TYPES + POWERPOINT_MIME_TYPES +
28-
SHEET_MIME_TYPES + ODS_MIME_TYPES + IMAGE_MIME_TYPES
30+
DOCX_MIME_TYPES + MSWORD_MIME_TYPES + PDF_MIME_TYPES +
31+
POWERPOINT_MIME_TYPES + SHEET_MIME_TYPES + ODS_MIME_TYPES +
32+
IMAGE_MIME_TYPES
2933
)
3034

3135
DEEP_SUPPORTED_EXTENSIONS = [
3236
'docx', 'xlsx', 'pdf', 'pptx',
3337
'json', 'png', 'jpg', 'jpeg', 'csv', 'txt',
34-
'geojson', 'zip', 'ods',
38+
'geojson', 'zip', 'ods', 'doc',
3539
]

utils/common.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,9 @@ def parse_time(time_str):
9595

9696

9797
def parse_number(num_str):
98-
if not num_str:
98+
try:
99+
num = float(num_str)
100+
except (ValueError, TypeError):
99101
return None
100102
num = float(num_str)
101103
if num == round(num):

utils/extractor/document.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,22 @@
55
PDF = 'pdf'
66
DOCX = 'docx'
77
PPTX = 'pptx'
8+
MSWORD = 'doc'
89

910
EXTRACTORS = {
1011
HTML: extractors.HtmlExtractor,
1112
PDF: extractors.PdfExtractor,
1213
DOCX: extractors.DocxExtractor,
1314
PPTX: extractors.PptxExtractor,
15+
MSWORD: extractors.MswordExtractor,
1416
}
1517

1618
THUMBNAILERS = {
1719
HTML: thumbnailers.WebThumbnailer,
1820
PDF: thumbnailers.DocThumbnailer,
1921
DOCX: thumbnailers.DocThumbnailer,
2022
PPTX: thumbnailers.DocThumbnailer,
23+
MSWORD: thumbnailers.DocThumbnailer,
2124
}
2225

2326

utils/extractor/extractors.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
from .formats.pdf import process as pdf_extract
44
from .formats.docx import (
55
process as docx_extract,
6-
pptx_process as pptx_extract
6+
pptx_process as pptx_extract,
7+
msword_process as msword_extract
78
)
89

910

@@ -69,3 +70,11 @@ class PptxExtractor(BaseExtractor):
6970
"""
7071
ERROR_MSG = "Not a pptx document"
7172
EXTRACT_METHOD = pptx_extract
73+
74+
75+
class MswordExtractor(BaseExtractor):
76+
"""
77+
Extractor class to extract msword documents.
78+
"""
79+
ERROR_MSG = "Not a msword (.doc) document"
80+
EXTRACT_METHOD = msword_extract

utils/extractor/file_document.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import os
22
from .document import (
33
Document,
4-
HTML, PDF, DOCX, PPTX,
4+
HTML, PDF, DOCX, PPTX, MSWORD,
55
)
66

77

@@ -14,6 +14,7 @@ class FileDocument(Document):
1414
HTML_TYPES = ['.html', '.htm', '.txt']
1515
PDF_TYPES = ['.pdf', ]
1616
DOCX_TYPES = ['.docx', ]
17+
MSWORD_TYPES = ['.doc', ]
1718
PPTX_TYPES = ['.pptx', ]
1819

1920
def __init__(self, file, name):
@@ -28,6 +29,8 @@ def __init__(self, file, name):
2829
type = HTML
2930
elif extension in self.DOCX_TYPES:
3031
type = DOCX
32+
elif extension in self.MSWORD_TYPES:
33+
type = MSWORD
3134
elif extension in self.PPTX_TYPES:
3235
type = PPTX
3336

0 commit comments

Comments
 (0)