Skip to content

Commit 73fad3d

Browse files
committed
Add search API
1 parent a065e53 commit 73fad3d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+1200
-245
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ __pycache__
1010
.vscode
1111
*.code-workspace
1212
/volumes
13+
staticfiles

Makefile

+13
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,18 @@ github-sync-owasp-organization:
1212
github-sync-related-repositories:
1313
@CMD="poetry run python manage.py github_sync_related_repositories" $(MAKE) run-backend-command
1414

15+
index:
16+
@CMD="poetry run python manage.py algolia_reindex" $(MAKE) run-backend-command
17+
1518
migrate:
1619
@CMD="poetry run python manage.py migrate" $(MAKE) run-backend-command
1720

1821
migrations:
1922
@CMD="poetry run python manage.py makemigrations" $(MAKE) run-backend-command
2023

24+
migrations-merge:
25+
@CMD="poetry run python manage.py makemigrations --merge" $(MAKE) run-backend-command
26+
2127
owasp-scrape-site-data:
2228
@CMD="poetry run python manage.py owasp_scrape_site_data" $(MAKE) run-backend-command
2329

@@ -27,6 +33,9 @@ owasp-update-projects:
2733
pre-commit:
2834
@pre-commit run -a
2935

36+
purge-data:
37+
@CMD="poetry run python manage.py purge_data" $(MAKE) run-backend-command
38+
3039
run:
3140
@docker compose up
3241

@@ -44,3 +53,7 @@ sync:
4453

4554
test:
4655
@cd backend && poetry run pytest; cd ..
56+
57+
update:
58+
@$(MAKE) sync
59+
@$(MAKE) index
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""Common management module."""
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
"""Common management commands."""
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
"""A command to purge OWASP Nest data."""
2+
3+
from django.core.management.base import BaseCommand
4+
from django.db import connection
5+
6+
from apps.github.models import Issue, Label, Organization, Release, Repository, User
7+
from apps.owasp.models import Chapter, Committee, Event, Project
8+
9+
BATCH_SIZE = 10
10+
11+
12+
class Command(BaseCommand):
13+
help = "Purge OWASP Nest data."
14+
15+
def handle(self, *_args, **options):
16+
with connection.cursor() as cursor:
17+
models = (
18+
Chapter,
19+
Committee,
20+
Event,
21+
Issue,
22+
Label,
23+
Organization,
24+
Project,
25+
Release,
26+
Repository,
27+
User,
28+
)
29+
30+
for model in models:
31+
cursor.execute(f"TRUNCATE TABLE {model._meta.db_table} CASCADE") # noqa: SLF001
32+
print(f"Purged GitHub {model._meta.verbose_name_plural}") # noqa: SLF001

backend/apps/common/models.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,10 @@ def bulk_save(model, objects):
1414
"""Bulk save objects."""
1515
model.objects.bulk_create(o for o in objects if not o.id)
1616
model.objects.bulk_update(
17-
[o for o in objects if o.id],
17+
(o for o in objects if o.id),
1818
fields=[field.name for field in model._meta.fields if not field.primary_key], # noqa: SLF001
1919
)
20+
objects.clear()
2021

2122

2223
class TimestampedModel(models.Model):

backend/apps/common/templates/base.html

Whitespace-only changes.

backend/apps/common/utils.py

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
"""Common app utils."""
2+
3+
4+
def join_values(fields, delimiter=" "):
5+
"""Join non-empty field values using the delimiter."""
6+
delimiter.join(field for field in fields if field)

backend/apps/github/admin.py

+2
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,9 @@ class RepositoryAdmin(admin.ModelAdmin):
5656
"is_owasp_site_repository",
5757
"has_funding_yml",
5858
"is_funding_policy_compliant",
59+
"is_template",
5960
"is_fork",
61+
"organization",
6062
)
6163
ordering = ("-created_at",)
6264
search_fields = ("name",)

backend/apps/github/constants.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import re
44

5+
GITHUB_GHOST_USER_LOGIN = "ghost"
56
GITHUB_ITEMS_PER_PAGE = 100
6-
GITHUB_ORGANIZATION_RE = re.compile("^https://github.com/([^/]+)/?$")
7-
GITHUB_REPOSITORY_RE = re.compile("^https://github.com/([^/]+)/([^/]+)/?$")
7+
GITHUB_REPOSITORY_RE = re.compile("^https://github.com/([^/]+)/([^/]+)(/.*)?$")
8+
GITHUB_USER_RE = re.compile("^https://github.com/([^/]+)/?$")

backend/apps/github/index.py

+93
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
"""GitHub app index."""
2+
3+
from datetime import timedelta as td
4+
5+
from algoliasearch_django import AlgoliaIndex
6+
from algoliasearch_django.decorators import register
7+
from django.db.models import Q
8+
from django.utils import timezone
9+
10+
from apps.github.models.issue import Issue
11+
12+
13+
@register(Issue)
14+
class IssueIndex(AlgoliaIndex):
15+
"""Issue index."""
16+
17+
index_name = "issues"
18+
19+
fields = (
20+
"idx_author_login",
21+
"idx_author_name",
22+
"idx_body",
23+
"idx_comments_count",
24+
"idx_created_at",
25+
"idx_labels",
26+
"idx_project_description",
27+
"idx_project_level",
28+
"idx_project_name",
29+
"idx_project_tags",
30+
"idx_repository_contributors_count",
31+
"idx_repository_description",
32+
"idx_repository_forks_count",
33+
"idx_repository_languages",
34+
"idx_repository_name",
35+
"idx_repository_stars_count",
36+
"idx_repository_topics",
37+
"idx_title",
38+
"idx_updated_at",
39+
"idx_url",
40+
)
41+
42+
settings = {
43+
"minProximity": 3,
44+
"attributeForDistinct": "idx_project_name",
45+
"distinct": True,
46+
"indexLanguages": ["en"],
47+
"customRanking": [
48+
"desc(idx_created_at)",
49+
"desc(idx_updated_at)",
50+
"desc(idx_comments_count)",
51+
"desc(idx_repository_contributors_count)",
52+
"desc(idx_repository_stars_count)",
53+
"desc(idx_repository_forks_count)",
54+
],
55+
"ranking": [
56+
"typo",
57+
"geo",
58+
"words",
59+
"filters",
60+
"proximity",
61+
"attribute",
62+
"exact",
63+
"custom",
64+
],
65+
"searchableAttributes": [
66+
"unordered(idx_labels, idx_repository_languages)",
67+
"unordered(idx_title, idx_project_name, idx_repository_name)",
68+
"unordered(idx_project_description, idx_repository_description)",
69+
"unordered(idx_project_tags, idx_repository_topics)",
70+
"unordered(idx_author_login, idx_author_name)",
71+
"unordered(idx_body)",
72+
],
73+
}
74+
75+
should_index = "is_indexable"
76+
77+
def get_queryset(self):
78+
"""Get queryset."""
79+
# We index all unassigned issues and issues with no activity within 60 days.
80+
return (
81+
Issue.objects.select_related(
82+
"repository",
83+
)
84+
.prefetch_related(
85+
"assignees",
86+
"labels",
87+
"repository__project_set",
88+
)
89+
.filter(
90+
Q(assignees__isnull=True)
91+
| Q(assignees__isnull=False, updated_at__lte=timezone.now() - td(days=60))
92+
)
93+
)

backend/apps/github/management/commands/github_sync_owasp_organization.py

+16-31
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,10 @@
66
from django.core.management.base import BaseCommand
77

88
from apps.github.constants import GITHUB_ITEMS_PER_PAGE
9-
from apps.github.models import Issue, Organization, Release, Repository, sync_repository
9+
from apps.github.models import Release, Repository, sync_repository
1010
from apps.owasp.constants import OWASP_ORGANIZATION_NAME
1111
from apps.owasp.models import Chapter, Committee, Event, Project
1212

13-
BATCH_SIZE = 10
14-
1513

1614
class Command(BaseCommand):
1715
help = "Updates OWASP entities based on their GitHub data."
@@ -20,17 +18,6 @@ def add_arguments(self, parser):
2018
parser.add_argument("--offset", default=0, required=False, type=int)
2119

2220
def handle(self, *_args, **options):
23-
def save_data():
24-
"""Save data to DB."""
25-
Organization.bulk_save(organizations)
26-
Issue.bulk_save(issues)
27-
Release.bulk_save(releases)
28-
29-
Chapter.bulk_save(chapters)
30-
Committee.bulk_save(committees)
31-
Event.bulk_save(events)
32-
Project.bulk_save(projects)
33-
3421
gh = github.Github(os.getenv("GITHUB_TOKEN"), per_page=GITHUB_ITEMS_PER_PAGE)
3522
gh_owasp_organization = gh.get_organization(OWASP_ORGANIZATION_NAME)
3623
remote_owasp_repositories_count = gh_owasp_organization.public_repos
@@ -41,27 +28,23 @@ def save_data():
4128
chapters = []
4229
committees = []
4330
events = []
44-
issues = []
45-
organizations = []
4631
projects = []
4732
releases = []
4833

4934
offset = options["offset"]
50-
for idx, gh_repository in enumerate(
51-
gh_owasp_organization.get_repos(
52-
type="public",
53-
sort="created",
54-
direction="asc",
55-
)[offset:]
56-
):
57-
print(f"{idx + offset + 1:<4} {gh_repository.name}")
35+
gh_repositories = gh_owasp_organization.get_repos(
36+
type="public",
37+
sort="created",
38+
direction="desc",
39+
)
40+
total_count = gh_repositories.totalCount - offset
41+
for idx, gh_repository in enumerate(gh_repositories[offset:]):
42+
prefix = f"{idx + offset + 1} of {total_count}"
43+
print(f"{prefix:<12} {gh_repository.name}")
5844

5945
owasp_organization, repository, new_releases = sync_repository(
6046
gh_repository, organization=owasp_organization, user=owasp_user
6147
)
62-
if not owasp_organization.id:
63-
owasp_organization.save()
64-
6548
releases.extend(new_releases)
6649

6750
entity_key = gh_repository.name.lower()
@@ -81,11 +64,13 @@ def save_data():
8164
elif entity_key.startswith("www-committee-"):
8265
committees.append(Committee.update_data(gh_repository, repository, save=False))
8366

84-
if idx % BATCH_SIZE == 0:
85-
save_data()
67+
# Bulk save data.
68+
Release.bulk_save(releases)
8669

87-
# Save remaining data.
88-
save_data()
70+
Chapter.bulk_save(chapters)
71+
Committee.bulk_save(committees)
72+
Event.bulk_save(events)
73+
Project.bulk_save(projects)
8974

9075
# Check repository counts.
9176
local_owasp_repositories_count = Repository.objects.filter(

backend/apps/github/management/commands/github_sync_related_repositories.py

+10-17
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@
1414

1515
logger = logging.getLogger(__name__)
1616

17-
BATCH_SIZE = 10
18-
1917

2018
class Command(BaseCommand):
2119
help = "Updates OWASP entities based on their owasp.org data."
@@ -24,12 +22,6 @@ def add_arguments(self, parser):
2422
parser.add_argument("--offset", default=0, required=False, type=int)
2523

2624
def handle(self, *args, **options):
27-
def save_data():
28-
"""Save data to DB."""
29-
Issue.bulk_save(issues)
30-
Release.bulk_save(releases)
31-
Project.bulk_save(projects)
32-
3325
active_projects = Project.objects.filter(is_active=True).order_by("created_at")
3426
gh = github.Github(os.getenv("GITHUB_TOKEN"), per_page=GITHUB_ITEMS_PER_PAGE)
3527

@@ -39,9 +31,10 @@ def save_data():
3931

4032
offset = options["offset"]
4133
for idx, project in enumerate(active_projects[offset:]):
42-
print(f"{idx + offset + 1:<4}", project.owasp_url)
34+
prefix = f"{idx + offset + 1} of {active_projects.count() - offset}"
35+
print(f"{prefix:<12} {project.owasp_url}")
4336

44-
repository_urls = project.repositories_raw.copy()
37+
repository_urls = project.related_urls.copy()
4538
for repository_url in repository_urls:
4639
repository_path = get_repository_path(repository_url)
4740
if not repository_path:
@@ -52,8 +45,9 @@ def save_data():
5245
gh_repository = gh.get_repo(repository_path)
5346
except UnknownObjectException as e:
5447
if e.data["status"] == "404" and "Not Found" in e.data["message"]:
55-
project.repositories_raw.remove(repository_url)
56-
project.save(update_fields=("repositories_raw",))
48+
project.invalid_urls.add(repository_url)
49+
project.related_urls.remove(repository_url)
50+
project.save(update_fields=("invalid_urls", "related_urls"))
5751
continue
5852

5953
organization, repository, new_releases = sync_repository(gh_repository)
@@ -65,8 +59,7 @@ def save_data():
6559

6660
projects.append(project)
6761

68-
if idx % BATCH_SIZE == 0:
69-
save_data()
70-
71-
# Save remaining data.
72-
save_data()
62+
# Bulk save data.
63+
Issue.bulk_save(issues)
64+
Release.bulk_save(releases)
65+
Project.bulk_save(projects)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# Generated by Django 5.1.1 on 2024-09-05 23:55
2+
3+
from django.db import migrations, models
4+
5+
6+
class Migration(migrations.Migration):
7+
dependencies = [
8+
("github", "0039_alter_issue_title"),
9+
]
10+
11+
operations = [
12+
migrations.AlterModelOptions(
13+
name="issue",
14+
options={"ordering": ("-updated_at", "-state"), "verbose_name_plural": "Issues"},
15+
),
16+
migrations.AlterField(
17+
model_name="issue",
18+
name="url",
19+
field=models.URLField(default="", max_length=500, verbose_name="URL"),
20+
),
21+
]

0 commit comments

Comments
 (0)