Skip to content

Commit 2b9f961

Browse files
author
Giulio Carvalho
committed
Init repo
0 parents  commit 2b9f961

33 files changed

+822
-0
lines changed

.db.env.sample

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
POSTGRES_PASSWORD=
2+
POSTGRES_USER=

.env.sample

+4
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
DB_DSN=
2+
TWOCAPTCHA_API_KEY=
3+
INSTACART_LOGIN=
4+
INSTACART_PASSWORD=

.gitignore

+139
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
# Byte-compiled / optimized / DLL files
2+
__pycache__/
3+
*.py[cod]
4+
*$py.class
5+
6+
# C extensions
7+
*.so
8+
9+
# Distribution / packaging
10+
.Python
11+
build/
12+
develop-eggs/
13+
dist/
14+
downloads/
15+
eggs/
16+
.eggs/
17+
lib/
18+
lib64/
19+
parts/
20+
sdist/
21+
var/
22+
wheels/
23+
share/python-wheels/
24+
*.egg-info/
25+
.installed.cfg
26+
*.egg
27+
MANIFEST
28+
29+
# PyInstaller
30+
# Usually these files are written by a python script from a template
31+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
32+
*.manifest
33+
*.spec
34+
35+
# Installer logs
36+
pip-log.txt
37+
pip-delete-this-directory.txt
38+
39+
# Unit test / coverage reports
40+
htmlcov/
41+
.tox/
42+
.nox/
43+
.coverage
44+
.coverage.*
45+
.cache
46+
nosetests.xml
47+
coverage.xml
48+
*.cover
49+
*.py,cover
50+
.hypothesis/
51+
.pytest_cache/
52+
cover/
53+
54+
# Translations
55+
*.mo
56+
*.pot
57+
58+
# Django stuff:
59+
*.log
60+
local_settings.py
61+
db.sqlite3
62+
db.sqlite3-journal
63+
64+
# Flask stuff:
65+
instance/
66+
.webassets-cache
67+
68+
# Scrapy stuff:
69+
.scrapy
70+
71+
# Sphinx documentation
72+
docs/_build/
73+
74+
# PyBuilder
75+
.pybuilder/
76+
target/
77+
78+
# Jupyter Notebook
79+
.ipynb_checkpoints
80+
81+
# IPython
82+
profile_default/
83+
ipython_config.py
84+
85+
# pyenv
86+
# For a library or package, you might want to ignore these files since the code is
87+
# intended to run in multiple environments; otherwise, check them in:
88+
# .python-version
89+
90+
# pipenv
91+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
93+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
94+
# install all needed dependencies.
95+
#Pipfile.lock
96+
97+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
98+
__pypackages__/
99+
100+
# Celery stuff
101+
celerybeat-schedule
102+
celerybeat.pid
103+
104+
# SageMath parsed files
105+
*.sage.py
106+
107+
# Environments
108+
.env
109+
.venv
110+
env/
111+
venv/
112+
ENV/
113+
env.bak/
114+
venv.bak/
115+
.db.env
116+
117+
# Spyder project settings
118+
.spyderproject
119+
.spyproject
120+
121+
# Rope project settings
122+
.ropeproject
123+
124+
# mkdocs documentation
125+
/site
126+
127+
# mypy
128+
.mypy_cache/
129+
.dmypy.json
130+
dmypy.json
131+
132+
# Pyre type checker
133+
.pyre/
134+
135+
# pytype static type analyzer
136+
.pytype/
137+
138+
# Cython debug symbols
139+
cython_debug/

.isort.cfg

+10
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[settings]
2+
combine_star = true
3+
combine_as_imports = true
4+
order_by_type = true
5+
multi_line_output = 3
6+
include_trailing_comma = true
7+
use_parentheses = true
8+
line_length = 88
9+
known_third_party = "aiohttp, aiopg, parsel, twocaptcha, jmespath, sqlalchemy, decouple"
10+
known_first_party = "data_collection"

.pre-commit-config.yaml

+20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
repos:
2+
- repo: https://github.com/pre-commit/pre-commit-hooks
3+
rev: v2.4.0
4+
hooks:
5+
- id: trailing-whitespace
6+
- id: end-of-file-fixer
7+
- repo: https://github.com/PyCQA/isort
8+
rev: 5.6.4
9+
hooks:
10+
- id: isort
11+
- repo: https://github.com/psf/black
12+
rev: 20.8b1
13+
hooks:
14+
- id: black
15+
- repo: https://gitlab.com/pycqa/flake8
16+
rev: 3.7.9
17+
hooks:
18+
- id: flake8
19+
args: ['--ignore=E, W, --show-source=True']
20+
files: '.*\.py$'

Dockerfile

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
FROM python:3.8-slim
2+
3+
RUN apt-get update -y
4+
5+
RUN apt-get -y install wait-for-it
6+
7+
WORKDIR /mnt
8+
9+
COPY requirements/requirements.txt .
10+
11+
RUN pip install -r requirements.txt
12+
13+
COPY . .
14+
15+
RUN chmod 755 data_collection/run.sh
16+
17+
ENTRYPOINT ["data_collection/run.sh"]

Makefile

+40
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
POSTGRES_USER ?= $(POSTGRES_USER)
2+
QUERY ?= SELECT * FROM stores;
3+
4+
query-command=(docker exec -it instacart-postgres psql -U $(POSTGRES_USER) -c $1)
5+
6+
.PHONY: build
7+
build:
8+
docker-compose build
9+
10+
start-database:
11+
docker-compose up -d database
12+
13+
wait-database:
14+
docker-compose run --rm --no-deps --entrypoint "wait-for-it --timeout=30 database:5432" instacart
15+
16+
.PHONY: database
17+
database: start-database wait-database
18+
19+
.PHONY: run
20+
run: build database
21+
docker-compose run --rm --no-deps instacart
22+
23+
.PHONY: sql
24+
sql:
25+
docker exec -it instacart-postgres psql -U $(POSTGRES_USER)
26+
27+
sql-query:
28+
$(call query-command, "$(QUERY)")
29+
30+
sql-select-all:
31+
$(call query-command, "SELECT * FROM stores")
32+
$(call query-command, "SELECT * FROM shelves")
33+
$(call query-command, "SELECT * FROM shelf_items")
34+
35+
destroy:
36+
docker-compose down -v --rmi all
37+
38+
dev-setup:
39+
pip install -r requirements/dev.txt
40+
pre-commit install

README.md

+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# Instacart Scraper
2+
3+
Simple spider and workflow system for scraping Instacart's (US) default store
4+
for a given user.
5+
6+
## Table of Contents
7+
* [How to run](#how-to-run)
8+
* [How to check collected data](#how-to-check-collected-data)
9+
* [How to destroy the created environment](#how-to-destroy-the-created-environment)
10+
* [For developers](#for-developers)
11+
12+
## How to run
13+
14+
First, you have to setup your environment variables. Samples
15+
([`.env.sample`](.env.sample) and [`.db.env.sample`](.db.env.sample)) were
16+
provided.
17+
18+
Then, execute (Docker and Docker Compose are required):
19+
20+
```shell
21+
$ make run
22+
```
23+
24+
Done :smile:
25+
26+
*Note: Recaptcha solving may fail. Retries are already in place, but in rare
27+
cases they are insufficient. In these cases, you try and run again.*
28+
29+
## How to check collected data
30+
31+
If you want to run a query in the database, execute:
32+
33+
```shell
34+
$ POSTGRES_USER=<YOUR-POSTGRES-USER> QUERY=<YOUR-QUERY> make sql-query
35+
```
36+
37+
However, to make things easier, a shortcut to make a `SELECT *` on all tables
38+
is available through:
39+
40+
```shell
41+
$ POSTGRES_USER=<YOUR-POSTGRES-USER> make sql-select-all
42+
```
43+
44+
## How to destroy the created environment
45+
46+
This project uses Docker. To destroy created images, volumes, etc., execute:
47+
48+
```shell
49+
$ make destroy
50+
```
51+
52+
## For developers
53+
54+
To setup you developer environment, create a virtualenv and execute:
55+
56+
```shell
57+
$ make dev-setup
58+
```
59+
60+
This project uses `pre-commit` for managing code formatting and `pip-tools` to
61+
manage dependencies.

data_collection/__init__.py

Whitespace-only changes.

data_collection/database/__init__.py

Whitespace-only changes.

data_collection/database/models.py

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
from sqlalchemy import (
2+
Column,
3+
ForeignKey,
4+
Integer,
5+
MetaData,
6+
String,
7+
Table,
8+
UniqueConstraint,
9+
)
10+
11+
metadata = MetaData()
12+
13+
stores_table = Table(
14+
"stores",
15+
metadata,
16+
Column("id", Integer, primary_key=True),
17+
Column("store_name", String, unique=True, nullable=False),
18+
Column("store_logo_url", String),
19+
)
20+
21+
shelves_table = Table(
22+
"shelves",
23+
metadata,
24+
Column("id", Integer, primary_key=True),
25+
Column("shelf_name", String, nullable=False),
26+
Column("store_id", Integer, ForeignKey("stores.id")),
27+
UniqueConstraint("shelf_name", "store_id"),
28+
)
29+
30+
31+
shelf_items_table = Table(
32+
"shelf_items",
33+
metadata,
34+
Column("id", Integer, primary_key=True),
35+
Column("item_name", String, nullable=False),
36+
Column("shelf_id", Integer, ForeignKey("shelves.id")),
37+
UniqueConstraint("item_name", "shelf_id"),
38+
)

data_collection/exceptions.py

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
class ExecutionFailedException(Exception):
2+
pass

data_collection/run.sh

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/bin/bash
2+
3+
current_dir=$PWD
4+
cd /mnt/data_collection
5+
python workflow/execute.py
6+
cd $current_dir

data_collection/scraping/__init__.py

Whitespace-only changes.

data_collection/scraping/page.py

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import json
2+
3+
from parsel import Selector
4+
5+
6+
class Page:
7+
def __init__(self, content):
8+
self.content = content
9+
10+
@classmethod
11+
def from_response(cls, response):
12+
return cls(content=response)
13+
14+
15+
class HtmlPage(Page):
16+
@classmethod
17+
def from_html(cls, html_response):
18+
return cls(content=Selector(html_response))
19+
20+
21+
class JsonPage(Page):
22+
@classmethod
23+
def from_json(cls, json_response):
24+
return cls(content=json.loads(json_response))

0 commit comments

Comments
 (0)