|
1 |
| -# tutorial-batch-data-engineering |
| 1 | +# tutorial-batch-data-engineering |
| 2 | + |
| 3 | +[](https://img.shields.io/badge/python-3.7.0-green.svg) |
| 4 | +[](https://coveralls.io/github/arthuralvim/tutorial-batch-data-engineering?branch=master) |
| 5 | + |
| 6 | + |
| 7 | +## DESENVOLVIMENTO |
| 8 | + |
| 9 | +```bash |
| 10 | +$ pre-commit install |
| 11 | +``` |
| 12 | + |
| 13 | +```bash |
| 14 | +$ pipenv install --dev |
| 15 | +``` |
| 16 | + |
| 17 | + |
| 18 | +## BASE DE DADOS |
| 19 | + |
| 20 | +CRIAR BALDE NO S3 |
| 21 | + |
| 22 | +https://brasil.io/dataset/gastos-deputados/cota_parlamentar |
| 23 | + |
| 24 | +## AWS ECR |
| 25 | + |
| 26 | +## AWS BATCH |
| 27 | + |
| 28 | +## AWS ATHENA |
| 29 | + |
| 30 | +https://docs.aws.amazon.com/athena/latest/ug/select.html |
| 31 | +https://docs.aws.amazon.com/athena/latest/ug/data-types.html |
| 32 | + |
| 33 | +```sql |
| 34 | +CREATE EXTERNAL TABLE IF NOT EXISTS dados_brasil.deputados_cota_parlamentar ( |
| 35 | + `codlegislatura` int, |
| 36 | + `datemissao` timestamp, |
| 37 | + `idedocumento` int, |
| 38 | + `idecadastro` int, |
| 39 | + `indtipodocumento` int, |
| 40 | + `nucarteiraparlamentar` int, |
| 41 | + `nudeputadoid` int, |
| 42 | + `nulegislatura` int, |
| 43 | + `numano` int, |
| 44 | + `numespecificacaosubcota` int, |
| 45 | + `numlote` int, |
| 46 | + `nummes` int, |
| 47 | + `numparcela` int, |
| 48 | + `numressarcimento` int, |
| 49 | + `numsubcota` int, |
| 50 | + `sgpartido` string, |
| 51 | + `sguf` string, |
| 52 | + `txnomeparlamentar` string, |
| 53 | + `txtcnpjcpf` string, |
| 54 | + `txtdescricao` string, |
| 55 | + `txtdescricaoespecificacao` string, |
| 56 | + `txtfornecedor` string, |
| 57 | + `txtnumero` string, |
| 58 | + `txtpassageiro` string, |
| 59 | + `txttrecho` string, |
| 60 | + `vlrdocumento` float, |
| 61 | + `vlrglosa` float, |
| 62 | + `vlrliquido` float, |
| 63 | + `vlrrestituicao` float |
| 64 | +) |
| 65 | +ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' |
| 66 | +WITH SERDEPROPERTIES ( |
| 67 | + 'serialization.format' = ',', |
| 68 | + 'field.delim' = ',' |
| 69 | +) LOCATION 's3://tutorial-batch-data-engineering/cota-parlamentar/' |
| 70 | +TBLPROPERTIES ('has_encrypted_data'='false'); |
| 71 | +``` |
| 72 | + |
| 73 | +https://aws.amazon.com/blogs/big-data/top-10-performance-tuning-tips-for-amazon-athena/ |
| 74 | + |
| 75 | +```sql |
| 76 | +SELECT * FROM "dados_brasil"."deputados_cota_parlamentar" limit 10; |
| 77 | +``` |
| 78 | + |
| 79 | +```sql |
| 80 | +SELECT COUNT(*) FROM "dados_brasil"."deputados_cota_parlamentar"; |
| 81 | +``` |
| 82 | + |
| 83 | +```sql |
| 84 | +SELECT txnomeparlamentar, COUNT(*) count FROM "dados_brasil"."deputados_cota_parlamentar" GROUP BY txnomeparlamentar ORDER BY count DESC limit 10; |
| 85 | +``` |
| 86 | + |
| 87 | +```sql |
| 88 | +SELECT txnomeparlamentar, sgpartido, sguf, SUM(vlrliquido) vlrtotal FROM "dados_brasil"."deputados_cota_parlamentar" GROUP BY txnomeparlamentar, sgpartido, sguf ORDER BY txnomeparlamentar DESC limit 10; |
| 89 | +``` |
| 90 | + |
| 91 | + |
| 92 | +1 (Run time: 2.36 seconds, Data scanned: 713.04 MB) |
| 93 | +3.697.967 |
| 94 | + |
| 95 | +2 (Run time: 2.21 seconds, Data scanned: 99.33 MB) |
| 96 | +3.697.983 |
| 97 | + |
| 98 | + |
| 99 | +16 linhas de diferença. 8 arquivos. |
0 commit comments