diff --git a/docker-compose.yaml b/docker-compose.yaml index 8d0312e03..d108ffbd9 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -4,9 +4,10 @@ services: environment: - CLICKHOUSE_ALWAYS_RUN_INITDB_SCRIPTS=true volumes: - - ./docker/clickhouse/load_ontime.sh:/docker-entrypoint-initdb.d/load_ontime.sh - /var/lib/clickhouse/ - /var/log/clickhouse-server/ + - ./docker/clickhouse/init_schema_ontime.sql:/docker-entrypoint-initdb.d/init_schema.sql + - ./docker/clickhouse/users.xml:/etc/clickhouse-server/users.d/users.xml clickhouse: image: ${CLICKHOUSE_IMAGE:-clickhouse/clickhouse-server}:${CLICKHOUSE_VERSION:-latest} diff --git a/docker/clickhouse/init_schema_ontime.sql b/docker/clickhouse/init_schema_ontime.sql new file mode 100644 index 000000000..6df6757c4 --- /dev/null +++ b/docker/clickhouse/init_schema_ontime.sql @@ -0,0 +1,120 @@ +DROP TABLE IF EXISTS datasets.`ontime`; +DROP DATABASE IF EXISTS datasets; + +CREATE DATABASE datasets; + +CREATE TABLE datasets.`ontime` +( + `Year` UInt16, + `Quarter` UInt8, + `Month` UInt8, + `DayofMonth` UInt8, + `DayOfWeek` UInt8, + `FlightDate` Date, + `Reporting_Airline` LowCardinality(String), + `DOT_ID_Reporting_Airline` Int32, + `IATA_CODE_Reporting_Airline` LowCardinality(String), + `Tail_Number` LowCardinality(String), + `Flight_Number_Reporting_Airline` LowCardinality(String), + `OriginAirportID` Int32, + `OriginAirportSeqID` Int32, + `OriginCityMarketID` Int32, + `Origin` FixedString(5), + `OriginCityName` LowCardinality(String), + `OriginState` FixedString(2), + `OriginStateFips` FixedString(2), + `OriginStateName` LowCardinality(String), + `OriginWac` Int32, + `DestAirportID` Int32, + `DestAirportSeqID` Int32, + `DestCityMarketID` Int32, + `Dest` FixedString(5), + `DestCityName` LowCardinality(String), + `DestState` FixedString(2), + `DestStateFips` FixedString(2), + `DestStateName` LowCardinality(String), + `DestWac` Int32, + `CRSDepTime` Int32, + `DepTime` Int32, + `DepDelay` Int32, + `DepDelayMinutes` Int32, + `DepDel15` Int32, + `DepartureDelayGroups` LowCardinality(String), + `DepTimeBlk` LowCardinality(String), + `TaxiOut` Int32, + `WheelsOff` LowCardinality(String), + `WheelsOn` LowCardinality(String), + `TaxiIn` Int32, + `CRSArrTime` Int32, + `ArrTime` Int32, + `ArrDelay` Int32, + `ArrDelayMinutes` Int32, + `ArrDel15` Int32, + `ArrivalDelayGroups` LowCardinality(String), + `ArrTimeBlk` LowCardinality(String), + `Cancelled` Int8, + `CancellationCode` FixedString(1), + `Diverted` Int8, + `CRSElapsedTime` Int32, + `ActualElapsedTime` Int32, + `AirTime` Int32, + `Flights` Int32, + `Distance` Int32, + `DistanceGroup` Int8, + `CarrierDelay` Int32, + `WeatherDelay` Int32, + `NASDelay` Int32, + `SecurityDelay` Int32, + `LateAircraftDelay` Int32, + `FirstDepTime` Int16, + `TotalAddGTime` Int16, + `LongestAddGTime` Int16, + `DivAirportLandings` Int8, + `DivReachedDest` Int8, + `DivActualElapsedTime` Int16, + `DivArrDelay` Int16, + `DivDistance` Int16, + `Div1Airport` LowCardinality(String), + `Div1AirportID` Int32, + `Div1AirportSeqID` Int32, + `Div1WheelsOn` Int16, + `Div1TotalGTime` Int16, + `Div1LongestGTime` Int16, + `Div1WheelsOff` Int16, + `Div1TailNum` LowCardinality(String), + `Div2Airport` LowCardinality(String), + `Div2AirportID` Int32, + `Div2AirportSeqID` Int32, + `Div2WheelsOn` Int16, + `Div2TotalGTime` Int16, + `Div2LongestGTime` Int16, + `Div2WheelsOff` Int16, + `Div2TailNum` LowCardinality(String), + `Div3Airport` LowCardinality(String), + `Div3AirportID` Int32, + `Div3AirportSeqID` Int32, + `Div3WheelsOn` Int16, + `Div3TotalGTime` Int16, + `Div3LongestGTime` Int16, + `Div3WheelsOff` Int16, + `Div3TailNum` LowCardinality(String), + `Div4Airport` LowCardinality(String), + `Div4AirportID` Int32, + `Div4AirportSeqID` Int32, + `Div4WheelsOn` Int16, + `Div4TotalGTime` Int16, + `Div4LongestGTime` Int16, + `Div4WheelsOff` Int16, + `Div4TailNum` LowCardinality(String), + `Div5Airport` LowCardinality(String), + `Div5AirportID` Int32, + `Div5AirportSeqID` Int32, + `Div5WheelsOn` Int16, + `Div5TotalGTime` Int16, + `Div5LongestGTime` Int16, + `Div5WheelsOff` Int16, + `Div5TailNum` LowCardinality(String) +) ENGINE = MergeTree + ORDER BY (Year, Quarter, Month, DayofMonth, FlightDate, IATA_CODE_Reporting_Airline); + +INSERT INTO datasets.ontime SELECT * FROM s3('https://clickhouse-public-datasets.s3.amazonaws.com/ontime/csv_by_year/*2018*.csv.gz', CSVWithNames) SETTINGS max_insert_threads = 40; \ No newline at end of file diff --git a/docker/clickhouse/load_ontime.sh b/docker/clickhouse/load_ontime.sh deleted file mode 100644 index c28936f26..000000000 --- a/docker/clickhouse/load_ontime.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash -set -xeou pipefail -export DEBIAN_FRONTEND=noninteractive -apt-get update -y -apt-get install -y aria2 - -if [ ! -d "/var/lib/clickhouse/data/datasets/ontime/201810_364_364_1" ]; then - aria2c -x 10 -s 10 -j 10 -c --dir=/var/lib/clickhouse/ --out=ontime.tar --file-allocation=none https://clickhouse-datasets.s3.yandex.net/ontime/partitions/ontime.tar - tar xvf /var/lib/clickhouse/ontime.tar -C /var/lib/clickhouse - chown -R clickhouse:clickhouse /var/lib/clickhouse -fi