Skip to content

Commit 7c3d194

Browse files
Merge pull request #579 from shehabgamin/sail
Update to Sail 0.3.3
2 parents d4f4469 + ab91f60 commit 7c3d194

File tree

7 files changed

+194
-11
lines changed

7 files changed

+194
-11
lines changed

sail-partitioned/benchmark.sh

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
#!/bin/bash
2+
3+
# Install
4+
5+
export DEBIAN_FRONTEND=noninteractive
6+
7+
echo "Set Timezone"
8+
export TZ=Etc/UTC
9+
sudo ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
10+
11+
echo "Install Rust"
12+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh
13+
bash rust-init.sh -y
14+
export HOME=${HOME:=~}
15+
source ~/.cargo/env
16+
17+
echo "Install Dependencies"
18+
sudo apt-get update -y
19+
sudo apt-get install -y software-properties-common
20+
sudo add-apt-repository ppa:deadsnakes/ppa -y
21+
sudo apt-get update -y
22+
sudo apt-get install -y \
23+
gcc protobuf-compiler \
24+
libprotobuf-dev \
25+
pkg-config \
26+
libssl-dev \
27+
python3.11 \
28+
python3.11-dev \
29+
python3.11-venv \
30+
python3.11-distutils
31+
32+
echo "Set Python alternatives"
33+
sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
34+
sudo update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
35+
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
36+
37+
echo "Install Python packages"
38+
python3 -m venv myenv
39+
source myenv/bin/activate
40+
pip install --upgrade setuptools wheel
41+
env RUSTFLAGS="-C target-cpu=native" pip install --no-cache-dir "pysail==0.3.3" -v --no-binary pysail
42+
pip install "pyspark-client==4.0.0" \
43+
"protobuf==5.28.3" \
44+
"grpcio==1.71.2" \
45+
"grpcio-status==1.71.2" \
46+
pandas \
47+
psutil
48+
49+
# Load the data
50+
51+
echo "Download benchmark target data, partitioned"
52+
mkdir -p partitioned
53+
seq 0 99 | xargs -P100 -I{} bash -c 'wget --directory-prefix partitioned --continue --progress=dot:giga https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet'
54+
55+
# Run the queries
56+
57+
./run.sh 2>&1 | tee log.txt
58+
59+
cat log.txt | grep -P '^Time:\s+([\d\.]+)|Failure!' | sed -r -e 's/Time: //; s/^Failure!$/null/' |
60+
awk '{ if (i % 3 == 0) { printf "[" }; printf $1; if (i % 3 != 2) { printf "," } else { print "]," }; ++i; }'
61+
62+
echo "Data size: $(du -b hits.parquet)"
63+
echo "Load time: 0"

sail-partitioned/queries.sql

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
SELECT COUNT(*) FROM hits;
2+
SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0;
3+
SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits;
4+
SELECT AVG(UserID) FROM hits;
5+
SELECT COUNT(DISTINCT UserID) FROM hits;
6+
SELECT COUNT(DISTINCT SearchPhrase) FROM hits;
7+
SELECT MIN(EventDate), MAX(EventDate) FROM hits;
8+
SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC;
9+
SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10;
10+
SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10;
11+
SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10;
12+
SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10;
13+
SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
14+
SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10;
15+
SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10;
16+
SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;
17+
SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;
18+
SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;
19+
SELECT UserID, extract(minute FROM CAST(EventTime AS TIMESTAMP)) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;
20+
SELECT UserID FROM hits WHERE UserID = 435090932899640449;
21+
SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';
22+
SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
23+
SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
24+
SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10;
25+
SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10;
26+
SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10;
27+
SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10;
28+
SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
29+
SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25;
30+
SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits;
31+
SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10;
32+
SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
33+
SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10;
34+
SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10;
35+
SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10;
36+
SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10;
37+
SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10;
38+
SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10;
39+
SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
40+
SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
41+
SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
42+
SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
43+
SELECT DATE_TRUNC('minute', CAST(EventTime AS TIMESTAMP)) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', CAST(EventTime AS TIMESTAMP)) ORDER BY DATE_TRUNC('minute', CAST(EventTime AS TIMESTAMP)) LIMIT 10 OFFSET 1000;

sail-partitioned/query.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#!/usr/bin/env python3
2+
3+
from pysail.spark import SparkConnectServer
4+
from pyspark.sql import SparkSession
5+
import pyspark.sql.functions as F
6+
7+
import timeit
8+
import psutil
9+
import sys
10+
import re
11+
12+
query = sys.stdin.read()
13+
# Replace \1 to $1 because spark recognizes only this pattern style (in query 28)
14+
query = re.sub(r"""(REGEXP_REPLACE\(.*?,\s*('[^']*')\s*,\s*)('1')""", r"\1'$1'", query)
15+
print(query)
16+
17+
import os
18+
os.environ["SAIL_PARQUET__BINARY_AS_STRING"] = "true"
19+
os.environ["SAIL_PARQUET__REORDER_FILTERS"] = "true"
20+
os.environ["SAIL_RUNTIME__ENABLE_SECONDARY"] = "true"
21+
os.environ["SAIL_PARQUET__ALLOW_SINGLE_FILE_PARALLELISM"] = "true"
22+
23+
server = SparkConnectServer()
24+
server.start()
25+
_, port = server.listening_address
26+
27+
spark = SparkSession.builder.remote(f"sc://localhost:{port}").getOrCreate()
28+
29+
df = spark.read.parquet("partitioned")
30+
df.createOrReplaceTempView("hits")
31+
32+
for try_num in range(3):
33+
try:
34+
start = timeit.default_timer()
35+
result = spark.sql(query)
36+
res = result.toPandas()
37+
end = timeit.default_timer()
38+
if try_num == 0:
39+
print(res)
40+
print("Time: ", round(end - start, 3))
41+
except Exception as e:
42+
print(e)
43+
print("Failure!")
44+
45+
spark.stop()

sail-partitioned/run.sh

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#!/bin/bash
2+
3+
cat queries.sql | while read query; do
4+
sync
5+
echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
6+
7+
./query.py <<< "${query}"
8+
done

sail/benchmark.sh

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,26 +3,52 @@
33
# Install
44

55
export DEBIAN_FRONTEND=noninteractive
6+
7+
echo "Set Timezone"
68
export TZ=Etc/UTC
79
sudo ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
810

11+
echo "Install Rust"
12+
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > rust-init.sh
13+
bash rust-init.sh -y
14+
export HOME=${HOME:=~}
15+
source ~/.cargo/env
16+
17+
echo "Install Dependencies"
918
sudo apt-get update -y
1019
sudo apt-get install -y software-properties-common
1120
sudo add-apt-repository ppa:deadsnakes/ppa -y
1221
sudo apt-get update -y
13-
sudo apt-get install -y python3.11 python3.11-dev python3.11-venv python3.11-distutils
14-
22+
sudo apt-get install -y \
23+
gcc protobuf-compiler \
24+
libprotobuf-dev \
25+
pkg-config \
26+
libssl-dev \
27+
python3.11 \
28+
python3.11-dev \
29+
python3.11-venv \
30+
python3.11-distutils
31+
32+
echo "Set Python alternatives"
1533
sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1 && \
1634
sudo update-alternatives --install /usr/bin/python python /usr/bin/python3.11 1 && \
1735
curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
1836

37+
echo "Install Python packages"
1938
python3 -m venv myenv
2039
source myenv/bin/activate
2140
pip install --upgrade setuptools wheel
22-
pip install pysail[spark]==0.2.6 pandas psutil
41+
env RUSTFLAGS="-C target-cpu=native" pip install --no-cache-dir "pysail==0.3.3" -v --no-binary pysail
42+
pip install "pyspark-client==4.0.0" \
43+
"protobuf==5.28.3" \
44+
"grpcio==1.71.2" \
45+
"grpcio-status==1.71.2" \
46+
pandas \
47+
psutil
2348

2449
# Load the data
2550

51+
echo "Download benchmark target data, single file"
2652
wget --continue --progress=dot:giga 'https://datasets.clickhouse.com/hits_compatible/hits.parquet'
2753

2854
# Run the queries

sail/queries.sql

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase
1616
SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10;
1717
SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;
1818
SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10;
19-
SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;
19+
SELECT UserID, extract(minute FROM CAST(EventTime AS TIMESTAMP)) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10;
2020
SELECT UserID FROM hits WHERE UserID = 435090932899640449;
2121
SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%';
2222
SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10;
@@ -40,4 +40,4 @@ SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >
4040
SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000;
4141
SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100;
4242
SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000;
43-
SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000;
43+
SELECT DATE_TRUNC('minute', CAST(EventTime AS TIMESTAMP)) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', CAST(EventTime AS TIMESTAMP)) ORDER BY DATE_TRUNC('minute', CAST(EventTime AS TIMESTAMP)) LIMIT 10 OFFSET 1000;

sail/query.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616

1717
import os
1818
os.environ["SAIL_PARQUET__BINARY_AS_STRING"] = "true"
19-
os.environ["SAIL_PARQUET__PUSHDOWN_FILTERS"] = "true"
2019
os.environ["SAIL_PARQUET__REORDER_FILTERS"] = "true"
2120
os.environ["SAIL_RUNTIME__ENABLE_SECONDARY"] = "true"
2221
os.environ["SAIL_PARQUET__ALLOW_SINGLE_FILE_PARALLELISM"] = "true"
@@ -28,20 +27,19 @@
2827
spark = SparkSession.builder.remote(f"sc://localhost:{port}").getOrCreate()
2928

3029
df = spark.read.parquet("hits.parquet")
31-
# Do casting before creating the view so no need to change to unreadable integer dates in SQL
32-
df = df.withColumn("EventTime", F.timestamp_seconds("EventTime"))
33-
df = df.withColumn("EventDate", F.date_add(F.lit("1970-01-01"), F.col("EventDate")))
3430
df.createOrReplaceTempView("hits")
3531

3632
for try_num in range(3):
3733
try:
3834
start = timeit.default_timer()
3935
result = spark.sql(query)
4036
res = result.toPandas()
37+
end = timeit.default_timer()
4138
if try_num == 0:
4239
print(res)
43-
end = timeit.default_timer()
4440
print("Time: ", round(end - start, 3))
4541
except Exception as e:
46-
print(e);
42+
print(e)
4743
print("Failure!")
44+
45+
spark.stop()

0 commit comments

Comments
 (0)