diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q53.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q53.py new file mode 100644 index 00000000000..2ae01baa0b3 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q53.py @@ -0,0 +1,170 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 53.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 53.""" + return """ + SELECT * + FROM (SELECT i_manufact_id, + Sum(ss_sales_price) sum_sales, + Avg(Sum(ss_sales_price)) + OVER ( + partition BY i_manufact_id) avg_quarterly_sales + FROM item, + store_sales, + date_dim, + store + WHERE ss_item_sk = i_item_sk + AND ss_sold_date_sk = d_date_sk + AND ss_store_sk = s_store_sk + AND d_month_seq IN ( 1199, 1199 + 1, 1199 + 2, 1199 + 3, + 1199 + 4, 1199 + 5, 1199 + 6, 1199 + 7, + 1199 + 8, 1199 + 9, 1199 + 10, 1199 + 11 ) + AND ( ( i_category IN ( 'Books', 'Children', 'Electronics' ) + AND i_class IN ( 'personal', 'portable', 'reference', + 'self-help' ) + AND i_brand IN ( 'scholaramalgamalg #14', + 'scholaramalgamalg #7' + , + 'exportiunivamalg #9', + 'scholaramalgamalg #9' ) + ) + OR ( i_category IN ( 'Women', 'Music', 'Men' ) + AND i_class IN ( 'accessories', 'classical', + 'fragrances', + 'pants' ) + AND i_brand IN ( 'amalgimporto #1', + 'edu packscholar #1', + 'exportiimporto #1', + 'importoamalg #1' ) ) ) + GROUP BY i_manufact_id, + d_qoy) tmp1 + WHERE CASE + WHEN avg_quarterly_sales > 0 THEN Abs (sum_sales - avg_quarterly_sales) + / + avg_quarterly_sales + ELSE NULL + END > 0.1 + ORDER BY avg_quarterly_sales, + sum_sales, + i_manufact_id + LIMIT 100; + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 53.""" + # Load tables + item = get_data(run_config.dataset_path, "item", run_config.suffix) + store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix) + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + store = get_data(run_config.dataset_path, "store", run_config.suffix) + month_seq_list = list(range(1199, 1199 + 12)) + grouped_data = ( + store_sales.join(item, left_on="ss_item_sk", right_on="i_item_sk") + .join(date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk") + .join(store, left_on="ss_store_sk", right_on="s_store_sk") + .filter(pl.col("d_month_seq").is_in(month_seq_list)) + .filter( + # Books/Children/Electronics categories + ( + (pl.col("i_category").is_in(["Books", "Children", "Electronics"])) + & ( + pl.col("i_class").is_in( + ["personal", "portable", "reference", "self-help"] + ) + ) + & ( + pl.col("i_brand").is_in( + [ + "scholaramalgamalg #14", + "scholaramalgamalg #7", + "exportiunivamalg #9", + "scholaramalgamalg #9", + ] + ) + ) + ) + | + # Women/Music/Men categories + ( + (pl.col("i_category").is_in(["Women", "Music", "Men"])) + & ( + pl.col("i_class").is_in( + ["accessories", "classical", "fragrances", "pants"] + ) + ) + & ( + pl.col("i_brand").is_in( + [ + "amalgimporto #1", + "edu packscholar #1", + "exportiimporto #1", + "importoamalg #1", + ] + ) + ) + ) + ) + .group_by(["i_manufact_id", "d_qoy"]) + .agg([pl.col("ss_sales_price").sum().alias("sum_sales_raw")]) + .with_columns( + [ + pl.when(pl.col("sum_sales_raw").is_not_null()) + .then(pl.col("sum_sales_raw")) + .otherwise(None) + .alias("sum(ss_sales_price)") + ] + ) + ) + non_null_data = grouped_data.filter(pl.col("i_manufact_id").is_not_null()) + null_data = grouped_data.filter(pl.col("i_manufact_id").is_null()) + manufacturer_averages = non_null_data.group_by("i_manufact_id").agg( + [pl.col("sum(ss_sales_price)").mean().alias("avg_quarterly_sales")] + ) + non_null_result = non_null_data.join( + manufacturer_averages, on="i_manufact_id", how="left" + ) + null_result = null_data.with_columns( + [pl.col("sum(ss_sales_price)").mean().alias("avg_quarterly_sales")] + ) + inner_query = pl.concat([non_null_result, null_result]).select( + [ + "i_manufact_id", + pl.col("sum(ss_sales_price)").alias("sum_sales"), + "avg_quarterly_sales", + ] + ) + return ( + inner_query.filter( + # Percentage deviation > 10% + pl.when(pl.col("avg_quarterly_sales") > 0) + .then( + (pl.col("sum_sales") - pl.col("avg_quarterly_sales")).abs() + / pl.col("avg_quarterly_sales") + ) + .otherwise(None) + > 0.1 + ) + .select(["i_manufact_id", "sum_sales", "avg_quarterly_sales"]) + .sort( + ["avg_quarterly_sales", "sum_sales", "i_manufact_id"], + nulls_last=True, + descending=[False, False, False], + ) + .limit(100) + ) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q54.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q54.py new file mode 100644 index 00000000000..ddd286e1e5b --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q54.py @@ -0,0 +1,159 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 54.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 54.""" + return """ + WITH my_customers + AS (SELECT DISTINCT c_customer_sk, + c_current_addr_sk + FROM (SELECT cs_sold_date_sk sold_date_sk, + cs_bill_customer_sk customer_sk, + cs_item_sk item_sk + FROM catalog_sales + UNION ALL + SELECT ws_sold_date_sk sold_date_sk, + ws_bill_customer_sk customer_sk, + ws_item_sk item_sk + FROM web_sales) cs_or_ws_sales, + item, + date_dim, + customer + WHERE sold_date_sk = d_date_sk + AND item_sk = i_item_sk + AND i_category = 'Sports' + AND i_class = 'fitness' + AND c_customer_sk = cs_or_ws_sales.customer_sk + AND d_moy = 5 + AND d_year = 2000), + my_revenue + AS (SELECT c_customer_sk, + Sum(ss_ext_sales_price) AS revenue + FROM my_customers, + store_sales, + customer_address, + store, + date_dim + WHERE c_current_addr_sk = ca_address_sk + AND ca_county = s_county + AND ca_state = s_state + AND ss_sold_date_sk = d_date_sk + AND c_customer_sk = ss_customer_sk + AND d_month_seq BETWEEN (SELECT DISTINCT d_month_seq + 1 + FROM date_dim + WHERE d_year = 2000 + AND d_moy = 5) AND + (SELECT DISTINCT + d_month_seq + 3 + FROM date_dim + WHERE d_year = 2000 + AND d_moy = 5) + GROUP BY c_customer_sk), + segments + AS (SELECT Cast(( revenue / 50 ) AS INT) AS segment + FROM my_revenue) + SELECT segment, + Count(*) AS num_customers, + segment * 50 AS segment_base + FROM segments + GROUP BY segment + ORDER BY segment, + num_customers + LIMIT 100; + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 54.""" + catalog_sales = get_data( + run_config.dataset_path, "catalog_sales", run_config.suffix + ) + web_sales = get_data(run_config.dataset_path, "web_sales", run_config.suffix) + store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix) + item = get_data(run_config.dataset_path, "item", run_config.suffix) + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + customer = get_data(run_config.dataset_path, "customer", run_config.suffix) + customer_address = get_data( + run_config.dataset_path, "customer_address", run_config.suffix + ) + store = get_data(run_config.dataset_path, "store", run_config.suffix) + + cs_sales = catalog_sales.select( + [ + pl.col("cs_sold_date_sk").alias("sold_date_sk"), + pl.col("cs_bill_customer_sk").alias("customer_sk"), + pl.col("cs_item_sk").alias("item_sk"), + ] + ) + ws_sales = web_sales.select( + [ + pl.col("ws_sold_date_sk").alias("sold_date_sk"), + pl.col("ws_bill_customer_sk").alias("customer_sk"), + pl.col("ws_item_sk").alias("item_sk"), + ] + ) + cs_or_ws_sales = pl.concat([cs_sales, ws_sales]) + my_customers = ( + cs_or_ws_sales.join(date_dim, left_on="sold_date_sk", right_on="d_date_sk") + .join(item, left_on="item_sk", right_on="i_item_sk") + .join(customer, left_on="customer_sk", right_on="c_customer_sk") + .filter( + (pl.col("i_category") == "Sports") + & (pl.col("i_class") == "fitness") + & (pl.col("d_moy") == 5) + & (pl.col("d_year") == 2000) + ) + .select([pl.col("customer_sk").alias("c_customer_sk"), "c_current_addr_sk"]) + .unique() + ) + + my_revenue = ( + my_customers.join( + customer_address, left_on="c_current_addr_sk", right_on="ca_address_sk" + ) + .join( + store, left_on=["ca_county", "ca_state"], right_on=["s_county", "s_state"] + ) + .join(store_sales, left_on="c_customer_sk", right_on="ss_customer_sk") + .join(date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk") + .filter(pl.col("d_month_seq").is_between(1206, 1208)) + .group_by("c_customer_sk") + .agg([pl.col("ss_ext_sales_price").sum().alias("revenue")]) + ) + + segments = my_revenue.with_columns( + (pl.col("revenue") / 50.0).round(0).cast(pl.Int32).alias("segment") + ).select("segment") + + return ( + segments.group_by("segment") + .agg([pl.len().alias("num_customers")]) + .with_columns((pl.col("segment") * 50).alias("segment_base")) + .select( + [ + "segment", + pl.col("num_customers").cast(pl.Int64), + "segment_base", + ] + ) + .sort( + ["segment", "num_customers"], + nulls_last=True, + descending=[False, False], + ) + .limit(100) + ) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q55.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q55.py new file mode 100644 index 00000000000..d2cfd0bacd7 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q55.py @@ -0,0 +1,64 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 55.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 55.""" + return """ + SELECT i_brand_id brand_id, + i_brand brand, + Sum(ss_ext_sales_price) ext_price + FROM date_dim, + store_sales, + item + WHERE d_date_sk = ss_sold_date_sk + AND ss_item_sk = i_item_sk + AND i_manager_id = 33 + AND d_moy = 12 + AND d_year = 1998 + GROUP BY i_brand, + i_brand_id + ORDER BY ext_price DESC, + i_brand_id + LIMIT 100; + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 55.""" + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix) + item = get_data(run_config.dataset_path, "item", run_config.suffix) + return ( + store_sales.join(date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk") + .join(item, left_on="ss_item_sk", right_on="i_item_sk") + .filter( + (pl.col("i_manager_id") == 33) + & (pl.col("d_moy") == 12) + & (pl.col("d_year") == 1998) + ) + .group_by(["i_brand", "i_brand_id"]) + .agg(pl.col("ss_ext_sales_price").sum().alias("ext_price")) + .select( + [ + pl.col("i_brand_id").alias("brand_id"), + pl.col("i_brand").alias("brand"), + pl.col("ext_price"), + ] + ) + .sort(["ext_price", "brand_id"], descending=[True, False], nulls_last=True) + .limit(100) + ) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q56.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q56.py new file mode 100644 index 00000000000..7e2976eab04 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q56.py @@ -0,0 +1,182 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 56.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 56.""" + return """ + WITH ss + AS (SELECT i_item_id, + Sum(ss_ext_sales_price) total_sales + FROM store_sales, + date_dim, + customer_address, + item + WHERE i_item_id IN (SELECT i_item_id + FROM item + WHERE i_color IN ( 'firebrick', 'rosy', 'white' ) + ) + AND ss_item_sk = i_item_sk + AND ss_sold_date_sk = d_date_sk + AND d_year = 1998 + AND d_moy = 3 + AND ss_addr_sk = ca_address_sk + AND ca_gmt_offset = -6 + GROUP BY i_item_id), + cs + AS (SELECT i_item_id, + Sum(cs_ext_sales_price) total_sales + FROM catalog_sales, + date_dim, + customer_address, + item + WHERE i_item_id IN (SELECT i_item_id + FROM item + WHERE i_color IN ( 'firebrick', 'rosy', 'white' ) + ) + AND cs_item_sk = i_item_sk + AND cs_sold_date_sk = d_date_sk + AND d_year = 1998 + AND d_moy = 3 + AND cs_bill_addr_sk = ca_address_sk + AND ca_gmt_offset = -6 + GROUP BY i_item_id), + ws + AS (SELECT i_item_id, + Sum(ws_ext_sales_price) total_sales + FROM web_sales, + date_dim, + customer_address, + item + WHERE i_item_id IN (SELECT i_item_id + FROM item + WHERE i_color IN ( 'firebrick', 'rosy', 'white' ) + ) + AND ws_item_sk = i_item_sk + AND ws_sold_date_sk = d_date_sk + AND d_year = 1998 + AND d_moy = 3 + AND ws_bill_addr_sk = ca_address_sk + AND ca_gmt_offset = -6 + GROUP BY i_item_id) + SELECT i_item_id, + Sum(total_sales) total_sales + FROM (SELECT * + FROM ss + UNION ALL + SELECT * + FROM cs + UNION ALL + SELECT * + FROM ws) tmp1 + GROUP BY i_item_id + ORDER BY total_sales + LIMIT 100; + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 56.""" + store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix) + catalog_sales = get_data( + run_config.dataset_path, "catalog_sales", run_config.suffix + ) + web_sales = get_data(run_config.dataset_path, "web_sales", run_config.suffix) + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + customer_address = get_data( + run_config.dataset_path, "customer_address", run_config.suffix + ) + item = get_data(run_config.dataset_path, "item", run_config.suffix) + + color_item_ids_lf = ( + item.filter(pl.col("i_color").is_in(["firebrick", "rosy", "white"])) + .select(["i_item_id"]) + .unique() + ) + + channels = [ + { + "lf": store_sales, + "sold_date_col": "ss_sold_date_sk", + "item_sk_col": "ss_item_sk", + "addr_sk_col": "ss_addr_sk", + "ext_col": "ss_ext_sales_price", + }, + { + "lf": catalog_sales, + "sold_date_col": "cs_sold_date_sk", + "item_sk_col": "cs_item_sk", + "addr_sk_col": "cs_bill_addr_sk", + "ext_col": "cs_ext_sales_price", + }, + { + "lf": web_sales, + "sold_date_col": "ws_sold_date_sk", + "item_sk_col": "ws_item_sk", + "addr_sk_col": "ws_bill_addr_sk", + "ext_col": "ws_ext_sales_price", + }, + ] + + per_channel = [ + ( + ch["lf"] + .join(item, left_on=ch["item_sk_col"], right_on="i_item_sk") + .join(color_item_ids_lf, on="i_item_id") + .join(date_dim, left_on=ch["sold_date_col"], right_on="d_date_sk") + .join(customer_address, left_on=ch["addr_sk_col"], right_on="ca_address_sk") + .filter( + (pl.col("d_year") == 1998) + & (pl.col("d_moy") == 3) + & (pl.col("ca_gmt_offset") == -6) + ) + .group_by("i_item_id") + .agg( + [ + pl.col(ch["ext_col"]).count().alias("count_sales"), + pl.col(ch["ext_col"]).sum().alias("sum_sales"), + ] + ) + .with_columns( + pl.when(pl.col("count_sales") == 0) + .then(None) + .otherwise(pl.col("sum_sales")) + .alias("total_sales") + ) + .select(["i_item_id", "total_sales"]) + ) + for ch in channels + ] + + return ( + pl.concat(per_channel) + .group_by("i_item_id") + .agg( + [ + pl.col("total_sales").count().alias("count_total"), + pl.col("total_sales").sum().alias("sum_total"), + ] + ) + .with_columns( + pl.when(pl.col("count_total") == 0) + .then(None) + .otherwise(pl.col("sum_total")) + .alias("total_sales") + ) + .select(["i_item_id", "total_sales"]) + .sort(["total_sales"], nulls_last=True, descending=[False]) + .limit(100) + ) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q57.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q57.py new file mode 100644 index 00000000000..eb852f2fb67 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q57.py @@ -0,0 +1,194 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 57.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 57.""" + return """ + WITH v1 + AS (SELECT i_category, + i_brand, + cc_name, + d_year, + d_moy, + Sum(cs_sales_price) sum_sales + , + Avg(Sum(cs_sales_price)) + OVER ( + partition BY i_category, i_brand, cc_name, d_year) + avg_monthly_sales + , + Rank() + OVER ( + partition BY i_category, i_brand, cc_name + ORDER BY d_year, d_moy) rn + FROM item, + catalog_sales, + date_dim, + call_center + WHERE cs_item_sk = i_item_sk + AND cs_sold_date_sk = d_date_sk + AND cc_call_center_sk = cs_call_center_sk + AND ( d_year = 2000 + OR ( d_year = 2000 - 1 + AND d_moy = 12 ) + OR ( d_year = 2000 + 1 + AND d_moy = 1 ) ) + GROUP BY i_category, + i_brand, + cc_name, + d_year, + d_moy), + v2 + AS (SELECT v1.i_brand, + v1.d_year, + v1.avg_monthly_sales, + v1.sum_sales, + v1_lag.sum_sales psum, + v1_lead.sum_sales nsum + FROM v1, + v1 v1_lag, + v1 v1_lead + WHERE v1.i_category = v1_lag.i_category + AND v1.i_category = v1_lead.i_category + AND v1.i_brand = v1_lag.i_brand + AND v1.i_brand = v1_lead.i_brand + AND v1. cc_name = v1_lag. cc_name + AND v1. cc_name = v1_lead. cc_name + AND v1.rn = v1_lag.rn + 1 + AND v1.rn = v1_lead.rn - 1) + SELECT * + FROM v2 + WHERE d_year = 2000 + AND avg_monthly_sales > 0 + AND CASE + WHEN avg_monthly_sales > 0 THEN Abs(sum_sales - avg_monthly_sales) + / + avg_monthly_sales + ELSE NULL + END > 0.1 + ORDER BY sum_sales - avg_monthly_sales, + 3 + LIMIT 100; + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 57.""" + item = get_data(run_config.dataset_path, "item", run_config.suffix) + catalog_sales = get_data( + run_config.dataset_path, "catalog_sales", run_config.suffix + ) + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + call_center = get_data(run_config.dataset_path, "call_center", run_config.suffix) + + v1 = ( + catalog_sales.join(item, left_on="cs_item_sk", right_on="i_item_sk") + .join(date_dim, left_on="cs_sold_date_sk", right_on="d_date_sk") + .join(call_center, left_on="cs_call_center_sk", right_on="cc_call_center_sk") + .filter( + (pl.col("d_year") == 2000) + | ((pl.col("d_year") == 1999) & (pl.col("d_moy") == 12)) + | ((pl.col("d_year") == 2001) & (pl.col("d_moy") == 1)) + ) + .group_by(["i_category", "i_brand", "cc_name", "d_year", "d_moy"]) + .agg( + [ + pl.col("cs_sales_price").count().alias("count_sales"), + pl.col("cs_sales_price").sum().alias("sum_sales_raw"), + ] + ) + .with_columns( + pl.when(pl.col("count_sales") == 0) + .then(None) + .otherwise(pl.col("sum_sales_raw")) + .alias("sum_sales") + ) + .drop("count_sales", "sum_sales_raw") + .with_columns( + [ + pl.col("sum_sales") + .mean() + .over(["i_category", "i_brand", "cc_name", "d_year"]) + .alias("avg_monthly_sales"), + pl.col("d_year") + .rank(method="ordinal") + .over( + ["i_category", "i_brand", "cc_name"], order_by=["d_year", "d_moy"] + ) + .alias("rn"), + ] + ) + ) + + v2 = ( + v1.join( + v1.select( + [ + pl.col("i_category").alias("i_category_lag"), + pl.col("i_brand").alias("i_brand_lag"), + pl.col("cc_name").alias("cc_name_lag"), + pl.col("rn").alias("rn_lag"), + pl.col("sum_sales").alias("psum"), + ] + ), + left_on=["i_category", "i_brand", "cc_name"], + right_on=["i_category_lag", "i_brand_lag", "cc_name_lag"], + how="inner", + ) + .filter(pl.col("rn") == pl.col("rn_lag") + 1) + .join( + v1.select( + [ + pl.col("i_category").alias("i_category_lead"), + pl.col("i_brand").alias("i_brand_lead"), + pl.col("cc_name").alias("cc_name_lead"), + pl.col("rn").alias("rn_lead"), + pl.col("sum_sales").alias("nsum"), + ] + ), + left_on=["i_category", "i_brand", "cc_name"], + right_on=["i_category_lead", "i_brand_lead", "cc_name_lead"], + how="inner", + ) + .filter(pl.col("rn") == pl.col("rn_lead") - 1) + .select(["i_brand", "d_year", "avg_monthly_sales", "sum_sales", "psum", "nsum"]) + ) + + return ( + v2.filter( + (pl.col("d_year") == 2000) + & (pl.col("avg_monthly_sales") > 0) + & ( + pl.when(pl.col("avg_monthly_sales") > 0) + .then( + (pl.col("sum_sales") - pl.col("avg_monthly_sales")).abs() + / pl.col("avg_monthly_sales") + ) + .otherwise(None) + > 0.1 + ) + ) + .sort( + by=[ + pl.col("sum_sales") - pl.col("avg_monthly_sales"), + pl.col("avg_monthly_sales"), + ], + descending=[False, False], + nulls_last=True, + ) + .limit(100) + ) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q58.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q58.py new file mode 100644 index 00000000000..5577be2f60d --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q58.py @@ -0,0 +1,231 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 58.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 58.""" + return """ + WITH ss_items + AS (SELECT i_item_id item_id, + Sum(ss_ext_sales_price) ss_item_rev + FROM store_sales, + item, + date_dim + WHERE ss_item_sk = i_item_sk + AND d_date IN (SELECT d_date + FROM date_dim + WHERE d_week_seq = (SELECT d_week_seq + FROM date_dim + WHERE d_date = '2002-02-25' + )) + AND ss_sold_date_sk = d_date_sk + GROUP BY i_item_id), + cs_items + AS (SELECT i_item_id item_id, + Sum(cs_ext_sales_price) cs_item_rev + FROM catalog_sales, + item, + date_dim + WHERE cs_item_sk = i_item_sk + AND d_date IN (SELECT d_date + FROM date_dim + WHERE d_week_seq = (SELECT d_week_seq + FROM date_dim + WHERE d_date = '2002-02-25' + )) + AND cs_sold_date_sk = d_date_sk + GROUP BY i_item_id), + ws_items + AS (SELECT i_item_id item_id, + Sum(ws_ext_sales_price) ws_item_rev + FROM web_sales, + item, + date_dim + WHERE ws_item_sk = i_item_sk + AND d_date IN (SELECT d_date + FROM date_dim + WHERE d_week_seq = (SELECT d_week_seq + FROM date_dim + WHERE d_date = '2002-02-25' + )) + AND ws_sold_date_sk = d_date_sk + GROUP BY i_item_id) + SELECT ss_items.item_id, + ss_item_rev, + ss_item_rev / ( ss_item_rev + cs_item_rev + ws_item_rev ) / 3 * + 100 ss_dev, + cs_item_rev, + cs_item_rev / ( ss_item_rev + cs_item_rev + ws_item_rev ) / 3 * + 100 cs_dev, + ws_item_rev, + ws_item_rev / ( ss_item_rev + cs_item_rev + ws_item_rev ) / 3 * + 100 ws_dev, + ( ss_item_rev + cs_item_rev + ws_item_rev ) / 3 + average + FROM ss_items, + cs_items, + ws_items + WHERE ss_items.item_id = cs_items.item_id + AND ss_items.item_id = ws_items.item_id + AND ss_item_rev BETWEEN 0.9 * cs_item_rev AND 1.1 * cs_item_rev + AND ss_item_rev BETWEEN 0.9 * ws_item_rev AND 1.1 * ws_item_rev + AND cs_item_rev BETWEEN 0.9 * ss_item_rev AND 1.1 * ss_item_rev + AND cs_item_rev BETWEEN 0.9 * ws_item_rev AND 1.1 * ws_item_rev + AND ws_item_rev BETWEEN 0.9 * ss_item_rev AND 1.1 * ss_item_rev + AND ws_item_rev BETWEEN 0.9 * cs_item_rev AND 1.1 * cs_item_rev + ORDER BY ss_items.item_id, + ss_item_rev + LIMIT 100; + """ + + +def _build_sales_agg( + sales: pl.LazyFrame, + item_lf: pl.LazyFrame, + week_dates: pl.LazyFrame, + sales_item_sk: str, + sales_date_sk: str, + price_col: str, + alias: str, +) -> pl.LazyFrame: + return ( + sales.join(item_lf, left_on=sales_item_sk, right_on="i_item_sk") + .join(week_dates, left_on=sales_date_sk, right_on="d_date_sk") + .group_by("i_item_id") + .agg(pl.col(price_col).sum().alias(alias)) + .select(pl.col("i_item_id").alias("item_id"), pl.col(alias)) + ) + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 58.""" + store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix) + catalog_sales = get_data( + run_config.dataset_path, "catalog_sales", run_config.suffix + ) + web_sales = get_data(run_config.dataset_path, "web_sales", run_config.suffix) + item = get_data(run_config.dataset_path, "item", run_config.suffix) + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + + target_week = date_dim.filter(pl.col("d_date") == pl.date(2002, 2, 25)).select( + "d_week_seq" + ) + week_dates = date_dim.join(target_week, on="d_week_seq", how="inner").select( + ["d_date_sk"] + ) + + ss_items = _build_sales_agg( + store_sales, + item, + week_dates, + "ss_item_sk", + "ss_sold_date_sk", + "ss_ext_sales_price", + "ss_item_rev", + ) + cs_items = _build_sales_agg( + catalog_sales, + item, + week_dates, + "cs_item_sk", + "cs_sold_date_sk", + "cs_ext_sales_price", + "cs_item_rev", + ) + ws_items = _build_sales_agg( + web_sales, + item, + week_dates, + "ws_item_sk", + "ws_sold_date_sk", + "ws_ext_sales_price", + "ws_item_rev", + ) + + return ( + ss_items.join(cs_items, on="item_id", how="inner") + .join(ws_items, on="item_id", how="inner") + .filter( + ( + pl.col("ss_item_rev").is_between( + 0.9 * pl.col("cs_item_rev"), 1.1 * pl.col("cs_item_rev") + ) + ) + & ( + pl.col("ss_item_rev").is_between( + 0.9 * pl.col("ws_item_rev"), 1.1 * pl.col("ws_item_rev") + ) + ) + & ( + pl.col("cs_item_rev").is_between( + 0.9 * pl.col("ss_item_rev"), 1.1 * pl.col("ss_item_rev") + ) + ) + & ( + pl.col("cs_item_rev").is_between( + 0.9 * pl.col("ws_item_rev"), 1.1 * pl.col("ws_item_rev") + ) + ) + & ( + pl.col("ws_item_rev").is_between( + 0.9 * pl.col("ss_item_rev"), 1.1 * pl.col("ss_item_rev") + ) + ) + & ( + pl.col("ws_item_rev").is_between( + 0.9 * pl.col("cs_item_rev"), 1.1 * pl.col("cs_item_rev") + ) + ) + ) + .with_columns( + [ + ( + pl.col("ss_item_rev") + + pl.col("cs_item_rev") + + pl.col("ws_item_rev") + ).alias("total_rev"), + ( + ( + pl.col("ss_item_rev") + + pl.col("cs_item_rev") + + pl.col("ws_item_rev") + ) + / 3 + ).alias("average"), + ] + ) + .with_columns( + [ + (pl.col("ss_item_rev") / pl.col("total_rev") / 3 * 100).alias("ss_dev"), + (pl.col("cs_item_rev") / pl.col("total_rev") / 3 * 100).alias("cs_dev"), + (pl.col("ws_item_rev") / pl.col("total_rev") / 3 * 100).alias("ws_dev"), + ] + ) + .select( + [ + "item_id", + "ss_item_rev", + "ss_dev", + "cs_item_rev", + "cs_dev", + "ws_item_rev", + "ws_dev", + "average", + ] + ) + .sort(["item_id", "ss_item_rev"], nulls_last=True) + .limit(100) + ) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q59.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q59.py new file mode 100644 index 00000000000..04825d5d385 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q59.py @@ -0,0 +1,230 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 59.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 59.""" + return """ + WITH wss + AS (SELECT d_week_seq, + ss_store_sk, + Sum(CASE + WHEN ( d_day_name = 'Sunday' ) THEN ss_sales_price + ELSE NULL + END) sun_sales, + Sum(CASE + WHEN ( d_day_name = 'Monday' ) THEN ss_sales_price + ELSE NULL + END) mon_sales, + Sum(CASE + WHEN ( d_day_name = 'Tuesday' ) THEN ss_sales_price + ELSE NULL + END) tue_sales, + Sum(CASE + WHEN ( d_day_name = 'Wednesday' ) THEN ss_sales_price + ELSE NULL + END) wed_sales, + Sum(CASE + WHEN ( d_day_name = 'Thursday' ) THEN ss_sales_price + ELSE NULL + END) thu_sales, + Sum(CASE + WHEN ( d_day_name = 'Friday' ) THEN ss_sales_price + ELSE NULL + END) fri_sales, + Sum(CASE + WHEN ( d_day_name = 'Saturday' ) THEN ss_sales_price + ELSE NULL + END) sat_sales + FROM store_sales, + date_dim + WHERE d_date_sk = ss_sold_date_sk + GROUP BY d_week_seq, + ss_store_sk) + SELECT s_store_name1, + s_store_id1, + d_week_seq1, + sun_sales1 / sun_sales2, + mon_sales1 / mon_sales2, + tue_sales1 / tue_sales2, + wed_sales1 / wed_sales2, + thu_sales1 / thu_sales2, + fri_sales1 / fri_sales2, + sat_sales1 / sat_sales2 + FROM (SELECT s_store_name s_store_name1, + wss.d_week_seq d_week_seq1, + s_store_id s_store_id1, + sun_sales sun_sales1, + mon_sales mon_sales1, + tue_sales tue_sales1, + wed_sales wed_sales1, + thu_sales thu_sales1, + fri_sales fri_sales1, + sat_sales sat_sales1 + FROM wss, + store, + date_dim d + WHERE d.d_week_seq = wss.d_week_seq + AND ss_store_sk = s_store_sk + AND d_month_seq BETWEEN 1196 AND 1196 + 11) y, + (SELECT s_store_name s_store_name2, + wss.d_week_seq d_week_seq2, + s_store_id s_store_id2, + sun_sales sun_sales2, + mon_sales mon_sales2, + tue_sales tue_sales2, + wed_sales wed_sales2, + thu_sales thu_sales2, + fri_sales fri_sales2, + sat_sales sat_sales2 + FROM wss, + store, + date_dim d + WHERE d.d_week_seq = wss.d_week_seq + AND ss_store_sk = s_store_sk + AND d_month_seq BETWEEN 1196 + 12 AND 1196 + 23) x + WHERE s_store_id1 = s_store_id2 + AND d_week_seq1 = d_week_seq2 - 52 + ORDER BY s_store_name1, + s_store_id1, + d_week_seq1 + LIMIT 100; + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 59.""" + store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix) + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + store = get_data(run_config.dataset_path, "store", run_config.suffix) + + base = store_sales.join( + date_dim.select(["d_date_sk", "d_week_seq", "d_day_name"]), + left_on="ss_sold_date_sk", + right_on="d_date_sk", + ).select(["d_week_seq", "ss_store_sk", "d_day_name", "ss_sales_price"]) + wss = base.group_by(["d_week_seq", "ss_store_sk"]).agg( + [ + pl.when(pl.col("d_day_name") == "Sunday") + .then(pl.col("ss_sales_price")) + .otherwise(None) + .sum() + .alias("sun_sales"), + pl.when(pl.col("d_day_name") == "Monday") + .then(pl.col("ss_sales_price")) + .otherwise(None) + .sum() + .alias("mon_sales"), + pl.when(pl.col("d_day_name") == "Tuesday") + .then(pl.col("ss_sales_price")) + .otherwise(None) + .sum() + .alias("tue_sales"), + pl.when(pl.col("d_day_name") == "Wednesday") + .then(pl.col("ss_sales_price")) + .otherwise(None) + .sum() + .alias("wed_sales"), + pl.when(pl.col("d_day_name") == "Thursday") + .then(pl.col("ss_sales_price")) + .otherwise(None) + .sum() + .alias("thu_sales"), + pl.when(pl.col("d_day_name") == "Friday") + .then(pl.col("ss_sales_price")) + .otherwise(None) + .sum() + .alias("fri_sales"), + pl.when(pl.col("d_day_name") == "Saturday") + .then(pl.col("ss_sales_price")) + .otherwise(None) + .sum() + .alias("sat_sales"), + ] + ) + wss_enriched = wss.join( + store.select(["s_store_sk", "s_store_id", "s_store_name"]), + left_on="ss_store_sk", + right_on="s_store_sk", + ).join(date_dim.select(["d_week_seq", "d_month_seq"]), on="d_week_seq") + y = wss_enriched.filter(pl.col("d_month_seq").is_between(1196, 1196 + 11)).select( + [ + pl.col("s_store_name").alias("s_store_name1"), + pl.col("s_store_id").alias("s_store_id1"), + pl.col("d_week_seq").alias("d_week_seq1"), + pl.col("sun_sales").alias("sun_sales1"), + pl.col("mon_sales").alias("mon_sales1"), + pl.col("tue_sales").alias("tue_sales1"), + pl.col("wed_sales").alias("wed_sales1"), + pl.col("thu_sales").alias("thu_sales1"), + pl.col("fri_sales").alias("fri_sales1"), + pl.col("sat_sales").alias("sat_sales1"), + ] + ) + x = wss_enriched.filter( + pl.col("d_month_seq").is_between(1196 + 12, 1196 + 23) + ).select( + [ + pl.col("s_store_id").alias("s_store_id2"), + (pl.col("d_week_seq") - 52).alias("d_week_seq_join"), + pl.col("sun_sales").alias("sun_sales2"), + pl.col("mon_sales").alias("mon_sales2"), + pl.col("tue_sales").alias("tue_sales2"), + pl.col("wed_sales").alias("wed_sales2"), + pl.col("thu_sales").alias("thu_sales2"), + pl.col("fri_sales").alias("fri_sales2"), + pl.col("sat_sales").alias("sat_sales2"), + ] + ) + joined = y.join( + x, + left_on=["s_store_id1", "d_week_seq1"], + right_on=["s_store_id2", "d_week_seq_join"], + how="inner", + ) + projected = joined.select( + [ + "s_store_name1", + "s_store_id1", + "d_week_seq1", + (pl.col("sun_sales1") / pl.col("sun_sales2")).alias( + "(sun_sales1 / sun_sales2)" + ), + (pl.col("mon_sales1") / pl.col("mon_sales2")).alias( + "(mon_sales1 / mon_sales2)" + ), + (pl.col("tue_sales1") / pl.col("tue_sales2")).alias( + "(tue_sales1 / tue_sales2)" + ), + (pl.col("wed_sales1") / pl.col("wed_sales2")).alias( + "(wed_sales1 / wed_sales2)" + ), + (pl.col("thu_sales1") / pl.col("thu_sales2")).alias( + "(thu_sales1 / thu_sales2)" + ), + (pl.col("fri_sales1") / pl.col("fri_sales2")).alias( + "(fri_sales1 / fri_sales2)" + ), + (pl.col("sat_sales1") / pl.col("sat_sales2")).alias( + "(sat_sales1 / sat_sales2)" + ), + ] + ) + return projected.sort( + ["s_store_name1", "s_store_id1", "d_week_seq1"], + descending=[False, False, False], + nulls_last=True, + ).limit(100) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q60.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q60.py new file mode 100644 index 00000000000..9aeb99eca99 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q60.py @@ -0,0 +1,156 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 60.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 60.""" + return """ + WITH ss + AS (SELECT i_item_id, + Sum(ss_ext_sales_price) total_sales + FROM store_sales, + date_dim, + customer_address, + item + WHERE i_item_id IN (SELECT i_item_id + FROM item + WHERE i_category IN ( 'Jewelry' )) + AND ss_item_sk = i_item_sk + AND ss_sold_date_sk = d_date_sk + AND d_year = 1999 + AND d_moy = 8 + AND ss_addr_sk = ca_address_sk + AND ca_gmt_offset = -6 + GROUP BY i_item_id), + cs + AS (SELECT i_item_id, + Sum(cs_ext_sales_price) total_sales + FROM catalog_sales, + date_dim, + customer_address, + item + WHERE i_item_id IN (SELECT i_item_id + FROM item + WHERE i_category IN ( 'Jewelry' )) + AND cs_item_sk = i_item_sk + AND cs_sold_date_sk = d_date_sk + AND d_year = 1999 + AND d_moy = 8 + AND cs_bill_addr_sk = ca_address_sk + AND ca_gmt_offset = -6 + GROUP BY i_item_id), + ws + AS (SELECT i_item_id, + Sum(ws_ext_sales_price) total_sales + FROM web_sales, + date_dim, + customer_address, + item + WHERE i_item_id IN (SELECT i_item_id + FROM item + WHERE i_category IN ( 'Jewelry' )) + AND ws_item_sk = i_item_sk + AND ws_sold_date_sk = d_date_sk + AND d_year = 1999 + AND d_moy = 8 + AND ws_bill_addr_sk = ca_address_sk + AND ca_gmt_offset = -6 + GROUP BY i_item_id) + SELECT i_item_id, + Sum(total_sales) total_sales + FROM (SELECT * + FROM ss + UNION ALL + SELECT * + FROM cs + UNION ALL + SELECT * + FROM ws) tmp1 + GROUP BY i_item_id + ORDER BY i_item_id, + total_sales + LIMIT 100; + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 60.""" + store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix) + catalog_sales = get_data( + run_config.dataset_path, "catalog_sales", run_config.suffix + ) + web_sales = get_data(run_config.dataset_path, "web_sales", run_config.suffix) + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + customer_address = get_data( + run_config.dataset_path, "customer_address", run_config.suffix + ) + item = get_data(run_config.dataset_path, "item", run_config.suffix) + + jewelry_item_ids_lf = ( + item.filter(pl.col("i_category") == "Jewelry").select(["i_item_id"]).unique() + ) + + channels = [ + ( + "ss", + store_sales, + "ss_item_sk", + "ss_sold_date_sk", + "ss_addr_sk", + "ss_ext_sales_price", + ), + ( + "cs", + catalog_sales, + "cs_item_sk", + "cs_sold_date_sk", + "cs_bill_addr_sk", + "cs_ext_sales_price", + ), + ( + "ws", + web_sales, + "ws_item_sk", + "ws_sold_date_sk", + "ws_bill_addr_sk", + "ws_ext_sales_price", + ), + ] + + parts: list[pl.LazyFrame] = [] + for _, sales_lf, item_fk, date_fk, addr_fk, price_col in channels: + parts.append( + sales_lf.join(item, left_on=item_fk, right_on="i_item_sk") + .join(jewelry_item_ids_lf, on="i_item_id") + .join(date_dim, left_on=date_fk, right_on="d_date_sk") + .join(customer_address, left_on=addr_fk, right_on="ca_address_sk") + .filter( + (pl.col("d_year") == 1999) + & (pl.col("d_moy") == 8) + & (pl.col("ca_gmt_offset") == -6) + ) + .group_by("i_item_id") + .agg([pl.col(price_col).sum().alias("total_sales")]) + .select(["i_item_id", "total_sales"]) + ) + + return ( + pl.concat(parts) + .group_by("i_item_id") + .agg([pl.col("total_sales").sum().alias("total_sales")]) + .sort(["i_item_id", "total_sales"], descending=[False, False], nulls_last=True) + .limit(100) + ) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q61.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q61.py new file mode 100644 index 00000000000..0f9a2074353 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q61.py @@ -0,0 +1,154 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 61.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 61.""" + return """ + SELECT promotions, + total, + Cast(promotions AS DECIMAL(15, 4)) / + Cast(total AS DECIMAL(15, 4)) * 100 + FROM (SELECT Sum(ss_ext_sales_price) promotions + FROM store_sales, + store, + promotion, + date_dim, + customer, + customer_address, + item + WHERE ss_sold_date_sk = d_date_sk + AND ss_store_sk = s_store_sk + AND ss_promo_sk = p_promo_sk + AND ss_customer_sk = c_customer_sk + AND ca_address_sk = c_current_addr_sk + AND ss_item_sk = i_item_sk + AND ca_gmt_offset = -7 + AND i_category = 'Books' + AND ( p_channel_dmail = 'Y' + OR p_channel_email = 'Y' + OR p_channel_tv = 'Y' ) + AND s_gmt_offset = -7 + AND d_year = 2001 + AND d_moy = 12) promotional_sales, + (SELECT Sum(ss_ext_sales_price) total + FROM store_sales, + store, + date_dim, + customer, + customer_address, + item + WHERE ss_sold_date_sk = d_date_sk + AND ss_store_sk = s_store_sk + AND ss_customer_sk = c_customer_sk + AND ca_address_sk = c_current_addr_sk + AND ss_item_sk = i_item_sk + AND ca_gmt_offset = -7 + AND i_category = 'Books' + AND s_gmt_offset = -7 + AND d_year = 2001 + AND d_moy = 12) all_sales + ORDER BY promotions, + total + LIMIT 100; + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 61.""" + # Load tables + store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix) + store = get_data(run_config.dataset_path, "store", run_config.suffix) + promotion = get_data(run_config.dataset_path, "promotion", run_config.suffix) + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + customer = get_data(run_config.dataset_path, "customer", run_config.suffix) + customer_address = get_data( + run_config.dataset_path, "customer_address", run_config.suffix + ) + item = get_data(run_config.dataset_path, "item", run_config.suffix) + + # Promotional sales (with promotion filters) + promotional_sales = ( + store_sales.join(store, left_on="ss_store_sk", right_on="s_store_sk") + .join(promotion, left_on="ss_promo_sk", right_on="p_promo_sk") + .join(date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk") + .join(customer, left_on="ss_customer_sk", right_on="c_customer_sk") + .join(customer_address, left_on="c_current_addr_sk", right_on="ca_address_sk") + .join(item, left_on="ss_item_sk", right_on="i_item_sk") + .filter( + (pl.col("ca_gmt_offset") == -7) + & (pl.col("i_category") == "Books") + & ( + (pl.col("p_channel_dmail") == "Y") + | (pl.col("p_channel_email") == "Y") + | (pl.col("p_channel_tv") == "Y") + ) + & (pl.col("s_gmt_offset") == -7) + & (pl.col("d_year") == 2001) + & (pl.col("d_moy") == 12) + ) + .select( + [ + pl.when(pl.count() > 0) + .then(pl.col("ss_ext_sales_price").sum()) + .otherwise(None) + .alias("promotions") + ] + ) + ) + + # All sales (no promotion filters) + all_sales = ( + store_sales.join(store, left_on="ss_store_sk", right_on="s_store_sk") + .join(date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk") + .join(customer, left_on="ss_customer_sk", right_on="c_customer_sk") + .join(customer_address, left_on="c_current_addr_sk", right_on="ca_address_sk") + .join(item, left_on="ss_item_sk", right_on="i_item_sk") + .filter( + (pl.col("ca_gmt_offset") == -7) + & (pl.col("i_category") == "Books") + & (pl.col("s_gmt_offset") == -7) + & (pl.col("d_year") == 2001) + & (pl.col("d_moy") == 12) + ) + .select( + [ + pl.when(pl.count() > 0) + .then(pl.col("ss_ext_sales_price").sum()) + .otherwise(None) + .alias("total") + ] + ) + ) + return ( + promotional_sales.join(all_sales, how="cross") + .with_columns( + [ + pl.when(pl.col("total").is_null() | (pl.col("total") == 0)) + .then(None) + .otherwise( + pl.col("promotions").cast(pl.Float64) + / pl.col("total").cast(pl.Float64) + * 100.0 + ) + .alias( + "((CAST(promotions AS DECIMAL(15,4)) / CAST(total AS DECIMAL(15,4))) * 100)" + ) + ] + ) + .sort(["promotions", "total"], nulls_last=True) + .limit(100) + ) diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q62.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q62.py new file mode 100644 index 00000000000..9513cbec203 --- /dev/null +++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q62.py @@ -0,0 +1,148 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. +# SPDX-License-Identifier: Apache-2.0 + +"""Query 62.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +import polars as pl + +from cudf_polars.experimental.benchmarks.utils import get_data + +if TYPE_CHECKING: + from cudf_polars.experimental.benchmarks.utils import RunConfig + + +def duckdb_impl(run_config: RunConfig) -> str: + """Query 62.""" + return """ + SELECT Substr(w_warehouse_name, 1, 20), + sm_type, + web_name, + Sum(CASE + WHEN ( ws_ship_date_sk - ws_sold_date_sk <= 30 ) THEN 1 + ELSE 0 + END) AS '30 days', + Sum(CASE + WHEN ( ws_ship_date_sk - ws_sold_date_sk > 30 ) + AND ( ws_ship_date_sk - ws_sold_date_sk <= 60 ) THEN 1 + ELSE 0 + END) AS '31-60 days', + Sum(CASE + WHEN ( ws_ship_date_sk - ws_sold_date_sk > 60 ) + AND ( ws_ship_date_sk - ws_sold_date_sk <= 90 ) THEN 1 + ELSE 0 + END) AS '61-90 days', + Sum(CASE + WHEN ( ws_ship_date_sk - ws_sold_date_sk > 90 ) + AND ( ws_ship_date_sk - ws_sold_date_sk <= 120 ) THEN + 1 + ELSE 0 + END) AS '91-120 days', + Sum(CASE + WHEN ( ws_ship_date_sk - ws_sold_date_sk > 120 ) THEN 1 + ELSE 0 + END) AS '>120 days' + FROM web_sales, + warehouse, + ship_mode, + web_site, + date_dim + WHERE d_month_seq BETWEEN 1222 AND 1222 + 11 + AND ws_ship_date_sk = d_date_sk + AND ws_warehouse_sk = w_warehouse_sk + AND ws_ship_mode_sk = sm_ship_mode_sk + AND ws_web_site_sk = web_site_sk + GROUP BY Substr(w_warehouse_name, 1, 20), + sm_type, + web_name + ORDER BY Substr(w_warehouse_name, 1, 20), + sm_type, + web_name + LIMIT 100; + """ + + +def polars_impl(run_config: RunConfig) -> pl.LazyFrame: + """Query 62.""" + web_sales = get_data(run_config.dataset_path, "web_sales", run_config.suffix) + warehouse = get_data(run_config.dataset_path, "warehouse", run_config.suffix) + ship_mode = get_data(run_config.dataset_path, "ship_mode", run_config.suffix) + web_site = get_data(run_config.dataset_path, "web_site", run_config.suffix) + date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix) + + return ( + web_sales.join(date_dim, left_on="ws_ship_date_sk", right_on="d_date_sk") + .join(warehouse, left_on="ws_warehouse_sk", right_on="w_warehouse_sk") + .join(ship_mode, left_on="ws_ship_mode_sk", right_on="sm_ship_mode_sk") + .join(web_site, left_on="ws_web_site_sk", right_on="web_site_sk") + .filter(pl.col("d_month_seq").is_between(1222, 1222 + 11)) + .with_columns( + [ + (pl.col("ws_ship_date_sk") - pl.col("ws_sold_date_sk")).alias( + "shipping_delay" + ), + pl.col("w_warehouse_name").str.slice(0, 20).alias("warehouse_substr"), + ] + ) + .with_columns( + [ + pl.when(pl.col("shipping_delay") <= 30) + .then(1) + .otherwise(0) + .alias("bucket_30"), + pl.when( + (pl.col("shipping_delay") > 30) & (pl.col("shipping_delay") <= 60) + ) + .then(1) + .otherwise(0) + .alias("bucket_31_60"), + pl.when( + (pl.col("shipping_delay") > 60) & (pl.col("shipping_delay") <= 90) + ) + .then(1) + .otherwise(0) + .alias("bucket_61_90"), + pl.when( + (pl.col("shipping_delay") > 90) & (pl.col("shipping_delay") <= 120) + ) + .then(1) + .otherwise(0) + .alias("bucket_91_120"), + pl.when(pl.col("shipping_delay") > 120) + .then(1) + .otherwise(0) + .alias("bucket_over_120"), + ] + ) + .group_by(["warehouse_substr", "sm_type", "web_name"]) + .agg( + [ + pl.col("bucket_30").sum().alias("30 days"), + pl.col("bucket_31_60").sum().alias("31-60 days"), + pl.col("bucket_61_90").sum().alias("61-90 days"), + pl.col("bucket_91_120").sum().alias("91-120 days"), + pl.col("bucket_over_120").sum().alias(">120 days"), + ] + ) + .select( + [ + pl.col("warehouse_substr").alias("substr(w_warehouse_name, 1, 20)"), + "sm_type", + "web_name", + "30 days", + "31-60 days", + "61-90 days", + "91-120 days", + ">120 days", + ] + ) + .sort( + ["substr(w_warehouse_name, 1, 20)", "sm_type", "web_name"], + nulls_last=True, + descending=[False, False, False], + ) + .limit(100) + )