Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
# SPDX-License-Identifier: Apache-2.0


"""Query 24."""

from __future__ import annotations

from typing import TYPE_CHECKING

import polars as pl

from cudf_polars.experimental.benchmarks.utils import get_data

if TYPE_CHECKING:
from cudf_polars.experimental.benchmarks.utils import RunConfig


def duckdb_impl(run_config: RunConfig) -> str:
"""Query 24."""
return """
WITH ssales AS
(SELECT c_last_name,
c_first_name,
s_store_name,
ca_state,
s_state,
i_color,
i_current_price,
i_manager_id,
i_units,
i_size,
sum(ss_net_paid) netpaid
FROM store_sales,
store_returns,
store,
item,
customer,
customer_address
WHERE ss_ticket_number = sr_ticket_number
AND ss_item_sk = sr_item_sk
AND ss_customer_sk = c_customer_sk
AND ss_item_sk = i_item_sk
AND ss_store_sk = s_store_sk
AND c_current_addr_sk = ca_address_sk
AND c_birth_country <> upper(ca_country)
AND s_zip = ca_zip
AND s_market_id=8
GROUP BY c_last_name,
c_first_name,
s_store_name,
ca_state,
s_state,
i_color,
i_current_price,
i_manager_id,
i_units,
i_size)
SELECT c_last_name,
c_first_name,
s_store_name,
sum(netpaid) paid
FROM ssales
WHERE i_color = 'peach'
GROUP BY c_last_name,
c_first_name,
s_store_name
HAVING sum(netpaid) >
(SELECT 0.05*avg(netpaid)
FROM ssales)
ORDER BY c_last_name,
c_first_name,
s_store_name ;
"""


def polars_impl(run_config: RunConfig) -> pl.LazyFrame:
"""Query 24."""
store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix)
store_returns = get_data(
run_config.dataset_path, "store_returns", run_config.suffix
)
store = get_data(run_config.dataset_path, "store", run_config.suffix)
item = get_data(run_config.dataset_path, "item", run_config.suffix)
customer = get_data(run_config.dataset_path, "customer", run_config.suffix)
customer_address = get_data(
run_config.dataset_path, "customer_address", run_config.suffix
)

ssales = (
store_sales.join(
store_returns,
left_on=["ss_ticket_number", "ss_item_sk"],
right_on=["sr_ticket_number", "sr_item_sk"],
)
.join(store, left_on="ss_store_sk", right_on="s_store_sk")
.join(item, left_on="ss_item_sk", right_on="i_item_sk")
.join(customer, left_on="ss_customer_sk", right_on="c_customer_sk")
.join(customer_address, left_on="c_current_addr_sk", right_on="ca_address_sk")
.filter(
(pl.col("c_birth_country") != pl.col("ca_country").str.to_uppercase())
& (pl.col("s_zip") == pl.col("ca_zip"))
& (pl.col("s_market_id") == 8)
)
.group_by(
[
"c_last_name",
"c_first_name",
"s_store_name",
"ca_state",
"s_state",
"i_color",
"i_current_price",
"i_manager_id",
"i_units",
"i_size",
]
)
.agg(pl.col("ss_net_paid").sum().alias("netpaid"))
)

threshold_table = ssales.select(
(pl.col("netpaid").mean() * 0.05).alias("threshold")
)

return (
ssales.filter(pl.col("i_color") == "peach")
.group_by(["c_last_name", "c_first_name", "s_store_name"])
.agg(pl.col("netpaid").sum().alias("paid"))
.join(threshold_table, how="cross")
.filter(pl.col("paid") > pl.col("threshold"))
.select(["c_last_name", "c_first_name", "s_store_name", "paid"])
.sort(["c_last_name", "c_first_name", "s_store_name"], nulls_last=True)
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
# SPDX-License-Identifier: Apache-2.0

"""Query 25."""

from __future__ import annotations

from typing import TYPE_CHECKING

import polars as pl

from cudf_polars.experimental.benchmarks.utils import get_data

if TYPE_CHECKING:
from cudf_polars.experimental.benchmarks.utils import RunConfig


def duckdb_impl(run_config: RunConfig) -> str:
"""Query 25."""
return """
SELECT i_item_id,
i_item_desc,
s_store_id,
s_store_name,
Max(ss_net_profit) AS store_sales_profit,
Max(sr_net_loss) AS store_returns_loss,
Max(cs_net_profit) AS catalog_sales_profit
FROM store_sales,
store_returns,
catalog_sales,
date_dim d1,
date_dim d2,
date_dim d3,
store,
item
WHERE d1.d_moy = 4
AND d1.d_year = 2001
AND d1.d_date_sk = ss_sold_date_sk
AND i_item_sk = ss_item_sk
AND s_store_sk = ss_store_sk
AND ss_customer_sk = sr_customer_sk
AND ss_item_sk = sr_item_sk
AND ss_ticket_number = sr_ticket_number
AND sr_returned_date_sk = d2.d_date_sk
AND d2.d_moy BETWEEN 4 AND 10
AND d2.d_year = 2001
AND sr_customer_sk = cs_bill_customer_sk
AND sr_item_sk = cs_item_sk
AND cs_sold_date_sk = d3.d_date_sk
AND d3.d_moy BETWEEN 4 AND 10
AND d3.d_year = 2001
GROUP BY i_item_id,
i_item_desc,
s_store_id,
s_store_name
ORDER BY i_item_id,
i_item_desc,
s_store_id,
s_store_name
LIMIT 100;
"""


def polars_impl(run_config: RunConfig) -> pl.LazyFrame:
"""Query 25."""
store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix)
store_returns = get_data(
run_config.dataset_path, "store_returns", run_config.suffix
)
catalog_sales = get_data(
run_config.dataset_path, "catalog_sales", run_config.suffix
)
date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix)
store = get_data(run_config.dataset_path, "store", run_config.suffix)
item = get_data(run_config.dataset_path, "item", run_config.suffix)

d1, d2, d3 = [
date_dim.clone().select(
[
pl.col("d_date_sk").alias(f"{p}_date_sk"),
pl.col("d_moy").alias(f"{p}_moy"),
pl.col("d_year").alias(f"{p}_year"),
]
)
for p in ("d1", "d2", "d3")
]

return (
store_sales.join(d1, left_on="ss_sold_date_sk", right_on="d1_date_sk")
.join(item, left_on="ss_item_sk", right_on="i_item_sk")
.join(store, left_on="ss_store_sk", right_on="s_store_sk")
.join(
store_returns,
left_on=["ss_customer_sk", "ss_item_sk", "ss_ticket_number"],
right_on=["sr_customer_sk", "sr_item_sk", "sr_ticket_number"],
)
.join(d2, left_on="sr_returned_date_sk", right_on="d2_date_sk")
.join(
catalog_sales,
left_on=["ss_customer_sk", "ss_item_sk"],
right_on=["cs_bill_customer_sk", "cs_item_sk"],
)
.join(d3, left_on="cs_sold_date_sk", right_on="d3_date_sk")
.filter(
(pl.col("d1_moy") == 4)
& (pl.col("d1_year") == 2001)
& (pl.col("d2_moy").is_between(4, 10))
& (pl.col("d2_year") == 2001)
& (pl.col("d3_moy").is_between(4, 10))
& (pl.col("d3_year") == 2001)
)
.group_by(["i_item_id", "i_item_desc", "s_store_id", "s_store_name"])
.agg(
[
pl.col("ss_net_profit").max().alias("store_sales_profit"),
pl.col("sr_net_loss").max().alias("store_returns_loss"),
pl.col("cs_net_profit").max().alias("catalog_sales_profit"),
]
)
.sort(["i_item_id", "i_item_desc", "s_store_id", "s_store_name"])
.limit(100)
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
# SPDX-License-Identifier: Apache-2.0

"""Query 26."""

from __future__ import annotations

from typing import TYPE_CHECKING

import polars as pl

from cudf_polars.experimental.benchmarks.utils import get_data

if TYPE_CHECKING:
from cudf_polars.experimental.benchmarks.utils import RunConfig


def duckdb_impl(run_config: RunConfig) -> str:
"""Query 26."""
return """
SELECT i_item_id,
Avg(cs_quantity) agg1,
Avg(cs_list_price) agg2,
Avg(cs_coupon_amt) agg3,
Avg(cs_sales_price) agg4
FROM catalog_sales,
customer_demographics,
date_dim,
item,
promotion
WHERE cs_sold_date_sk = d_date_sk
AND cs_item_sk = i_item_sk
AND cs_bill_cdemo_sk = cd_demo_sk
AND cs_promo_sk = p_promo_sk
AND cd_gender = 'F'
AND cd_marital_status = 'W'
AND cd_education_status = 'Secondary'
AND ( p_channel_email = 'N'
OR p_channel_event = 'N' )
AND d_year = 2000
GROUP BY i_item_id
ORDER BY i_item_id
LIMIT 100;
"""


def polars_impl(run_config: RunConfig) -> pl.LazyFrame:
"""Query 26."""
# Load tables
catalog_sales = get_data(
run_config.dataset_path, "catalog_sales", run_config.suffix
)
customer_demographics = get_data(
run_config.dataset_path, "customer_demographics", run_config.suffix
)
date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix)
item = get_data(run_config.dataset_path, "item", run_config.suffix)
promotion = get_data(run_config.dataset_path, "promotion", run_config.suffix)
return (
catalog_sales.join(date_dim, left_on="cs_sold_date_sk", right_on="d_date_sk")
.join(item, left_on="cs_item_sk", right_on="i_item_sk")
.join(customer_demographics, left_on="cs_bill_cdemo_sk", right_on="cd_demo_sk")
.join(promotion, left_on="cs_promo_sk", right_on="p_promo_sk")
.filter(
(pl.col("cd_gender") == "F")
& (pl.col("cd_marital_status") == "W")
& (pl.col("cd_education_status") == "Secondary")
& ((pl.col("p_channel_email") == "N") | (pl.col("p_channel_event") == "N"))
& (pl.col("d_year") == 2000)
)
.group_by("i_item_id")
.agg(
[
pl.col("cs_quantity").mean().alias("agg1"),
pl.col("cs_list_price").mean().alias("agg2"),
pl.col("cs_coupon_amt").mean().alias("agg3"),
pl.col("cs_sales_price").mean().alias("agg4"),
]
)
.sort("i_item_id")
.limit(100)
)
Loading
Loading