Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
# SPDX-License-Identifier: Apache-2.0

"""Query 53."""

from __future__ import annotations

from typing import TYPE_CHECKING

import polars as pl

from cudf_polars.experimental.benchmarks.utils import get_data

if TYPE_CHECKING:
from cudf_polars.experimental.benchmarks.utils import RunConfig


def duckdb_impl(run_config: RunConfig) -> str:
"""Query 53."""
return """
SELECT *
FROM (SELECT i_manufact_id,
Sum(ss_sales_price) sum_sales,
Avg(Sum(ss_sales_price))
OVER (
partition BY i_manufact_id) avg_quarterly_sales
FROM item,
store_sales,
date_dim,
store
WHERE ss_item_sk = i_item_sk
AND ss_sold_date_sk = d_date_sk
AND ss_store_sk = s_store_sk
AND d_month_seq IN ( 1199, 1199 + 1, 1199 + 2, 1199 + 3,
1199 + 4, 1199 + 5, 1199 + 6, 1199 + 7,
1199 + 8, 1199 + 9, 1199 + 10, 1199 + 11 )
AND ( ( i_category IN ( 'Books', 'Children', 'Electronics' )
AND i_class IN ( 'personal', 'portable', 'reference',
'self-help' )
AND i_brand IN ( 'scholaramalgamalg #14',
'scholaramalgamalg #7'
,
'exportiunivamalg #9',
'scholaramalgamalg #9' )
)
OR ( i_category IN ( 'Women', 'Music', 'Men' )
AND i_class IN ( 'accessories', 'classical',
'fragrances',
'pants' )
AND i_brand IN ( 'amalgimporto #1',
'edu packscholar #1',
'exportiimporto #1',
'importoamalg #1' ) ) )
GROUP BY i_manufact_id,
d_qoy) tmp1
WHERE CASE
WHEN avg_quarterly_sales > 0 THEN Abs (sum_sales - avg_quarterly_sales)
/
avg_quarterly_sales
ELSE NULL
END > 0.1
ORDER BY avg_quarterly_sales,
sum_sales,
i_manufact_id
LIMIT 100;
"""


def polars_impl(run_config: RunConfig) -> pl.LazyFrame:
"""Query 53."""
# Load tables
item = get_data(run_config.dataset_path, "item", run_config.suffix)
store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix)
date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix)
store = get_data(run_config.dataset_path, "store", run_config.suffix)
month_seq_list = list(range(1199, 1199 + 12))
grouped_data = (
store_sales.join(item, left_on="ss_item_sk", right_on="i_item_sk")
.join(date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk")
.join(store, left_on="ss_store_sk", right_on="s_store_sk")
.filter(pl.col("d_month_seq").is_in(month_seq_list))
.filter(
# Books/Children/Electronics categories
(
(pl.col("i_category").is_in(["Books", "Children", "Electronics"]))
& (
pl.col("i_class").is_in(
["personal", "portable", "reference", "self-help"]
)
)
& (
pl.col("i_brand").is_in(
[
"scholaramalgamalg #14",
"scholaramalgamalg #7",
"exportiunivamalg #9",
"scholaramalgamalg #9",
]
)
)
)
|
# Women/Music/Men categories
(
(pl.col("i_category").is_in(["Women", "Music", "Men"]))
& (
pl.col("i_class").is_in(
["accessories", "classical", "fragrances", "pants"]
)
)
& (
pl.col("i_brand").is_in(
[
"amalgimporto #1",
"edu packscholar #1",
"exportiimporto #1",
"importoamalg #1",
]
)
)
)
)
.group_by(["i_manufact_id", "d_qoy"])
.agg([pl.col("ss_sales_price").sum().alias("sum_sales_raw")])
.with_columns(
[
pl.when(pl.col("sum_sales_raw").is_not_null())
.then(pl.col("sum_sales_raw"))
.otherwise(None)
.alias("sum(ss_sales_price)")
]
)
)
non_null_data = grouped_data.filter(pl.col("i_manufact_id").is_not_null())
null_data = grouped_data.filter(pl.col("i_manufact_id").is_null())
manufacturer_averages = non_null_data.group_by("i_manufact_id").agg(
[pl.col("sum(ss_sales_price)").mean().alias("avg_quarterly_sales")]
)
non_null_result = non_null_data.join(
manufacturer_averages, on="i_manufact_id", how="left"
)
null_result = null_data.with_columns(
[pl.col("sum(ss_sales_price)").mean().alias("avg_quarterly_sales")]
)
inner_query = pl.concat([non_null_result, null_result]).select(
[
"i_manufact_id",
pl.col("sum(ss_sales_price)").alias("sum_sales"),
"avg_quarterly_sales",
]
)
return (
inner_query.filter(
# Percentage deviation > 10%
pl.when(pl.col("avg_quarterly_sales") > 0)
.then(
(pl.col("sum_sales") - pl.col("avg_quarterly_sales")).abs()
/ pl.col("avg_quarterly_sales")
)
.otherwise(None)
> 0.1
)
.select(["i_manufact_id", "sum_sales", "avg_quarterly_sales"])
.sort(
["avg_quarterly_sales", "sum_sales", "i_manufact_id"],
nulls_last=True,
descending=[False, False, False],
)
.limit(100)
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
# SPDX-License-Identifier: Apache-2.0

"""Query 54."""

from __future__ import annotations

from typing import TYPE_CHECKING

import polars as pl

from cudf_polars.experimental.benchmarks.utils import get_data

if TYPE_CHECKING:
from cudf_polars.experimental.benchmarks.utils import RunConfig


def duckdb_impl(run_config: RunConfig) -> str:
"""Query 54."""
return """
WITH my_customers
AS (SELECT DISTINCT c_customer_sk,
c_current_addr_sk
FROM (SELECT cs_sold_date_sk sold_date_sk,
cs_bill_customer_sk customer_sk,
cs_item_sk item_sk
FROM catalog_sales
UNION ALL
SELECT ws_sold_date_sk sold_date_sk,
ws_bill_customer_sk customer_sk,
ws_item_sk item_sk
FROM web_sales) cs_or_ws_sales,
item,
date_dim,
customer
WHERE sold_date_sk = d_date_sk
AND item_sk = i_item_sk
AND i_category = 'Sports'
AND i_class = 'fitness'
AND c_customer_sk = cs_or_ws_sales.customer_sk
AND d_moy = 5
AND d_year = 2000),
my_revenue
AS (SELECT c_customer_sk,
Sum(ss_ext_sales_price) AS revenue
FROM my_customers,
store_sales,
customer_address,
store,
date_dim
WHERE c_current_addr_sk = ca_address_sk
AND ca_county = s_county
AND ca_state = s_state
AND ss_sold_date_sk = d_date_sk
AND c_customer_sk = ss_customer_sk
AND d_month_seq BETWEEN (SELECT DISTINCT d_month_seq + 1
FROM date_dim
WHERE d_year = 2000
AND d_moy = 5) AND
(SELECT DISTINCT
d_month_seq + 3
FROM date_dim
WHERE d_year = 2000
AND d_moy = 5)
GROUP BY c_customer_sk),
segments
AS (SELECT Cast(( revenue / 50 ) AS INT) AS segment
FROM my_revenue)
SELECT segment,
Count(*) AS num_customers,
segment * 50 AS segment_base
FROM segments
GROUP BY segment
ORDER BY segment,
num_customers
LIMIT 100;
"""


def polars_impl(run_config: RunConfig) -> pl.LazyFrame:
"""Query 54."""
catalog_sales = get_data(
run_config.dataset_path, "catalog_sales", run_config.suffix
)
web_sales = get_data(run_config.dataset_path, "web_sales", run_config.suffix)
store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix)
item = get_data(run_config.dataset_path, "item", run_config.suffix)
date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix)
customer = get_data(run_config.dataset_path, "customer", run_config.suffix)
customer_address = get_data(
run_config.dataset_path, "customer_address", run_config.suffix
)
store = get_data(run_config.dataset_path, "store", run_config.suffix)

cs_sales = catalog_sales.select(
[
pl.col("cs_sold_date_sk").alias("sold_date_sk"),
pl.col("cs_bill_customer_sk").alias("customer_sk"),
pl.col("cs_item_sk").alias("item_sk"),
]
)
ws_sales = web_sales.select(
[
pl.col("ws_sold_date_sk").alias("sold_date_sk"),
pl.col("ws_bill_customer_sk").alias("customer_sk"),
pl.col("ws_item_sk").alias("item_sk"),
]
)
cs_or_ws_sales = pl.concat([cs_sales, ws_sales])
my_customers = (
cs_or_ws_sales.join(date_dim, left_on="sold_date_sk", right_on="d_date_sk")
.join(item, left_on="item_sk", right_on="i_item_sk")
.join(customer, left_on="customer_sk", right_on="c_customer_sk")
.filter(
(pl.col("i_category") == "Sports")
& (pl.col("i_class") == "fitness")
& (pl.col("d_moy") == 5)
& (pl.col("d_year") == 2000)
)
.select([pl.col("customer_sk").alias("c_customer_sk"), "c_current_addr_sk"])
.unique()
)

my_revenue = (
my_customers.join(
customer_address, left_on="c_current_addr_sk", right_on="ca_address_sk"
)
.join(
store, left_on=["ca_county", "ca_state"], right_on=["s_county", "s_state"]
)
.join(store_sales, left_on="c_customer_sk", right_on="ss_customer_sk")
.join(date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk")
.filter(pl.col("d_month_seq").is_between(1206, 1208))
.group_by("c_customer_sk")
.agg([pl.col("ss_ext_sales_price").sum().alias("revenue")])
)

segments = my_revenue.with_columns(
(pl.col("revenue") / 50.0).round(0).cast(pl.Int32).alias("segment")
).select("segment")

return (
segments.group_by("segment")
.agg([pl.len().alias("num_customers")])
.with_columns((pl.col("segment") * 50).alias("segment_base"))
.select(
[
"segment",
pl.col("num_customers").cast(pl.Int64),
"segment_base",
]
)
.sort(
["segment", "num_customers"],
nulls_last=True,
descending=[False, False],
)
.limit(100)
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
# SPDX-License-Identifier: Apache-2.0

"""Query 55."""

from __future__ import annotations

from typing import TYPE_CHECKING

import polars as pl

from cudf_polars.experimental.benchmarks.utils import get_data

if TYPE_CHECKING:
from cudf_polars.experimental.benchmarks.utils import RunConfig


def duckdb_impl(run_config: RunConfig) -> str:
"""Query 55."""
return """
SELECT i_brand_id brand_id,
i_brand brand,
Sum(ss_ext_sales_price) ext_price
FROM date_dim,
store_sales,
item
WHERE d_date_sk = ss_sold_date_sk
AND ss_item_sk = i_item_sk
AND i_manager_id = 33
AND d_moy = 12
AND d_year = 1998
GROUP BY i_brand,
i_brand_id
ORDER BY ext_price DESC,
i_brand_id
LIMIT 100;
"""


def polars_impl(run_config: RunConfig) -> pl.LazyFrame:
"""Query 55."""
date_dim = get_data(run_config.dataset_path, "date_dim", run_config.suffix)
store_sales = get_data(run_config.dataset_path, "store_sales", run_config.suffix)
item = get_data(run_config.dataset_path, "item", run_config.suffix)
return (
store_sales.join(date_dim, left_on="ss_sold_date_sk", right_on="d_date_sk")
.join(item, left_on="ss_item_sk", right_on="i_item_sk")
.filter(
(pl.col("i_manager_id") == 33)
& (pl.col("d_moy") == 12)
& (pl.col("d_year") == 1998)
)
.group_by(["i_brand", "i_brand_id"])
.agg(pl.col("ss_ext_sales_price").sum().alias("ext_price"))
.select(
[
pl.col("i_brand_id").alias("brand_id"),
pl.col("i_brand").alias("brand"),
pl.col("ext_price"),
]
)
.sort(["ext_price", "brand_id"], descending=[True, False], nulls_last=True)
.limit(100)
)
Loading
Loading