-
Notifications
You must be signed in to change notification settings - Fork 234
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add some data validation examples (#112)
Add some data validation examples including: pandera Nike's spark-expectations basic WAP Target's data-validator Consisting of: * Add pandera req * Start adding a pandera example * Formatting * Format * Formatting * Add an example for the target data validator. * Flesh out the target example and add it to CI. * Start working on adding spark expectations from the Nike folks. * Update rule * Play around with spark expectations * Update the sample rule * Fix mismatched scala versions * Install IcebergSparkSessionExtensions * Start adding a pure SQL WAP example. * Switch to using Session over legacy context. * Format python ex * Style cleanup * Hmmm clone dv as well * Comment out not yet working FF in SQL * Fix examples * Skip CI on target data validator for now (nested build issue and it should go away once PR is merged anyways). * SparkSession imports are good.
- Loading branch information
Showing
17 changed files
with
350 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
creator,projectname,stars | ||
holdenk,spark-upgrade,17 | ||
krisnova,rust-nova,71 | ||
kbendick,MongoMart,6 | ||
mateiz,spark,36600 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
from pyspark.sql.session import SparkSession | ||
|
||
# tags::pandera_imports[] | ||
import pandera.pyspark as pa | ||
import pyspark.sql.types as T | ||
|
||
# end::pandera_imports[] | ||
|
||
|
||
# tag::simple_data_schema[] | ||
class ProjectDataSchema(pa.DataFrameModel): | ||
# Note str_length is currently broken :/ | ||
creator: T.StringType() = pa.Field(str_length={"min_value": 1}) | ||
projectname: T.StringType() = pa.Field() | ||
stars: T.IntegerType() = pa.Field(ge=0) | ||
|
||
|
||
# end::simple_data_schema[] | ||
|
||
|
||
# tag::gender_data[] | ||
class GenderData(pa.DataFrameModel): | ||
MaleBonusPercent: T.DoubleType() = pa.Field(nullable=True, le=5) | ||
FemaleBonusPercent: T.DoubleType() = pa.Field(nullable=True) | ||
CompanyNumber: T.IntegerType() = pa.Field() | ||
|
||
|
||
# end::gender_data[] | ||
|
||
if __name__ == "__main__": | ||
spark = SparkSession.builder.master("local[4]").getOrCreate() | ||
# Make sure to make | ||
# "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021" | ||
# available as ./data/2021 | ||
uk_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True) | ||
|
||
# tag::validate_gender_data[] | ||
validated_df = GenderData(uk_df) | ||
# Print out the errors. You may wish to exit with an error condition. | ||
if validated_df.pandera.errors != {}: | ||
print(validated_df.pandera.errors) | ||
# sys.exit(1) | ||
# end::validate_gender_data[] | ||
|
||
# tag::validate_project_data[] | ||
project_data = spark.read.csv("./data/project.csv", header=True, inferSchema=True) | ||
validated_df = ProjectDataSchema(project_data) | ||
# Print out the errors. You may wish to exit with an error condition. | ||
if validated_df.pandera.errors != {}: | ||
print(validated_df.pandera.errors) | ||
# sys.exit(1) | ||
# end::validate_project_data[] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
from pyspark import SparkFiles | ||
from pyspark.sql import * | ||
from spark_expectations.core.expectations import SparkExpectations | ||
|
||
spark = SparkSession.builder.master("local[4]").getOrCreate() | ||
sc = spark.sparkContext | ||
|
||
# tag::global_setup[] | ||
from spark_expectations.config.user_config import * | ||
|
||
se_global_spark_Conf = { | ||
"se_notifications_enable_email": False, | ||
"se_notifications_email_smtp_host": "mailhost.example.com", | ||
"se_notifications_email_smtp_port": 25, | ||
"se_notifications_email_from": "[email protected]", | ||
"se_notifications_email_subject": "spark expectations - data quality - notifications", | ||
"se_notifications_on_fail": True, | ||
"se_notifications_on_error_drop_exceeds_threshold_breach": True, | ||
"se_notifications_on_error_drop_threshold": 15, | ||
"se_enable_streaming": False, # Required or tries to publish to kafka. | ||
} | ||
# end::gloabl_setup[] | ||
|
||
|
||
# tag::setup_and_load[] | ||
spark.sql("DROP TABLE IF EXISTS local.magic_validation") | ||
spark.sql( | ||
""" | ||
create table local.magic_validation ( | ||
product_id STRING, | ||
table_name STRING, | ||
rule_type STRING, | ||
rule STRING, | ||
column_name STRING, | ||
expectation STRING, | ||
action_if_failed STRING, | ||
tag STRING, | ||
description STRING, | ||
enable_for_source_dq_validation BOOLEAN, | ||
enable_for_target_dq_validation BOOLEAN, | ||
is_active BOOLEAN, | ||
enable_error_drop_alert BOOLEAN, | ||
error_drop_threshold INT | ||
)""" | ||
) | ||
spark.sql( | ||
""" | ||
create table if not exists local.pay_stats ( | ||
product_id STRING, | ||
table_name STRING, | ||
input_count LONG, | ||
error_count LONG, | ||
output_count LONG, | ||
output_percentage FLOAT, | ||
success_percentage FLOAT, | ||
error_percentage FLOAT, | ||
source_agg_dq_results array<map<string, string>>, | ||
final_agg_dq_results array<map<string, string>>, | ||
source_query_dq_results array<map<string, string>>, | ||
final_query_dq_results array<map<string, string>>, | ||
row_dq_res_summary array<map<string, string>>, | ||
row_dq_error_threshold array<map<string, string>>, | ||
dq_status map<string, string>, | ||
dq_run_time map<string, float>, | ||
dq_rules map<string, map<string,int>>, | ||
meta_dq_run_id STRING, | ||
meta_dq_run_date DATE, | ||
meta_dq_run_datetime TIMESTAMP | ||
);""" | ||
) | ||
rule_file = "./spark_expectations_sample_rules.json" | ||
sc.addFile(rule_file) | ||
df = spark.read.json(SparkFiles.get(rule_file)) | ||
print(df) | ||
df.write.option("byname", "true").mode("append").saveAsTable("local.magic_validation") | ||
spark.read.table("local.magic_validation").show() | ||
se: SparkExpectations = SparkExpectations( | ||
product_id="pay", debugger=True # Used to filter which rules we apply | ||
) | ||
# end::setup_and_load[] | ||
|
||
|
||
# tag::run_validation[] | ||
# Only row data quality checking | ||
@se.with_expectations( | ||
se.reader.get_rules_from_table( | ||
product_rules_table="local.magic_validation", | ||
target_table_name="local.bonuses", | ||
dq_stats_table_name="local.pay_stats", | ||
), | ||
write_to_table=False, | ||
row_dq=True, | ||
# This does not work currently (Iceberg) | ||
spark_conf={"format": "iceberg"}, | ||
options={"format": "iceberg"}, | ||
options_error_table={"format": "iceberg"}, | ||
) | ||
def load_data(): | ||
raw_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True) | ||
uk_df = raw_df.select("CompanyNumber", "MaleBonusPercent", "FemaleBonuspercent") | ||
return uk_df | ||
|
||
|
||
data = load_data() | ||
# end::run_validation[] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"product_id": "pay", "table_name": "local.bonuses", "rule_type": "row_dq", "rule": "bonus_checker", "column_name": "MaleBonusPercent", "expectation": "MaleBonusPercent > FemaleBonusPercent", "action_if_failed": "drop", "tag": "", "description": "Sample rule that the male bonuses should be higher. Thankfully this fails (but could be lower base pay etc.)", "enable_for_source_dq_validation": true, "enable_for_target_dq_validation": true, "is_active": true, "enable_error_drop_alert": true, "error_drop_threshold": 1} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.