Add data quality checks (#11)

andre-salvati · web-flow · commit 453f5507a206 · 2025-07-16T10:09:47.000-03:00
* add data quality checks

* improve assertions
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@ default_language_version:
     python: python3
 repos:
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.12.0
+    rev: v0.12.3
     hooks:
     -   id: ruff
         args:
diff --git a/Pipfile b/Pipfile
@@ -17,6 +17,7 @@ pytest = "==8.3.2"
 jinja2 = "==3.1.4"
 pyspark = "==3.5.5"
 pytest-cov = "==5.0.0"
+databricks-labs-dqx = "==0.7.0"
 packages = "*"
 
 [dev-packages]
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 
-# Template project with medallion architecture, Python packaging, unit tests, integration tests, CI/CD automation, and Databricks Asset Bundles.
+# Template project with medallion architecture, Python packaging, unit tests, integration tests, CI/CD automation, Databricks Asset Bundles, and DQX data quality framework.
 
 This project template provides a structured approach to enhance productivity when delivering ETL pipelines on Databricks. Feel free to customize it based on your project's specific nuances and the audience you are targeting.
 
@@ -11,7 +11,8 @@ This project template demonstrates how to:
 - structure PySpark code inside classes/packages.
 - structure unit tests for the data transformations and set up VS Code to run them on your local machine.
 - structure integration tests to be executed on different environments / catalogs.
-- package code and deploy it to different environments (dev, staging, prod) using a CI/CD pipeline with [Github Actions](https://docs.github.com/en/actions).
+- utilize [Databricks DQX](https://databrickslabs.github.io/dqx/) to define and enforce data quality rules, such as null checks, uniqueness, thresholds, and schema validation.
+- package and deploy code to different environments (dev, staging, prod) using a CI/CD pipeline with [Github Actions](https://docs.github.com/en/actions).
 - isolate "dev" environments / catalogs to avoid concurrency issues between developers testing jobs.
 - configure the workflow to run in different environments with different parameters with [jinja package](https://pypi.org/project/jinja2/).
 - configure the workflow to run tasks selectively.
@@ -25,7 +26,7 @@ This project template demonstrates how to:
 - utilize [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/index.html) and [Databricks Asset Bundles](https://docs.databricks.com/en/dev-tools/bundles/index.html) to package/deploy/run a Python wheel package on Databricks.
 - utilize [Databricks SDK for Python](https://docs.databricks.com/en/dev-tools/sdk-python.html) to manage workspaces and accounts. The sample script enables metastore system tables with [relevant data about billing, usage, lineage, prices, and access](https://www.youtube.com/watch?v=LcRWHzk8Wm4).
 - utilize [Databricks Unity Catalog](https://www.databricks.com/product/unity-catalog) and get data lineage for your tables and columns and a simplified permission model for your data.
-- utilize [Databricks Workflows](https://docs.databricks.com/en/workflows/index.html) to execute a DAG and [task parameters](https://docs.databricks.com/en/workflows/jobs/parameter-value-references.html) to share context information between tasks (see [Task Parameters section](#task-parameters)). Yes, you don't need Airflow to manage your DAGs here!!!
+- utilize [Databricks Lakeflow Jobs](https://docs.databricks.com/en/workflows/index.html) to execute a DAG and [task parameters](https://docs.databricks.com/en/workflows/jobs/parameter-value-references.html) to share context information between tasks (see [Task Parameters section](#task-parameters)). Yes, you don't need Airflow to manage your DAGs here!!!
 - utilize [Databricks job clusters](https://docs.databricks.com/en/workflows/jobs/use-compute.html#use-databricks-compute-with-your-jobs) to reduce costs.
 - define clusters on AWS and Azure.
 
@@ -39,7 +40,7 @@ Sessions on Databricks Asset Bundles, CI/CD, and Software Development Life Cycle
 - [Deploying Databricks Asset Bundles (DABs) at Scale](https://www.youtube.com/watch?v=mMwprgB-sIU)
 - [A Prescription for Success: Leveraging DABs for Faster Deployment and Better Patient Outcomes](https://www.youtube.com/watch?v=01JHTM2UP-U)
 
-### DAG
+### DAGs
 
 <br>
 
@@ -51,18 +52,27 @@ Sessions on Databricks Asset Bundles, CI/CD, and Software Development Life Cycle
 
 <br>
 
-<img src="docs/task output.png">
+<img src="docs/task_output.png">
 
 <br>
 
 ### Data Lineage (Catalog Explorer)
 
 <br>
 
-<img src="docs/data lineage.png">
+<img src="docs/data_lineage.png">
 
 <br>
 
+### Data Quality (generated by Databricks DQX)
+
+<br>
+
+<img src="docs/data_quality.png">
+
+<br>
+
+
 
 ### CI/CD pipeline
 
diff --git a/docs/data_lineage.png b/docs/data_lineage.png
diff --git a/docs/data_quality.png b/docs/data_quality.png
diff --git a/docs/task output.png b/docs/task output.png
diff --git a/docs/task_output.png b/docs/task_output.png
diff --git a/setup.py b/setup.py
@@ -36,5 +36,6 @@
         "setuptools",
         "funcy",
         "databricks-sdk",
+        "databricks-labs-dqx",
     ],
 )
diff --git a/src/template/baseTask.py b/src/template/baseTask.py
@@ -2,4 +2,3 @@ class BaseTask:
     def __init__(self, config):
         self.config = config
         self.spark = config.get_spark()
-        self.dbutils = config.get_dbutils()
diff --git a/src/template/config.py b/src/template/config.py
@@ -1,4 +1,6 @@
 import pyspark.sql.functions as F
+from databricks.labs.dqx.engine import DQEngine
+from databricks.sdk import WorkspaceClient
 from pyspark.sql import SparkSession
 
 
@@ -16,29 +18,7 @@ def __init__(self, args):
 
         self.spark = SparkSession.builder.appName(args.task).getOrCreate()
 
-        try:
-            from pyspark.dbutils import DBUtils
-
-            self.dbutils = DBUtils(self.spark)
-
-            # TODO cannot access context on serverless
-            # context_tags = self.dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags()
-            # print(context_tags)
-
-            # username = context_tags.get("user")
-
-            # if username.isDefined():
-            #     actual_value = username.get()
-            #     python_string = str(actual_value)
-            #     self.params.update({"workspace_user": python_string})
-            #     print("workspace user: " + python_string)
-            # else:
-            #     print("workspace user empty")
-
-        except ModuleNotFoundError:
-            self.dbutils = self._mock_dbutils(self.spark)
-
-        if self.params["env"] != "local":
+        if args.env != "local":
             # if running in Databricks, set default catalog and schema
 
             if args.env == "dev":
@@ -57,29 +37,18 @@ def __init__(self, args):
 
             self.spark.sql(f"CREATE SCHEMA IF NOT EXISTS {args.schema}")
 
-    def _mock_dbutils(self, spark):
-        class DBUtils:
-            def __init__(self, spark):
-                self.fs = self.FileSystem()
-
-            class FileSystem:
-                def mount(self, source, mount_point):
-                    print(f"Mounting {source} to {mount_point}")
+            ws = WorkspaceClient()
 
-                def unmount(self, mount_point):
-                    print(f"Unmounting {mount_point}")
+        else:
+            from unittest.mock import MagicMock
 
-                def mounts(self):
-                    return []
+            ws = MagicMock(spec=WorkspaceClient, **{"current_user.me.return_value": None})
 
-        return DBUtils(spark)
+        self.dq_engine = DQEngine(ws)
 
     def get_spark(self):
         return self.spark
 
-    def get_dbutils(self):
-        return self.dbutils
-
     def get_value(self, key):
         return self.params[key]
 
diff --git a/src/template/extract_source2.py b/src/template/extract_source2.py
@@ -1,18 +1,58 @@
+from databricks.labs.dqx import check_funcs
+from databricks.labs.dqx.rule import (
+    Criticality,
+    DQDatasetRule,
+    DQForEachColRule,
+    DQRowRule,
+)
+
 from .baseTask import BaseTask
 
 
 class ExtractSource2(BaseTask):
     def __init__(self, config):
         super().__init__(config)
 
+    def validate_order(self, df_order):
+        checks = [
+            # Warning if total > 150
+            DQRowRule(
+                column="total",
+                check_func=check_funcs.is_not_greater_than,
+                check_func_kwargs={"limit": 150},
+                criticality=Criticality.WARN.value,
+            ),
+            # Error if ids are not null or empty
+            *DQForEachColRule(
+                columns=["id", "id_customer"],
+                check_func=check_funcs.is_not_null_and_not_empty,
+                criticality=Criticality.ERROR.value,
+                user_metadata={"check_type": "completeness", "responsible_data_steward": "someone@email.com"},
+            ).get_rules(),
+            # Error if id is not unique
+            DQDatasetRule(
+                columns=["id"],
+                check_func=check_funcs.is_unique,
+                criticality=Criticality.ERROR.value,
+            ),
+        ]
+
+        df_valid, df_invalid = self.config.dq_engine.apply_checks_and_split(df_order, checks)
+
+        return (df_valid, df_invalid)
+
     def run(self):
         print("Extracting data from Source2 ...")
 
         df_order = self.spark.read.table("external_source.order")
+        df_order, df_order_invalid = self.validate_order(df_order)
+        df_order_invalid.write.mode("overwrite").saveAsTable(f"{self.config.get_value('schema')}.order_quarantine")
+
         df_order_item = self.spark.read.table("external_source.order_item")
 
         if self.config.get_value("debug"):
             df_order.show()
+            df_order_invalid.show()
             df_order_item.show()
 
         df_order.write.mode("overwrite").saveAsTable(f"{self.config.get_value('schema')}.order")
diff --git a/src/template/setup.py b/src/template/setup.py
@@ -28,10 +28,16 @@ def run(self):
 
         # order
 
-        order_data = [(1, 10, 100.0, "2023-01-01"), (2, 20, 150.0, "2023-01-02")]
+        order_data = [
+            (1, 10, 100.0, "2023-01-01"),
+            (2, 20, 151.0, "2023-01-02"),
+            (None, 10, 100.0, "2023-01-01"),  # id is null
+            (3, 20, 150.0, "2023-01-02"),  # id is duplicated
+            (3, 20, 150.0, "2023-01-02"),
+        ]  # id is duplicated
         df_order = self.spark.createDataFrame(order_data, schema=order_schema)
         df_order.write.saveAsTable(f"{schema}.order")
 
-        order_item_data = [(1, 1, "Item A", 2, 50.0), (1, 2, "Item B", 1, 50.0), (2, 1, "Item C", 3, 150.0)]
+        order_item_data = [(1, 1, "Item A", 2, 50.0), (1, 2, "Item B", 1, 50.0), (2, 1, "Item C", 3, 151.0)]
         df_order_item = self.spark.createDataFrame(order_item_data, schema=order_item_schema)
         df_order_item.write.saveAsTable(f"{schema}.order_item")
diff --git a/src/template/validate.py b/src/template/validate.py
@@ -19,7 +19,7 @@ def run(self):
 
         expected_data = [
             ("John Doe", 3, 100.0),
-            ("Jane Smith", 3, 150.0),
+            ("Jane Smith", 3, 151.0),
         ]
         expected_schema = StructType(
             [
diff --git a/tests/template_test.py b/tests/template_test.py
@@ -11,6 +11,7 @@
 from template.commonSchemas import customer_schema, order_schema, order_item_schema
 
 from pyspark.testing import assertDataFrameEqual
+from pyspark.sql.functions import explode
 
 # from databricks.connect import DatabricksSession
 # from databricks.sdk.core import Config
@@ -42,6 +43,18 @@ def spark(config) -> TaskConfig:
     return config.get_spark()
 
 
+@pytest.fixture
+def df_orders_from_source(spark) -> DataFrame:
+    order_data = [
+        (1, 10, 100.0, "2023-01-01"),
+        (2, 20, 151.0, "2023-01-02"),
+        (None, 10, 100.0, "2023-01-01"),  # id is null
+        (3, 20, 100.0, "2023-01-02"),  # id is duplicated
+        (3, 20, 100.0, "2023-01-02"),  # id is duplicated
+    ]
+    return spark.createDataFrame(order_data, schema=order_schema)
+
+
 @pytest.fixture
 def df_orders(spark) -> DataFrame:
     orders_data = [
@@ -104,6 +117,39 @@ def test_config(args, expected_output):
     assert config.get_test_output() == expected_output
 
 
+def test_validate_orders_from_source(spark, config, df_orders_from_source):
+    task = ExtractSource2(config)
+
+    df_out, df_out_invalid = task.validate_order(df_orders_from_source)
+
+    df_invalid_out = df_out_invalid.select("id", explode("_errors.name").alias("name")).union(
+        df_out_invalid.select("id", explode("_warnings.name").alias("name"))
+    )
+
+    assert df_out.count() == 2
+    assert df_invalid_out.count() == 4
+
+    expected_data = [
+        (None, "id_is_null_or_empty"),
+        (2, "total_greater_than_limit"),
+        (3, "id_is_not_unique"),
+        (3, "id_is_not_unique"),
+    ]
+    expected_schema = StructType(
+        [
+            StructField("id", IntegerType(), True),
+            StructField("name", StringType(), True),
+        ]
+    )
+
+    df_expected = spark.createDataFrame(expected_data, schema=expected_schema)
+
+    df_invalid_out.show()
+    df_expected.show()
+
+    assertDataFrameEqual(df_invalid_out, df_expected)
+
+
 def test_enrich_orders(spark, config, df_orders):
     df_expected = df_orders
 

Original file line number	Diff line number	Diff line change
`@@ -36,5 +36,6 @@`
`36`	`36`	`"setuptools",`
`37`	`37`	`"funcy",`
`38`	`38`	`"databricks-sdk",`
	`39`	`+ "databricks-labs-dqx",`
`39`	`40`	`],`
`40`	`41`	`)`