From 13723ea68158a88a04ededc8ae4cc3555731c21a Mon Sep 17 00:00:00 2001 From: shourya_singh_cs Date: Mon, 14 Nov 2022 07:51:10 +0000 Subject: [PATCH 1/3] added the test case for params row-filter --- configs/entities/test-data.yml | 4 +- .../reference_columns/reference-columns.yml | 2 +- tests/BUILD | 8 + .../configs/row_filters/row-filters.yml | 7 + ...test_dq_row_filter_params_sql_expected.sql | 288 ++++++++++++++++++ tests/unit/test_dq_row_filter_params.py | 100 ++++++ 6 files changed, 406 insertions(+), 3 deletions(-) create mode 100644 tests/resources/test_dq_row_filter_params_sql_expected.sql create mode 100644 tests/unit/test_dq_row_filter_params.py diff --git a/configs/entities/test-data.yml b/configs/entities/test-data.yml index 4f535b6f..6802486d 100644 --- a/configs/entities/test-data.yml +++ b/configs/entities/test-data.yml @@ -16,8 +16,8 @@ entities: TEST_TABLE: source_database: BIGQUERY table_name: contact_details - dataset_name: - project_name: + dataset_name: clouddq_dev_us_central1 + project_name: kthxbayes-sandbox columns: ROW_ID: name: row_id diff --git a/configs/reference_columns/reference-columns.yml b/configs/reference_columns/reference-columns.yml index 7d0a9a60..34fc871c 100644 --- a/configs/reference_columns/reference-columns.yml +++ b/configs/reference_columns/reference-columns.yml @@ -22,4 +22,4 @@ reference_columns: INCLUDE_ALL_REFERENCE_COLUMNS: include_reference_columns: - - * + - "*" diff --git a/tests/BUILD b/tests/BUILD index 57426437..100c5344 100644 --- a/tests/BUILD +++ b/tests/BUILD @@ -246,3 +246,11 @@ py_test( legacy_create_init = 0, deps = DEPS, ) + +py_test( + name = "test_dq_row_filter_params", + srcs = SRCS, + data = DATA, + legacy_create_init = 0, + deps = DEPS, +) \ No newline at end of file diff --git a/tests/resources/configs/row_filters/row-filters.yml b/tests/resources/configs/row_filters/row-filters.yml index d64b3c25..41c2cbe7 100644 --- a/tests/resources/configs/row_filters/row-filters.yml +++ b/tests/resources/configs/row_filters/row-filters.yml @@ -20,3 +20,10 @@ row_filters: DATA_TYPE_EMAIL: filter_sql_expr: |- contact_type = 'email' + + COLUMN_IN_VALUES_SET: + params: + - column + - values + filter_sql_expr: |- + $column IN ($values) diff --git a/tests/resources/test_dq_row_filter_params_sql_expected.sql b/tests/resources/test_dq_row_filter_params_sql_expected.sql new file mode 100644 index 00000000..4dc68a2c --- /dev/null +++ b/tests/resources/test_dq_row_filter_params_sql_expected.sql @@ -0,0 +1,288 @@ +-- Copyright 2022 Google LLC +-- +-- Licensed under the Apache License, Version 2.0 (the "License"); +-- you may not use this file except in compliance with the License. +-- You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, software +-- distributed under the License is distributed on an "AS IS" BASIS, +-- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +-- See the License for the specific language governing permissions and +-- limitations under the License. + +WITH +zero_record AS ( +SELECT +'' AS rule_binding_id, +), +data AS ( +SELECT +*, +'' AS rule_binding_id, +FROM +`..contact_details` d +WHERE +contact_type IN ('email') +), +last_mod AS ( +SELECT +project_id || '.' || dataset_id || '.' || table_id AS table_id, +TIMESTAMP_MILLIS(last_modified_time) AS last_modified +FROM `..__TABLES__` +), +validation_results AS ( +SELECT +CURRENT_TIMESTAMP() AS execution_ts, +'' AS rule_binding_id, +'NO_DUPLICATES_IN_COLUMN_GROUPS' AS rule_id, +'..contact_details' AS table_id, +CAST(NULL AS STRING) AS column_id, +NULL AS column_value, +CAST(NULL AS STRING) AS dimension, +CAST(NULL AS BOOLEAN) AS simple_rule_row_is_valid, +TRUE AS skip_null_count, +custom_sql_statement_validation_errors.complex_rule_validation_errors_count AS complex_rule_validation_errors_count, +CASE +WHEN custom_sql_statement_validation_errors.complex_rule_validation_errors_count IS NULL THEN CAST(NULL AS BOOLEAN) +WHEN custom_sql_statement_validation_errors.complex_rule_validation_errors_count = 0 THEN TRUE +ELSE FALSE +END AS complex_rule_validation_success_flag, + r""" + WITH + zero_record AS ( + SELECT + '' AS rule_binding_id, + ), + data AS ( + SELECT + *, + '' AS rule_binding_id, + FROM + `..contact_details` d + WHERE + contact_type IN ('email') + ), + last_mod AS ( + SELECT + project_id || '.' || dataset_id || '.' || table_id AS table_id, + TIMESTAMP_MILLIS(last_modified_time) AS last_modified + FROM `..__TABLES__` + ), + validation_results AS (SELECT + '' AS rule_binding_id, + 'NO_DUPLICATES_IN_COLUMN_GROUPS' AS rule_id, + '..contact_details' AS table_id, + CAST(NULL AS STRING) AS column_id, + NULL AS column_value, +custom_sql_statement_validation_errors, + CAST(NULL AS STRING) AS dimension, + CAST(NULL AS BOOLEAN) AS simple_rule_row_is_valid, + TRUE AS skip_null_count, + custom_sql_statement_validation_errors.complex_rule_validation_errors_count AS complex_rule_validation_errors_count, + CASE + WHEN custom_sql_statement_validation_errors.complex_rule_validation_errors_count IS NULL THEN CAST(NULL AS BOOLEAN) + WHEN custom_sql_statement_validation_errors.complex_rule_validation_errors_count = 0 THEN TRUE + ELSE FALSE + END AS complex_rule_validation_success_flag, +FROM +zero_record +LEFT JOIN +( +SELECT +*, +'' AS _rule_binding_id, +COUNT(*) OVER() AS complex_rule_validation_errors_count, +FROM ( +select a.* +from data a +inner join ( +select +contact_type,value +from data +group by contact_type,value +having count(*) > 1 +) duplicates +using (contact_type,value) +) custom_sql +) custom_sql_statement_validation_errors +ON +zero_record.rule_binding_id = custom_sql_statement_validation_errors._rule_binding_id + ), + all_validation_results AS ( + SELECT + '{{ invocation_id }}' as _dq_validation_invocation_id, + r.rule_binding_id AS _dq_validation_rule_binding_id, + r.rule_id AS _dq_validation_rule_id, + r.column_id AS _dq_validation_column_id, + r.column_value AS _dq_validation_column_value, + CAST(r.dimension AS STRING) AS _dq_validation_dimension, + r.simple_rule_row_is_valid AS _dq_validation_simple_rule_row_is_valid, + r.complex_rule_validation_errors_count AS _dq_validation_complex_rule_validation_errors_count, + r.complex_rule_validation_success_flag AS _dq_validation_complex_rule_validation_success_flag, +r.custom_sql_statement_validation_errors, + FROM + validation_results r + ) + SELECT + * + FROM + all_validation_results +WHERE +_dq_validation_simple_rule_row_is_valid is False +OR +_dq_validation_complex_rule_validation_success_flag is False +ORDER BY _dq_validation_rule_id""" + AS failed_records_query, + FROM + zero_record + LEFT JOIN + ( + SELECT + *, + '' AS _rule_binding_id, + COUNT(*) OVER() AS complex_rule_validation_errors_count, + FROM ( + select a.* + from data a + inner join ( + select + contact_type,value + from data + group by contact_type,value + having count(*) > 1 + ) duplicates + using (contact_type,value) + ) custom_sql + ) custom_sql_statement_validation_errors + ON + zero_record.rule_binding_id = custom_sql_statement_validation_errors._rule_binding_id +UNION ALL +SELECT +CURRENT_TIMESTAMP() AS execution_ts, +'' AS rule_binding_id, +'NOT_NULL_SIMPLE' AS rule_id, +'..contact_details' AS table_id, +'value' AS column_id, +data.value AS column_value, +CAST(NULL AS STRING) AS dimension, +CASE +WHEN value IS NOT NULL THEN TRUE +ELSE +FALSE +END AS simple_rule_row_is_valid, +TRUE AS skip_null_count, +CAST(NULL AS INT64) AS complex_rule_validation_errors_count, +CAST(NULL AS BOOLEAN) AS complex_rule_validation_success_flag, + r""" + WITH + zero_record AS ( + SELECT + '' AS rule_binding_id, + ), + data AS ( + SELECT + *, + '' AS rule_binding_id, + FROM + `..contact_details` d + WHERE + contact_type IN ('email') + ), + last_mod AS ( + SELECT + project_id || '.' || dataset_id || '.' || table_id AS table_id, + TIMESTAMP_MILLIS(last_modified_time) AS last_modified + FROM `..__TABLES__` + ), + validation_results AS (SELECT + '' AS rule_binding_id, + 'NOT_NULL_SIMPLE' AS rule_id, + '..contact_details' AS table_id, + 'value' AS column_id, + data.value AS column_value, + data.row_id AS row_id, + data.contact_type AS contact_type, + data.value AS value, + CAST(NULL AS STRING) AS dimension, + CASE + WHEN value IS NOT NULL THEN TRUE + ELSE + FALSE + END AS simple_rule_row_is_valid, + TRUE AS skip_null_count, + CAST(NULL AS INT64) AS complex_rule_validation_errors_count, + CAST(NULL AS BOOLEAN) AS complex_rule_validation_success_flag, + FROM + zero_record + LEFT JOIN + data + ON + zero_record.rule_binding_id = data.rule_binding_id + ), + all_validation_results AS ( + SELECT + '{{ invocation_id }}' as _dq_validation_invocation_id, + r.rule_binding_id AS _dq_validation_rule_binding_id, + r.rule_id AS _dq_validation_rule_id, + r.column_id AS _dq_validation_column_id, + r.column_value AS _dq_validation_column_value, + CAST(r.dimension AS STRING) AS _dq_validation_dimension, + r.simple_rule_row_is_valid AS _dq_validation_simple_rule_row_is_valid, + r.complex_rule_validation_errors_count AS _dq_validation_complex_rule_validation_errors_count, + r.complex_rule_validation_success_flag AS _dq_validation_complex_rule_validation_success_flag, + r.row_id AS row_id, + r.contact_type AS contact_type, + r.value AS value, + FROM + validation_results r + ) + SELECT + * + FROM + all_validation_results +WHERE +_dq_validation_simple_rule_row_is_valid is False +OR +_dq_validation_complex_rule_validation_success_flag is False +ORDER BY _dq_validation_rule_id""" + AS failed_records_query, +FROM +zero_record +LEFT JOIN +data +ON +zero_record.rule_binding_id = data.rule_binding_id +), +all_validation_results AS ( +SELECT +r.execution_ts AS execution_ts, +r.rule_binding_id AS rule_binding_id, +r.rule_id AS rule_id, +r.table_id AS table_id, +r.column_id AS column_id, +r.column_value AS column_value, +CAST(r.dimension AS STRING) AS dimension, +r.skip_null_count AS skip_null_count, +r.simple_rule_row_is_valid AS simple_rule_row_is_valid, +r.complex_rule_validation_errors_count AS complex_rule_validation_errors_count, +r.complex_rule_validation_success_flag AS complex_rule_validation_success_flag, +(SELECT COUNT(*) FROM data) AS rows_validated, +last_mod.last_modified, +'{"brand": "one"}' AS metadata_json_string, +'' AS configs_hashsum, +'' AS dataplex_lake, +'' AS dataplex_zone, +'' AS dataplex_asset_id, +CONCAT(r.rule_binding_id, '_', r.rule_id, '_', r.execution_ts, '_', True) AS dq_run_id, +TRUE AS progress_watermark, + failed_records_query AS failed_records_query, +FROM +validation_results r +JOIN last_mod USING(table_id) +) +SELECT +* +FROM +all_validation_results \ No newline at end of file diff --git a/tests/unit/test_dq_row_filter_params.py b/tests/unit/test_dq_row_filter_params.py new file mode 100644 index 00000000..bd1d486b --- /dev/null +++ b/tests/unit/test_dq_row_filter_params.py @@ -0,0 +1,100 @@ +# Copyright 2022 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import re + +import pytest + +from clouddq import lib +from clouddq import utils + +RE_NEWLINES = r"(\n( )*)+" +RE_CONFIGS_HASHSUM = r"'[\w\d]+' AS configs_hashsum," +CONFIGS_HASHSUM_REP = "'' AS configs_hashsum," +RE_ASSET_ID = r"'[\w-]+' AS dataplex_asset_id," +ASSET_ID_REP = "'' AS dataplex_asset_id," + +logger = logging.getLogger(__name__) + +class TestRowFilterParams: + + @pytest.fixture(scope="session") + def test_rule_bindings_collection_team_10(self, source_configs_path): + """ """ + return lib.load_rule_bindings_config( + source_configs_path / "rule_bindings" / "team-10-rule-bindings.yml" + ) + + def test_dq_row_filter_params( + self, + test_rule_bindings_collection_team_10, + test_configs_cache, + test_resources, + gcp_project_id, + gcp_dataplex_bigquery_dataset_id, + gcp_bq_dataset, + gcp_dataplex_zone_id, + gcp_dataplex_lake_name, + test_bigquery_client, + ): + """ + + Args: + test_rule_bindings_collection_team_10: + test_entities_collection: + test_rules_collection: + test_row_filters_collection: + + Returns: + + """ + for rule_binding_id, rule_binding_configs in test_rule_bindings_collection_team_10.items(): + with open(test_resources / "test_dq_row_filter_params_sql_expected.sql") as f: + expected = f.read() + configs = lib.create_rule_binding_view_model( + rule_binding_id=rule_binding_id, + rule_binding_configs=rule_binding_configs, + dq_summary_table_name="..dq_summary", + configs_cache=test_configs_cache, + environment="DEV", + debug=True, + high_watermark_filter_exists=False, + bigquery_client=test_bigquery_client, + ) + output = configs.get("generated_sql_string_dict")\ + .get(f"{rule_binding_id}_generated_sql_string") + output = output.replace(gcp_project_id, "") \ + .replace(gcp_dataplex_bigquery_dataset_id, "") \ + .replace(gcp_bq_dataset, "") + if gcp_dataplex_zone_id in output: + output = output.replace(gcp_dataplex_zone_id, "") + else: + output = output.replace("CAST(NULL AS STRING) AS dataplex_zone", + "'' AS dataplex_zone") + if gcp_dataplex_lake_name in output: + output = output.replace(gcp_dataplex_lake_name, "") + else: + output = output.replace("CAST(NULL AS STRING) AS dataplex_lake", + "'' AS dataplex_lake") + + output = output.replace(rule_binding_id, "") + output = re.sub(RE_NEWLINES, '\n', output).strip() + output = re.sub(RE_CONFIGS_HASHSUM, CONFIGS_HASHSUM_REP, output) + output = re.sub(RE_ASSET_ID, ASSET_ID_REP, output) + expected = utils.strip_margin(re.sub(RE_NEWLINES, '\n', expected)).strip() + assert output == expected + +if __name__ == "__main__": + raise SystemExit(pytest.main([__file__, '-vv', '-rP', '-n', 'auto'])) \ No newline at end of file From 4f7af312d9c014c68e5d41431325a306e15bfeaf Mon Sep 17 00:00:00 2001 From: shourya_singh_cs Date: Mon, 14 Nov 2022 07:53:22 +0000 Subject: [PATCH 2/3] added the test case for params row-filter --- configs/entities/test-data.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/entities/test-data.yml b/configs/entities/test-data.yml index 6802486d..4f535b6f 100644 --- a/configs/entities/test-data.yml +++ b/configs/entities/test-data.yml @@ -16,8 +16,8 @@ entities: TEST_TABLE: source_database: BIGQUERY table_name: contact_details - dataset_name: clouddq_dev_us_central1 - project_name: kthxbayes-sandbox + dataset_name: + project_name: columns: ROW_ID: name: row_id From eb81fd9da2df4e6edb5d6a2539577bd968d5ac94 Mon Sep 17 00:00:00 2001 From: shourya_singh_cs Date: Tue, 15 Nov 2022 07:07:03 +0000 Subject: [PATCH 3/3] test --- clouddq/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clouddq/main.py b/clouddq/main.py index f2e0fc1e..0047525a 100644 --- a/clouddq/main.py +++ b/clouddq/main.py @@ -217,7 +217,7 @@ def main( # noqa: C901 enable_experimental_bigquery_entity_uris: bool = True, enable_experimental_dataplex_gcs_validation: bool = True, ) -> None: - """Run RULE_BINDING_IDS from a RULE_BINDING_CONFIG_PATH. + """Run RULE_BINDING_IDS from a RULE_BINDING_CONFIG_PATH.. RULE_BINDING_IDS: comma-separated Rule Binding ID(s) containing the