diff --git a/.github/workflows/onpush.yml b/.github/workflows/onpush.yml index d164e4c..9c30a0c 100644 --- a/.github/workflows/onpush.yml +++ b/.github/workflows/onpush.yml @@ -12,7 +12,7 @@ jobs: runs-on: ubuntu-latest strategy: - max-parallel: 4 + max-parallel: 1 env: DATABRICKS_HOST: ${{ secrets.DATABRICKS_HOST }} @@ -33,10 +33,14 @@ jobs: pipenv run curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh pipenv run databricks --version - - name: Package and Deployment + - name: Deploy on staging run: | - make deploy-dev + make deploy-staging - - name: Run + - name: Run on staging run: | - make deploy-ci + make run-staging + + - name: Deploy on prod + run: | + make deploy-prod diff --git a/Makefile b/Makefile index 5e2f60c..33a1134 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,9 @@ install: + python3 -m pip install --upgrade pip + pip install pipenv pipenv install packages pipenv run pytest tests/ - pipenv shell + pipenv run pip list pre-commit: pre-commit autoupdate @@ -11,9 +13,13 @@ deploy-dev: python ./scripts/generate_template_workflow.py dev databricks bundle deploy --target dev -run-dev: - databricks bundle run default_python_job --target dev +deploy-staging: + pipenv run python ./scripts/generate_template_workflow.py staging + pipenv run databricks bundle deploy --target staging -deploy-ci: - pipenv run python ./scripts/generate_template_workflow.py ci - pipenv run databricks bundle deploy --target ci +run-staging: + pipenv run databricks bundle run default_python_job --target staging + +deploy-prod: + pipenv run python ./scripts/generate_template_workflow.py prod + pipenv run databricks bundle deploy --target prod diff --git a/Pipfile b/Pipfile index f3a4ed0..f373f0d 100644 --- a/Pipfile +++ b/Pipfile @@ -5,12 +5,10 @@ name = "pypi" [packages] funcy = "==2.0" -packages = "*" numpy = "==1.23.5" pandas = "==1.5.3" pyarrow = "8.0.0" pydantic = "==2.7.4" -unidecode = "==1.3.8" wheel = "==0.44.0" coverage = "==7.6.1" setuptools = "==72.1.0" @@ -19,6 +17,7 @@ pytest = "==8.3.2" jinja2 = "==3.1.4" pyspark = "==3.5.1" pytest-cov = "==5.0.0" +packages = "*" [dev-packages] diff --git a/README.md b/README.md index 40b7168..2ff862c 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,14 @@ -# Project Template for Spark/Databricks with Python packaging and CI/CD automation +# Databricks template project with Asset Bundles, Python packaging and CI/CD automation This project template provides a structured approach to enhance your productivity when delivering ETL pipelines on Databricks. Feel free to customize it based on your project's specific nuances and the audience you are targeting. This project template demonstrates how to: - structure your PySpark code inside classes/packages. -- package your code and move it on different environments on a CI/CD pipeline. +- package your code and move it on different environments (dev, staging, prod) on a CI/CD pipeline. - configure your workflow to run in different environments with different configurations with [jinja package](https://pypi.org/project/jinja2/) +- configure your workflow to selectively run tasks, preventing collisions and interference between developers working in parallel. - use a [medallion architecure](https://www.databricks.com/glossary/medallion-architecture) pattern by improving the data quality as it goes trought more refinement. - use a Make file to automate repetitive tasks on local env. - lint and format the code with [ruff](https://docs.astral.sh/ruff/) and [pre-commit](https://pre-commit.com/). @@ -16,11 +17,12 @@ This project template demonstrates how to: - utilize [pytest package](https://pypi.org/project/pytest/) to run unit tests on transformations. - utilize [argparse package](https://pypi.org/project/argparse/) to build a flexible command line interface to start your jobs. - utilize [funcy package](https://pypi.org/project/funcy/) to log the execution time of each transformation. -- utilize [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/index.html) and (the new!!!) [Databricks Asset Bundles](https://docs.databricks.com/en/dev-tools/bundles/index.html) to package/deploy/run a Python wheel package on Databricks. +- utilize [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/index.html) and [Databricks Asset Bundles](https://docs.databricks.com/en/dev-tools/bundles/index.html) to package/deploy/run a Python wheel package on Databricks. - utilize [Databricks SDK for Python](https://docs.databricks.com/en/dev-tools/sdk-python.html) to manage workspaces and accounts. This script enables your metastore system tables that have [relevant data about billing, usage, lineage, prices, and access](https://www.youtube.com/watch?v=LcRWHzk8Wm4). - utilize [Databricks Unity Catalog](https://www.databricks.com/product/unity-catalog) instead of Hive as your data catalog and earn for free data lineage for your tables and columns and a simplified permission model for your data. - utilize [Databricks Workflows](https://docs.databricks.com/en/workflows/index.html) to execute a DAG and [task parameters](https://docs.databricks.com/en/workflows/jobs/parameter-value-references.html) to share context information between tasks (see [Task Parameters section](#task-parameters)). Yes, you don't need Airflow to manage your DAGs here!!! - utilize [Databricks job clusters](https://docs.databricks.com/en/workflows/jobs/use-compute.html#use-databricks-compute-with-your-jobs) to reduce costs. +- define clusters on AWS and Azure. - execute a CI/CD pipeline with [Github Actions](https://docs.github.com/en/actions) after a repo push. For a debate about the use of notebooks x Python packages, please refer to: @@ -74,12 +76,17 @@ For a debate about the use of notebooks x Python packages, please refer to: # Instructions -### 1) install and configure Databricks CLI +### 1) (optional) create a Databricks Workspace with Terraform + +Follow instructions [here](https://github.com/databricks/terraform-databricks-examples) + + +### 2) install and configure Databricks CLI on your local machine Follow instructions [here](https://docs.databricks.com/en/dev-tools/cli/install.html) -### 2) build python env and execute unit tests +### 3) build python env and execute unit tests on your local machine make install @@ -87,24 +94,17 @@ You can also execute unit tests from your preferred IDE. Here's a screenshot fro -### 3) deploy and execute on dev and prod workspaces. +### 4) deploy and execute on dev workspace. Update "job_clusters" properties on wf_template.yml file. There are different properties for AWS and Azure. make deploy-dev -### 4) configure CI/CD automation +### 5) configure CI/CD automation Configure [Github Actions repository secrets](https://docs.github.com/en/actions/security-guides/using-secrets-in-github-actions) DATABRICKS_HOST and DATABRICKS_TOKEN. -### 5) enable system tables on Catalog Explorer - - python sdk_system_tables.py - - -... and now you can code the transformations for each task and run unit and integration tests. - # Task parameters diff --git a/conf/wf_template.yml b/conf/wf_template.yml index 27bee7c..f90be8d 100644 --- a/conf/wf_template.yml +++ b/conf/wf_template.yml @@ -17,7 +17,7 @@ resources: tasks: - task_key: extract_source1 - job_cluster_key: cluster-dev + job_cluster_key: cluster-dev-aws max_retries: 0 python_wheel_task: package_name: template @@ -29,7 +29,7 @@ resources: - whl: ../dist/*.whl - task_key: extract_source2 - job_cluster_key: cluster-dev + job_cluster_key: cluster-dev-aws max_retries: 0 python_wheel_task: package_name: template @@ -44,7 +44,7 @@ resources: depends_on: - task_key: extract_source1 - task_key: extract_source2 - job_cluster_key: cluster-dev + job_cluster_key: cluster-dev-aws max_retries: 0 python_wheel_task: package_name: template @@ -58,7 +58,7 @@ resources: - task_key: generate_orders_agg depends_on: - task_key: generate_orders - job_cluster_key: cluster-dev + job_cluster_key: cluster-dev-aws max_retries: 0 python_wheel_task: package_name: template @@ -70,12 +70,26 @@ resources: - whl: ../dist/*.whl job_clusters: - - job_cluster_key: cluster-dev + # - job_cluster_key: cluster-dev-azure + # new_cluster: + # spark_version: 15.3.x-scala2.12 + # node_type_id: Standard_D8as_v5 + # num_workers: 1 + # azure_attributes: + # first_on_demand: 1 + # availability: SPOT_AZURE + # data_security_mode: SINGLE_USER + + - job_cluster_key: cluster-dev-aws new_cluster: - spark_version: 15.3.x-scala2.12 - node_type_id: Standard_D8as_v5 - num_workers: 2 - azure_attributes: + spark_version: 14.2.x-scala2.12 + node_type_id: c5d.xlarge + num_workers: 1 + aws_attributes: first_on_demand: 1 - availability: SPOT_AZURE - data_security_mode: SINGLE_USER + availability: SPOT_WITH_FALLBACK + zone_id: auto + spot_bid_price_percent: 100 + ebs_volume_count: 0 + policy_id: 001934F3ABD02D4A + data_security_mode: SINGLE_USER diff --git a/conf/workflow.yml b/conf/workflow.yml deleted file mode 100644 index abeffca..0000000 --- a/conf/workflow.yml +++ /dev/null @@ -1,75 +0,0 @@ -# The main job for default_python -resources: - jobs: - - default_python_job: - name: data_reporting_${bundle.target} - timeout_seconds: 3600 - - - - tasks: - - - task_key: extract_source1 - job_cluster_key: cluster-dev - max_retries: 0 - python_wheel_task: - package_name: template - entry_point: main - parameters: ["--task={{task.name}}", - "--env=${bundle.target}", - "${var.debug}"] - libraries: - - whl: ../dist/*.whl - - - task_key: extract_source2 - job_cluster_key: cluster-dev - max_retries: 0 - python_wheel_task: - package_name: template - entry_point: main - parameters: ["--task={{task.name}}", - "--env=${bundle.target}", - "${var.debug}"] - libraries: - - whl: ../dist/*.whl - - - task_key: generate_orders - depends_on: - - task_key: extract_source1 - - task_key: extract_source2 - job_cluster_key: cluster-dev - max_retries: 0 - python_wheel_task: - package_name: template - entry_point: main - parameters: ["--task={{task.name}}", - "--env=${bundle.target}", - "${var.debug}"] - libraries: - - whl: ../dist/*.whl - - - task_key: generate_orders_agg - depends_on: - - task_key: generate_orders - job_cluster_key: cluster-dev - max_retries: 0 - python_wheel_task: - package_name: template - entry_point: main - parameters: ["--task={{task.name}}", - "--env=${bundle.target}", - "${var.debug}"] - libraries: - - whl: ../dist/*.whl - - job_clusters: - - job_cluster_key: cluster-dev - new_cluster: - spark_version: 15.3.x-scala2.12 - node_type_id: Standard_D8as_v5 - num_workers: 2 - azure_attributes: - first_on_demand: 1 - availability: SPOT_AZURE - data_security_mode: SINGLE_USER diff --git a/databricks.yml b/databricks.yml index 9cf2cce..0a90078 100644 --- a/databricks.yml +++ b/databricks.yml @@ -27,13 +27,15 @@ targets: default: true workspace: profile: dev + run_as: + user_name: user.two@domain.com # Optionally, there could be a 'staging' target here. # (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/index.html.) - # - # staging: - # workspace: - # host: https://myworkspace.databricks.com + + staging: + workspace: + profile: dev # The 'prod' target, used for production deployment. prod: @@ -49,4 +51,4 @@ targets: # This runs as username@company.com in production. Alternatively, # a service principal could be used here using service_principal_name # (see Databricks documentation). - user_name: username@company.com + user_name: user.two@domain.com diff --git a/docs/ci_cd.drawio b/docs/ci_cd.drawio new file mode 100755 index 0000000..e029a49 --- /dev/null +++ b/docs/ci_cd.drawio @@ -0,0 +1,79 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/docs/ci_cd.png b/docs/ci_cd.png new file mode 100644 index 0000000..57b9956 Binary files /dev/null and b/docs/ci_cd.png differ diff --git a/src/template/config.ini b/src/template/config.ini index c7883cd..2eb90df 100644 --- a/src/template/config.ini +++ b/src/template/config.ini @@ -1,4 +1,4 @@ -[andre.f.salvati@gmail.com] +[user.two@domain.com] extract_source1 = true extract_source2 = true generate_orders = true diff --git a/src/template/config.py b/src/template/config.py index 75f1c5b..e20cce4 100644 --- a/src/template/config.py +++ b/src/template/config.py @@ -75,7 +75,7 @@ def skip_task(self): ): print("Skipped with config file for 'dev' and 'ci' envs.") return True - elif self.params["env"] in ("stag", "prod") and self.in_table_for_skip(self.params["task"]): + elif self.params["env"] in ("staging", "prod") and self.in_table_for_skip(self.params["task"]): print("Skipped with config table for 'prod' env.") return True diff --git a/src/template/main.py b/src/template/main.py index 7e3a86c..552ab56 100644 --- a/src/template/main.py +++ b/src/template/main.py @@ -10,7 +10,7 @@ def arg_parser(): parser = argparse.ArgumentParser() - parser.add_argument("--env", required=True, choices=["dev", "ci", "prod"]) + parser.add_argument("--env", required=True, choices=["dev", "staging", "prod"]) parser.add_argument("--default_schema") parser.add_argument( "--task",