diff --git a/.github/workflows/onpush.yml b/.github/workflows/onpush.yml
index d164e4c..9c30a0c 100644
--- a/.github/workflows/onpush.yml
+++ b/.github/workflows/onpush.yml
@@ -12,7 +12,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
- max-parallel: 4
+ max-parallel: 1
env:
DATABRICKS_HOST: ${{ secrets.DATABRICKS_HOST }}
@@ -33,10 +33,14 @@ jobs:
pipenv run curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh
pipenv run databricks --version
- - name: Package and Deployment
+ - name: Deploy on staging
run: |
- make deploy-dev
+ make deploy-staging
- - name: Run
+ - name: Run on staging
run: |
- make deploy-ci
+ make run-staging
+
+ - name: Deploy on prod
+ run: |
+ make deploy-prod
diff --git a/Makefile b/Makefile
index 5e2f60c..33a1134 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,9 @@
install:
+ python3 -m pip install --upgrade pip
+ pip install pipenv
pipenv install packages
pipenv run pytest tests/
- pipenv shell
+ pipenv run pip list
pre-commit:
pre-commit autoupdate
@@ -11,9 +13,13 @@ deploy-dev:
python ./scripts/generate_template_workflow.py dev
databricks bundle deploy --target dev
-run-dev:
- databricks bundle run default_python_job --target dev
+deploy-staging:
+ pipenv run python ./scripts/generate_template_workflow.py staging
+ pipenv run databricks bundle deploy --target staging
-deploy-ci:
- pipenv run python ./scripts/generate_template_workflow.py ci
- pipenv run databricks bundle deploy --target ci
+run-staging:
+ pipenv run databricks bundle run default_python_job --target staging
+
+deploy-prod:
+ pipenv run python ./scripts/generate_template_workflow.py prod
+ pipenv run databricks bundle deploy --target prod
diff --git a/Pipfile b/Pipfile
index f3a4ed0..f373f0d 100644
--- a/Pipfile
+++ b/Pipfile
@@ -5,12 +5,10 @@ name = "pypi"
[packages]
funcy = "==2.0"
-packages = "*"
numpy = "==1.23.5"
pandas = "==1.5.3"
pyarrow = "8.0.0"
pydantic = "==2.7.4"
-unidecode = "==1.3.8"
wheel = "==0.44.0"
coverage = "==7.6.1"
setuptools = "==72.1.0"
@@ -19,6 +17,7 @@ pytest = "==8.3.2"
jinja2 = "==3.1.4"
pyspark = "==3.5.1"
pytest-cov = "==5.0.0"
+packages = "*"
[dev-packages]
diff --git a/README.md b/README.md
index 40b7168..2ff862c 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,14 @@
-# Project Template for Spark/Databricks with Python packaging and CI/CD automation
+# Databricks template project with Asset Bundles, Python packaging and CI/CD automation
This project template provides a structured approach to enhance your productivity when delivering ETL pipelines on Databricks. Feel free to customize it based on your project's specific nuances and the audience you are targeting.
This project template demonstrates how to:
- structure your PySpark code inside classes/packages.
-- package your code and move it on different environments on a CI/CD pipeline.
+- package your code and move it on different environments (dev, staging, prod) on a CI/CD pipeline.
- configure your workflow to run in different environments with different configurations with [jinja package](https://pypi.org/project/jinja2/)
+- configure your workflow to selectively run tasks, preventing collisions and interference between developers working in parallel.
- use a [medallion architecure](https://www.databricks.com/glossary/medallion-architecture) pattern by improving the data quality as it goes trought more refinement.
- use a Make file to automate repetitive tasks on local env.
- lint and format the code with [ruff](https://docs.astral.sh/ruff/) and [pre-commit](https://pre-commit.com/).
@@ -16,11 +17,12 @@ This project template demonstrates how to:
- utilize [pytest package](https://pypi.org/project/pytest/) to run unit tests on transformations.
- utilize [argparse package](https://pypi.org/project/argparse/) to build a flexible command line interface to start your jobs.
- utilize [funcy package](https://pypi.org/project/funcy/) to log the execution time of each transformation.
-- utilize [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/index.html) and (the new!!!) [Databricks Asset Bundles](https://docs.databricks.com/en/dev-tools/bundles/index.html) to package/deploy/run a Python wheel package on Databricks.
+- utilize [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/index.html) and [Databricks Asset Bundles](https://docs.databricks.com/en/dev-tools/bundles/index.html) to package/deploy/run a Python wheel package on Databricks.
- utilize [Databricks SDK for Python](https://docs.databricks.com/en/dev-tools/sdk-python.html) to manage workspaces and accounts. This script enables your metastore system tables that have [relevant data about billing, usage, lineage, prices, and access](https://www.youtube.com/watch?v=LcRWHzk8Wm4).
- utilize [Databricks Unity Catalog](https://www.databricks.com/product/unity-catalog) instead of Hive as your data catalog and earn for free data lineage for your tables and columns and a simplified permission model for your data.
- utilize [Databricks Workflows](https://docs.databricks.com/en/workflows/index.html) to execute a DAG and [task parameters](https://docs.databricks.com/en/workflows/jobs/parameter-value-references.html) to share context information between tasks (see [Task Parameters section](#task-parameters)). Yes, you don't need Airflow to manage your DAGs here!!!
- utilize [Databricks job clusters](https://docs.databricks.com/en/workflows/jobs/use-compute.html#use-databricks-compute-with-your-jobs) to reduce costs.
+- define clusters on AWS and Azure.
- execute a CI/CD pipeline with [Github Actions](https://docs.github.com/en/actions) after a repo push.
For a debate about the use of notebooks x Python packages, please refer to:
@@ -74,12 +76,17 @@ For a debate about the use of notebooks x Python packages, please refer to:
# Instructions
-### 1) install and configure Databricks CLI
+### 1) (optional) create a Databricks Workspace with Terraform
+
+Follow instructions [here](https://github.com/databricks/terraform-databricks-examples)
+
+
+### 2) install and configure Databricks CLI on your local machine
Follow instructions [here](https://docs.databricks.com/en/dev-tools/cli/install.html)
-### 2) build python env and execute unit tests
+### 3) build python env and execute unit tests on your local machine
make install
@@ -87,24 +94,17 @@ You can also execute unit tests from your preferred IDE. Here's a screenshot fro
-### 3) deploy and execute on dev and prod workspaces.
+### 4) deploy and execute on dev workspace.
Update "job_clusters" properties on wf_template.yml file. There are different properties for AWS and Azure.
make deploy-dev
-### 4) configure CI/CD automation
+### 5) configure CI/CD automation
Configure [Github Actions repository secrets](https://docs.github.com/en/actions/security-guides/using-secrets-in-github-actions) DATABRICKS_HOST and DATABRICKS_TOKEN.
-### 5) enable system tables on Catalog Explorer
-
- python sdk_system_tables.py
-
-
-... and now you can code the transformations for each task and run unit and integration tests.
-
# Task parameters
diff --git a/conf/wf_template.yml b/conf/wf_template.yml
index 27bee7c..f90be8d 100644
--- a/conf/wf_template.yml
+++ b/conf/wf_template.yml
@@ -17,7 +17,7 @@ resources:
tasks:
- task_key: extract_source1
- job_cluster_key: cluster-dev
+ job_cluster_key: cluster-dev-aws
max_retries: 0
python_wheel_task:
package_name: template
@@ -29,7 +29,7 @@ resources:
- whl: ../dist/*.whl
- task_key: extract_source2
- job_cluster_key: cluster-dev
+ job_cluster_key: cluster-dev-aws
max_retries: 0
python_wheel_task:
package_name: template
@@ -44,7 +44,7 @@ resources:
depends_on:
- task_key: extract_source1
- task_key: extract_source2
- job_cluster_key: cluster-dev
+ job_cluster_key: cluster-dev-aws
max_retries: 0
python_wheel_task:
package_name: template
@@ -58,7 +58,7 @@ resources:
- task_key: generate_orders_agg
depends_on:
- task_key: generate_orders
- job_cluster_key: cluster-dev
+ job_cluster_key: cluster-dev-aws
max_retries: 0
python_wheel_task:
package_name: template
@@ -70,12 +70,26 @@ resources:
- whl: ../dist/*.whl
job_clusters:
- - job_cluster_key: cluster-dev
+ # - job_cluster_key: cluster-dev-azure
+ # new_cluster:
+ # spark_version: 15.3.x-scala2.12
+ # node_type_id: Standard_D8as_v5
+ # num_workers: 1
+ # azure_attributes:
+ # first_on_demand: 1
+ # availability: SPOT_AZURE
+ # data_security_mode: SINGLE_USER
+
+ - job_cluster_key: cluster-dev-aws
new_cluster:
- spark_version: 15.3.x-scala2.12
- node_type_id: Standard_D8as_v5
- num_workers: 2
- azure_attributes:
+ spark_version: 14.2.x-scala2.12
+ node_type_id: c5d.xlarge
+ num_workers: 1
+ aws_attributes:
first_on_demand: 1
- availability: SPOT_AZURE
- data_security_mode: SINGLE_USER
+ availability: SPOT_WITH_FALLBACK
+ zone_id: auto
+ spot_bid_price_percent: 100
+ ebs_volume_count: 0
+ policy_id: 001934F3ABD02D4A
+ data_security_mode: SINGLE_USER
diff --git a/conf/workflow.yml b/conf/workflow.yml
deleted file mode 100644
index abeffca..0000000
--- a/conf/workflow.yml
+++ /dev/null
@@ -1,75 +0,0 @@
-# The main job for default_python
-resources:
- jobs:
-
- default_python_job:
- name: data_reporting_${bundle.target}
- timeout_seconds: 3600
-
-
-
- tasks:
-
- - task_key: extract_source1
- job_cluster_key: cluster-dev
- max_retries: 0
- python_wheel_task:
- package_name: template
- entry_point: main
- parameters: ["--task={{task.name}}",
- "--env=${bundle.target}",
- "${var.debug}"]
- libraries:
- - whl: ../dist/*.whl
-
- - task_key: extract_source2
- job_cluster_key: cluster-dev
- max_retries: 0
- python_wheel_task:
- package_name: template
- entry_point: main
- parameters: ["--task={{task.name}}",
- "--env=${bundle.target}",
- "${var.debug}"]
- libraries:
- - whl: ../dist/*.whl
-
- - task_key: generate_orders
- depends_on:
- - task_key: extract_source1
- - task_key: extract_source2
- job_cluster_key: cluster-dev
- max_retries: 0
- python_wheel_task:
- package_name: template
- entry_point: main
- parameters: ["--task={{task.name}}",
- "--env=${bundle.target}",
- "${var.debug}"]
- libraries:
- - whl: ../dist/*.whl
-
- - task_key: generate_orders_agg
- depends_on:
- - task_key: generate_orders
- job_cluster_key: cluster-dev
- max_retries: 0
- python_wheel_task:
- package_name: template
- entry_point: main
- parameters: ["--task={{task.name}}",
- "--env=${bundle.target}",
- "${var.debug}"]
- libraries:
- - whl: ../dist/*.whl
-
- job_clusters:
- - job_cluster_key: cluster-dev
- new_cluster:
- spark_version: 15.3.x-scala2.12
- node_type_id: Standard_D8as_v5
- num_workers: 2
- azure_attributes:
- first_on_demand: 1
- availability: SPOT_AZURE
- data_security_mode: SINGLE_USER
diff --git a/databricks.yml b/databricks.yml
index 9cf2cce..0a90078 100644
--- a/databricks.yml
+++ b/databricks.yml
@@ -27,13 +27,15 @@ targets:
default: true
workspace:
profile: dev
+ run_as:
+ user_name: user.two@domain.com
# Optionally, there could be a 'staging' target here.
# (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/index.html.)
- #
- # staging:
- # workspace:
- # host: https://myworkspace.databricks.com
+
+ staging:
+ workspace:
+ profile: dev
# The 'prod' target, used for production deployment.
prod:
@@ -49,4 +51,4 @@ targets:
# This runs as username@company.com in production. Alternatively,
# a service principal could be used here using service_principal_name
# (see Databricks documentation).
- user_name: username@company.com
+ user_name: user.two@domain.com
diff --git a/docs/ci_cd.drawio b/docs/ci_cd.drawio
new file mode 100755
index 0000000..e029a49
--- /dev/null
+++ b/docs/ci_cd.drawio
@@ -0,0 +1,79 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/ci_cd.png b/docs/ci_cd.png
new file mode 100644
index 0000000..57b9956
Binary files /dev/null and b/docs/ci_cd.png differ
diff --git a/src/template/config.ini b/src/template/config.ini
index c7883cd..2eb90df 100644
--- a/src/template/config.ini
+++ b/src/template/config.ini
@@ -1,4 +1,4 @@
-[andre.f.salvati@gmail.com]
+[user.two@domain.com]
extract_source1 = true
extract_source2 = true
generate_orders = true
diff --git a/src/template/config.py b/src/template/config.py
index 75f1c5b..e20cce4 100644
--- a/src/template/config.py
+++ b/src/template/config.py
@@ -75,7 +75,7 @@ def skip_task(self):
):
print("Skipped with config file for 'dev' and 'ci' envs.")
return True
- elif self.params["env"] in ("stag", "prod") and self.in_table_for_skip(self.params["task"]):
+ elif self.params["env"] in ("staging", "prod") and self.in_table_for_skip(self.params["task"]):
print("Skipped with config table for 'prod' env.")
return True
diff --git a/src/template/main.py b/src/template/main.py
index 7e3a86c..552ab56 100644
--- a/src/template/main.py
+++ b/src/template/main.py
@@ -10,7 +10,7 @@
def arg_parser():
parser = argparse.ArgumentParser()
- parser.add_argument("--env", required=True, choices=["dev", "ci", "prod"])
+ parser.add_argument("--env", required=True, choices=["dev", "staging", "prod"])
parser.add_argument("--default_schema")
parser.add_argument(
"--task",