andre-salvati · andre-salvati · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024
diff --git a/.github/workflows/onpush.yml b/.github/workflows/onpush.yml
@@ -12,7 +12,7 @@ jobs:
 
     runs-on: ubuntu-latest
     strategy:
-      max-parallel: 4
+      max-parallel: 1
 
     env:
       DATABRICKS_HOST: ${{ secrets.DATABRICKS_HOST }}
@@ -33,10 +33,14 @@ jobs:
           pipenv run curl -fsSL https://raw.githubusercontent.com/databricks/setup-cli/main/install.sh | sh
           pipenv run databricks --version
 
-      - name: Package and Deployment
+      - name: Deploy on staging
         run: |
-          make deploy-dev
+          make deploy-staging
 
-      - name: Run
+      - name: Run on staging
         run: |
-          make deploy-ci
+          make run-staging
+
+      - name: Deploy on prod
+        run: |
+          make deploy-prod
diff --git a/Makefile b/Makefile
@@ -1,7 +1,9 @@
 install:
+	python3 -m pip install --upgrade pip
+	pip install pipenv
 	pipenv install packages
 	pipenv run pytest tests/
-	pipenv shell
+	pipenv run pip list
 
 pre-commit:
 	pre-commit autoupdate
@@ -11,9 +13,13 @@ deploy-dev:
 	python ./scripts/generate_template_workflow.py dev
 	databricks bundle deploy --target dev
 
-run-dev:
-	databricks bundle run default_python_job --target dev
+deploy-staging:
+	pipenv run python ./scripts/generate_template_workflow.py staging
+	pipenv run databricks bundle deploy --target staging
 
-deploy-ci:
-	pipenv run python ./scripts/generate_template_workflow.py ci
-	pipenv run databricks bundle deploy --target ci
+run-staging:
+	pipenv run databricks bundle run default_python_job --target staging
+
+deploy-prod:
+	pipenv run python ./scripts/generate_template_workflow.py prod
+	pipenv run databricks bundle deploy --target prod
diff --git a/Pipfile b/Pipfile
@@ -5,12 +5,10 @@ name = "pypi"
 
 [packages]
 funcy = "==2.0"
-packages = "*"
 numpy = "==1.23.5"
 pandas = "==1.5.3"
 pyarrow = "8.0.0"
 pydantic = "==2.7.4"
-unidecode = "==1.3.8"
 wheel = "==0.44.0"
 coverage = "==7.6.1"
 setuptools = "==72.1.0"
@@ -19,6 +17,7 @@ pytest = "==8.3.2"
 jinja2 = "==3.1.4"
 pyspark = "==3.5.1"
 pytest-cov = "==5.0.0"
+packages = "*"
 
 [dev-packages]
 

diff --git a/README.md b/README.md
@@ -1,13 +1,14 @@
 
-# Project Template for Spark/Databricks with Python packaging and CI/CD automation
+# Databricks template project with Asset Bundles, Python packaging and CI/CD automation
 
 This project template provides a structured approach to enhance your productivity when delivering ETL pipelines on Databricks. Feel free to customize it based on your project's specific nuances and the audience you are targeting.
 
 This project template demonstrates how to:
 
 - structure your PySpark code inside classes/packages.
-- package your code and move it on different environments on a CI/CD pipeline.
+- package your code and move it on different environments (dev, staging, prod) on a CI/CD pipeline.
 - configure your workflow to run in different environments with different configurations with [jinja package](https://pypi.org/project/jinja2/)
+- configure your workflow to selectively run tasks, preventing collisions and interference between developers working in parallel.
 - use a [medallion architecure](https://www.databricks.com/glossary/medallion-architecture) pattern by improving the data quality as it goes trought more refinement.
 - use a Make file to automate repetitive tasks on local env.
 - lint and format the code with [ruff](https://docs.astral.sh/ruff/) and [pre-commit](https://pre-commit.com/).
@@ -16,11 +17,12 @@ This project template demonstrates how to:
 - utilize [pytest package](https://pypi.org/project/pytest/) to run unit tests on transformations.
 - utilize [argparse package](https://pypi.org/project/argparse/) to build a flexible command line interface to start your jobs.
 - utilize [funcy package](https://pypi.org/project/funcy/) to log the execution time of each transformation.
-- utilize [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/index.html) and (the new!!!) [Databricks Asset Bundles](https://docs.databricks.com/en/dev-tools/bundles/index.html) to package/deploy/run a Python wheel package on Databricks.
+- utilize [Databricks CLI](https://docs.databricks.com/en/dev-tools/cli/index.html) and [Databricks Asset Bundles](https://docs.databricks.com/en/dev-tools/bundles/index.html) to package/deploy/run a Python wheel package on Databricks.
 - utilize [Databricks SDK for Python](https://docs.databricks.com/en/dev-tools/sdk-python.html) to manage workspaces and accounts. This script enables your metastore system tables that have [relevant data about billing, usage, lineage, prices, and access](https://www.youtube.com/watch?v=LcRWHzk8Wm4).
 - utilize [Databricks Unity Catalog](https://www.databricks.com/product/unity-catalog) instead of Hive as your data catalog and earn for free data lineage for your tables and columns and a simplified permission model for your data.
 - utilize [Databricks Workflows](https://docs.databricks.com/en/workflows/index.html) to execute a DAG and [task parameters](https://docs.databricks.com/en/workflows/jobs/parameter-value-references.html) to share context information between tasks (see [Task Parameters section](#task-parameters)). Yes, you don't need Airflow to manage your DAGs here!!!
 - utilize [Databricks job clusters](https://docs.databricks.com/en/workflows/jobs/use-compute.html#use-databricks-compute-with-your-jobs) to reduce costs.
+- define clusters on AWS and Azure.
 - execute a CI/CD pipeline with [Github Actions](https://docs.github.com/en/actions) after a repo push.
 
 For a debate about the use of notebooks x Python packages, please refer to:
@@ -74,37 +76,35 @@ For a debate about the use of notebooks x Python packages, please refer to:
 
 # Instructions
 
-### 1) install and configure Databricks CLI
+### 1) (optional) create a Databricks Workspace with Terraform
+
+Follow instructions [here](https://github.com/databricks/terraform-databricks-examples)
+
+
+### 2) install and configure Databricks CLI on your local machine
 
 Follow instructions [here](https://docs.databricks.com/en/dev-tools/cli/install.html)
 
 
-### 2) build python env and execute unit tests
+### 3) build python env and execute unit tests on your local machine
 
         make install
 
 You can also execute unit tests from your preferred IDE. Here's a screenshot from [VS Code](https://code.visualstudio.com/) with [Microsoft's Python extension](https://marketplace.visualstudio.com/items?itemName=ms-python.python) installed.
 
 <img src="docs/vscode.png"  width="30%" height="30%">
 
-### 3) deploy and execute on dev and prod workspaces.
+### 4) deploy and execute on dev workspace.
 
 Update "job_clusters" properties on wf_template.yml file. There are different properties for AWS and Azure.
 
         make deploy-dev
 
 
-### 4) configure CI/CD automation
+### 5) configure CI/CD automation
 
 Configure [Github Actions repository secrets](https://docs.github.com/en/actions/security-guides/using-secrets-in-github-actions) DATABRICKS_HOST and DATABRICKS_TOKEN.
 
-### 5) enable system tables on Catalog Explorer
-
-        python sdk_system_tables.py
-
-
-... and now you can code the transformations for each task and run unit and integration tests.
-
 
 # Task parameters
 

diff --git a/conf/wf_template.yml b/conf/wf_template.yml
@@ -17,7 +17,7 @@ resources:
       tasks:
 
         - task_key: extract_source1
-          job_cluster_key: cluster-dev
+          job_cluster_key: cluster-dev-aws
           max_retries: 0
           python_wheel_task:
             package_name: template
@@ -29,7 +29,7 @@ resources:
             - whl: ../dist/*.whl
 
         - task_key: extract_source2
-          job_cluster_key: cluster-dev
+          job_cluster_key: cluster-dev-aws
           max_retries: 0
           python_wheel_task:
             package_name: template
@@ -44,7 +44,7 @@ resources:
           depends_on:
             - task_key: extract_source1
             - task_key: extract_source2
-          job_cluster_key: cluster-dev
+          job_cluster_key: cluster-dev-aws
           max_retries: 0
           python_wheel_task:
             package_name: template
@@ -58,7 +58,7 @@ resources:
         - task_key: generate_orders_agg
           depends_on:
             - task_key: generate_orders
-          job_cluster_key: cluster-dev
+          job_cluster_key: cluster-dev-aws
           max_retries: 0
           python_wheel_task:
             package_name: template
@@ -70,12 +70,26 @@ resources:
             - whl: ../dist/*.whl
 
       job_clusters:
-        - job_cluster_key: cluster-dev
+        # - job_cluster_key: cluster-dev-azure
+        #   new_cluster:
+        #     spark_version: 15.3.x-scala2.12
+        #     node_type_id: Standard_D8as_v5
+        #     num_workers: 1
+        #     azure_attributes:
+        #       first_on_demand: 1
+        #       availability: SPOT_AZURE
+        #     data_security_mode: SINGLE_USER
+
+        - job_cluster_key: cluster-dev-aws
           new_cluster:
-            spark_version: 15.3.x-scala2.12
-            node_type_id: Standard_D8as_v5
-            num_workers: 2
-            azure_attributes:
+            spark_version: 14.2.x-scala2.12
+            node_type_id: c5d.xlarge
+            num_workers: 1
+            aws_attributes:
               first_on_demand: 1
-              availability: SPOT_AZURE
-            data_security_mode: SINGLE_USER
+              availability: SPOT_WITH_FALLBACK
+              zone_id: auto
+              spot_bid_price_percent: 100
+              ebs_volume_count: 0
+            policy_id: 001934F3ABD02D4A
+            data_security_mode: SINGLE_USER            
diff --git a/conf/workflow.yml b/conf/workflow.yml
diff --git a/databricks.yml b/databricks.yml
@@ -27,13 +27,15 @@ targets:
     default: true
     workspace:
       profile: dev
+    run_as:
+      user_name: [email protected]
 
   # Optionally, there could be a 'staging' target here.
   # (See Databricks docs on CI/CD at https://docs.databricks.com/dev-tools/bundles/index.html.)
-  #
-  # staging:
-  #  workspace:
-  #    host: https://myworkspace.databricks.com
+
+  staging:
+    workspace:
+      profile: dev
 
   # The 'prod' target, used for production deployment.
   prod:
@@ -49,4 +51,4 @@ targets:
       # This runs as [email protected] in production. Alternatively,
       # a service principal could be used here using service_principal_name
       # (see Databricks documentation).
-      user_name: username@company.com
+      user_name: user.two@domain.com