diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..274b436 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,30 @@ +--- +name: Bug report +about: Create a report to help us improve +labels: 'bug' +type: 'Bug' +--- + +## Describe the bug +A clear and concise description of what the bug is. + +## To Reproduce +Steps to reproduce the behavior OR commands run: +1. Go to '...' +2. Click on '....' +3. Enter value '...' +4. See error + +## Expected behavior +A clear and concise description of what you expected to happen. + +## Screenshots +If applicable, add screenshots to help explain your problem. + +## Desktop (please complete the following information): + - OS: [e.g. iOS] + - Browser [e.g. chrome, safari] + - Version [e.g. 22] + +## Additional context +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/epic_task.md b/.github/ISSUE_TEMPLATE/epic_task.md new file mode 100644 index 0000000..c43af1e --- /dev/null +++ b/.github/ISSUE_TEMPLATE/epic_task.md @@ -0,0 +1,14 @@ +--- +name: Epic +about: A larger task consisting of more deliverables +labels: 'epic' +type: 'Epic' +--- + +## Background +A clear and concise intro into the situation. + +## Goal +The goal that epic wants to achieve. + +[Add actionable subtasks or even epics] diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..71b1af1 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,21 @@ +--- +name: Feature request +about: Suggest an idea for this project +labels: 'enhancement' +type: 'Feature' +--- + +## Background +A clear and concise description of where the limitation lies. + +## Feature +A description of the requested feature. + +## Example [Optional] +A simple example if applicable. + +## Proposed Solution [Optional] +Solution Ideas: +1. +2. +3. diff --git a/.github/ISSUE_TEMPLATE/operative_task.md b/.github/ISSUE_TEMPLATE/operative_task.md new file mode 100644 index 0000000..fc73981 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/operative_task.md @@ -0,0 +1,10 @@ +--- +name: Operative task +about: Issue template for operational tasks. +labels: 'infrastructure,no RN' +type: 'Task' +--- + +## The task + +Short description of the task. diff --git a/.github/ISSUE_TEMPLATE/spike_task.md b/.github/ISSUE_TEMPLATE/spike_task.md new file mode 100644 index 0000000..05c6dba --- /dev/null +++ b/.github/ISSUE_TEMPLATE/spike_task.md @@ -0,0 +1,34 @@ +--- +name: Spike +about: Issue template for spikes, research and investigation tasks +labels: 'spike' +type: 'Task' +--- + +## Background +A clear and concise description of the problem or a topic we need to understand. + +Feel free to add information about why it's needed and what assumptions you have at the moment. + +## Questions To Answer + +1. +2. +3. + +## Desired Outcome + +The list of desired outcomes of this spike ticket. + +### Tasks +- [ ] Questions have been answered or we have a clearer idea of how to get to our goal +- [ ] Discussion with the team +- [ ] Documentation +- [ ] Create recommendations and new implementation tickets +- [ ] item here.. + +## Additional Info/Resources [Optional] + +1. +2. +3. diff --git a/.github/workflows/cd_scala.yml b/.github/workflows/cd_scala.yml index f308469..9cc122e 100644 --- a/.github/workflows/cd_scala.yml +++ b/.github/workflows/cd_scala.yml @@ -1,3 +1,18 @@ +# Copyright 2024 ABSA Group Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + name: CD Scala on: @@ -50,21 +65,3 @@ jobs: with: name: ${{ env.artifact_name }} path: bigfiles/staging - - - name: Aquasec Manifest Generation - run: | - export BILLY_SERVER=https://billy.eu-1.codesec.aquasec.com - curl -sLo install.sh download.codesec.aquasec.com/billy/install.sh - curl -sLo install.sh.checksum https://github.com/argonsecurity/releases/releases/latest/download/install.sh.checksum - if ! cat install.sh.checksum | sha256sum --check; then - echo "install.sh checksum failed" - exit 1 - fi - BINDIR="." sh install.sh - rm install.sh install.sh.checksum - ./billy generate \ - --access-token "${{ secrets.GITHUB_TOKEN }}" \ - --aqua-key "${{ secrets.AQUA_KEY }}" \ - --aqua-secret "${{ secrets.AQUA_SECRET }}" \ - --cspm-url https://eu-1.api.cloudsploit.com \ - --artifact-path "${{ github.workspace }}" \ No newline at end of file diff --git a/.github/workflows/check_pr_release_notes.yml b/.github/workflows/check_pr_release_notes.yml index 26a8dae..0b25ba6 100644 --- a/.github/workflows/check_pr_release_notes.yml +++ b/.github/workflows/check_pr_release_notes.yml @@ -1,3 +1,18 @@ +# Copyright 2024 ABSA Group Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + name: Check PR Release Notes in Description on: @@ -16,7 +31,7 @@ jobs: - name: Check presence of release notes in PR description id: check-release-notes - uses: AbsaOSS/release-notes-presence-check@v0.2.1 + uses: AbsaOSS/release-notes-presence-check@v0.3.0 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: diff --git a/.github/workflows/ci_python.yml b/.github/workflows/ci_python.yml index d92d3c7..f80c74b 100644 --- a/.github/workflows/ci_python.yml +++ b/.github/workflows/ci_python.yml @@ -1,3 +1,18 @@ +# Copyright 2024 ABSA Group Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + name: CI Python on: diff --git a/.github/workflows/ci_scala.yml b/.github/workflows/ci_scala.yml index acae1f9..5d89cef 100644 --- a/.github/workflows/ci_scala.yml +++ b/.github/workflows/ci_scala.yml @@ -1,3 +1,18 @@ +# Copyright 2024 ABSA Group Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + name: CI Scala on: @@ -60,23 +75,6 @@ jobs: script: | core.setFailed('Changed files coverage is less than ${{ env.changed }}%!') - - name: Aquasec Manifest Generation - run: | - export BILLY_SERVER=https://billy.eu-1.codesec.aquasec.com - curl -sLo install.sh download.codesec.aquasec.com/billy/install.sh - curl -sLo install.sh.checksum https://github.com/argonsecurity/releases/releases/latest/download/install.sh.checksum - if ! cat install.sh.checksum | sha256sum --check; then - echo "install.sh checksum failed" - exit 1 - fi - BINDIR="." sh install.sh - rm install.sh install.sh.checksum - ./billy generate \ - --access-token "${{ secrets.GITHUB_TOKEN }}" \ - --aqua-key "${{ secrets.AQUA_KEY }}" \ - --aqua-secret "${{ secrets.AQUA_SECRET }}" \ - --cspm-url https://eu-1.api.cloudsploit.com \ - --artifact-path "${{ github.workspace }}" format-check: name: Format Check diff --git a/.github/workflows/release_draft.yml b/.github/workflows/release_draft.yml index e4c4d79..5c092a1 100644 --- a/.github/workflows/release_draft.yml +++ b/.github/workflows/release_draft.yml @@ -1,3 +1,18 @@ +# Copyright 2024 ABSA Group Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + name: Release - create draft release on: workflow_dispatch: diff --git a/.github/workflows/release_publish.yml b/.github/workflows/release_publish.yml new file mode 100644 index 0000000..1a62709 --- /dev/null +++ b/.github/workflows/release_publish.yml @@ -0,0 +1,70 @@ +# Copyright 2024 ABSA Group Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Release - publish artifacts +on: + release: + types: [released] + +jobs: + publish-jar-file: + name: Publish jar file to GitHub Release + runs-on: ubuntu-latest + defaults: + run: + working-directory: bigfiles + strategy: + matrix: + include: + - scala: 2.12.17 + scalaShort: "2.12" + - scala: 2.11.12 + scalaShort: "2.11" + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + persist-credentials: false + fetch-depth: 0 + - uses: coursier/cache-action@v6 + + - name: Setup Scala + uses: olafurpg/setup-scala@v14 + with: + java-version: "adopt@1.8" + + - name: Build jar for Scala ${{ matrix.scala }} + run: sbt ++${{ matrix.scala }} assembly + + - name: Create staging directory + run: mkdir -p staging + + - name: Copy jar to staging + run: | + jar_file=$(ls target/scala-${{ matrix.scalaShort }}/dataset-comparison-*.jar) + base_name=$(basename $jar_file) + cp target/scala-${{ matrix.scalaShort }}/dataset-comparison-*.jar staging/scala-${{ matrix.scalaShort }}$base_name + echo "artifact_name=scala-${{ matrix.scalaShort }}-$base_name" >> $GITHUB_ENV + + - name: Info on publish + run: | + echo "Release: ${{ github.event.release.tag_name }}" + echo "Published file: ${{ env.artifact_name }}" + echo "In repo: ${{ github.repository }}" + + - name: Upload JAR file to GitHub Release + run: gh release upload ${{ github.event.release.tag_name }} bigfiles/staging --repo ${{ github.repository }} + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..f3da1e1 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,36 @@ +# How to Contribute? + +## **Identifying and Reporting Bugs** +* **Ensure the bug has not already been reported** by searching our **[GitHub Issues](https://github.com/AbsaOSS/generate-release-notes/issues)**. +* If you cannot find an open issue describing the problem, use the **Bug report** template to open a new one. Tag it with the **bug** label. + +## **Proposing New Features** + +* **Check if the feature has already been requested** by searching through our **[GitHub Issues](https://github.com/AbsaOSS/generate-release-notes/issues)**. +* If the feature request doesn't exist, feel free to create a new one. Tag it with the **request** label. + +## **Contributing to Development** + +* Check _Issues_ logs for the desired feature or bug. Ensure that no one else is already working on it. + * If the feature/bug is not yet filed, please create a detailed issue first: + * **"Detail Your Idea or Issue"** +* Fork the repository. +* Begin coding. Feel free to ask questions and collaborate with us. + * Commit messages should reference the GitHub Issue and provide a concise description: + * **"#34 Implement Feature X"** + * Remember to include tests for your code. +* Once done, push to your fork and submit a Pull Request to our `master` branch: + * Pull Request titles should begin with the GitHub Issue number: + * **"45 Implementing New Analytics Feature"** + * Ensure the Pull Request description clearly outlines your solution. + * Link your PR to the relevant _Issue_. + +### Community and Communication + +If you have any questions or need help, don't hesitate to reach out through our GitHub discussion section. We're here to help! + +#### Thanks! + +Your contributions are invaluable to us. Thank you for being part of the AbsaOSS community and helping us grow and improve! + +The AbsaOSS Team \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..f49a4e1 --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/README.md b/README.md index 6bcff36..8d25688 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,9 @@ Tool for exact comparison two Parquet files. - [What is CPS-Dataset-Comparison?](#what-is-CPS-Dataset-Comparison) + - [Abstract example](#abstract-example) - [Removing noise](#removing-noise) - - [Removing same recors](#removing-same-recors) + - [Removing same records](#removing-same-records) - [Detailed Analyses](#detailed-analyses) - [Project structure](#project-structure) - [bigfiles](#bigfiles) @@ -20,13 +21,27 @@ In this particular solution, we will consider Parquet files as input. The tool w ![alt text](images/mainFlow.png) +### Abstract example + +Let's say we have two Parquet files with the following content: +![img.png](images/tables.png) +Firstly we will remove the first column because it is always different/autogenerated ... +![img_1.png](images/remove_id.png) + +We can see that the first file has 1st and 3rd rows exactly the same as the 2nd and 3rd in second file. So we will remove them. +![img_2.png](images/find_match.png) + +Then we can found the difference between other rows. +![img_3.png](images/find_diff.png) + ### Removing noise Noise removal will not be implemented in the first version. It was decided that this could be implemented afterward if there was a problem with noise columns. But we know some noise columns: Timestamps and Run id. The approach for finding nondeterministic columns (noise columns) will be: Finding which columns are not the same in two Crunch runs (every run is constructed from 2 Crunch runs and one Spark run). + > At first we should compare the schema of both parquet files -### Removing same recors +### Removing same records We have decided not to bother with duplicates so we will remove common rows as described on the following flow chart: ![alt text](images/removeRecords.png) @@ -41,19 +56,19 @@ We have decided to use row by row comparison for detailed analyses. We can use m ![alt text](images/analyses.png) -> All charts could be seen on [this Miro board](https://miro.com/app/board/uXjVLaOagec=/?share_link_id=579669188211) - ## Project structure Project is divided into two modules: ### bigfiles + - bigfile is file that does not fit to RAM - module for comparing big files - written in Scala - more about bigfiles module could be found in [bigfiles README](bigfiles/README.md) ### smallfiles + - smallfile is file that fits to RAM - module for comparing small files - written in Python diff --git a/bigfiles/README.md b/bigfiles/README.md index 6a6da41..fe24e5e 100644 --- a/bigfiles/README.md +++ b/bigfiles/README.md @@ -29,7 +29,7 @@ spark-submit target/scala-2.12/dataset-comparison-assembly-1.0.jar -o --inputA --inputB -d Row +spark-submit --class za.co.absa.DatasetComparison --conf "spark.driver.extraJavaOptions=-Dconfig.file=/path/to/application.conf" target/scala-2.12/dataset-comparison-assembly-0.1.0.jar -o --inputA --inputB -d Row ``` `-d Row` is optional parameter for detailed analyses that specifies which analyses to use. Now it can be only `Row`. It will compute detailed analyses if number of different columns is less than 200, you can change this threshold in `src/main/resources/application.conf`. @@ -94,16 +94,16 @@ tutorial [here](https://dev.to/awwsmm/installing-and-running-hadoop-and-spark-on ```xml dfs.name.dir - /Users/AB032MJ/hadoop_temp/name_node + /.../hadoop_temp/name_node dfs.data.dir - /Users/AB032MJ/hadoop_temp/data_node + /.../hadoop_temp/data_node hadoop.tmp.dir - /Users/AB032MJ/hadoop_temp/internal_temp + /.../hadoop_temp/internal_temp @@ -143,7 +143,7 @@ tutorial [here](https://dev.to/awwsmm/installing-and-running-hadoop-and-spark-on ``` Add this into **hadoop-env.sh** - ```export JAVA_HOME="/Users/AB032MJ/.sdkman/candidates/java/8.0.422-amzn"``` + ```export JAVA_HOME="/.../.sdkman/candidates/java/8.0.422-amzn"``` 4. create directories by configuration for example: ``` diff --git a/bigfiles/build.sbt b/bigfiles/build.sbt index 22b4156..cb9ce33 100644 --- a/bigfiles/build.sbt +++ b/bigfiles/build.sbt @@ -13,13 +13,13 @@ lazy val supportedScalaVersions = List(scala211, scala212) ThisBuild / version := "0.1.0" ThisBuild / scalaVersion := scala212 -ThisBuild / organization := "africa.absa.cps" +ThisBuild / organization := "za.co.absa" lazy val root = (project in file(".")) .settings( name := "dataset-comparison", crossScalaVersions := supportedScalaVersions, - assembly / mainClass := Some("africa.absa.cps.DatasetComparison"), + assembly / mainClass := Some("za.co.absa.DatasetComparison"), libraryDependencies ++= bigfilesDependencies ++ Seq( "org.apache.spark" %% "spark-core" % sparkVersionForScala(scalaVersion.value) % Provided, "org.apache.spark" %% "spark-sql" % sparkVersionForScala(scalaVersion.value) % Provided, @@ -48,7 +48,7 @@ Test / jacocoReportSettings := JacocoReportSettings( formats = Seq(JacocoReportFormats.HTML, JacocoReportFormats.XML) ) -Test / jacocoExcludes := Seq("africa.absa.cps.DatasetComparison*") +Test / jacocoExcludes := Seq("za.co.absa.DatasetComparison*") ThisBuild / assemblyMergeStrategy := { case PathList("META-INF", xs @ _*) => MergeStrategy.discard diff --git a/bigfiles/images/settings.png b/bigfiles/images/settings.png index 24dc604..2734f2b 100644 Binary files a/bigfiles/images/settings.png and b/bigfiles/images/settings.png differ diff --git a/bigfiles/project/Dependencies.scala b/bigfiles/project/Dependencies.scala index 43807f3..96806a9 100644 --- a/bigfiles/project/Dependencies.scala +++ b/bigfiles/project/Dependencies.scala @@ -1,3 +1,15 @@ +/** Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ + import sbt._ import sbt.Keys._ diff --git a/bigfiles/project/plugins.sbt b/bigfiles/project/plugins.sbt index bcb46f4..28851e0 100644 --- a/bigfiles/project/plugins.sbt +++ b/bigfiles/project/plugins.sbt @@ -1,3 +1,15 @@ +/** Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ + addSbtPlugin("org.jetbrains.scala" % "sbt-ide-settings" % "1.1.2") // Plugins to build the server module as a jar file diff --git a/bigfiles/src/main/scala/africa/absa/cps/analysis/ColumnsDiff.scala b/bigfiles/src/main/scala/africa/absa/cps/analysis/ColumnsDiff.scala deleted file mode 100644 index d8e0e67..0000000 --- a/bigfiles/src/main/scala/africa/absa/cps/analysis/ColumnsDiff.scala +++ /dev/null @@ -1,3 +0,0 @@ -package africa.absa.cps.analysis - -case class ColumnsDiff(columnName: String = "N/A", values: Seq[String] = Seq.empty) diff --git a/bigfiles/src/main/scala/africa/absa/cps/analysis/RowsDiff.scala b/bigfiles/src/main/scala/africa/absa/cps/analysis/RowsDiff.scala deleted file mode 100644 index f3cb6fd..0000000 --- a/bigfiles/src/main/scala/africa/absa/cps/analysis/RowsDiff.scala +++ /dev/null @@ -1,3 +0,0 @@ -package africa.absa.cps.analysis - -case class RowsDiff(inputLeftHash: String = "N/A", inputRightHash: String = "N/A", diffs: Seq[ColumnsDiff] = Seq.empty) diff --git a/bigfiles/src/main/scala/africa/absa/cps/parser/Arguments.scala b/bigfiles/src/main/scala/africa/absa/cps/parser/Arguments.scala deleted file mode 100644 index 75a582a..0000000 --- a/bigfiles/src/main/scala/africa/absa/cps/parser/Arguments.scala +++ /dev/null @@ -1,10 +0,0 @@ -package africa.absa.cps.parser - -case class Arguments( - out: String = "", - inputA: String = "", - inputB: String = "", - outFormat: OutputFormatType.Value = OutputFormatType.Parquet, - diff: DiffComputeType.Value = DiffComputeType.None, - exclude: Seq[String] = Seq() -) diff --git a/bigfiles/src/main/scala/africa/absa/cps/parser/DiffComputeType.scala b/bigfiles/src/main/scala/africa/absa/cps/parser/DiffComputeType.scala deleted file mode 100644 index a0cd9ea..0000000 --- a/bigfiles/src/main/scala/africa/absa/cps/parser/DiffComputeType.scala +++ /dev/null @@ -1,8 +0,0 @@ -package africa.absa.cps.parser - -object DiffComputeType extends Enumeration { - val None, Row = Value -} -object DiffComputeTypeHelper { - implicit val diffComputeRead: scopt.Read[DiffComputeType.Value] = scopt.Read.reads(DiffComputeType.withName) -} diff --git a/bigfiles/src/main/scala/africa/absa/cps/parser/OutputFormatType.scala b/bigfiles/src/main/scala/africa/absa/cps/parser/OutputFormatType.scala deleted file mode 100644 index 5569b95..0000000 --- a/bigfiles/src/main/scala/africa/absa/cps/parser/OutputFormatType.scala +++ /dev/null @@ -1,9 +0,0 @@ -package africa.absa.cps.parser - -object OutputFormatType extends Enumeration { - type OutputFormatType = Value - val Parquet: OutputFormatType = Value("parquet") - val CSV: OutputFormatType = Value("csv") - - override def toString: String = super.toString -} diff --git a/bigfiles/src/main/scala/africa/absa/cps/Comparator.scala b/bigfiles/src/main/scala/za/co/absa/Comparator.scala similarity index 84% rename from bigfiles/src/main/scala/africa/absa/cps/Comparator.scala rename to bigfiles/src/main/scala/za/co/absa/Comparator.scala index 2b21b9e..7cb0a20 100644 --- a/bigfiles/src/main/scala/africa/absa/cps/Comparator.scala +++ b/bigfiles/src/main/scala/za/co/absa/Comparator.scala @@ -1,6 +1,18 @@ -package africa.absa.cps +/** Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ + +package za.co.absa -import africa.absa.cps.hash.HashUtils.HASH_COLUMN_NAME +import za.co.absa.hash.HashUtils.HASH_COLUMN_NAME import hash.HashUtils import org.apache.spark.sql.{DataFrame, SparkSession} import org.json4s.JsonAST diff --git a/bigfiles/src/main/scala/africa/absa/cps/DatasetComparison.scala b/bigfiles/src/main/scala/za/co/absa/DatasetComparison.scala similarity index 78% rename from bigfiles/src/main/scala/africa/absa/cps/DatasetComparison.scala rename to bigfiles/src/main/scala/za/co/absa/DatasetComparison.scala index 8dab9ad..e171277 100644 --- a/bigfiles/src/main/scala/africa/absa/cps/DatasetComparison.scala +++ b/bigfiles/src/main/scala/za/co/absa/DatasetComparison.scala @@ -1,9 +1,21 @@ -package africa.absa.cps +/** Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ -import africa.absa.cps.DatasetComparison.logger -import africa.absa.cps.analysis.RowByRowAnalysis -import africa.absa.cps.parser.{ArgsParser, DiffComputeType} -import africa.absa.cps.io.IOHandler +package za.co.absa + +import za.co.absa.DatasetComparison.logger +import za.co.absa.analysis.RowByRowAnalysis +import za.co.absa.parser.{ArgsParser, DiffComputeType} +import za.co.absa.io.IOHandler import org.apache.spark.sql.{DataFrame, SparkSession} import com.typesafe.config.ConfigFactory import org.slf4j.{Logger, LoggerFactory} diff --git a/bigfiles/src/main/scala/africa/absa/cps/DatasetComparisonHelper.scala b/bigfiles/src/main/scala/za/co/absa/DatasetComparisonHelper.scala similarity index 64% rename from bigfiles/src/main/scala/africa/absa/cps/DatasetComparisonHelper.scala rename to bigfiles/src/main/scala/za/co/absa/DatasetComparisonHelper.scala index 0b149ac..519c012 100644 --- a/bigfiles/src/main/scala/africa/absa/cps/DatasetComparisonHelper.scala +++ b/bigfiles/src/main/scala/za/co/absa/DatasetComparisonHelper.scala @@ -1,4 +1,16 @@ -package africa.absa.cps +/** Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ + +package za.co.absa import org.apache.spark.sql.{DataFrame, SparkSession} diff --git a/bigfiles/src/main/scala/africa/absa/cps/analysis/AnalyseStat.scala b/bigfiles/src/main/scala/za/co/absa/analysis/AnalyseStat.scala similarity index 54% rename from bigfiles/src/main/scala/africa/absa/cps/analysis/AnalyseStat.scala rename to bigfiles/src/main/scala/za/co/absa/analysis/AnalyseStat.scala index 75588a7..c6b53b7 100644 --- a/bigfiles/src/main/scala/africa/absa/cps/analysis/AnalyseStat.scala +++ b/bigfiles/src/main/scala/za/co/absa/analysis/AnalyseStat.scala @@ -1,4 +1,16 @@ -package africa.absa.cps.analysis +/** Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ + +package za.co.absa.analysis import org.apache.spark.sql.Row diff --git a/bigfiles/src/main/scala/za/co/absa/analysis/ColumnsDiff.scala b/bigfiles/src/main/scala/za/co/absa/analysis/ColumnsDiff.scala new file mode 100644 index 0000000..19b8cb6 --- /dev/null +++ b/bigfiles/src/main/scala/za/co/absa/analysis/ColumnsDiff.scala @@ -0,0 +1,15 @@ +/** Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ + +package za.co.absa.analysis + +case class ColumnsDiff(columnName: String = "N/A", values: Seq[String] = Seq.empty) diff --git a/bigfiles/src/main/scala/africa/absa/cps/analysis/RowByRowAnalysis.scala b/bigfiles/src/main/scala/za/co/absa/analysis/RowByRowAnalysis.scala similarity index 88% rename from bigfiles/src/main/scala/africa/absa/cps/analysis/RowByRowAnalysis.scala rename to bigfiles/src/main/scala/za/co/absa/analysis/RowByRowAnalysis.scala index aca7214..4527a0f 100644 --- a/bigfiles/src/main/scala/africa/absa/cps/analysis/RowByRowAnalysis.scala +++ b/bigfiles/src/main/scala/za/co/absa/analysis/RowByRowAnalysis.scala @@ -1,6 +1,18 @@ -package africa.absa.cps.analysis - -import africa.absa.cps.hash.HashUtils.HASH_COLUMN_NAME +/** Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ + +package za.co.absa.analysis + +import za.co.absa.hash.HashUtils.HASH_COLUMN_NAME import org.apache.spark.sql.functions.col import org.apache.spark.sql.{DataFrame, Row} import org.slf4j.{Logger, LoggerFactory} diff --git a/bigfiles/src/main/scala/za/co/absa/analysis/RowsDiff.scala b/bigfiles/src/main/scala/za/co/absa/analysis/RowsDiff.scala new file mode 100644 index 0000000..c8adac5 --- /dev/null +++ b/bigfiles/src/main/scala/za/co/absa/analysis/RowsDiff.scala @@ -0,0 +1,15 @@ +/** Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ + +package za.co.absa.analysis + +case class RowsDiff(inputLeftHash: String = "N/A", inputRightHash: String = "N/A", diffs: Seq[ColumnsDiff] = Seq.empty) diff --git a/bigfiles/src/main/scala/africa/absa/cps/hash/HashUtils.scala b/bigfiles/src/main/scala/za/co/absa/hash/HashUtils.scala similarity index 57% rename from bigfiles/src/main/scala/africa/absa/cps/hash/HashUtils.scala rename to bigfiles/src/main/scala/za/co/absa/hash/HashUtils.scala index cc28900..5f65a2a 100644 --- a/bigfiles/src/main/scala/africa/absa/cps/hash/HashUtils.scala +++ b/bigfiles/src/main/scala/za/co/absa/hash/HashUtils.scala @@ -1,4 +1,16 @@ -package africa.absa.cps.hash +/** Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ + +package za.co.absa.hash import org.apache.spark.sql.DataFrame import org.apache.spark.sql.Row diff --git a/bigfiles/src/main/scala/africa/absa/cps/io/IOHandler.scala b/bigfiles/src/main/scala/za/co/absa/io/IOHandler.scala similarity index 78% rename from bigfiles/src/main/scala/africa/absa/cps/io/IOHandler.scala rename to bigfiles/src/main/scala/za/co/absa/io/IOHandler.scala index 0bc50a5..b866bc7 100644 --- a/bigfiles/src/main/scala/africa/absa/cps/io/IOHandler.scala +++ b/bigfiles/src/main/scala/za/co/absa/io/IOHandler.scala @@ -1,8 +1,20 @@ -package africa.absa.cps.io +/** Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ -import africa.absa.cps.analysis.{ColumnsDiff, RowsDiff} -import africa.absa.cps.parser.OutputFormatType -import africa.absa.cps.parser.OutputFormatType._ +package za.co.absa.io + +import za.co.absa.analysis.{ColumnsDiff, RowsDiff} +import za.co.absa.parser.OutputFormatType +import za.co.absa.parser.OutputFormatType._ import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.IOUtils import org.apache.spark.sql.{DataFrame, SparkSession} diff --git a/bigfiles/src/main/scala/africa/absa/cps/parser/ArgsParser.scala b/bigfiles/src/main/scala/za/co/absa/parser/ArgsParser.scala similarity index 82% rename from bigfiles/src/main/scala/africa/absa/cps/parser/ArgsParser.scala rename to bigfiles/src/main/scala/za/co/absa/parser/ArgsParser.scala index 5bba5ff..b27fc1b 100644 --- a/bigfiles/src/main/scala/africa/absa/cps/parser/ArgsParser.scala +++ b/bigfiles/src/main/scala/za/co/absa/parser/ArgsParser.scala @@ -1,4 +1,16 @@ -package africa.absa.cps.parser +/** Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ + +package za.co.absa.parser import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.fs.Path diff --git a/bigfiles/src/main/scala/za/co/absa/parser/Arguments.scala b/bigfiles/src/main/scala/za/co/absa/parser/Arguments.scala new file mode 100644 index 0000000..9dc7987 --- /dev/null +++ b/bigfiles/src/main/scala/za/co/absa/parser/Arguments.scala @@ -0,0 +1,22 @@ +/** Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ + +package za.co.absa.parser + +case class Arguments( + out: String = "", + inputA: String = "", + inputB: String = "", + outFormat: OutputFormatType.Value = OutputFormatType.Parquet, + diff: DiffComputeType.Value = DiffComputeType.None, + exclude: Seq[String] = Seq() +) diff --git a/bigfiles/src/main/scala/za/co/absa/parser/DiffComputeType.scala b/bigfiles/src/main/scala/za/co/absa/parser/DiffComputeType.scala new file mode 100644 index 0000000..df62b5c --- /dev/null +++ b/bigfiles/src/main/scala/za/co/absa/parser/DiffComputeType.scala @@ -0,0 +1,20 @@ +/** Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ + +package za.co.absa.parser + +object DiffComputeType extends Enumeration { + val None, Row = Value +} +object DiffComputeTypeHelper { + implicit val diffComputeRead: scopt.Read[DiffComputeType.Value] = scopt.Read.reads(DiffComputeType.withName) +} diff --git a/bigfiles/src/main/scala/za/co/absa/parser/OutputFormatType.scala b/bigfiles/src/main/scala/za/co/absa/parser/OutputFormatType.scala new file mode 100644 index 0000000..ec856e9 --- /dev/null +++ b/bigfiles/src/main/scala/za/co/absa/parser/OutputFormatType.scala @@ -0,0 +1,21 @@ +/** Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. + */ + +package za.co.absa.parser + +object OutputFormatType extends Enumeration { + type OutputFormatType = Value + val Parquet: OutputFormatType = Value("parquet") + val CSV: OutputFormatType = Value("csv") + + override def toString: String = super.toString +} diff --git a/bigfiles/src/test/scala/ArgsParserTest.scala b/bigfiles/src/test/scala/ArgsParserTest.scala index 8166a33..9091422 100644 --- a/bigfiles/src/test/scala/ArgsParserTest.scala +++ b/bigfiles/src/test/scala/ArgsParserTest.scala @@ -1,4 +1,20 @@ -import africa.absa.cps.parser.{ArgsParser, Arguments, DiffComputeType, OutputFormatType} +/** + * Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +import za.co.absa.parser.{ArgsParser, Arguments, DiffComputeType, OutputFormatType} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.sql.SparkSession diff --git a/bigfiles/src/test/scala/ComparatorTest.scala b/bigfiles/src/test/scala/ComparatorTest.scala index b0fdd6b..74b605b 100644 --- a/bigfiles/src/test/scala/ComparatorTest.scala +++ b/bigfiles/src/test/scala/ComparatorTest.scala @@ -1,4 +1,20 @@ -import africa.absa.cps.Comparator +/** + * Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +import za.co.absa.Comparator import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} import org.scalatest.funsuite.AnyFunSuite diff --git a/bigfiles/src/test/scala/DatasetComparisonHelperTest.scala b/bigfiles/src/test/scala/DatasetComparisonHelperTest.scala index 2c65a4a..8e03a29 100644 --- a/bigfiles/src/test/scala/DatasetComparisonHelperTest.scala +++ b/bigfiles/src/test/scala/DatasetComparisonHelperTest.scala @@ -1,4 +1,20 @@ -import africa.absa.cps.DatasetComparisonHelper +/** + * Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +import za.co.absa.DatasetComparisonHelper import org.apache.spark.sql.{DataFrame, SparkSession} import org.scalatest.funsuite.AnyFunSuite diff --git a/bigfiles/src/test/scala/DatasetComparisonTest.scala b/bigfiles/src/test/scala/DatasetComparisonTest.scala index 86472a6..2fe7752 100644 --- a/bigfiles/src/test/scala/DatasetComparisonTest.scala +++ b/bigfiles/src/test/scala/DatasetComparisonTest.scala @@ -1,4 +1,20 @@ -import africa.absa.cps.DatasetComparison +/** + * Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +import za.co.absa.DatasetComparison import org.apache.spark.sql.{DataFrame, SparkSession} import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll} import org.scalatest.funsuite.AnyFunSuite diff --git a/bigfiles/src/test/scala/HashTableTest.scala b/bigfiles/src/test/scala/HashTableTest.scala index da94fde..4dc1a94 100644 --- a/bigfiles/src/test/scala/HashTableTest.scala +++ b/bigfiles/src/test/scala/HashTableTest.scala @@ -1,6 +1,22 @@ -import africa.absa.cps.Comparator -import africa.absa.cps.hash.HashUtils -import africa.absa.cps.hash.HashUtils.HASH_COLUMN_NAME +/** + * Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +import za.co.absa.Comparator +import za.co.absa.hash.HashUtils +import za.co.absa.hash.HashUtils.HASH_COLUMN_NAME import org.apache.spark.sql.{DataFrame, SparkSession} import org.scalatest.funsuite.AnyFunSuite diff --git a/bigfiles/src/test/scala/IOHandlerTest.scala b/bigfiles/src/test/scala/IOHandlerTest.scala index 311665c..38d3406 100644 --- a/bigfiles/src/test/scala/IOHandlerTest.scala +++ b/bigfiles/src/test/scala/IOHandlerTest.scala @@ -1,6 +1,22 @@ -import africa.absa.cps.analysis.{ColumnsDiff, RowsDiff} -import africa.absa.cps.io.IOHandler -import africa.absa.cps.parser.OutputFormatType +/** + * Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +import za.co.absa.analysis.{ColumnsDiff, RowsDiff} +import za.co.absa.io.IOHandler +import za.co.absa.parser.OutputFormatType import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} import org.json4s.native.JsonMethods.{compact, render} import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll} diff --git a/bigfiles/src/test/scala/RowByRowAnalysesTest.scala b/bigfiles/src/test/scala/RowByRowAnalysesTest.scala index 9fe44e2..2a1ac11 100644 --- a/bigfiles/src/test/scala/RowByRowAnalysesTest.scala +++ b/bigfiles/src/test/scala/RowByRowAnalysesTest.scala @@ -1,7 +1,22 @@ - -import africa.absa.cps.analysis.{ColumnsDiff, RowsDiff} -import africa.absa.cps.analysis.RowByRowAnalysis.generateDiffJson -import africa.absa.cps.hash.HashUtils.HASH_COLUMN_NAME +/** + * Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + +import za.co.absa.analysis.{ColumnsDiff, RowsDiff} +import za.co.absa.analysis.RowByRowAnalysis.generateDiffJson +import za.co.absa.hash.HashUtils.HASH_COLUMN_NAME import org.apache.spark.sql.SparkSession import org.scalatest.funsuite.AnyFunSuite import upickle.default._ diff --git a/bigfiles/src/test/scala/SparkTestSession.scala b/bigfiles/src/test/scala/SparkTestSession.scala index 5265cc7..bb13e95 100644 --- a/bigfiles/src/test/scala/SparkTestSession.scala +++ b/bigfiles/src/test/scala/SparkTestSession.scala @@ -1,3 +1,19 @@ +/** + * Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + import org.apache.spark.sql.SparkSession import java.io.File diff --git a/bigfiles/src/test/scala/VersionTest.scala b/bigfiles/src/test/scala/VersionTest.scala index 31ae00c..60ebe76 100644 --- a/bigfiles/src/test/scala/VersionTest.scala +++ b/bigfiles/src/test/scala/VersionTest.scala @@ -1,3 +1,19 @@ +/** + * Copyright 2020 ABSA Group Limited + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + **/ + import org.apache.spark._ import org.apache.spark.sql.SparkSession import org.scalatest.funsuite.AnyFunSuite diff --git a/images/find_diff.png b/images/find_diff.png new file mode 100644 index 0000000..2f3a734 Binary files /dev/null and b/images/find_diff.png differ diff --git a/images/find_match.png b/images/find_match.png new file mode 100644 index 0000000..9b5c4de Binary files /dev/null and b/images/find_match.png differ diff --git a/images/remove_id.png b/images/remove_id.png new file mode 100644 index 0000000..b639974 Binary files /dev/null and b/images/remove_id.png differ diff --git a/images/tables.png b/images/tables.png new file mode 100644 index 0000000..a64504d Binary files /dev/null and b/images/tables.png differ diff --git a/smallfiles/main.py b/smallfiles/main.py index ad0fe31..3498397 100644 --- a/smallfiles/main.py +++ b/smallfiles/main.py @@ -1,3 +1,17 @@ +# Copyright 2020 ABSA Group Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import pandas as pd import numpy as np diff --git a/smallfiles/test/__init__.py b/smallfiles/test/__init__.py index e69de29..c412da6 100644 --- a/smallfiles/test/__init__.py +++ b/smallfiles/test/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2020 ABSA Group Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/smallfiles/test/test_version.py b/smallfiles/test/test_version.py index 3829172..54f82c4 100644 --- a/smallfiles/test/test_version.py +++ b/smallfiles/test/test_version.py @@ -1,3 +1,17 @@ +# Copyright 2020 ABSA Group Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import sys