diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000..01ecd10 --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1 @@ +liberapay: jarbasAI diff --git a/.github/workflows/build_tests.yml b/.github/workflows/build_tests.yml new file mode 100644 index 0000000..bc795f3 --- /dev/null +++ b/.github/workflows/build_tests.yml @@ -0,0 +1,25 @@ +name: Run Build Tests +on: + push: + workflow_dispatch: + +jobs: + build_tests: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + ref: ${{ github.head_ref }} + - name: Setup Python + uses: actions/setup-python@v1 + with: + python-version: "3.11" + - name: Install Build Tools + run: | + python -m pip install build wheel + - name: Build Distribution Packages + run: | + python setup.py bdist_wheel + - name: Install package + run: | + pip install . diff --git a/.github/workflows/conventional-label.yaml b/.github/workflows/conventional-label.yaml new file mode 100644 index 0000000..0a449cb --- /dev/null +++ b/.github/workflows/conventional-label.yaml @@ -0,0 +1,10 @@ +# auto add labels to PRs +on: + pull_request_target: + types: [ opened, edited ] +name: conventional-release-labels +jobs: + label: + runs-on: ubuntu-latest + steps: + - uses: bcoe/conventional-release-labels@v1 \ No newline at end of file diff --git a/.github/workflows/publish_stable.yml b/.github/workflows/publish_stable.yml new file mode 100644 index 0000000..0e5d94e --- /dev/null +++ b/.github/workflows/publish_stable.yml @@ -0,0 +1,58 @@ +name: Stable Release +on: + push: + branches: [master] + workflow_dispatch: + +jobs: + publish_stable: + uses: TigreGotico/gh-automations/.github/workflows/publish-stable.yml@master + secrets: inherit + with: + branch: 'master' + version_file: 'zbase/version.py' + setup_py: 'setup.py' + publish_release: true + + publish_pypi: + needs: publish_stable + if: success() # Ensure this job only runs if the previous job succeeds + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + ref: dev + fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository. + - name: Setup Python + uses: actions/setup-python@v1 + with: + python-version: "3.11" + - name: Install Build Tools + run: | + python -m pip install build wheel + - name: version + run: echo "::set-output name=version::$(python setup.py --version)" + id: version + - name: Build Distribution Packages + run: | + python setup.py sdist bdist_wheel + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@master + with: + password: ${{secrets.PYPI_TOKEN}} + + + sync_dev: + needs: publish_stable + if: success() # Ensure this job only runs if the previous job succeeds + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository. + ref: master + - name: Push master -> dev + uses: ad-m/github-push-action@master + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + branch: dev \ No newline at end of file diff --git a/.github/workflows/release_workflow.yml b/.github/workflows/release_workflow.yml new file mode 100644 index 0000000..af152a0 --- /dev/null +++ b/.github/workflows/release_workflow.yml @@ -0,0 +1,108 @@ +name: Release Alpha and Propose Stable + +on: + pull_request: + types: [closed] + branches: [dev] + +jobs: + publish_alpha: + if: github.event.pull_request.merged == true + uses: TigreGotico/gh-automations/.github/workflows/publish-alpha.yml@master + secrets: inherit + with: + branch: 'dev' + version_file: 'zbase/version.py' + setup_py: 'setup.py' + update_changelog: true + publish_prerelease: true + changelog_max_issues: 100 + + notify: + if: github.event.pull_request.merged == true + needs: publish_alpha + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Send message to Matrix bots channel + id: matrix-chat-message + uses: fadenb/matrix-chat-message@v0.0.6 + with: + homeserver: 'matrix.org' + token: ${{ secrets.MATRIX_TOKEN }} + channel: '!WjxEKjjINpyBRPFgxl:krbel.duckdns.org' + message: | + new ${{ github.event.repository.name }} PR merged! https://github.com/${{ github.repository }}/pull/${{ github.event.number }} + + publish_pypi: + needs: publish_alpha + if: success() # Ensure this job only runs if the previous job succeeds + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + with: + ref: dev + fetch-depth: 0 # otherwise, there would be errors pushing refs to the destination repository. + - name: Setup Python + uses: actions/setup-python@v1 + with: + python-version: "3.11" + - name: Install Build Tools + run: | + python -m pip install build wheel + - name: version + run: echo "::set-output name=version::$(python setup.py --version)" + id: version + - name: Build Distribution Packages + run: | + python setup.py sdist bdist_wheel + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@master + with: + password: ${{secrets.PYPI_TOKEN}} + + + propose_release: + needs: publish_alpha + if: success() # Ensure this job only runs if the previous job succeeds + runs-on: ubuntu-latest + steps: + - name: Checkout dev branch + uses: actions/checkout@v3 + with: + ref: dev + + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: "3.11" + + - name: Get version from setup.py + id: get_version + run: | + VERSION=$(python setup.py --version) + echo "VERSION=$VERSION" >> $GITHUB_ENV + + - name: Create and push new branch + run: | + git checkout -b release-${{ env.VERSION }} + git push origin release-${{ env.VERSION }} + + - name: Open Pull Request from dev to master + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + # Variables + BRANCH_NAME="release-${{ env.VERSION }}" + BASE_BRANCH="master" + HEAD_BRANCH="release-${{ env.VERSION }}" + PR_TITLE="Release ${{ env.VERSION }}" + PR_BODY="Human review requested!" + + # Create a PR using GitHub API + curl -X POST \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: token $GITHUB_TOKEN" \ + -d "{\"title\":\"$PR_TITLE\",\"body\":\"$PR_BODY\",\"head\":\"$HEAD_BRANCH\",\"base\":\"$BASE_BRANCH\"}" \ + https://api.github.com/repos/${{ github.repository }}/pulls + diff --git a/README.md b/README.md index 248573c..37aef2d 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,98 @@ +# Zbase - Base91 and Z85 Encodings +This repository provides C and Python implementations of three encoding schemes: **Z85P**, **Base91**, and **Z85B**. -| Encoding | Avg Encoding Time (ns) | Avg Decoding Time (ns) | Avg Size Increase | Encoding Rank | Decoding Rank | Size Increase Rank | -|-------------|-------------------------|-------------------------|-------------------|---------------|---------------|--------------------| -| [pybase64](https://github.com/mayeut/pybase64) | 1131 ns | 2946 ns | 1.35x | 1 🥇 | 1 🥇 | 4 | -| **base91** | 622324 ns | 38632 ns | 1.23x | 5 | 4 | 1 🥇 | -| [base64](https://docs.python.org/3/library/base64.html) | 7113 ns | 7051 ns | 1.35x | 3 🥉 | 3 🥉 | 4 | -| [base16](https://docs.python.org/3/library/binascii.html) | 5953 ns | 5859 ns | 2.00x | 2 🥈 | 2 🥈 | 6 | -| **z85b** | 626214 ns | 871890 ns | 1.25x | 6 | 6 | 2 🥈 | -| **z85p** | 633825 ns | 775821 ns | 1.28x | 7 | 5 | 3 🥉 | -| [base32](https://docs.python.org/3/library/base64.html) | 503698 ns | 882194 ns | 1.62x | 4 | 7 | 5 | -| [z85p_py](https://github.com/JarbasHiveMind/hivemind-websocket-client/blob/dev/hivemind_bus_client/encodings/z85p.py) | 940859 ns | 1159043 ns | 1.28x | 8 | 8 | 3 🥉 | -| [z85b_py](https://github.com/JarbasHiveMind/hivemind-websocket-client/blob/dev/hivemind_bus_client/encodings/z85b.py) | 983796 ns | 1314734 ns | 1.25x | 9 | 9 | 2 🥈 | -| [base91_py](https://github.com/JarbasHiveMind/hivemind-websocket-client/blob/dev/hivemind_bus_client/encodings/b91.py) | 1414374 ns | 2080957 ns | 1.23x | 10 | 10 | 1 🥇 | +The C-based shared libraries are optimized for performance, while Python implementations provide a fallback when the C +libraries are not available. + +The repository contains: + +- **Base91 encoding**: A binary-to-text encoding scheme that uses 91 printable ASCII characters. +- **Z85B encoding**: A variant of Z85 used for efficient binary-to-text encoding. +- **Z85P encoding**: Another variant of Z85, with different padding scheme. + +## Features + +- **C-based implementation** for each encoding scheme for maximum performance. +- **Pure Python fallback** for environments where the C libraries are not available. +- Easy-to-use API for encoding and decoding with detailed error handling and logging. +- Cross-platform support (Linux, macOS, Windows) via system architecture detection. + +## Benchmarks + +| Encoding | Avg Encoding Time (ns) | Avg Decoding Time (ns) | Avg Size Increase | Encoding Rank | Decoding Rank | Size Increase Rank | +|------------------------------------------------------------------------------------------------------------------------|------------------------|------------------------|-------------------|---------------|---------------|--------------------| +| [pybase64](https://github.com/mayeut/pybase64) | 1131 ns | 2946 ns | 1.35x | 1 🥇 | 1 🥇 | 4 | +| **base91** | 622324 ns | 38632 ns | 1.23x | 5 | 4 | 1 🥇 | +| [base64](https://docs.python.org/3/library/base64.html) | 7113 ns | 7051 ns | 1.35x | 3 🥉 | 3 🥉 | 4 | +| [base16](https://docs.python.org/3/library/binascii.html) | 5953 ns | 5859 ns | 2.00x | 2 🥈 | 2 🥈 | 6 | +| **z85b** | 626214 ns | 871890 ns | 1.25x | 6 | 6 | 2 🥈 | +| **z85p** | 633825 ns | 775821 ns | 1.28x | 7 | 5 | 3 🥉 | +| [base32](https://docs.python.org/3/library/base64.html) | 503698 ns | 882194 ns | 1.62x | 4 | 7 | 5 | +| [z85p_py](https://github.com/JarbasHiveMind/hivemind-websocket-client/blob/dev/hivemind_bus_client/encodings/z85p.py) | 940859 ns | 1159043 ns | 1.28x | 8 | 8 | 3 🥉 | +| [z85b_py](https://github.com/JarbasHiveMind/hivemind-websocket-client/blob/dev/hivemind_bus_client/encodings/z85b.py) | 983796 ns | 1314734 ns | 1.25x | 9 | 9 | 2 🥈 | +| [base91_py](https://github.com/JarbasHiveMind/hivemind-websocket-client/blob/dev/hivemind_bus_client/encodings/b91.py) | 1414374 ns | 2080957 ns | 1.23x | 10 | 10 | 1 🥇 | + +## Usage + +You can use the provided classes to encode and decode data using the supported encoding schemes. + +### Z85P Encoding + +```python +from z85p import Z85P + +# Encode data +data = b"Hello, World!" +encoded = Z85P.encode(data) +print("Encoded Z85P:", encoded) + +# Decode data +decoded = Z85P.decode(encoded) +print("Decoded Z85P:", decoded) +``` + +### Base91 Encoding + +```python +from base91 import B91 + +# Encode data +data = b"Hello, World!" +encoded = B91.encode(data) +print("Encoded Base91:", encoded) + +# Decode data +decoded = B91.decode(encoded) +print("Decoded Base91:", decoded) +``` + +### Z85B Encoding + +```python +from z85b import Z85B + +# Encode data +data = b"Hello, World!" +encoded = Z85B.encode(data) +print("Encoded Z85B:", encoded) + +# Decode data +decoded = Z85B.decode(encoded) +print("Decoded Z85B:", decoded) +``` + +## Error Handling + +The library automatically falls back to the Python implementation if the C libraries are not found or fail to load. Any +issues related to encoding or decoding will raise a `ValueError` with a detailed message. + +In the case of missing C libraries, warnings will be logged using Python's built-in `logging` module. + +### Logging + +The library uses the `logging` module to provide useful runtime information: + +```bash +2025-01-08 12:34:56,789 - WARNING - Z85P C library not available: Library load error. Falling back to pure Python implementation. +``` diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..42c9011 --- /dev/null +++ b/setup.py @@ -0,0 +1,50 @@ +import os +from setuptools import setup + +BASEDIR = os.path.abspath(os.path.dirname(__file__)) + + +def get_version(): + """ Find the version of the package""" + version_file = os.path.join(BASEDIR, 'zbase', 'version.py') + major, minor, build, alpha = (None, None, None, None) + with open(version_file) as f: + for line in f: + if 'VERSION_MAJOR' in line: + major = line.split('=')[1].strip() + elif 'VERSION_MINOR' in line: + minor = line.split('=')[1].strip() + elif 'VERSION_BUILD' in line: + build = line.split('=')[1].strip() + elif 'VERSION_ALPHA' in line: + alpha = line.split('=')[1].strip() + + if ((major and minor and build and alpha) or + '# END_VERSION_BLOCK' in line): + break + version = f"{major}.{minor}.{build}" + if int(alpha): + version += f"a{alpha}" + return version + + +def required(requirements_file): + """ Read requirements file and remove comments and empty lines. """ + with open(os.path.join(BASEDIR, requirements_file), 'r') as f: + requirements = f.read().splitlines() + return [pkg for pkg in requirements + if pkg.strip() and not pkg.startswith("#")] + + + +setup( + name='zbase', + version=get_version(), + packages=['zbase'], + url='https://github.com/JarbasHiveMind/zbase', + license='Apache-2.0', + author='jarbasAi', + include_package_data=True, + author_email='jarbasai@mailfence.com', + description='base91, z85b and z85p encodings' +) diff --git a/z85base91/b91.c b/src/b91.c similarity index 77% rename from z85base91/b91.c rename to src/b91.c index a4c0141..19b7091 100644 --- a/z85base91/b91.c +++ b/src/b91.c @@ -6,13 +6,19 @@ #define ALPHABET_SIZE 91 // Base91 Alphabet (same as Python ALPHABET) -const char ALPHABET[ALPHABET_SIZE] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!#$%&()*+,-./:;<=>?@[\\]^_`{|}~\""; +const char ALPHABET[ALPHABET_SIZE] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!#$%&()*+,./:;<=>?@[]^_`{|}~\""; // Decode table (maps Base91 characters to their indices in the alphabet) int DECODE_TABLE[256]; // Function to initialize the DECODE_TABLE void initialize_decode_table() { + // Set all values to -1 (invalid by default) + for (int i = 0; i < 256; i++) { + DECODE_TABLE[i] = -1; + } + + // Now populate the DECODE_TABLE for valid characters for (int i = 0; i < ALPHABET_SIZE; i++) { DECODE_TABLE[(unsigned char)ALPHABET[i]] = i; } @@ -72,15 +78,15 @@ char* encode(const unsigned char* data, size_t data_len, size_t* output_len) { size_t out_idx = 0; for (size_t i = 0; i < data_len; i++) { - b |= data[i] << n; - n += 8; - if (n > 13) { - int v = b & 8191; + b |= data[i] << n; // Shift the bits + n += 8; // Move 8 bits at a time + if (n > 13) { // We need to encode multiple characters + int v = b & 8191; // Get the first 13 bits (Base91 split) if (v > 88) { b >>= 13; n -= 13; } else { - v = b & 16383; + v = b & 16383; // Get 14 bits b >>= 14; n -= 14; } @@ -89,14 +95,15 @@ char* encode(const unsigned char* data, size_t data_len, size_t* output_len) { } } + // Handle any leftover bits if (n) { - out[out_idx++] = ALPHABET[b % 91]; + out[out_idx++] = ALPHABET[b % 91]; // Place the final character if (n > 7 || b > 90) { out[out_idx++] = ALPHABET[b / 91]; } } - out[out_idx] = '\0'; + out[out_idx] = '\0'; // Null-terminate the output string *output_len = out_idx; return out; } diff --git a/src/compile.sh b/src/compile.sh new file mode 100644 index 0000000..1f9a249 --- /dev/null +++ b/src/compile.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# Compile for x86_64 +gcc -shared -o libbase91-x86_64.so -fPIC b91.c +gcc -shared -o libz85p-x86_64.so -fPIC z85p.c +gcc -shared -o libz85b-x86_64.so -fPIC z85b.c + +# Compile for armhf (32-bit ARM) +arm-linux-gnueabihf-gcc -shared -o libbase91-armhf.so -fPIC b91.c +arm-linux-gnueabihf-gcc -shared -o libz85p-armhf.so -fPIC z85p.c +arm-linux-gnueabihf-gcc -shared -o libz85b-armhf.so -fPIC z85b.c + +# Compile for aarch64 (64-bit ARM) +aarch64-linux-gnu-gcc -shared -o libbase91-aarch64.so -fPIC b91.c +aarch64-linux-gnu-gcc -shared -o libz85p-aarch64.so -fPIC z85p.c +aarch64-linux-gnu-gcc -shared -o libz85b-aarch64.so -fPIC z85b.c + +# Compile for i386 (32-bit Intel/AMD) +gcc -m32 -shared -o libbase91-i386.so -fPIC b91.c +gcc -m32 -shared -o libz85p-i386.so -fPIC z85p.c +gcc -m32 -shared -o libz85b-i386.so -fPIC z85b.c + +# Compile for Windows +x86_64-w64-mingw32-gcc -shared -o libbase91.dll -fPIC b91.c +x86_64-w64-mingw32-gcc -shared -o libz85p.dll -fPIC z85p.c +x86_64-w64-mingw32-gcc -shared -o libz85b.dll -fPIC z85b.c + +# Compile for MacOS +osxcross -shared -o libbase91.dylib -fPIC b91.c +osxcross -shared -o libz85p.dylib -fPIC z85p.c +osxcross -shared -o libz85b.dylib -fPIC z85b.c + +echo "Compilation completed!" diff --git a/z85base91/z85b.c b/src/z85b.c similarity index 100% rename from z85base91/z85b.c rename to src/z85b.c diff --git a/z85base91/z85p.c b/src/z85p.c similarity index 100% rename from z85base91/z85p.c rename to src/z85p.c diff --git a/z85base91/__init__.py b/z85base91/__init__.py deleted file mode 100644 index a8240ba..0000000 --- a/z85base91/__init__.py +++ /dev/null @@ -1,202 +0,0 @@ -# TODO - package this properly for pypi -# $ gcc -shared -o libz85p.so -fPIC z85p.c -# $ gcc -shared -o libz85b.so -fPIC z85b.c -# $ gcc -shared -o libbase91.so -fPIC b91.c - -import ctypes -from ctypes import c_char_p, c_void_p, c_size_t, c_ubyte, byref, POINTER -from typing import Union - - -class Z85P: - # Load the shared library - lib = ctypes.CDLL('./libz85p.so') # On Windows, use './z85p.dll' - - # Initialize the Z85 map (this needs to be called first) - lib.initialize_z85_map() - - # Define the encode function prototype - lib.encode_z85p.argtypes = [ctypes.POINTER(ctypes.c_ubyte), c_size_t, POINTER(c_size_t)] - lib.encode_z85p.restype = ctypes.POINTER(ctypes.c_ubyte) - - # Define the decode function prototype - lib.decode_z85p.argtypes = [ctypes.POINTER(ctypes.c_ubyte), c_size_t, POINTER(c_size_t)] - lib.decode_z85p.restype = ctypes.POINTER(ctypes.c_ubyte) - - @classmethod - def encode(cls, data: bytes) -> bytes: - out_len = c_size_t(0) - raw_data = (ctypes.c_ubyte * len(data))(*data) - encoded_data = cls.lib.encode_z85p(raw_data, len(data), ctypes.byref(out_len)) - return bytes(ctypes.string_at(encoded_data, out_len.value)) - - @classmethod - def decode(cls, data: bytes) -> bytes: - out_len = c_size_t(0) - raw_data = (ctypes.c_ubyte * len(data))(*data) - decoded_data = cls.lib.decode_z85p(raw_data, len(data), ctypes.byref(out_len)) - return bytes(ctypes.string_at(decoded_data, out_len.value)) - - -class B91: - # Load the shared library - lib = ctypes.CDLL('./libbase91.so') # On Windows, use './base91.dll' - - # Initialize the decode table (this needs to be called first) - lib.initialize_decode_table() - - # Define the decode function prototype - lib.decode.argtypes = [c_char_p, ctypes.POINTER(c_size_t)] - lib.decode.restype = c_void_p - - # Define the encode function prototype - lib.encode.argtypes = [ctypes.POINTER(ctypes.c_ubyte), c_size_t, ctypes.POINTER(c_size_t)] - lib.encode.restype = c_char_p - - @classmethod - def decode(cls, encoded_data: Union[str, bytes]): - if isinstance(encoded_data, str): - # Convert the encoded data to bytes - encoded_data = encoded_data.encode('utf-8') - output_len = c_size_t(0) - - # Call the C function - decoded_data = cls.lib.decode(encoded_data, ctypes.byref(output_len)) - - if decoded_data: - return ctypes.string_at(decoded_data, output_len.value) - else: - raise ValueError("Invalid Base91 string") - - @classmethod - def encode(cls, data: Union[str, bytes]): - if isinstance(data, str): - # Convert the data to bytes - data = data.encode('utf-8') - output_len = c_size_t(0) - - # Call the C function - encoded_data = cls.lib.encode((ctypes.c_ubyte * len(data))(*data), len(data), ctypes.byref(output_len)) - - if encoded_data: - return ctypes.string_at(encoded_data, output_len.value) - else: - raise ValueError("Encoding failed") - - -class Z85B: - # Load the shared library dynamically - lib = ctypes.CDLL('./libz85b.so') # Update path as needed - - # Define function prototypes - lib.encode_z85b.argtypes = [POINTER(c_ubyte), c_size_t, POINTER(c_size_t)] - lib.encode_z85b.restype = POINTER(c_ubyte) - - lib.decode_z85b.argtypes = [POINTER(c_ubyte), c_size_t, POINTER(c_size_t)] - lib.decode_z85b.restype = POINTER(c_ubyte) - - lib.free.argtypes = [ctypes.c_void_p] # Add free function for memory cleanup - - @classmethod - def encode(cls, data: bytes) -> bytes: - """ - Encode raw bytes into Z85b format. - - Args: - data (bytes): Input data to encode. - - Returns: - bytes: Z85b-encoded data. - - Raises: - ValueError: If encoding fails. - """ - output_len = c_size_t(0) - encoded_data = cls.lib.encode_z85b((c_ubyte * len(data))(*data), len(data), byref(output_len)) - if not encoded_data: - raise ValueError("Encoding failed") - - try: - return ctypes.string_at(encoded_data, output_len.value) - finally: - cls.lib.free(encoded_data) - - @classmethod - def decode(cls, encoded_data: bytes) -> bytes: - """ - Decode Z85b-encoded bytes into raw bytes. - - Args: - encoded_data (bytes): Z85b-encoded input. - - Returns: - bytes: Decoded raw bytes. - - Raises: - ValueError: If decoding fails. - """ - output_len = c_size_t(0) - decoded_data = cls.lib.decode_z85b((c_ubyte * len(encoded_data))(*encoded_data), len(encoded_data), - byref(output_len)) - if not decoded_data: - raise ValueError("Decoding failed") - - try: - return ctypes.string_at(decoded_data, output_len.value) - finally: - cls.lib.free(decoded_data) - - -if __name__ == "__main__": - from hivemind_bus_client.encodings import Z85B as Z85Bpy, B91 as B91py, Z85P as Z85Ppy - - - def test_b91(s=b"Hello, Base91!"): - # Example usage: - try: - encoded = B91py.encode(s) - print("Encoded py:", encoded) - decoded = B91py.decode(encoded) - print("Decoded py:", decoded) - - encoded = B91.encode(s) - print("Encoded:", encoded) - decoded = B91.decode(encoded) - print("Decoded:", decoded) - except Exception as e: - print(f"Error: {e}") - - - def test_z85b(s=b"Hello, Z85B!"): - try: - encoded = Z85Bpy.encode(s) - print("Encoded py:", encoded) - decoded = Z85Bpy.decode(encoded) - print("Decoded py:", decoded) - - encoded = Z85B.encode(s) - print("Encoded:", encoded) - decoded = Z85B.decode(encoded) - print("Decoded:", decoded) - except Exception as e: - print(f"Error: {e}") - - - def test_z85p(s=b"Hello, Z85P!"): - try: - encoded = Z85Ppy.encode(s) - print(f"Encoded py: {encoded}") - decoded = Z85Ppy.decode(encoded) - print(f"Decoded py: {decoded.decode('utf-8')}") - - encoded = Z85P.encode(s) - print(f"Encoded: {encoded}") - decoded = Z85P.decode(encoded) - print(f"Decoded: {decoded.decode('utf-8')}") - except Exception as e: - print(f"Error: {e}") - - - test_b91() - test_z85p() - test_z85b() diff --git a/zbase/__init__.py b/zbase/__init__.py new file mode 100644 index 0000000..f0a022e --- /dev/null +++ b/zbase/__init__.py @@ -0,0 +1,311 @@ +import ctypes +import logging +import os.path +import platform +from ctypes import c_char_p, c_void_p, c_size_t, c_ubyte, byref, POINTER +from typing import Union + +# Set up logging +logging.basicConfig(level=logging.WARNING, format='%(asctime)s - %(levelname)s - %(message)s') + + +# get the appropriate shared library file based on architecture +def get_arch_lib(lib_base_name: str) -> str: + """ + Returns the path to the shared library based on the system's architecture. + + Args: + lib_base_name (str): The base name of the shared library (e.g., 'libz85p'). + + Returns: + str: The path to the shared library for the appropriate system architecture. + + Raises: + ValueError: If the system architecture is unsupported. + """ + arch = platform.machine() # Get system architecture + if arch == "x86_64": + return f"{os.path.dirname(__file__)}/{lib_base_name}-x86_64.so" + elif arch == "aarch64": + return f"{os.path.dirname(__file__)}/{lib_base_name}-aarch64.so" + elif arch == "i386" or arch == "i686": + return f"{os.path.dirname(__file__)}/{lib_base_name}-i386.so" + else: + raise ValueError(f"Unsupported architecture: {arch}") + + +try: + class Z85P: + """ + Class for encoding and decoding Z85P format using a C-based shared library. + If the C library is not available, it falls back to a pure Python implementation. + """ + # Load the correct shared library based on system architecture + lib = ctypes.CDLL(get_arch_lib('libz85p')) + + # Initialize the Z85 map (this needs to be called first) + lib.initialize_z85_map() + + # Define the encode function prototype + lib.encode_z85p.argtypes = [ctypes.POINTER(ctypes.c_ubyte), c_size_t, POINTER(c_size_t)] + lib.encode_z85p.restype = ctypes.POINTER(ctypes.c_ubyte) + + # Define the decode function prototype + lib.decode_z85p.argtypes = [ctypes.POINTER(ctypes.c_ubyte), c_size_t, POINTER(c_size_t)] + lib.decode_z85p.restype = ctypes.POINTER(ctypes.c_ubyte) + + @classmethod + def encode(cls, data: bytes) -> bytes: + """ + Encodes the input data into Z85P format. + + Args: + data (bytes): The raw data to encode. + + Returns: + bytes: The Z85P-encoded data. + + Raises: + ValueError: If encoding fails. + """ + out_len = c_size_t(0) + raw_data = (ctypes.c_ubyte * len(data))(*data) + encoded_data = cls.lib.encode_z85p(raw_data, len(data), ctypes.byref(out_len)) + if not encoded_data: + raise ValueError("Encoding failed") + return bytes(ctypes.string_at(encoded_data, out_len.value)) + + @classmethod + def decode(cls, data: bytes) -> bytes: + """ + Decodes the input Z85P-encoded data into raw bytes. + + Args: + data (bytes): The Z85P-encoded data to decode. + + Returns: + bytes: The decoded raw data. + + Raises: + ValueError: If decoding fails. + """ + out_len = c_size_t(0) + raw_data = (ctypes.c_ubyte * len(data))(*data) + decoded_data = cls.lib.decode_z85p(raw_data, len(data), ctypes.byref(out_len)) + if not decoded_data: + raise ValueError("Decoding failed") + return bytes(ctypes.string_at(decoded_data, out_len.value)) +except Exception as e: + logging.warning(f"Z85P C library not available: {e}. Falling back to pure Python implementation.") + from .z85p import Z85P + +try: + class B91: + """ + Class for encoding and decoding Base91 format using a C-based shared library. + If the C library is not available, it falls back to a pure Python implementation. + """ + # Load the correct shared library based on system architecture + lib = ctypes.CDLL(get_arch_lib('libbase91')) + + # Initialize the decode table (this needs to be called first) + lib.initialize_decode_table() + + # Define the decode function prototype + lib.decode.argtypes = [c_char_p, ctypes.POINTER(c_size_t)] + lib.decode.restype = c_void_p + + # Define the encode function prototype + lib.encode.argtypes = [ctypes.POINTER(ctypes.c_ubyte), c_size_t, ctypes.POINTER(c_size_t)] + lib.encode.restype = c_char_p + + @classmethod + def decode(cls, encoded_data: Union[str, bytes]) -> bytes: + """ + Decodes the input Base91-encoded data into raw bytes. + + Args: + encoded_data (Union[str, bytes]): The Base91-encoded data to decode. + + Returns: + bytes: The decoded raw data. + + Raises: + ValueError: If decoding fails. + """ + if isinstance(encoded_data, str): + # Convert the encoded data to bytes + encoded_data = encoded_data.encode('utf-8') + output_len = c_size_t(0) + + # Call the C function + decoded_data = cls.lib.decode(encoded_data, ctypes.byref(output_len)) + + if not decoded_data: + raise ValueError("Invalid Base91 string") + return ctypes.string_at(decoded_data, output_len.value) + + @classmethod + def encode(cls, data: Union[str, bytes]) -> bytes: + """ + Encodes the input data into Base91 format. + + Args: + data (Union[str, bytes]): The raw data to encode. + + Returns: + bytes: The Base91-encoded data. + + Raises: + ValueError: If encoding fails. + """ + if isinstance(data, str): + # Convert the data to bytes + data = data.encode('utf-8') + output_len = c_size_t(0) + + # Call the C function + encoded_data = cls.lib.encode((ctypes.c_ubyte * len(data))(*data), len(data), ctypes.byref(output_len)) + + if not encoded_data: + raise ValueError("Encoding failed") + return ctypes.string_at(encoded_data, output_len.value) +except Exception as e: + logging.warning(f"Base91 C library not available: {e}. Falling back to pure Python implementation.") + from .b91 import B91 + +try: + class Z85B: + """ + Class for encoding and decoding Z85B format using a C-based shared library. + If the C library is not available, it falls back to a pure Python implementation. + """ + # Load the correct shared library based on system architecture + lib = ctypes.CDLL(get_arch_lib('libz85b')) + + # Define function prototypes + lib.encode_z85b.argtypes = [POINTER(c_ubyte), c_size_t, POINTER(c_size_t)] + lib.encode_z85b.restype = POINTER(c_ubyte) + + lib.decode_z85b.argtypes = [POINTER(c_ubyte), c_size_t, POINTER(c_size_t)] + lib.decode_z85b.restype = POINTER(c_ubyte) + + lib.free.argtypes = [ctypes.c_void_p] # Add free function for memory cleanup + + @classmethod + def encode(cls, data: bytes) -> bytes: + """ + Encodes the input data into Z85B format. + + Args: + data (bytes): The raw data to encode. + + Returns: + bytes: The Z85B-encoded data. + + Raises: + ValueError: If encoding fails. + """ + output_len = c_size_t(0) + encoded_data = cls.lib.encode_z85b((c_ubyte * len(data))(*data), len(data), byref(output_len)) + if not encoded_data: + raise ValueError("Encoding failed") + + try: + return ctypes.string_at(encoded_data, output_len.value) + finally: + cls.lib.free(encoded_data) + + @classmethod + def decode(cls, encoded_data: bytes) -> bytes: + """ + Decodes the input Z85B-encoded data into raw bytes. + + Args: + encoded_data (bytes): The Z85B-encoded data to decode. + + Returns: + bytes: The decoded raw data. + + Raises: + ValueError: If decoding fails. + """ + output_len = c_size_t(0) + decoded_data = cls.lib.decode_z85b((c_ubyte * len(encoded_data))(*encoded_data), len(encoded_data), + byref(output_len)) + if not decoded_data: + raise ValueError("Decoding failed") + + try: + return ctypes.string_at(decoded_data, output_len.value) + finally: + cls.lib.free(decoded_data) +except Exception as e: + logging.warning(f"Z85B C library not available: {e}. Falling back to pure Python implementation.") + from .z85b import Z85B + +if __name__ == "__main__": + + from zbase.b91 import B91 as B91py + from zbase.z85b import Z85B as Z85Bpy + from zbase.z85p import Z85P as Z85Ppy + + + def test_b91(s=b"Hello, Base91!"): + # Example usage: + try: + pencoded = B91py.encode(s) + print("Encoded py:", pencoded) + pdecoded = B91py.decode(pencoded) + print("Decoded py:", pdecoded) + + encoded = B91.encode(s) + print("Encoded:", encoded) + decoded = B91.decode(encoded) + print("Decoded:", decoded) + + assert pdecoded == decoded + assert pencoded == encoded + except Exception as e: + print(f"Error: {e}") + + + def test_z85b(s=b"Hello, Z85B!"): + try: + pencoded = Z85Bpy.encode(s) + print("Encoded py:", pencoded) + pdecoded = Z85Bpy.decode(pencoded) + print("Decoded py:", pdecoded) + + encoded = Z85B.encode(s) + print("Encoded:", encoded) + decoded = Z85B.decode(encoded) + print("Decoded:", decoded) + + assert pdecoded == decoded + assert pencoded == encoded + except Exception as e: + print(f"Error: {e}") + + + def test_z85p(s=b"Hello, Z85P!"): + try: + pencoded = Z85Ppy.encode(s) + print(f"Encoded py: {pencoded}") + pdecoded = Z85Ppy.decode(pencoded) + print(f"Decoded py: {pdecoded.decode('utf-8')}") + + encoded = Z85P.encode(s) + print(f"Encoded: {encoded}") + decoded = Z85P.decode(encoded) + print(f"Decoded: {decoded.decode('utf-8')}") + + assert pdecoded == decoded + assert pencoded == encoded + except Exception as e: + print(f"Error: {e}") + + + test_b91() + test_z85p() + test_z85b() diff --git a/zbase/b91.py b/zbase/b91.py new file mode 100644 index 0000000..6ce0773 --- /dev/null +++ b/zbase/b91.py @@ -0,0 +1,100 @@ +from typing import Union + + +class B91: + ALPHABET = [ + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', + 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', + 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', + 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', + '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '!', '#', '$', + '%', '&', '(', ')', '*', '+', ',', '.', '/', ':', ';', '<', '=', + '>', '?', '@', '[', ']', '^', '_', '`', '{', '|', '}', '~', '"' + ] + + DECODE_TABLE = {char: idx for idx, char in enumerate(ALPHABET)} + + @classmethod + def decode(cls, encoded_data: Union[str, bytes], encoding: str = "utf-8") -> bytes: + """ + Decodes a Base91-encoded string into its original binary form. + + Args: + encoded_data (Union[str, bytes]): Base91-encoded input data. If `bytes`, it is decoded as UTF-8. + encoding (str): The encoding to use if `encoded_data` is provided as a string. Default is 'utf-8'. + + Returns: + bytes: The decoded binary data. + + Raises: + ValueError: If the input contains invalid Base91 characters. + """ + if isinstance(encoded_data, bytes): + encoded_data = encoded_data.decode(encoding) + + v = -1 + b = 0 + n = 0 + out = bytearray() + + for char in encoded_data: + if char not in cls.DECODE_TABLE: + raise ValueError(f"Invalid Base91 character: {char}") + c = cls.DECODE_TABLE[char] + if v < 0: + v = c + else: + v += c * 91 + b |= v << n + n += 13 if (v & 8191) > 88 else 14 + while n >= 8: + out.append(b & 255) + b >>= 8 + n -= 8 + v = -1 + + if v >= 0: + out.append((b | v << n) & 255) + + return bytes(out) + + @classmethod + def encode(cls, data: Union[bytes, str], encoding: str = "utf-8") -> bytes: + """ + Encodes binary data into a Base91-encoded string. + + Args: + data (Union[bytes, str]): Input binary data to encode. If `str`, it is encoded as UTF-8. + encoding (str): The encoding to use if `data` is provided as a string. Default is 'utf-8'. + + Returns: + str: The Base91-encoded string. + """ + if isinstance(data, str): + data = data.encode(encoding) + + b = 0 + n = 0 + out = [] + + for byte in data: + b |= byte << n + n += 8 + if n > 13: + v = b & 8191 + if v > 88: + b >>= 13 + n -= 13 + else: + v = b & 16383 + b >>= 14 + n -= 14 + out.append(cls.ALPHABET[v % 91]) + out.append(cls.ALPHABET[v // 91]) + + if n: + out.append(cls.ALPHABET[b % 91]) + if n > 7 or b > 90: + out.append(cls.ALPHABET[b // 91]) + + return ''.join(out).encode(encoding) diff --git a/zbase/bench.py b/zbase/bench.py new file mode 100644 index 0000000..7cd398d --- /dev/null +++ b/zbase/bench.py @@ -0,0 +1,227 @@ +import base64 +import pybase64 +import random +import string +import time +from typing import Callable, Dict, List, Tuple + +import click +from z85base91 import Z85P, Z85B, B91 +from hivemind_bus_client.encodings import Z85B as Z85Bpy, Z85P as Z85Ppy, B91 as B91py +from tabulate import tabulate + + +def get_encoder(encoding: str) -> Callable[[bytes], bytes]: + """Retrieve the encoder function for the given encoding.""" + encoders = { + "base64": base64.b64encode, + "base64_py": pybase64.b64encode, + "z85b": Z85B.encode, + "z85p": Z85P.encode, + "base91": B91.encode, + "z85b_py": Z85Bpy.encode, + "z85p_py": Z85Ppy.encode, + "base91_py": B91py.encode, + "base32": base64.b32encode + } + return encoders[encoding] + + +def get_decoder(encoding: str) -> Callable[[bytes], bytes]: + """Retrieve the decoder function for the given encoding.""" + decoders = { + "base64": base64.b64decode, + "base64_py": pybase64.b64decode, + "z85b": Z85B.decode, + "z85p": Z85P.decode, + "base91": B91.decode, + "z85b_py": Z85Bpy.decode, + "z85p_py": Z85Ppy.decode, + "base91_py": B91py.decode, + "base32": base64.b32decode + } + return decoders[encoding] + + +def generate_random_data(size: int) -> bytes: + """Generate random binary data of a given size.""" + return ''.join(random.choices(string.ascii_letters + string.digits, k=size)).encode("utf-8") + + +def benchmark_encoding(encoding: str, data: bytes) -> Dict[str, int]: + """Benchmark encoding and decoding for a given encoding.""" + encoder = get_encoder(encoding) + decoder = get_decoder(encoding) + + # Measure encoding time in nanoseconds + start_time = time.perf_counter_ns() + encoded_data = encoder(data) + encoding_time = time.perf_counter_ns() - start_time + + # Measure decoding time in nanoseconds + start_time = time.perf_counter_ns() + decoded_data = decoder(encoded_data) + decoding_time = time.perf_counter_ns() - start_time + + # Validate decoding + if decoded_data != data: + raise ValueError(f"Decoded data does not match for encoding {encoding}.") + + # Calculate size increase + original_size = len(data) + encoded_size = len(encoded_data) + size_increase = encoded_size / original_size + + return { + "encoding_time": encoding_time, + "decoding_time": decoding_time, + "size_increase": size_increase, + } + + +def get_rankings(metric: Dict[str, Dict[str, int]], key: str) -> List[Tuple[str, int]]: + """Rank the encodings based on the provided metric, handling ties.""" + sorted_encodings = sorted(metric.items(), key=lambda x: x[1][key], reverse=False) + rankings = [] + current_rank = 1 # Start from rank 1 + + for i in range(len(sorted_encodings)): + if i > 0 and sorted_encodings[i][1][key] == sorted_encodings[i - 1][1][key]: + # Tie case: Same rank as the previous item + rankings.append((sorted_encodings[i][0], rankings[-1][1])) + else: + # No tie, increase the rank + rankings.append((sorted_encodings[i][0], current_rank)) + current_rank += 1 # Increment rank only when no tie + + return rankings + + +def compare_python_c(encoding: str, python_results: Dict[str, Dict[str, int]], + c_results: Dict[str, Dict[str, int]]) -> float: + """Compare the speed between Python and C encodings and calculate how many times faster or slower it is.""" + if encoding.endswith("_py"): + c_encoding = encoding[:-3] # Remove the _py suffix to get the C counterpart + python_time = python_results[encoding]["encoding_time"] + c_time = c_results[c_encoding]["encoding_time"] + + if c_time == 0: + return float('inf') # Avoid division by zero + return python_time / c_time + return 1.0 # If not a Python version, return 1 (no comparison) + + +@click.command() +@click.option("--sizes", default="100,1000,10000", help="Comma-separated list of data sizes to test.") +@click.option("--iterations", default=10, help="Number of iterations for each test.") +def main(sizes: str, iterations: int): + sizes = list(map(int, sizes.split(","))) + + encodings = [ + "base64", + "base32", + "base64_py", + "z85b", + "z85p", + "base91", + "z85b_py", + "z85p_py", + "base91_py", + ] + + results = {size: {encoding: [] for encoding in encodings} for size in sizes} + + # Run benchmarks + for size in sizes: + print(f"Testing size: {size} bytes") + for _ in range(iterations): + data = generate_random_data(size) + for encoding in encodings: + result = benchmark_encoding(encoding, data) + results[size][encoding].append(result) + + # Calculate averages and print results + global_ranking = {encoding: {"encoding_time": 0, "decoding_time": 0, "size_increase": 0} for encoding in encodings} + table = [] + + # Aggregate results across all sizes + for encoding in encodings: + total_encoding_time = 0 + total_decoding_time = 0 + total_size_increase = 0 + + for size, encoding_results in results.items(): + avg_encoding_time = sum(m["encoding_time"] for m in encoding_results[encoding]) // iterations + avg_decoding_time = sum(m["decoding_time"] for m in encoding_results[encoding]) // iterations + avg_size_increase = sum(m["size_increase"] for m in encoding_results[encoding]) / iterations + + total_encoding_time += avg_encoding_time + total_decoding_time += avg_decoding_time + total_size_increase += avg_size_increase + + table.append([ + encoding, + f"{avg_encoding_time} ns", + f"{avg_decoding_time} ns", + f"{avg_size_increase:.2f}x size increase" + ]) + + # Store global averages + global_ranking[encoding]["encoding_time"] = total_encoding_time // len(sizes) + global_ranking[encoding]["decoding_time"] = total_decoding_time // len(sizes) + global_ranking[encoding]["size_increase"] = total_size_increase / len(sizes) + + # Global ranking (based on average times) + print("\n### Global Ranking (Merged) ###") + + # Get rankings for encoding time, decoding time, and size increase + sorted_by_encoding_time = get_rankings(global_ranking, "encoding_time") + sorted_by_decoding_time = get_rankings(global_ranking, "decoding_time") + sorted_by_size_increase = get_rankings(global_ranking, "size_increase") + + merged_table = [] + for encoding, metrics in global_ranking.items(): + encoding_time_rank = next(rank for enc, rank in sorted_by_encoding_time if enc == encoding) + decoding_time_rank = next(rank for enc, rank in sorted_by_decoding_time if enc == encoding) + size_increase_rank = next(rank for enc, rank in sorted_by_size_increase if enc == encoding) + + # Calculate the average rank + avg_rank = (encoding_time_rank + decoding_time_rank + size_increase_rank) / 3 + + # Medal assignments + encoding_time_medal = "🥇" if encoding_time_rank == 1 else "🥈" if encoding_time_rank == 2 else "🥉" if encoding_time_rank == 3 else "" + decoding_time_medal = "🥇" if decoding_time_rank == 1 else "🥈" if decoding_time_rank == 2 else "🥉" if decoding_time_rank == 3 else "" + size_increase_medal = "🥇" if size_increase_rank == 1 else "🥈" if size_increase_rank == 2 else "🥉" if size_increase_rank == 3 else "" + + # Add the top-ranked emojis + merged_table.append([ + encoding, + f"{metrics['encoding_time']} ns", + f"{metrics['decoding_time']} ns", + f"{metrics['size_increase']:.2f}x", + f"{encoding_time_rank} {encoding_time_medal}", + f"{decoding_time_rank} {decoding_time_medal}", + f"{size_increase_rank} {size_increase_medal}", + avg_rank + ]) + + # Pairwise comparison for Python vs C + if encoding.endswith("_py"): # If it's a Python version + speed_comparison = compare_python_c(encoding, global_ranking, global_ranking) + if float(speed_comparison) > 1: + merged_table[-1].append(f"{speed_comparison:.2f}x slower") + else: + merged_table[-1].append(f"{1/speed_comparison:.2f}x faster") + + # Sort the merged table based on the average rank + merged_table.sort(key=lambda x: float(str(x[-2]).split()[0]), reverse=False) + + # Display the final table + print(tabulate(merged_table, + headers=["Encoding", "Avg Encoding Time (ns)", "Avg Decoding Time (ns)", "Avg Size Increase", + "Encoding Rank", "Decoding Rank", "Size Increase Rank", "Score", "Reference vs Optimized"], + tablefmt="grid")) + + +if __name__ == "__main__": + main() diff --git a/zbase/libbase91-aarch64.so b/zbase/libbase91-aarch64.so new file mode 100755 index 0000000..8a85767 Binary files /dev/null and b/zbase/libbase91-aarch64.so differ diff --git a/zbase/libbase91-i386.so b/zbase/libbase91-i386.so new file mode 100755 index 0000000..a999f33 Binary files /dev/null and b/zbase/libbase91-i386.so differ diff --git a/z85base91/libbase91.so b/zbase/libbase91-x86_64.so similarity index 87% rename from z85base91/libbase91.so rename to zbase/libbase91-x86_64.so index f4e3e26..764107e 100755 Binary files a/z85base91/libbase91.so and b/zbase/libbase91-x86_64.so differ diff --git a/zbase/libz85b-aarch64.so b/zbase/libz85b-aarch64.so new file mode 100755 index 0000000..ff8baff Binary files /dev/null and b/zbase/libz85b-aarch64.so differ diff --git a/zbase/libz85b-i386.so b/zbase/libz85b-i386.so new file mode 100755 index 0000000..494dc39 Binary files /dev/null and b/zbase/libz85b-i386.so differ diff --git a/z85base91/libz85b.so b/zbase/libz85b-x86_64.so similarity index 100% rename from z85base91/libz85b.so rename to zbase/libz85b-x86_64.so diff --git a/zbase/libz85p-aarch64.so b/zbase/libz85p-aarch64.so new file mode 100755 index 0000000..c634c7a Binary files /dev/null and b/zbase/libz85p-aarch64.so differ diff --git a/zbase/libz85p-i386.so b/zbase/libz85p-i386.so new file mode 100755 index 0000000..12eebca Binary files /dev/null and b/zbase/libz85p-i386.so differ diff --git a/z85base91/libz85p.so b/zbase/libz85p-x86_64.so similarity index 100% rename from z85base91/libz85p.so rename to zbase/libz85p-x86_64.so diff --git a/zbase/version.py b/zbase/version.py new file mode 100644 index 0000000..76c4342 --- /dev/null +++ b/zbase/version.py @@ -0,0 +1,6 @@ +# START_VERSION_BLOCK +VERSION_MAJOR = 0 +VERSION_MINOR = 0 +VERSION_BUILD = 1 +VERSION_ALPHA = 0 +# END_VERSION_BLOCK diff --git a/zbase/z85b.py b/zbase/z85b.py new file mode 100644 index 0000000..50c89a0 --- /dev/null +++ b/zbase/z85b.py @@ -0,0 +1,108 @@ +""" +Python implementation of Z85b 85-bit encoding. + +Z85b is a variation of ZMQ RFC 32 Z85 85-bit encoding with the following differences: +1. Little-endian encoding (to facilitate alignment with lower byte indices). +2. No requirement for a multiple of 4/5 length. +3. `decode_z85b()` eliminates whitespace from the input. +4. `decode_z85b()` raises a clear exception if invalid characters are encountered. + +This file is a derivative work of https://gist.github.com/minrk/6357188?permalink_comment_id=2366506#gistcomment-2366506 + +Copyright (c) 2013 Brian Granger, Min Ragan-Kelley +Distributed under the terms of the New BSD License. +""" +import re +import struct +from typing import Union + +from hivemind_bus_client.exceptions import Z85DecodeError + + +class Z85B: + # Z85CHARS is the base 85 symbol table + Z85CHARS = bytearray(b"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.-:+=^!/*?&<>()[]{}@%$#") + + # Z85MAP maps integers in [0, 84] to the appropriate character in Z85CHARS + Z85MAP = {char: idx for idx, char in enumerate(Z85CHARS)} + + # Powers of 85 for encoding/decoding + _85s = [85 ** i for i in range(5)] + + # Padding lengths for encoding and decoding + _E_PADDING = [0, 3, 2, 1] + _D_PADDING = [0, 4, 3, 2, 1] + + @classmethod + def encode(cls, data: Union[str, bytes], encoding: str = "utf-8") -> bytes: + """ + Encode raw bytes into Z85b format. + + Args: + data (Union[str, bytes]): Input data to encode. + encoding (str): The encoding to use if `data` is provided as a string. Default is 'utf-8'. + + Returns: + bytes: Z85b-encoded bytes. + """ + if isinstance(data, str): + data = data.encode(encoding) + data = bytearray(data) + padding = cls._E_PADDING[len(data) % 4] + data += b'\x00' * padding + nvalues = len(data) // 4 + + # Pack the raw bytes into little-endian 32-bit integers + values = struct.unpack(f'<{nvalues}I', data) + encoded = bytearray() + + for value in values: + for offset in cls._85s: + encoded.append(cls.Z85CHARS[(value // offset) % 85]) + + # Remove padding characters from the encoded output + if padding: + encoded = encoded[:-padding] + return bytes(encoded) + + @classmethod + def decode(cls, encoded_data: Union[str, bytes], encoding: str = "utf-8") -> bytes: + """ + Decode Z85b-encoded bytes into raw bytes. + + Args: + encoded_data (Union[str, bytes]): Z85b-encoded data. + encoding (str): The encoding to use if `encoded_data` is provided as a string. Default is 'utf-8'. + + Returns: + bytes: Decoded raw bytes. + + Raises: + Z85DecodeError: If invalid characters are encountered during decoding. + """ + # Normalize input by removing whitespace + encoded_data = bytearray(re.sub(rb'\s+', b'', + encoded_data if isinstance(encoded_data, bytes) + else encoded_data.encode(encoding))) + padding = cls._D_PADDING[len(encoded_data) % 5] + nvalues = (len(encoded_data) + padding) // 5 + + values = [] + for i in range(0, len(encoded_data), 5): + value = 0 + for j, offset in enumerate(cls._85s): + try: + value += cls.Z85MAP[encoded_data[i + j]] * offset + except IndexError: + break # End of input reached + except KeyError as e: + raise Z85DecodeError(f"Invalid byte code: {e.args[0]!r}") + values.append(value) + + # Unpack the values back into raw bytes + decoded = struct.pack(f'<{nvalues}I', *values) + + # Remove padding from the decoded output + if padding: + decoded = decoded[:-padding] + return decoded diff --git a/zbase/z85p.py b/zbase/z85p.py new file mode 100644 index 0000000..3298a17 --- /dev/null +++ b/zbase/z85p.py @@ -0,0 +1,88 @@ +from typing import Union +import struct + +class Z85P: + """ + Z85 is a class that provides encoding and decoding methods for transforming raw bytes into the Z85 encoding format. + Z85 encoding represents 32-bit chunks of input bytes into a base85-encoded string with padding applied. + The padding is added to ensure the encoded data's length is a multiple of 4 characters. + The first byte of the encoded data indicates how many padding characters were added, which can be removed during decoding. + """ + Z85CHARS = b"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ.-:+=^!/*?&<>()[]{}@%$#" + Z85MAP = {c: idx for idx, c in enumerate(Z85CHARS)} + + _85s = [85 ** i for i in range(5)][::-1] + + @classmethod + def encode(cls, rawbytes: Union[str, bytes]) -> bytes: + """ + Encodes raw bytes into Z85 encoding format with padding, and prepends the padding size. + + Args: + rawbytes (Union[str, bytes]): The input raw bytes to be encoded. + + Returns: + bytes: The Z85-encoded byte sequence with appropriate padding and padding size indication. + + Notes: + The padding is applied to ensure the length of the encoded data is a multiple of 5. The first byte in the + returned byte sequence represents the number of padding characters added. + """ + if isinstance(rawbytes, str): + rawbytes = rawbytes.encode("utf-8") + + padding = (4 - len(rawbytes) % 4) % 4 # Padding to make the length a multiple of 4 + rawbytes += b'\x00' * padding + + # The first byte indicates how many padding characters were added + nvalues = len(rawbytes) // 4 + values = struct.unpack('>%dI' % nvalues, rawbytes) + encoded = [padding] + + for v in values: + for offset in cls._85s: + encoded.append(cls.Z85CHARS[(v // offset) % 85]) + + return bytes(encoded) + + @classmethod + def decode(cls, z85bytes: Union[str, bytes]) -> bytes: + """ + Decodes a Z85-encoded byte sequence back into raw bytes, removing padding as indicated by the first byte. + + Args: + z85bytes (Union[str, bytes]): The Z85-encoded byte sequence to be decoded. + + Returns: + bytes: The decoded raw byte sequence with padding removed. + + Raises: + ValueError: If the length of the input data is not divisible by 5 or contains invalid Z85 encoding. + + Notes: + The first byte of the encoded data indicates the padding size, and this padding is removed during decoding. + """ + if isinstance(z85bytes, str): + z85bytes = z85bytes.encode("utf-8") + + if len(z85bytes) == 0: + return z85bytes + + if len(z85bytes) % 5 != 1: + raise ValueError('Invalid data length, should be divisible by 5 with 1 extra byte for padding indicator.') + + padding = z85bytes[0] # Read the padding size from the first byte + if padding < 0 or padding > 4: + raise ValueError('Padding size must be between 0 and 4.') + + z85bytes = z85bytes[1:] # Remove the first byte (padding size byte) + + values = [] + for i in range(0, len(z85bytes), 5): + value = 0 + for j, offset in enumerate(cls._85s): + value += cls.Z85MAP[z85bytes[i + j]] * offset + values.append(value) + + decoded = struct.pack('>%dI' % len(values), *values) + return decoded[:-padding] if padding else decoded # Remove padding