Skip to content

Commit 2eebc16

Browse files
committed
Update README, document stuff, update gitignore
1 parent a893b38 commit 2eebc16

File tree

3 files changed

+196
-83
lines changed

3 files changed

+196
-83
lines changed

.gitignore

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
# zig compile outputs
12
zig-cache/
23
zig-out/
34

5+
# comrpessed and decompresed files (testing)
6+
*.bin
7+
*.orig
8+
9+
# executables and object files
10+
gallop
11+
gallop.o

README.md

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,41 @@
11
# Gallop
22

3-
A quick and dirty CM compressor for prototyping built with zig.
3+
A lightweight CM compressor for prototyping built with zig, roughly based on [lpaq](http://mattmahoney.net/dc/#lpaq).
44
It *gallops*!
55

6-
Check out [weath3rb0i](https://github.com/Mitiko/weath3rb0i) as well.
6+
Currently: prediction is bitwise, model is order0-ish (12-bit context) with simple 12-bit counters.
7+
8+
## Usage
9+
10+
```bash
11+
./gallop c /data/book1 book1.bin
12+
./gallop c book1.bin book1.orig
13+
cmp book1.orig /data/book1
14+
```
15+
16+
## Goals
17+
18+
- Compress book1 in <2 s, enwik8 in <3 mins
19+
- Under 1KLoC (but readable, lpaq style)
20+
- Under 1GB memory usage
21+
- Optimized for compression ratio
22+
23+
## TODO
24+
25+
- Add a magic number (I propose b"gllp" = 0x676c_6c70)
26+
- Use (12-bit) states instead of counters
27+
- Add state map
28+
- Match model?
29+
- Delayed counters?
30+
- Entropy hashing?
31+
32+
### Build
33+
34+
```bash
35+
zig build-exe gallop.zig
36+
```
37+
38+
### References
39+
40+
Roughly based on [lpaq](http://mattmahoney.net/dc/#lpaq).
41+
Check out my other compressor [weath3rb0i](https://github.com/Mitiko/weath3rb0i).

gallop.zig

Lines changed: 151 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1,58 +1,13 @@
11
const std = @import("std");
22
const print = std.debug.print;
3-
4-
const Mode = enum { c, d };
5-
6-
pub fn main() !void {
7-
var args = std.process.args();
8-
_ = args.skip(); const arg = args.next();
9-
if (arg == null) { print("Error: No args passed. Pass -c for compression , -d for decompression\n", .{}); std.os.exit(1); }
10-
11-
const mode = if (std.mem.eql(u8, arg.?, "-d")) Mode.d
12-
else if (std.mem.eql(u8, arg.?, "-c")) Mode.c
13-
else null;
14-
if (mode == null) { print("Error: Invalid arg. Pass -c for compression , -d for decompression\n", .{}); std.os.exit(2); }
15-
16-
var bufw = std.io.bufferedWriter(std.io.getStdOut().writer());
17-
var writer = std.io.bitWriter(.Big, bufw.writer());
18-
var model = Model.init();
19-
20-
if (mode.? == .c) {
21-
const fileName = args.next();
22-
if (fileName == null) { print("Error: To compress use -c fileName\n", .{}); std.os.exit(3); }
23-
var path_buffer: [std.fs.MAX_PATH_BYTES]u8 = undefined;
24-
const path = try std.fs.realpathZ(fileName.?, &path_buffer);
25-
const file = try std.fs.openFileAbsolute(path, .{});
26-
defer file.close();
27-
const size = (try file.stat()).size;
28-
var bufr = std.io.bufferedReader(file.reader());
29-
var reader = std.io.bitReader(.Big, bufr.reader());
30-
31-
try writer.writeBits(size, 64);
32-
var ac = initAC(writer, Mode.c);
33-
while (true) {
34-
const bit = reader.readBitsNoEof(u1, 1) catch { break; };
35-
try ac.encode(bit, model.p());
36-
model.update(bit);
37-
}
38-
try ac.flush();
39-
try bufw.flush();
40-
} else {
41-
var bufr = std.io.bufferedReader(std.io.getStdIn().reader());
42-
var reader = std.io.bitReader(.Big, bufr.reader());
43-
const size = try reader.readBitsNoEof(u64, 64);
44-
var ac = initAC(reader, Mode.d);
45-
46-
var i: u64 = 0;
47-
while (i / 8 < size) : (i += 1) {
48-
const bit = ac.decode(model.p());
49-
try writer.writeBits(bit, 1);
50-
model.update(bit);
51-
}
52-
try bufw.flush();
53-
}
54-
}
55-
3+
const assert = std.debug.assert;
4+
const File = std.fs.File;
5+
6+
/// ============================= Model =============================
7+
/// A simple order0-ish model (with 12-bit context)
8+
/// To be replaced with mixer + micromodels
9+
/// MicroModels will share 2 hashtables, and hopefully 2 statetables - for big contexts, and for small ones
10+
/// Mixer should use vectors (since zig is chill like that)
5611
const Model = struct {
5712
ctx: u12,
5813
data: [1 << 12]Counter,
@@ -63,90 +18,205 @@ const Model = struct {
6318
pub fn p(self: Self) u16 { return self.data[self.ctx].p(); }
6419
pub fn update(self: *Self, bit: u1) void {
6520
self.data[self.ctx].update(bit);
66-
self.ctx <<= 1; self.ctx |= bit; self.ctx &= (1 << 12) - 1;
21+
self.ctx <<= 1; self.ctx |= bit;
6722
}
6823
};
6924

25+
/// ============================= Counter =============================
26+
/// A simple u12 counter (takes 3-bytes of memory)
27+
/// To be replaced with a state table + state map
7028
const Counter = struct {
71-
// c0: u16, c1: u16,
72-
c0: u12, c1: u12,
29+
counts: [2]u12,
7330

7431
const Self = @This();
75-
pub fn init() Self { return Self { .c0 = 0, .c1 = 0 }; }
32+
pub fn init() Self { return Self { .counts = [_]u12 { 0, 0 } }; }
7633
pub fn p(self: Self) u16 {
77-
const n0 = @as(u64, self.c0);
78-
const n1 = @as(u64, self.c1);
34+
const n0 = @as(u64, self.counts[0]);
35+
const n1 = @as(u64, self.counts[1]);
7936
return @intCast(u16, (1 << 16) * (n1 + 1) / (n1 + n0 + 2));
8037
}
8138
pub fn update(self: *Self, bit: u1) void {
82-
// const maxCount = (1 << 16) - 1;
8339
const maxCount = (1 << 12) - 1;
84-
if (self.c0 == maxCount or self.c1 == maxCount) {
85-
self.c0 >>= 1;
86-
self.c1 >>= 1;
40+
if (self.counts[0] == maxCount or self.counts[1] == maxCount) {
41+
self.counts[0] >>= 1;
42+
self.counts[1] >>= 1;
8743
}
88-
if (bit == 1) self.c1 += 1 else self.c0 += 1;
44+
self.counts[bit] += 1;
8945
}
9046
};
9147

92-
fn initAC(writer: anytype, comptime mode: Mode) ArithmeticCoder(@TypeOf(writer), mode) { return ArithmeticCoder(@TypeOf(writer), mode).init(writer); }
48+
/// ============================= Arithmetic coder =============================
49+
/// 32-bit (binary) arithmetic coder
50+
/// Use `initAC(writer, Mode.c)` for encoding, and `initAC(reader, Mode.d)` for decoding
51+
/// Initializing in wrong mode wouldn't compile because of the way zig emulates generics
52+
/// To encode: `try ac.encode(bit, p1)`, To decode: `const bit = ac.decode(p1)`
53+
/// Expected io is `std.io.BitReader` or `std.io.BitWriter`
54+
/// `flush()` should be called exactly once
55+
const Mode = enum { c, d }; // (compression, decompression) = (encode, decode)
56+
57+
fn initAC(writer: anytype, comptime mode: Mode) ArithmeticCoder(@TypeOf(writer), mode) {
58+
return ArithmeticCoder(@TypeOf(writer), mode).init(writer);
59+
}
60+
9361
fn ArithmeticCoder(comptime T: type, comptime mode: Mode) type { return struct {
9462
io:T, x: if (mode == Mode.d) u32 else void,
9563
revBits: if (mode == Mode.c) u64 else void,
9664
x1: u32 = 0, x2: u32 = (1 << 32) - 1,
9765

9866
const Self = @This();
99-
pub fn init(io: T) Self {
67+
const Q1: u32 = 1 << 30; const PREC_SHIFT: u32 = 31;
68+
const Q2: u32 = 2 << 30; const RLOW_MOD: u32 = (1 << 31) - 1; // Modify x1 bits in E3 mapping, AND with
69+
const Q3: u32 = 3 << 30; const RHIGH_MOD: u32 = (1 << 31) + 1; // Modify x2 bits in E3 mapping, OR with
70+
71+
pub fn init(io: T) Self { // initialize fields, read state in decode mode
10072
var self = if (mode == .c) Self { .io = io, .revBits = 0, .x = {} }
10173
else if (mode == .d) Self { .io = io, .x = 0, .revBits = {} };
10274
if (mode == .d) self.readState();
10375
return self;
10476
}
105-
pub fn encode(self: *Self, bit: u1, p: u16) !void { return self.code(bit, p); }
106-
pub fn decode(self: *Self, p: u16) u1 { return self.code({}, p); }
107-
pub fn flush(self: *Self) !void {
108-
try self.writeBit(self.x2 >> 31);
77+
pub fn encode(self: *Self, bit: u1, p: u16) !void { return self.proc(bit, p); }
78+
pub fn decode(self: *Self, p: u16) u1 { return self.proc({}, p); }
79+
pub fn flush(self: *Self) !void { // flush leading byte to stream
80+
comptime { assert(mode == .c); }
81+
try self.writeBit(self.x2 >> PREC_SHIFT);
10982
while (self.io.bit_count != 0) {
11083
self.x2 <<= 1; try self.writeBit(self.x2 >> 31);
11184
}
11285
}
11386

114-
fn readBit(self: *Self) u1 { return self.io.readBitsNoEof(u1, 1) catch 0; }
115-
fn incParity(self: *Self) void { self.revBits += 1; }
116-
fn writeBit(self: *Self, bit: u32) !void {
87+
fn readBit(self: *Self) u1 { return self.io.readBitsNoEof(u1, 1) catch 0; } // TODO: return 0 only on EOF, otherwise return error
88+
fn incParity(self: *Self) void { self.revBits += 1; } // for E3 mapping
89+
fn writeBit(self: *Self, bit: u32) !void { // writes bit, conscious of any E3 mappings
11790
try self.io.writeBits(bit, 1);
11891
while (self.revBits > 0) {
11992
try self.io.writeBits(bit ^ 1, 1);
12093
self.revBits -= 1;
12194
}
12295
}
123-
fn readState(self: *Self) void {
96+
fn readState(self: *Self) void { // reads 32-bits into state and pads with zeroes if necessary
12497
var bitsRead: usize = 0;
12598
var state = self.io.readBits(u32, 32, &bitsRead) catch 0;
12699
self.x = state << @intCast(u5, 32 - bitsRead);
127100
}
128101

129-
fn code(self: *Self, bit_: if (mode == .d) void else u1, prob: u16) if (mode == .d) u1 else anyerror!void {
102+
// processes a single bit -> decompresses a bit in decode mode, compresses a bit in encode mode
103+
fn proc(self: *Self, bit_: if (mode == .d) void else u1, prob: u16) if (mode == .d) u1 else anyerror!void {
130104
const p = if (prob == 0) 1 else @as(u64, prob) << 16;
131105
const xmid = @intCast(u32, self.x1 + ((@as(u64, self.x2 - self.x1) * p) >> 32));
132106

133107
const bit = if (mode == .c) bit_ else @boolToInt(self.x <= xmid);
134108
if (bit == 1) self.x2 = xmid else self.x1 = xmid + 1;
135109

136-
while ((self.x1 ^ self.x2) >> 31 == 0) {
137-
if (mode == .c) try self.writeBit(self.x1 >> 31)
110+
while ((self.x1 ^ self.x2) >> PREC_SHIFT == 0) {
111+
if (mode == .c) try self.writeBit(self.x1 >> PREC_SHIFT)
138112
else self.x = (self.x << 1) | self.readBit();
139113
self.x1 <<= 1;
140114
self.x2 = (self.x2 << 1) | 1;
141115
}
142116

143-
while (self.x1 >= (1 << 30) and self.x2 < (3 << 30)) {
117+
while (self.x1 >= Q1 and self.x2 < Q3) {
144118
if (mode == .c) self.incParity()
145-
else self.x = ((self.x << 1) ^ (2 << 30)) | self.readBit();
146-
self.x1 = (self.x1 << 1) & ((1 << 31) - 1);
147-
self.x2 = (self.x2 << 1) | ((1 << 31) + 1);
119+
else self.x = ((self.x << 1) ^ Q2) | self.readBit();
120+
self.x1 = (self.x1 << 1) & RLOW_MOD;
121+
self.x2 = (self.x2 << 1) | RHIGH_MOD;
148122
}
149123

150124
if (mode == .d) return bit;
151125
}
152126
};}
127+
128+
/// ============================ User Interface =============================
129+
pub fn main() !void {
130+
var args = std.process.args();
131+
_ = args.skip(); // skip program invokation
132+
const mode = parseMode(args.next());
133+
const inFile = try parseFile(args.next(), FileOptions.read);
134+
const outFile = try parseFile(args.next(), FileOptions.create);
135+
defer inFile.close(); defer outFile.close();
136+
137+
var timer = try std.time.Timer.start();
138+
139+
var bufr = std.io.bufferedReader(inFile.reader());
140+
var bufw = std.io.bufferedWriter(outFile.writer());
141+
var reader = std.io.bitReader(.Big, bufr.reader());
142+
var writer = std.io.bitWriter(.Big, bufw.writer());
143+
var model = Model.init();
144+
145+
if (mode == .c) { // Compression
146+
const size = try getSize(inFile);
147+
try writer.writeBits(size, 64);
148+
149+
var ac = initAC(writer, Mode.c);
150+
while (true) {
151+
const bit = reader.readBitsNoEof(u1, 1) catch { break; };
152+
try ac.encode(bit, model.p());
153+
model.update(bit);
154+
}
155+
156+
try ac.flush();
157+
try bufw.flush();
158+
} else { // Decompression
159+
const size = try reader.readBitsNoEof(u64, 64);
160+
var i: u64 = 0;
161+
162+
var ac = initAC(reader, Mode.d);
163+
while (i / 8 < size) : (i += 1) {
164+
const bit = ac.decode(model.p());
165+
try writer.writeBits(bit, 1);
166+
model.update(bit);
167+
}
168+
169+
try bufw.flush();
170+
}
171+
172+
const ns = @intToFloat(f64, timer.lap());
173+
const inSize = try getSize(inFile);
174+
const outSize = try getSize(outFile);
175+
reportResult(mode, inSize, outSize, ns);
176+
}
177+
178+
fn parseMode(arg: ?[:0]const u8) Mode {
179+
if (arg == null) exit(1);
180+
const mode = if (std.mem.eql(u8, arg.?, "c")) Mode.c
181+
else if (std.mem.eql(u8, arg.?, "d")) Mode.d
182+
else null;
183+
if (mode == null) exit(2);
184+
return mode.?;
185+
}
186+
187+
const FileOptions = enum { create, read };
188+
fn parseFile(arg: ?[:0]const u8, options: FileOptions) !File {
189+
if (arg == null) exit(3);
190+
if (options == .create) return std.fs.cwd().createFileZ(arg.?, .{});
191+
var pathBuf: [std.fs.MAX_PATH_BYTES]u8 = undefined;
192+
const path = try std.fs.realpathZ(arg.?, &pathBuf);
193+
return std.fs.openFileAbsolute(path, .{});
194+
}
195+
196+
fn getSize(f: File) !u64 { return (try f.stat()).size; }
197+
198+
fn reportResult(mode: Mode, inSize: u64, outSize: u64, ns: f64) void {
199+
switch (mode) {
200+
.c => print("Compressed {} -> {} in ", .{inSize, outSize}),
201+
.d => print("Decompressed {} -> {} in ", .{inSize, outSize})
202+
}
203+
204+
if (ns < 1000) { print("{d:.0} ns\n", .{ns}); return; }
205+
const us = ns / 1000; if (us < 1000) { print("{d:.3} us\n", .{us}); return; }
206+
const ms = us / 1000; if (ms < 1000) { print("{d:.2} ms\n", .{ms}); return; }
207+
const s = ms / 1000; if (s < 300) { print("{d:.2} sec\n", .{s}); return; }
208+
const m = s / 60; if (m < 60) { print("{d:.2} mins\n", .{m}); return; }
209+
const h = m / 60; print("{d:.2} hr\n", .{h});
210+
}
211+
212+
fn exit(status: u8) void {
213+
print(
214+
\\gallop file compressor (C) 2022, Dimitar Rusev (mitiko)
215+
\\
216+
\\To compress: ./gallop c input output
217+
\\To decompress: ./gallop d input output
218+
\\Example: (./gallop c /data/book1 book1.bin) && (./gallop d book1.bin book1.orig) && (cmp book1.orig /data/book1)
219+
\\
220+
,.{});
221+
std.os.exit(status);
222+
}

0 commit comments

Comments
 (0)