34 # include <sys/mman.h>
42 #pragma GCC diagnostic ignored "-Wpedantic"
44 #define BMZ_MAGIC "BMZ"
45 #define BMZIP_VER 0x0110
49 #define BMZ_HEADER_SZ (3 + 2 + 1 + 6 + 4)
52 #define BMZ_A_UNPACK 1
55 #define BMZ_O_BM_ONLY 1
56 #define BMZ_O_STREAM 2
58 typedef unsigned char Byte;
61 typedef long long unsigned Llu;
62 typedef long unsigned Lu;
68 #define LOG(_lvl_, _fmt_, ...) if (s_verbosity >= _lvl_) do { \
69 fprintf(stderr, "bmzip: %s: " _fmt_, __FUNCTION__, ##__VA_ARGS__); \
70 if (errno) fprintf(stderr, ": %s", strerror(errno)); \
74 #define WARN(_fmt_, ...) do { \
75 LOG(0, "warning: " _fmt_, ##__VA_ARGS__); \
78 #define DIE(_fmt_, ...) do { \
79 LOG(0, "fatal: " _fmt_, ##__VA_ARGS__); \
83 #define BMZ_ALIGN(_mem_, _n_) (Byte *)(_mem_) + _n_ - (((size_t)(_mem_))%(_n_))
85 #define BMZ_READ_INT16(_p_, _n_) \
86 _n_ = (*_p_++ << 8); \
89 #define BMZ_READ_INT32(_p_, _n_) \
90 _n_ = (*_p_++ << 24); \
91 _n_ |= (*_p_++ << 16); \
92 _n_ |= (*_p_++ << 8); \
95 #define BMZ_READ_INT48(_p_, _n_) \
96 _n_ = ((uint64_t)*_p_++ << 40); \
97 _n_ |= ((uint64_t)*_p_++ << 32); \
98 _n_ |= (*_p_++ << 24); \
99 _n_ |= (*_p_++ << 16); \
100 _n_ |= (*_p_++ << 8); \
103 #define BMZ_WRITE_INT16(_p_, _n_) \
104 *_p_++ = (Byte)(_n_ >> 8); \
107 #define BMZ_WRITE_INT32(_p_, _n_) \
108 *_p_++ = (Byte)(_n_ >> 24); \
109 *_p_++ = (Byte)(_n_ >> 16); \
110 *_p_++ = (Byte)(_n_ >> 8); \
113 #define BMZ_WRITE_INT48(_p_, _n_) \
114 *_p_++ = (Byte)(_n_ >> 40); \
115 *_p_++ = (Byte)(_n_ >> 32); \
116 *_p_++ = (Byte)(_n_ >> 24); \
117 *_p_++ = (Byte)(_n_ >> 16); \
118 *_p_++ = (Byte)(_n_ >> 8); \
129 uint32_t *checksum_p, uint32_t *options) {
130 const Byte *bp = buf;
134 DIE(
"bad magic in file header (%lu bytes)", (
Lu)magic_len);
140 DIE(
"incomaptible version: %04x", *version_p);
150 uint64_t orig_size = in_len;
167 uint64_t orig_size, size;
168 uint32_t checksum, options;
171 if (fstat(fd, &st) != 0)
DIE(
"error getting stat from file (%d)", fd);
176 printf(
"%8s%16s%16s%8s\n",
"version",
"compressed",
"uncompressed",
"ratio");
177 printf(
" %04x%16llu%16llu%7.2f%%\n", version, (
Llu)size,
178 (
Llu)orig_size, orig_size ? size * 100. / orig_size : 1);
182 do_pack(
const void *in,
size_t in_len,
size_t buf_len,
183 size_t offset,
size_t fp_len,
Byte options) {
187 Byte *out, *work_mem;
190 out_len = in_len + 1;
192 if (buf_len > in_len + worklen) {
193 out = (
Byte *)in + in_len;
194 work_mem = out + out_len;
197 out = malloc(worklen);
200 DIE(
"error allocating %lu bytes memory", (
Lu)worklen);
202 work_mem = out + out_len;
207 else if (buf_len > buflen + worklen) {
208 work_mem = (
Byte *)in + buflen;
212 out = malloc(buflen + worklen);
215 DIE(
"error allocating %lu bytes memory", (
Lu)buflen + worklen);
217 work_mem = out + buflen;
224 DIE(
"error encoding bm output (error %d)", ret);
228 WARN(
"error dumping bm encoding (ret=%d)", ret);
233 else if ((ret =
bmz_pack(in, in_len, out, &out_len, offset, fp_len,
236 DIE(
"error compressing input (error %d)", ret);
239 write(1, out, out_len);
243 do_unpack(
const void *in,
size_t in_len,
size_t buf_len) {
247 uint32_t checksum, cs, options;
252 if (in_len < BMZ_HEADER_SZ)
DIE(
"file truncated (size: %lu)", (
Lu)in_len);
256 if (orig_size > INT_MAX &&
sizeof(
size_t) == 4)
257 DIE(
"original file size %llu requires 64-bit version of bmzip",
266 DIE(
"checksum mismatch (expecting %x, got %x).", checksum, cs);
269 out = buf_len > in_len + orig_size ? (
Byte*)bp + len : malloc(outlen);
272 DIE(
"error decoding bm input (error %d)", ret);
276 out = (buf_len > outlen + worklen) ? (
Byte *)bp : malloc(outlen + worklen);
277 workmem = out + outlen;
280 DIE(
"error decompressing (error %d)", ret);
282 if (orig_size != outlen)
283 WARN(
"size mismatch (expecting %llu, got %llu)",
284 (
Llu)orig_size, (
Llu)outlen);
286 write(1, out, outlen);
290 do_block(
const void *in,
size_t len,
size_t buf_len,
size_t offset,
291 size_t fp_len,
int action,
int options) {
294 do_pack(in, len, buf_len, offset, fp_len, options);
300 DIE(
"unknown action: %d", action);
308 int64_t len = 0, size = 0, ret;
310 while ((ret = fread(buf, 1,
sizeof(buf), fp)) > 0) {
313 DIE(
"reading from stdin for data size greater than 2GB "
314 "not yet supported (current size: %lld)", (
long long)len);
317 size = (len + 16) * 5 / 2;
318 data = realloc(data, size);
320 memcpy(data + len - ret, buf, ret);
333 if (fstat(fd, &st) != 0)
DIE(
"cannot stat fd <%d>", fd);
335 if (st.st_size > INT_MAX &&
sizeof(
size_t) == 4)
336 DIE(
"file size %llu requires 64-bit version of bmzip",
339 sz = *len_p = *size_p = st.st_size;
341 if (!sz)
return data;
345 LOG(1,
"mmapping file (size: %lu)...", (
Lu)sz);
346 data = mmap(NULL, sz, PROT_READ, MAP_PRIVATE, fd, 0);
348 if (!data || (
void *)-1 == data) {
349 LOG(1,
"mmap failed on fd %d", fd);
351 LOG(1,
"%s",
"trying alternative");
357 LOG(1,
"reading file (size: %lu) into memory...", (
Lu)sz);
360 if (!data)
DIE(
"cannot allocate %lu bytes memory", (
Lu)sz);
362 if (read(fd, data, sz) != sz)
DIE(
"error reading %lu bytes", (
Lu)sz);
377 do_block(data, len, buf_len, offset, fp_len, action, options);
385 int fd = open(fname, O_RDONLY, 0);
387 if (fd == -1)
DIE(
"cannot open '%s'", fname);
394 do_block(data, len, buf_len, offset, fp_len, action, options);
408 DIE(
"unknown hash: %s", name);
414 fprintf(stderr,
"%s%s",
415 "usage: bmzip [options] [<file>]\n"
416 "-d, --decompress decompress to stdout\n"
417 "--verbose[=level] show some diagnostic messages\n"
418 "-l, --list list compressed file info\n"
419 "-h, --help show this message\n"
420 "--offset <number> expert: bm encoding start offset\n"
421 "--fp-len <number> expert: bm encoding fingerprint size\n"
422 "--bm-thresh <number> expert: bm hash collision threshold\n",
423 "--bm-hash <name> expert: use <name> as bm hash\n"
424 "--bm-only expert: skip lz compression\n"
425 "--bm-dump expert: dump human readable bm encoding\n"
426 "--no-mmap expert: do not use mmap\n");
432 char **ia = av + 1, **a_end = av + ac;
434 size_t fp_len = 64, offset = 0;
435 int bm_thresh = 0, action =
BMZ_A_PACK, options = 0;
437 for (; ia < a_end; ++ia) {
438 if (!strcmp(
"-d", *ia) ||
440 else if (!strcmp(
"--verbose", *ia))
s_verbosity = 1;
441 else if (!strcmp(
"--verbose=", *ia))
s_verbosity = atoi(*ia + 9);
442 else if (!strcmp(
"--offset", *ia)) offset = atoi(*++ia);
443 else if (!strcmp(
"--fp-len", *ia)) fp_len = atoi(*++ia);
444 else if (!strcmp(
"--bm-only", *ia)) options |=
BMZ_O_BM_ONLY;
445 else if (!strcmp(
"--bm-dump", *ia))
s_bm_dump = 1;
446 else if (!strcmp(
"--no-mmap", *ia))
s_no_mmap = 1;
447 else if (!strcmp(
"--bm-thresh", *ia)) bm_thresh = atoi(*++ia);
449 else if (!strcmp(
"-l", *ia) ||
451 else if (!strcmp(
"-h", *ia) ||
452 !strcmp(
"--help", *ia)) {
455 else if (!strcmp(
"--version", *ia)) {
460 else if (!strcmp(
"--", *ia)) {
464 else if (
'-' == **ia)
465 DIE(
"unknown option: %s\n", *ia);
int bmz_pack(const void *in, size_t in_len, void *out, size_t *out_len_p, size_t offset, size_t fp_len, unsigned flags, void *work_mem)
Perform bmz compression.
static void do_list(int fd)
#define BMZ_WRITE_INT48(_p_, _n_)
bmz_bm_unpack(const void *in, size_t in_len, void *out, size_t *out_len_p)
#define BMZ_WRITE_INT32(_p_, _n_)
int main(int ac, char *av[])
static void read_bmz_header(int fd, Byte *buf)
#define BMZ_ALIGN(_mem_, _n_)
static void input_from_file(const char *fname, size_t offset, size_t fp_len, int action, int options)
int bmz_set_verbosity(int verbosity)
Set the verbosity of library for testing and debugging.
#define BMZ_READ_INT16(_p_, _n_)
#define BMZ_READ_INT32(_p_, _n_)
static int bm_hash(const char *name)
#define BMZ_HASH_MASK16X2
static void do_unpack(const void *in, size_t in_len, size_t buf_len)
static void HT_NORETURN show_usage()
bmz_set_collision_thresh(int thresh)
int bmz_unpack(const void *in, size_t in_len, void *out, size_t *out_len_p, void *work_mem)
Perform bmz decompression.
static void do_pack(const void *in, size_t in_len, size_t buf_len, size_t offset, size_t fp_len, Byte options)
static char * read_from_fd(int fd, size_t *len_p, size_t *size_p)
size_t bmz_pack_buflen(size_t in_len)
Compute bmz compression output buffer length.
#define BMZ_WRITE_INT16(_p_, _n_)
size_t bmz_unpack_worklen(size_t out_len)
Return size of work memory for bmz decompression.
#define LOG(_lvl_, _fmt_,...)
static void do_block(const void *in, size_t len, size_t buf_len, size_t offset, size_t fp_len, int action, int options)
static int s_no_mmap
Copyright (C) 2007-2015 Hypertable, Inc.
bmz_bm_dump(const void *in, size_t in_len)
static void write_bmz_header(int fd, size_t in_len, uint32_t checksum, Byte options)
static void parse_bmz_header(const Byte *buf, uint16_t *version_p, uint64_t *orig_size_p, uint32_t *checksum_p, uint32_t *options)
#define BMZ_HASH_MASK32X2
static char * read_from_fp(FILE *fp, size_t *len_p, size_t *size_p)
unsigned bmz_checksum(const void *in, size_t in_len)
A fast checksum (adler32) function that might be useful.
bmz_bm_pack_mask(const void *in, size_t in_len, void *out, size_t *out_len_p, size_t offset, size_t fp_len, void *work_mem, size_t b)
Required portability definitions for all .cc files.
static void input_from_stdin(size_t offset, size_t fp_len, int action, int options)
#define BMZ_READ_INT48(_p_, _n_)
size_t bmz_pack_worklen(size_t in_len, size_t fp_len)
Return size of work memory for bmz compression.