tools: holemap: new tool

Tool to list how a file is 'sparse' and calculate how 'sparse' it
/could/ be.

Signed-off-by: Michel Pollet <buserror@gmail.com>
This commit is contained in:
Michel Pollet
2016-10-12 22:51:32 +01:00
parent 1a768d5e9c
commit c265d7fe3f
2 changed files with 286 additions and 0 deletions

15
tools/Makefile Normal file
View File

@@ -0,0 +1,15 @@
CPPFLAGS = -Wall -I../src/server -I../src/common -I../src/proxy \
-Doff64_t=__off64_t \
-Wunused-const-variable=0 -Wformat=0
CFLAGS = $(CPPFLAGS) -g -O3 -std=gnu99
all: holemap
# holemap requires libmhash-dev for md5 calculation
holemap: holemap.c ../src/server/*.h
$(CC) $(CFLAGS) -o $@ $^ ${shell pkg-config mhash --cflags --libs}
clean:
rm -f holemap

271
tools/holemap.c Normal file
View File

@@ -0,0 +1,271 @@
#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <unistd.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <fcntl.h>
#include "bitset.h"
#include <mhash.h>
static int verbose = 0;
static const char * fstr(const char *fname, const char *error) {
char * ret;
asprintf(&ret, "%s:%s", fname, error);
return strdup(ret); // leaks on purpose; it's before we exit() anyway
}
static void
fill_check_pages(
uint8_t *map,
bitfield_p zmap,
uint64_t current,
uint64_t count,
uint64_t blksize)
{
uint64_t * base = (uint64_t*)(map + (current * blksize));
const int wordcount = blksize / sizeof(uint64_t);
while (count--) {
int is_zero = 1;
if (is_zero) {
uint64_t *cur = base;
for (int z = 0; z < wordcount && is_zero; z++)
is_zero = 0 == *cur++;
}
if (!is_zero)
bit_set(zmap, current);
base += wordcount;
current++;
}
}
static int
read_file_data_ranges(
int fd,
bitfield_p bmap,
struct stat st)
{
off_t off = 0;
int whence = SEEK_DATA;
while (off < st.st_size) {
if (verbose > 1)
printf("offset %llu\n", off);
off_t res = lseek(fd, off, whence);
if (res == -1) {
// end of file is OK
if (errno == ENXIO)
return 0;
fprintf(stderr, "lseek fails at %llu/%llu\n", off, st.st_size);
return res;
}
if (res > off && verbose)
printf("%s from %llu to %llu\n",
whence == SEEK_DATA ? "hole" : "data", off, res);
if (whence != SEEK_DATA) {
if (res > off) {
uint64_t from = off / st.st_blksize,
cnt = ((res + st.st_blksize - 1) - off) / st.st_blksize;
bit_set_range(bmap, from, cnt);
if (verbose > 1)
printf("set bits from %u cnt %u\n", from, cnt);
}
}
whence = whence == SEEK_DATA ? SEEK_HOLE : SEEK_DATA;
off = res;
}
return 0;
}
int main(int argc, const char *argv[])
{
int dohash = 0;
for (int i = 1; i < argc; i++) {
if (argv[i][0] == '-') {
if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) {
verbose++;
continue;
} else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--hash")) {
dohash++;
continue;
}
fprintf(stderr, "%s: invalid argument: %s\n", argv[0], argv[i]);
exit(1);
}
const char *fname = argv[i];
struct stat st;
if (stat(fname, &st)) {
perror(fname);
exit(1);
}
int fd = open(fname, O_RDONLY);
if (fd == -1) {
perror(fname);
exit(1);
}
if (verbose)
printf("%s is %u (%u times %u)\n", fname,
st.st_size, st.st_blocks, st.st_blksize);
/* number of used blocks for this file size, also number of bits in bitmaps */
size_t maxblocks = (st.st_size + st.st_blksize -1) / st.st_blksize;
/* this is the original bitmap made using the operating system */
bitfield_p bmap = (bitfield_p)calloc(
1 + (maxblocks / BITS_PER_WORD), BITFIELD_WORD_SIZE);
if (read_file_data_ranges(fd, bmap, st)) {
perror(fstr(fname, "read_file_data_ranges"));
exit(1);
}
if (verbose > 3) {
printf("original map: ");
for (int z = 0; z <= (maxblocks / BITS_PER_WORD); z++)
printf("%016llx ", bmap[z]);
printf("\n");
}
uint8_t * map = mmap(NULL, maxblocks * st.st_blksize,
PROT_READ, MAP_PRIVATE,
fd, 0);
if (map == MAP_FAILED) {
perror(fname);
exit(1);
}
/* This one bitmap will have a subset, or be a copy of the bmap one when
* we're done
*/
bitfield_p zmap = (bitfield_p)calloc(
1 + (maxblocks / BITS_PER_WORD), BITFIELD_WORD_SIZE);
{
uint64_t current = 0, count;
int which;
while (current < maxblocks &&
(count = bit_run_count(bmap, current, maxblocks - current, &which)) > 0) {
if (verbose)
printf("map is %s from %u to %u\n",
which ? "set" : "zero",
current * st.st_blksize,
(current+count) * st.st_blksize);
/* if bits were sets in the extents, double check them */
if (which)
fill_check_pages(map, zmap, current, count, st.st_blksize);
current += count;
}
}
if (verbose > 2) {
printf("optimized map: ");
for (int z = 0; z <= (maxblocks / BITS_PER_WORD); z++)
printf("%016llx ", zmap[z]);
printf("\n");
}
{
bitfield_p b[2] = { bmap, zmap };
uint64_t set[2] = {0};
for (int bs = 0; bs < 2; bs++) {
for (int z = 0; z <= (maxblocks / BITS_PER_WORD); z++)
set[bs] += __builtin_popcountll(b[bs][z]);
}
printf("%s is %d%% sparse, saving %llu blocks (%lluMB)\n", fname,
100 - ((set[0] * 100) / maxblocks),
100 - (maxblocks - set[0]),
((maxblocks - set[0]) * st.st_blksize) / (1024*1024));
if (set[1] < set[0])
printf("%s could be %d%% sparse, saving %llu blocks (%lluMB)\n", fname,
100 - ((set[1] * 100) / maxblocks),
100 - (maxblocks - set[1]),
((maxblocks - set[1]) * st.st_blksize) / (1024*1024));
}
if (dohash) {
const int hs = mhash_get_block_size(MHASH_MD5);
struct {
char h[hs + 1];
} hash[3];
MHASH td;
if (verbose)
printf("%s: calculating hashes, be patient\n", argv[i]);
td = mhash_init(MHASH_MD5);
mhash(td, map, maxblocks * st.st_blksize);
mhash_deinit(td, hash[0].h);
if (verbose)
printf("%s: hash 1/3 done\n", argv[i]);
{
uint64_t current = 0, count;
int which;
td = mhash_init(MHASH_MD5);
while (current < maxblocks &&
(count = bit_run_count(bmap, current, maxblocks - current, &which)) > 0) {
/* if bits were sets in the extents, double check them */
if (which)
mhash(td, map + (current * st.st_blksize),
count * st.st_blksize);
current += count;
}
mhash_deinit(td, hash[1].h);
}
if (verbose)
printf("%s: hash 2/3 done\n", argv[i]);
{
uint64_t current = 0, count;
int which;
td = mhash_init(MHASH_MD5);
while (current < maxblocks &&
(count = bit_run_count(zmap, current, maxblocks - current, &which)) > 0) {
/* if bits were sets in the extents, double check them */
if (which)
mhash(td, map + (current * st.st_blksize),
count * st.st_blksize);
current += count;
}
mhash_deinit(td, hash[2].h);
}
if (memcmp(hash[0].h, hash[1].h, hs) || memcmp(hash[0].h, hash[2].h, hs)) {
fprintf(stderr, "%s: hash mismatch: ", argv[i]);
for (int h = 0; h < 3; h++) {
for (int hb = 0; hb < hs; hb++)
fprintf(stderr, "%.2x", (uint8_t)hash[h].h[hb]);
fprintf(stderr, " ");
}
fprintf(stderr, "\n");
} else if (verbose) {
fprintf(stdout, "%s: hash 3/3 match: ", argv[i]);
for (int hb = 0; hb < hs; hb++)
fprintf(stdout, "%.2x", (uint8_t)hash[0].h[hb]);
fprintf(stdout, "\n");
}
}
munmap(map, maxblocks * st.st_blksize);
close(fd);
free(bmap);
free(zmap);
}
}