(I asked multiple questions to the implementation of the huffman code here, if I ask too many questions please tell me and I'll stop. Any tips on how to learn to find those mistakes myself are appreciated, I spent hours and couldn't find the problem.)
I try to implement huffman code in C. Right now I can successfully read a file, generate the tree and write the compressed data to the output file.However if my input file is random generated (using something like dd if=/dev/random of=foo bs=100 count=1
) the program aborts with a segfault. The weird thing is, this only happens on some random files, some others work.I'm confident that the null character isn't the problem because I don't use any str*() functions (I know printTree()
does rely on \0
but I don't use the function), also one random file that I generated contained an empty byte and it worked. One more time I am profoundly confused as to why my code doesn't work, more specific why it doesn't treat all input equally (I would have expected that my code either fails all the time or never as long as the input has the same length).
huffman.c:
#include "huffman.h"#include <stdio.h>#include <stdlib.h>#include <string.h>struct node { node* left; node* right; double weight; char* bytes; size_t nmemb;};void printTree(struct node* root, int depth){ if (root == NULL) return; for (int i = 0; i < depth; i++) { printf(" │"); } printf("(%s: %.2f)\n", root->bytes, root->weight); printTree(root->left, depth + 1); printTree(root->right, depth + 1);}// Order nodes in array, smallest weight laststatic int compareFrequency(const void* a, const void* b){ if ((*(node**)a)->weight < (*(node**)b)->weight) return 1; if ((*(node**)a)->weight > (*(node**)b)->weight) return -1; if ((*(node**)a)->nmemb < (*(node**)b)->nmemb) return 1; if ((*(node**)a)->nmemb > (*(node**)b)->nmemb) return -1; return 1;}// Encode input to outputvoid encode(FILE* input, FILE* output){ unsigned int possibleBytes[256] = { 0 }, differentBytes = 0, i; unsigned long long totalBytes = 0; char currChar; // Count number of occurence of each individual byte while ((currChar = fgetc(input)) != EOF) { possibleBytes[(int)currChar]++; totalBytes++; } // Count number of unique bytes for (i = 0; i < 256; i++) { if (possibleBytes[i]) differentBytes++; } // Make array filled with nodes (at the moment only the used bytes) node* nodes = calloc(differentBytes * 2 - 1, sizeof(node)); node* nodesP = nodes; for (i = 0; i < 256; i++) { if (possibleBytes[i]) { nodesP->bytes = calloc(1, sizeof(char)); nodesP->bytes[0] = i; nodesP->weight = (double)possibleBytes[i] / totalBytes; nodesP->nmemb = 1; nodesP++; } } // Fill trees array with nodes node** trees = calloc(differentBytes, sizeof(node*)); node** treesP = trees; nodesP = nodes; unsigned int treesLeft = differentBytes; for (i = 0; i < treesLeft; i++) { *treesP = nodesP; treesP++; nodesP++; } // Build tree node* lastTree; node* secondLastTree; while (treesLeft > 1) { qsort(trees, treesLeft, sizeof(node*), &compareFrequency); lastTree = *(trees + treesLeft - 1); secondLastTree = *(trees + treesLeft - 2); nodesP->left = lastTree; nodesP->right = secondLastTree; nodesP->weight = lastTree->weight + secondLastTree->weight; nodesP->nmemb = lastTree->nmemb + secondLastTree->nmemb; nodesP->bytes = calloc(lastTree->nmemb + secondLastTree->nmemb, sizeof(char)); // str* functions use \0 terminated string -> Might not work if array contains \0 (not at // the end) // strcat(nodesP->bytes, lastTree->bytes); // strcat(nodesP->bytes, secondLastTree->bytes); memcpy(nodesP->bytes, lastTree->bytes, lastTree->nmemb); memcpy(nodesP->bytes + lastTree->nmemb, secondLastTree->bytes, secondLastTree->nmemb); trees[treesLeft - 2] = nodesP; treesLeft--; nodesP++; } // write tree to file // https://stackoverflow.com/a/759766/15833045 //... // write data to file // left is 0, right is 1 rewind(input); unsigned char bitPosition = 1; unsigned char buffer[BUFSIZ]; node* currNode = *trees; currChar = fgetc(input); for (i = 0; currChar != EOF; i++) { // Fill byte of buffer while (bitPosition <= 8) { // Set 0 if going left, set 1 if going right if (memchr(currNode->left->bytes, currChar, currNode->nmemb)) { buffer[i] &= ~(1 << (8 - bitPosition)); currNode = currNode->left; } else { buffer[i] |= (1 << (8 - bitPosition)); currNode = currNode->right; } // If leaf, reset currNode and get next byte if (!currNode->left) { currNode = *trees; currChar = fgetc(input); if (currChar == EOF) { break; } } bitPosition++; } bitPosition = 1; if (i == BUFSIZ - 1) { i = 0; fwrite(buffer, sizeof(char), BUFSIZ, output); } } fwrite(buffer, sizeof(char), i, output);}
main.c:
#include "huffman.h"#include <stdio.h>int main(int argc, char* argv[]){ FILE* inputP = fopen(argv[1], "rb"); FILE* outputP = fopen(argv[2], "w"); if (!(inputP && outputP)) return 1; encode(inputP, outputP); fclose(inputP); fclose(outputP); return 0;}
Makefile:
CC = gccCFLAGS = -gOBJ = main.o huffman.ohuffman: $(OBJ) $(CC) $(OBJ) -o $@%.o: %.c $(CC) $(CFLAGS) -c $<clean: rm *.o
gdb output (is slightly different when using different random files and hence a different core dumps):
Reading symbols from huffman...[New LWP 9843][Thread debugging using libthread_db enabled]Using host libthread_db library "/usr/lib/libthread_db.so.1".Core was generated by `huffman a b'.Program terminated with signal SIGSEGV, Segmentation fault.#0 __memchr_avx2 () at ../sysdeps/x86_64/multiarch/memchr-avx2.S:8282 VPCMPEQ (%rdi), %ymm0, %ymm1
huffman.h
#ifndef HUFFMAN_H#define HUFFMAN_H#include <stdio.h>typedef struct node node;// static int compareFrequency(const void* a, const void* b);void encode(FILE* input, FILE* output);#endif