huffman编码解码 huffman解码算法
* huffman - Encode/Decode files using Huffman encoding.
* Copyright (C) 2003 Douglas Ryan Richardson; Gauss Interprise, Inc
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* 针对可打印字符的huffman编解码.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "huffman.h"
#ifdef WIN32
#include <winsock2.h>
#include <winsock.h>
#include <malloc.h>
#define alloca _alloca
#else
#include <netinet/in.h>
#include <sys/socket.h>
#include <sys/types.h>
#endif
// ---------------------------------------------------------------------------------------
/* 描述huffman树的节点 */
typedef struct huffman_node_tag
{
unsigned char isLeaf; // 当前节点是否为叶子节点
unsigned long count; // 权(字符出现的次数)
struct huffman_node_tag *parent; // 当前节点的父亲
union {
/*
* 如果改节点为叶子节点, 则有unsigned char symbol字段,
* 如果该节点不是叶子节点, 则具有左孩子右孩子字段.
*/
struct {
struct huffman_node_tag *zero; // 左孩子
struct huffman_node_tag *one; // 右孩子
};
unsigned char symbol; // 存储的字符.
};
} huffman_node;
/* 编码 */
typedef struct huffman_code_tag
{
/*
* numbits 这个字段是用来记录从leaf--->root一共走了多少步.
* 也就是叶子节点所含字符的编码长度
* bits 是用来存储编码的, 但是它是以Byte为单位的, 而编码是
* 以bit为单位的, 所以需要根据 numbits 去从 bits 里提取出前
* numbits 个bit。 这才是真正的编码.
*
* 比如
* numbits=4
* bits = 0110 1011
* 那么我们应该提取bits的低4位 1011
*
* 注意 numbits 和 bits是有关系的, bits一定不可能超过 numbits/8
*/
/* The length of this code in bits. */
unsigned long numbits;/* 记录从叶子走到root需要多少步, 也就是说需要多少位来对指定的字符进行编码 */
/* The bits that make up this code. The first
bit is at position 0 in bits[0]. The second
bit is at position 1 in bits[0]. The eighth
bit is at position 7 in bits[0]. The ninth
bit is at position 0 in bits[1]. */
unsigned char *bits; /* 用来存储编码String */
} huffman_code;
int
main(int argc, char** argv)
{
/*
char memory = 0;
char compress = 1;
int opt;
const char *file_in = NULL;
const char *file_out = NULL;
*/
//FILE *in = stdin;
//FILE *out = stdout;
FILE *in = NULL;
FILE *out = NULL;
FILE *out2 = NULL;
in = fopen("test.txt", "r+");
out = fopen("test1.txt", "w+");
//out2 = fopen("test2.txt", "w+");
int result = 0;
result = huffman_encode_file(in, out);
//result = huffman_decode_file(out, out2);
printf("result : %d/n", result);
system("pause");
return 1;
}
// ---------------------------------------------------------------------------------------
/**
* bit-->byte转换, 不足8位则补齐8位.
*/
static unsigned long
numbytes_from_numbits(unsigned long numbits)
{
return numbits / 8 + (numbits % 8 ? 1 : 0);
}
/*
* get_bit returns the ith bit in the bits array
* in the 0th position of the return value.
* 取出第i位(从低位往高位排)
*/
static unsigned char
get_bit(unsigned char *bits, unsigned long i)
{
return ( bits[i/8] >> (i%8) ) & 1;
}
/**
* 反转数组的前((numbits/8)+1)个字符
*
*/
static void
reverse_bits(unsigned char* bits, unsigned long numbits)
{
unsigned long numbytes = numbytes_from_numbits(numbits); // 所占字节数.
unsigned char *tmp = (unsigned char*)calloc(numbytes, sizeof(unsigned char)); // 分配内存
unsigned long curbit; // index -- 当前位
long curbyte = 0; // 当前是byte的index
memset(tmp, 0, numbytes); // 将tmp指向的buffer清零
/*
*
*/
for(curbit=0; curbit<numbits; ++curbit) {
unsigned int bitpos = curbit % 8; // 当前byte里的index
if(curbit>0 && curbit%8==0) { //
++curbyte;
}
/*
* 按位 OR
*
*/
tmp[curbyte] |= ( get_bit(bits, numbits-curbit-1) << bitpos );
}
memcpy(bits, tmp, numbytes);
}
/*
* new_code builds a huffman_code from a leaf in
* a Huffman tree.
* 对指定的叶子进行编码.
*/
static huffman_code*
new_code(const huffman_node* leaf)
{
/* Build the huffman code by walking up to
* the root node and then reversing the bits,
* since the Huffman code is calculated by
* walking down the tree. */
unsigned long numbits = 0;
unsigned char *bits = NULL;
huffman_code *p;
while(leaf!=NULL && leaf->parent!=NULL) {
huffman_node *parent = leaf->parent; // 当前活动节点的双亲
unsigned long cur_byte = numbits / 8; // 当前byte
unsigned char cur_bit = (unsigned char)(numbits % 8); // 当前byte里的当前bit
/* If we need another byte to hold the code,
then allocate it. */
if(cur_bit == 0) { // 满了一个byte就需要重新分配内存
size_t newSize = cur_byte + 1;
bits = (char*)realloc(bits, newSize);
bits[newSize - 1] = 0; /* Initialize the new byte. */
}
/*
* ************************************************************************
* 需要注意的是右孩子的处理, 因为左孩子已经是0, 初始分配bits的时候就已经把每位都设置为0了.
* ************************************************************************
*/
/* If a one must be added then or it in. If a zero
* must be added then do nothing, since the byte
* was initialized to zero. */
if(leaf == parent->one) {
/*
* 右孩子的话就需要把当前位置为1
*/
bits[cur_byte] |= (1<<cur_bit);
}
++numbits;
leaf = parent;
}
if( bits != 0) {
/*
* 如果编码里头含有1, 则需要进行反转, 如果全为0, 反转后和反转前是一样的, 就没必要反转了
*/
reverse_bits(bits, numbits);
}
p = (huffman_code*)malloc(sizeof(huffman_code));
p->numbits = numbits; /* 记录从叶子走到root需要多少步, 也就是说需要多少位来对指定的字符进行编码 */
p->bits = bits; /* 用来存储编码的区间 */
return p;
}
/**
* 创建一个孤立的"叶子"节点, 该节点为一个单独的树
* symbol : 该叶子节点的权,即字符
*/
static huffman_node*
new_leaf_node(unsigned char symbol)
{
huffman_node *p = (huffman_node*)malloc( sizeof(huffman_node) );
p->isLeaf = 1;
p->symbol = symbol;
p->count = 0;
p->parent = 0;
return p;
}
/**
* 创建节点(该节点不是叶子节点)
*
* count : 字符出现的次数
* zero : 左孩子
* one : 右兄弟
*
* return : 返回一个节点对象
*/
static huffman_node*
new_nonleaf_node(unsigned long count, huffman_node *zero, huffman_node *one)
{
huffman_node *p = (huffman_node*)malloc( sizeof(huffman_node) );
p->isLeaf = 0;
p->count = count;
p->zero = zero;
p->one = one;
p->parent = 0;
return p;
}
/**
* 释放树占用的内存空间
*/
static void
free_huffman_tree(huffman_node *subtree)
{
if(subtree == NULL)
return;
if( !(subtree->isLeaf) ) {
/*
* 先序遍历进行递归调用
*/
free_huffman_tree( subtree->zero );
free_huffman_tree( subtree->one );
}
free( subtree );
}
/**
* free code
*/
static void
free_code(huffman_code* p)
{
free(p->bits);
free(p);
}
// ----------------------------------------------------------------------------------------
#define MAX_SYMBOLS 256
typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS]; /* */
typedef huffman_code* SymbolEncoder[MAX_SYMBOLS]; /* */
/* */
static void
free_encoder(SymbolEncoder *pSE)
{
unsigned long i;
for(i = 0; i < MAX_SYMBOLS; ++i) {
huffman_code *p = (*pSE)[i];
if( p ) free_code(p);
}
}
/* */
static void
init_frequencies(SymbolFrequencies *pSF)
{
memset(*pSF, 0, sizeof(SymbolFrequencies) ); /* 清零 */
}
// ----------------------------------------------------------------------------------------
typedef struct buf_cache_tag
{
/*
* 该结构主要描述了两个部分, 一个是cache, 一个是bufout.
* cache是一个临时存储数据的buffer, cache会将数据写往bufout区间,
* bufout类似一个仓库, 会一直存储cache写入的数据.
* cache可以多次网bufout内写数据, bufout会一直保存这些数据.
* bufout是一个动态的buffer, cache每一次往bufout内写数据的时候bufout都需要realloc一次.
*/
unsigned char *cache; // 指向真正存储数据的buffer
unsigned int cache_len; // buffer的长度, 初始的时候就可以设置 cache的大小的
unsigned int cache_cur; // 数据结尾处(或者说是光标位置)
unsigned char **pbufout; /*
* cache要写数据就往这个空间内写(类似一个动态仓库, 一定是动态的)
* (*pbufout)就是真实的存储区
*/
unsigned int *pbufoutlen; // 仓库的大小
} buf_cache;
/* 初始化一个buf_cache */
static int init_cache(buf_cache *pc,
unsigned int cache_size,
unsigned char **pbufout,
unsigned int *pbufoutlen)
{
assert(pc && pbufout && pbufoutlen);
if(!pbufout || !pbufoutlen) return 1;
pc->cache = (unsigned char*)malloc(cache_size); // 分配存储空间
pc->cache_len = cache_size; //
pc->cache_cur = 0; // 光标从0开始
pc->pbufout = pbufout; //
*pbufout = NULL; //
pc->pbufoutlen = pbufoutlen;
*pbufoutlen = 0; //
return (pc->cache==NULL ? 0 : 1);
}
/* 释放buf_cache */
static void free_cache(buf_cache* pc)
{
assert( pc );
if( pc->cache != NULL)
{
/*
* 我觉得这里没有必要free( pc->cache );, 直接执行pc->cache = NULL;就可以保证
* 逻辑上清cache了, 至于真实的存储区内是否仍然有数据并没有意义.
*/
free( pc->cache );
pc->cache = NULL;
}
}
/*
* 将cache内的数据写到pbufout中去, 并清洗cache
* 成功则返回0, 失败返回1.
*/
static int flush_cache(buf_cache* pc)
{
assert( pc );
if(pc->cache_cur > 0) // 确定cache_cur有平移, 这样才能确定cache内有数据. 才可以flush
{
/* 当前要写的数据长度为pc->cache_cur, 原来的bufout里头本身还有*(pc->pbufoutlen)长度的数据 */
unsigned int newlen = pc->cache_cur + *(pc->pbufoutlen);
/* 需要重新为*(pc->pbufout)分配空间, tmp为这个新buffer的首地址 */
unsigned char* tmp = realloc(*(pc->pbufout), newlen);
if( !tmp ) return 1;
/* 追加到pbufout结尾处, 而不是覆盖 */
memcpy(tmp + *(pc->pbufoutlen), pc->cache, pc->cache_cur);
*pc->pbufout = tmp; // pbufout指针重定位到新的扩大了的内存区.
*pc->pbufoutlen = newlen; // 重新计算pbufoutlen
pc->cache_cur = 0; // cache逻辑上清零
}
return 0;
}
/* 写cache */
static int write_cache(buf_cache* pc,
const void *to_write,
unsigned int to_write_len)
{
unsigned char* tmp;
assert(pc && to_write);
assert(pc->cache_len >= pc->cache_cur);
/* If trying to write more than the cache will hold
* flush the cache and allocate enough space immediately,
* that is, don't use the cache. */
if(to_write_len > pc->cache_len - pc->cache_cur)
{
/*
* to_write_len : 需要往cache内写的数据长度
* pc->cache_len-pc->cache_cur : cache内剩余空间
* 如果cache存储能力不够则先清洗cache, 再将数据直接写到
* pbufout中去, 不使用cache.
*/
unsigned int newlen;
flush_cache( pc );
newlen = *pc->pbufoutlen + to_write_len;
tmp = realloc(*pc->pbufout, newlen);
if( !tmp ) return 1;
memcpy(tmp + *pc->pbufoutlen, to_write, to_write_len);
*pc->pbufout = tmp;
*pc->pbufoutlen = newlen;
}
else
{
/*
* Write the data to the cache
* 如果cache存储能力足够,则往cache内追加数据且将当前光标移动到新的数据尾部.
*/
memcpy(pc->cache+pc->cache_cur, to_write, to_write_len);
pc->cache_cur += to_write_len;
}
return 0;
}
// -------------------------------------------------------------------------------------
/*
* 计算FILE对象内的各个字符出现的频率
* 扫描FILE对象,
*/
static unsigned int
get_symbol_frequencies(SymbolFrequencies *pSF, FILE *in)
{
int c;
unsigned int total_count = 0; // FILE对象内的字符总数
init_frequencies( pSF ); /* Set all frequencies to 0. */
/* Count the frequency of each symbol in the input file. */
while( (c=fgetc(in)) != EOF )
{
unsigned char uc = c;
if( !(*pSF)[uc] ) // 如果这个字符没有出现过则为这个字符建立一个叶子
{
/*
* 这里设计的相当有意思,
* fgetc会获得当前光标所在的字符, 这个字符必然是一个char, 也就是一个
* unsinged int型数.
* 我们前面有定义
* #define MAX_SYMBOLS 256
* typedef huffman_node* SymbolFrequencies[MAX_SYMBOLS];
* typedef huffman_code* SymbolEncoder[MAX_SYMBOLS];
* 这里我们采用SymbolFrequencies[uc]就存储c这个字符.
* 也就是说
* SymbolFrequencies[0]==0x00
* SymbolFrequencies[1]==0x01
* ......
* SymbolFrequencies[65]==A ( A==65 :) )
* SymbolFrequencies[66]==B ( B==66 :) )
* ......
* 起初在typedf SymbolFrequencies的时候, 我着实没看懂, 实在太巧妙了.
*/
(*pSF)[uc] = new_leaf_node( uc );
}
/*
* 这个自加也非常巧妙, 遇到uc字符则将第uc个huffman_node的count自加
* 实际就是一个字符一个字符读下去,,
* 读到了A就 ++(pSF['A'].count) == ++(pSF[65].count)
* 读到了B就 ++(pSF['B'].count) == ++(pSF[66].count)
* 真TNND的牛X!
*/
++( (*pSF)[uc]->count );
++total_count;
}
return total_count;
}
/* 计算buffer内各个字符的频率,和get_symbol_frequencies函数同理 */
static unsigned int
get_symbol_frequencies_from_memory(SymbolFrequencies *pSF,
const unsigned char *bufin,
unsigned int bufinlen)
{
unsigned int i;
unsigned int total_count = 0;
/* Set all frequencies to 0. */
init_frequencies(pSF);
/* Count the frequency of each symbol in the input file. */
for(i = 0; i < bufinlen; ++i)
{
unsigned char uc = bufin[i];
if( !(*pSF)[uc] )
{
(*pSF)[uc] = new_leaf_node(uc);
}
++(*pSF)[uc]->count;
++total_count;
}
return total_count;
}
/*
* When used by qsort, SFComp sorts the array so that
* the symbol with the lowest frequency is first. Any
* NULL entries will be sorted to the end of the list.
*
* 两个huffman_node进行对比, 以count作为比较依据.
* 即对比两个不同 symbol 出现的频率
* 非叶子节点通通排在后面
*/
static int
SFComp(const void *p1, const void *p2)
{
const huffman_node *hn1 = *(const huffman_node**)p1;
const huffman_node *hn2 = *(const huffman_node**)p2;
/* Sort all NULLs to the end. */
if(hn1 == NULL && hn2 == NULL) return 0;
if(hn1 == NULL) return 1;
if(hn2 == NULL) return -1;
if(hn1->count > hn2->count) return 1;
else if(hn1->count < hn2->count) return -1;
return 0;
}
/*
* build_symbol_encoder builds a SymbolEncoder by walking
* down to the leaves of the Huffman tree and then,
* for each leaf, determines its code.
*
* 递归方式为每个叶子节点编码
*/
static void
build_symbol_encoder(huffman_node *subtree, SymbolEncoder *pSE)
{
if(subtree == NULL) return;
if( subtree->isLeaf )
{
(*pSE)[subtree->symbol] = new_code( subtree );
}
else
{
build_symbol_encoder(subtree->zero, pSE);
build_symbol_encoder(subtree->one, pSE);
}
}
/*
* calculate_huffman_codes turns pSF into an array
* with a single entry that is the root of the
* huffman tree. The return value is a SymbolEncoder,
* which is an array of huffman codes index by symbol value.
*
* 为每个node编码. 这个函数比较重要, 精华就是在这个函数里头的for循环. 哈哈
* 整个tree的建立全都依赖这个函数
*/
static SymbolEncoder*
calculate_huffman_codes(SymbolFrequencies * pSF)
{
unsigned int i = 0;
unsigned int n = 0;
huffman_node *m1 = NULL, *m2 = NULL;
SymbolEncoder *pSE = NULL;
/*
* Sort the symbol frequency array by ascending frequency.
* 快速排序例程进行排序
* 以symbol频率为关键字做升序排列
* 有symbol的节点都会按升序排列, 没有symbol的节点会统一排在后面,
* 通过一个for就能计算出symbol的个数了.
*/
qsort((*pSF), MAX_SYMBOLS, sizeof((*pSF)[0]), SFComp);
/*
* Get the number of symbols.
* 计算huffman树中的字符数, 这个实现可读性不够好
*/
for(n = 0; (n<MAX_SYMBOLS) && (*pSF)[n]; ++n)
;
/*
* Construct a Huffman tree. This code is based
* on the algorithm given in Managing Gigabytes
* by Ian Witten et al, 2nd edition, page 34.
* Note that this implementation uses a simple
* count instead of probability.
*/
for(i = 0; i < (n-1); ++i)
{
/* Set m1 and m2 to the two subsets of least probability. */
m1 = (*pSF)[0];
m2 = (*pSF)[1];
/* Replace m1 and m2 with a set {m1, m2} whose probability
* is the sum of that of m1 and m2.
* 这个算法有优化的余地的, 因为n在一直减小.
* 将最小的两个元素合并后得到一个一个节点为m12, 此时m1,m2已经建立起来了关系.
* 这个m12的地址又被pSF[0]存储, 循环直至整个Tree建立成功.
* 指针在这里运用的实在太巧妙了.
* 这一行代码就是建树, 靠,NBA!
*/
(*pSF)[0] = m1->parent = m2->parent = new_nonleaf_node(m1->count+m2->count, m1, m2);
(*pSF)[1] = NULL;
/*
* Put newSet into the correct count position in pSF.
* 这里应该可以再进行优化, 是否有必要再进行排序, 或者被排序的数组过长了.
* 实际上每循环一次n都减少了一次
*/
qsort((*pSF), n, sizeof((*pSF)[0]), SFComp);
}/* for完毕的时候就求出了root, pSF[0]就是root, 后面的元素都是NULL
* 而树通过for循环里头的
* (*pSF)[0] = m1->parent = m2->parent = new_nonleaf_node(m1->count+m2->count, m1, m2);
* 已经建立完成了*/
/* Build the SymbolEncoder array from the tree. */
pSE = (SymbolEncoder*)malloc(sizeof(SymbolEncoder));
memset(pSE, 0, sizeof(SymbolEncoder));
build_symbol_encoder((*pSF)[0], pSE);
return pSE;
}
/*
* Write the huffman code table. The format is:
* 4 byte code count in network byte order.
* 4 byte number of bytes encoded
* (if you decode the data, you should get this number of bytes)
* code1
* ...
* codeN, where N is the count read at the begginning of the file.
* Each codeI has the following format:
* 1 byte symbol, 1 byte code bit length, code bytes.
* Each entry has numbytes_from_numbits code bytes.
* The last byte of each code may have extra bits, if the number of
* bits in the code is not a multiple of 8.
*
* 编码后的格式 :
* 0-3个byte是FILE内出现的不同字符个数(几不同的字符个数)
* 4-7个byte是FILE内出现的全部字符个数(所有的字符)
* 8-X是真正的编码后值
*
*/
static int
write_code_table(FILE* out, SymbolEncoder *se, unsigned int symbol_count)
{
unsigned long i, count = 0;
/*
* Determine the number of entries in se
* 计算 SymbolEncoder 内具有编码值的元素个数.
* 即有几种字符
*/
for(i = 0; i < MAX_SYMBOLS; ++i)
if( (*se)[i] )
++count;
/*
* Write the number of entries in network byte order.
* 将字符种数写入到文件头部, 即[0, 3]一共4个字节
*/
//i = htonl( count );
i = count;
if(fwrite(&i, sizeof(i), 1, out) != 1) return 1;
/*
* Write the number of bytes that will be encoded.
* 将字符个数追加到[4,7]一共4个字节
*/
//symbol_count = htonl(symbol_count);
symbol_count = symbol_count;
if(fwrite(&symbol_count, sizeof(symbol_count), 1, out) != 1) return 1;
/*
* Write the entries.
*/
for(i = 0; i < MAX_SYMBOLS; ++i)
{
huffman_code *p = (*se)[i];
if( p != NULL )
{ /*
* 每个单元分为三个部分 :
* symbol -- 字符
* numbits -- 叶子走到root需要的步数
* bits -- 叶子走到root的方式(即最终的编码, 比如说0101)
*/
unsigned int numbytes;
/* Write the 1 byte symbol. */
fputc((unsigned char)i, out);
/* Write the 1 byte code bit length. */
fputc(p->numbits, out);
/* Write the code bytes. 她这个注释就没有说是几byte, 值得思考一下 */
numbytes = numbytes_from_numbits( p->numbits );
/* 将叶子走到root的方式写进去, 这个方式会被整理为byte格式, 不够就补0 */
if(fwrite(p->bits, 1, numbytes, out) != numbytes) return 1;
}
}
return 0;
}
/*
* Allocates memory and sets *pbufout to point to it. The memory
* contains the code table.
*
* 以指定的格式将编码后的数据写入到cache中去, 实际是写到pbufout中去了.
*
*/
static int
write_code_table_to_memory(buf_cache *pc,
SymbolEncoder *se,
unsigned int symbol_count)
{
unsigned long i, count = 0;
/* Determine the number of entries in se. */
for(i = 0; i < MAX_SYMBOLS; ++i)
{
if((*se)[i])
{
++count; // 计算不同字符的个数
}
}
/* Write the number of entries in network byte order. */
//i = htonl(count);
i = count;
if( write_cache(pc, &i, sizeof(i)) ) // 前四个字节是memory内所有字符数
return 1;
/* Write the number of bytes that will be encoded. */
//symbol_count = htonl(symbol_count);
symbol_count = symbol_count;
if( write_cache(pc, &symbol_count, sizeof(symbol_count)) ) // 4-8字节是不同字符个数
return 1;
/* Write the entries. */
for(i = 0; i < MAX_SYMBOLS; ++i)
{
huffman_code *p = (*se)[i];
if( p )
{
/*
* 对于每次循环来说, 如果p不为NULL, 则将该字符对应的编码写入到cache内.
* 存储格式为三个字节作为一个单位.
* byte0 --- 字符本身
* byte1 --- 该字符编码后的码值长度(即2进制的位数)
* byte2 --- 该字符对应的码值
*/
unsigned int numbytes;
/*
* The value of i is < MAX_SYMBOLS (256), so it can
* be stored in an unsigned char.
* 将i转换为char型, 可以对应到字符集
*/
unsigned char uc = (unsigned char)i;
/*
* Write the 1 byte symbol.
* 将字符写到cache内
*/
if(write_cache(pc, &uc, sizeof(uc))) return 1;
/*
* Write the 1 byte code bit length.
* 将叶子节点到root所需要经过的步数写到cache内, 也就是编码的长度
* 这个数据是为了解码使用的.
*/
uc = (unsigned char)p->numbits;
if(write_cache(pc, &uc, sizeof(uc))) return 1;
/*
* Write the code bytes.
* 将编码值对齐并写如到cache内
* 事先必须知道编码由几位组成, 如果编码为9位, 那么就需要2个byte来存储这个码值
* 如果编码为4位, 那么就需要1个byte来存储了,
*/
numbytes = numbytes_from_numbits(p->numbits);
if(write_cache(pc, p->bits, numbytes)) return 1;
}
}
return 0;
}
/*
* read_code_table builds a Huffman tree from the code
* in the in file. This function returns NULL on error.
* The returned value should be freed with free_huffman_tree.
*
*
*/
static huffman_node*
read_code_table(FILE* in, unsigned int *pDataBytes)
{
huffman_node *root = new_nonleaf_node(0, NULL, NULL);
unsigned int count;
/*
* Read the number of entries.
* (it is stored in network byte order).
* 获得字符种数, 前2个byte就是出现的字符种数
*/
if( fread(&count, sizeof(count), 1, in) != 1 )
{
free_huffman_tree( root );
return NULL;
}
//count = ntohl(count);
count = count;
/*
* Read the number of data bytes this encoding represents.
* 一个有多少个字符
*/
if( fread(pDataBytes, sizeof(*pDataBytes), 1, in) != 1 )
{
free_huffman_tree(root);
return NULL;
}
//*pDataBytes = ntohl(*pDataBytes);
*pDataBytes = *pDataBytes;
/* Read the entries. */
while(count-- > 0)
{
int c;
unsigned int curbit;
unsigned char symbol;
unsigned char numbits;
unsigned char numbytes;
unsigned char *bytes;
huffman_node *p = root;
if( (c=fgetc(in)) == EOF )
{
free_huffman_tree( root );
return NULL;
}
symbol = (unsigned char)c;
if( (c=fgetc(in)) == EOF )
{
free_huffman_tree( root );
return NULL;
}
numbits = (unsigned char)c;
numbytes = (unsigned char)numbytes_from_numbits( numbits );
bytes = (unsigned char*)malloc( numbytes );
if( fread(bytes, 1, numbytes, in) != numbytes )
{
free(bytes);
free_huffman_tree(root);
return NULL;
}
/*
* Add the entry to the Huffman tree. The value
* of the current bit is used switch between
* zero and one child nodes in the tree. New nodes
* are added as needed in the tree.
*/
for(curbit = 0; curbit < numbits; ++curbit)
{
if(get_bit(bytes, curbit))
{
if(p->one == NULL)
{
p->one =
curbit == (unsigned char)(numbits-1) ?
new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL);
p->one->parent = p;
}
p = p->one;
}
else
{
if(p->zero == NULL)
{
p->zero =
curbit == (unsigned char)(numbits - 1) ?
new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL);
p->zero->parent = p;
}
p = p->zero;
}
}
free(bytes);
}
return root;
}
/*
* 将数据从buf读到bufout中, 成功则返回0, 其他则返回1.
* pindex -- 拷贝的起点
*/
static int
memread(const unsigned char* buf,
unsigned int buflen,
unsigned int *pindex,
void* bufout,
unsigned int readlen)
{
assert(buf && pindex && bufout);
assert(buflen >= *pindex);
// 错误
if(buflen < *pindex) return 1;
if(readlen + *pindex >= buflen) return 1;
memcpy(bufout, buf + *pindex, readlen);
*pindex += readlen;
return 0;
}
/*
* 从编码后的buf内读数据.
*/
static huffman_node*
read_code_table_from_memory(const unsigned char* bufin,
unsigned int bufinlen,
unsigned int *pindex,
unsigned int *pDataBytes)
{
huffman_node *root = new_nonleaf_node(0, NULL, NULL);
unsigned int count;
/*
* Read the number of entries.
* (it is stored in network byte order).
* 读取
*/
if( memread(bufin, bufinlen, pindex, &count, sizeof(count)) )
{
free_huffman_tree(root);
return NULL;
}
//count = ntohl(count);
count = count;
/* Read the number of data bytes this encoding represents. */
if(memread(bufin, bufinlen, pindex, pDataBytes, sizeof(*pDataBytes)))
{
free_huffman_tree(root);
return NULL;
}
//*pDataBytes = ntohl(*pDataBytes);
*pDataBytes = *pDataBytes;
/* Read the entries. */
while( (count--) > 0 )
{
unsigned int curbit;
unsigned char symbol;
unsigned char numbits;
unsigned char numbytes;
unsigned char *bytes;
huffman_node *p = root;
if(memread(bufin, bufinlen, pindex, &symbol, sizeof(symbol)))
{
free_huffman_tree(root);
return NULL;
}
if(memread(bufin, bufinlen, pindex, &numbits, sizeof(numbits)))
{
free_huffman_tree(root);
return NULL;
}
numbytes = (unsigned char)numbytes_from_numbits(numbits);
bytes = (unsigned char*)malloc(numbytes);
if(memread(bufin, bufinlen, pindex, bytes, numbytes))
{
free(bytes);
free_huffman_tree(root);
return NULL;
}
/*
* Add the entry to the Huffman tree. The value
* of the current bit is used switch between
* zero and one child nodes in the tree. New nodes
* are added as needed in the tree.
*/
for(curbit = 0; curbit < numbits; ++curbit)
{
if(get_bit(bytes, curbit))
{
if(p->one == NULL)
{
p->one = ( curbit==(unsigned char)(numbits - 1) ) ?
new_leaf_node(symbol) : new_nonleaf_node(0, NULL, NULL);
p->one->parent = p;
}
p = p->one;
}
else
{
if(p->zero == NULL)
{
p->zero = curbit == (unsigned char)(numbits - 1)
? new_leaf_node(symbol)
: new_nonleaf_node(0, NULL, NULL);
p->zero->parent = p;
}
p = p->zero;
}
}
free(bytes);
}
return root;
}
/*
* 依次将各个字符的编码写入到out中, 这次是直接写, 不对编码进行整齐工作
* 也就是不将编码强制为byte类型了, 而是直接写入到out中.
*/
static int
do_file_encode(FILE* in, FILE* out, SymbolEncoder *se)
{
unsigned char curbyte = 0;
unsigned char curbit = 0;
int c;
while( (c = fgetc(in)) != EOF)
{
//printf("c=%c /n", c);
unsigned char uc = (unsigned char)c;
huffman_code *code = (*se)[uc];
unsigned long i;
for(i = 0; i < code->numbits; ++i)
{
//printf("i=%d /n", i);
/* Add the current bit to curbyte. */
curbyte |= get_bit(code->bits, i) << curbit;
/* If this byte is filled up then write it
* out and reset the curbit and curbyte. */
if(++curbit == 8)
{
/*
* 依次将各个字符的编码写入到out中, 这次是直接写, 不对编码进行整齐工作
* 也就是不将编码强制为byte类型了, 而是直接写入到out中.
*/
fputc(curbyte, out);
printf("curbyte=%d/n", curbyte);
// printf("curbyte=%d /n", curbyte);
curbyte = 0;
curbit = 0;
}
}
printf("/n");
}
/*
* If there is data in curbyte that has not been
* output yet, which means that the last encoded
* character did not fall on a byte boundary,
* then output it.
*/
if(curbit > 0)
fputc(curbyte, out);
return 0;
}
/*
* 对memory进行编码.
*
* pc -- cache
* bufin -- 待编码的buffer
* bufinlen -- buffer长, 即字符个数
* se --
*/
static int
do_memory_encode(buf_cache *pc,
const unsigned char *bufin,
unsigned int bufinlen,
SymbolEncoder *se)
{
unsigned char curbyte = 0; //
unsigned char curbit = 0;
unsigned int i;
/* 对 bufin 内的字符依次循环 */
for(i = 0; i < bufinlen; ++i)
{
unsigned char uc = bufin[i];
huffman_code *code = (*se)[uc]; // 取出第i个字符的编码
unsigned long i;
/* 对第i个字符编码长度进行循环 */
for(i = 0; i < code->numbits; ++i)
{
/*
* Add the current bit to curbyte.
* 依次取出
*/
curbyte |= ( get_bit(code->bits, i) << curbit );
/*
* If this byte is filled up then write it
* out and reset the curbit and curbyte
*/
if(++curbit == 8)
{
/*
* 满了一个字节则写cache
*
*/
if(write_cache(pc, &curbyte, sizeof(curbyte))) return 1;
curbyte = 0;
curbit = 0;
}
}
}
/*
* If there is data in curbyte that has not been
* output yet, which means that the last encoded
* character did not fall on a byte boundary,
* then output it.
*/
return curbit > 0 ? write_cache(pc, &curbyte, sizeof(curbyte)) : 0;
}
/*
* huffman_encode_file huffman encodes in to out.
* 对FILE对象进行编码, 将*in编码后写入*out.
*/
int
huffman_encode_file(FILE *in, FILE *out)
{
SymbolFrequencies sf;
SymbolEncoder *se;
huffman_node *root = NULL;
int rc;
unsigned int symbol_count;
/*
* Get the frequency of each symbol in the input file.
* 将FILE中的数据写入到SymbolFrequencies中, 计算出了各个字符出现频率,
* 也计算出了FILE对象内的总字符数.
*/
symbol_count = get_symbol_frequencies(&sf, in);
/*
* Build an optimal table from the symbolCount.
* 建树完毕, root就是sf[0], sf[1]=sf[2]=...sf[MAX_SYMBOLS]=NULL了
*/
se = calculate_huffman_codes( &sf );
root = sf[0];
/*
* Scan the file again and, using the table
* previously built, encode it into the output file.
*/
rewind( in ); // 将文件指针重新指向一个流的开头
/*
* 将编码写如文件流
*/
rc = write_code_table(out, se, symbol_count);
if(rc == 0)
rc = do_file_encode(in, out, se);
/* Free the Huffman tree. */
free_huffman_tree( root );
free_encoder(se);
return rc;
}
/**
* 对FILE进行解码
*/
int
huffman_decode_file(FILE *in, FILE *out)
{
huffman_node *root;
huffman_node *p;
int c;
unsigned int data_count;
/* Read the Huffman code table. */
root = read_code_table(in, &data_count);
if( !root ) return 1;
/* Decode the file. */
p = root;
while(data_count>0 && (c=fgetc(in))!=EOF)
{
unsigned char byte = (unsigned char)c;
unsigned char mask = 1;
while(data_count > 0 && mask)
{
p = ( (byte&mask)? p->one : p->zero );
mask <<= 1;
if( p->isLeaf )
{
fputc(p->symbol, out);
p = root;
--data_count;
}
}
}
free_huffman_tree( root );
return 0;
}
// --------------------------------------------------------------------------------------
#define CACHE_SIZE 1024 /*
* 对memory编码需要考虑到数据源的速度和bufout的接收速度不匹配, cache
* cache是为了连接不同速度的设备而设计的
*/
/**
* 对buffer进行编码
*
*/
int huffman_encode_memory(const unsigned char *bufin,
unsigned int bufinlen,
unsigned char **pbufout,
unsigned int *pbufoutlen)
{
SymbolFrequencies sf;
SymbolEncoder *se;
huffman_node *root = NULL;
int rc;
unsigned int symbol_count; // memory中的字符个数
buf_cache cache; //
/* Ensure the arguments are valid. 检测参数合法性 */
if(!pbufout || !pbufoutlen) return 1;
if( init_cache(&cache, CACHE_SIZE, pbufout, pbufoutlen) ) return 1;
/*
* Get the frequency of each symbol in the input memory
* 计算bufin内各个字符出现的频率, 并求得bufin内存储的字符个数.
*/
symbol_count = get_symbol_frequencies_from_memory(&sf, bufin, bufinlen);
/*
* Build an optimal table from the symbolCount.
* 为每个Node编码, 如果这个Node的symbol为NULL, 则不编码了.
*/
se = calculate_huffman_codes( &sf );
root = sf[0]; // root来拉, 哈哈, 逻辑树出来了.
/*
* Scan the memory again and, using the table
* previously built, encode it into the output memory.
* 将se内的数据统统的写入到cache中克.
*/
rc = write_code_table_to_memory(&cache, se, symbol_count);
if(rc == 0)
{
/*
* 为什么write_code_table_to_memory成功之后还要要执行一次do_memory_encode?
*
*/
rc = do_memory_encode(&cache, bufin, bufinlen, se);
}
/* Flush the cache. */
flush_cache( &cache );
/* Free the Huffman tree. */
free_huffman_tree( root );
free_encoder( se );
free_cache( &cache );
return rc;
}
/**
* 对bufin进行解码. 将解码后的数据写入到bufout中.
*/
int huffman_decode_memory(const unsigned char *bufin,
unsigned int bufinlen,
unsigned char **pbufout,
unsigned int *pbufoutlen)
{
huffman_node *root, *p;
unsigned int data_count;
unsigned int i = 0;
unsigned char *buf;
unsigned int bufcur = 0;
/* Ensure the arguments are valid. */
if(!pbufout || !pbufoutlen) return 1;
/* Read the Huffman code table. */
root = read_code_table_from_memory(bufin, bufinlen, &i, &data_count);
if(!root) return 1;
buf = (unsigned char*)malloc(data_count);
/* Decode the memory. */
p = root;
for(; i < bufinlen && data_count > 0; ++i)
{
unsigned char byte = bufin[i];
unsigned char mask = 1;
while(data_count > 0 && mask)
{
p = byte & mask ? p->one : p->zero;
mask <<= 1;
if(p->isLeaf)
{
buf[bufcur++] = p->symbol;
p = root;
--data_count;
}
}
}
free_huffman_tree(root);
*pbufout = buf;
*pbufoutlen = bufcur;
return 0;
}
/*
--------------------------------------------------------------------------------
不妨假设待编码的buffer为 "ABCAADC"
手工分析可得该树的形状 :
当然也可以也可以将这个树沿y方向翻转180度
root
//
/ /
A *
//
/ /
C *
//
/ /
B D
现在我们知道的两个事实是 :
这个buffer内的字符数为 : symbol_count = 7 ( "ABCAADC"一个有7个字符 )
这个buffer内出现的字符种数为 : count = 4 ( 只出现了ABCD四种字符 )
接下来人工分析各个字符 :
symbol | count | bits
--------|-----------|---------------------
A | 3 | 0000 0000
B | 1 | 0000 0110
C | 2 | 0000 0010
D | 1 | 0000 0111
我们设置左边为0, 右边为1. bits为从叶子节点走到root的路径.
分析完毕后, 需要实现整个编码过程. 编码过程暂时跳过.
假设成功编码完毕, 需要把编码后的数据写入到bufout内.
bufout内的
0-3个byte为字符种数 count
4-7个byte为字符个数 symbol_count
然后是遍历SymbolEncoder, 依次对每种字符进行编码(我们这个例子只进行4次编码)
我们对每种字符都会进行编码, 每个字符编码后的输出不妨称为frame
那么这个frame是由三个部分组成的:
(这个我们可以肯定一个char肯定是1byte的)
symbol (1byte) -- 字符
(这个我们可以肯定就算这个树根本没有分支, 永远只有左/右孩子, 那也了不起是是256的深度)
numbits (1byte) -- 叶子走到root需要的步数
bits (1byte) -- 叶子走到root的方式(即最终的编码, 比如说011)
开始我对这个bites到底占了多少个byte很是怀疑, 因为我不知道从叶子走到root
到底耗费了几步. 这里需要好好研究一下, 最好和最差情况. 暂时假设是个变化的byte吧.
但是有一点bites跟numbits是有关系的, 所以只要知道numbits还是可以知道bites占据了
多少byte的, 也知道bits到底是有几位.
byte content
---------------------------------------------
0-3(4) count
4-7(4) symbol_count
A占xa个byte frame_struct
B占xb个byte frame_struct
C占xc个byte frame_struct
D占xd个byte frame_struct
X X
这个X是do_file_encode函数写到bufout中去的数据, 那么这个数据是什么呢?
实际上它是循环的把出现的字符的bits写到bufout中,
根据这个数据,解码的时候就可以依次的找到第0,1,2...个位置出现的是什么字符了
--------------------------------------------------------------------------------
*/
更多阅读
暴风影音如何下载安装解码器 暴风影音解码器
暴风影音如何下载安装解码器——简介暴风影音需要安装一些解码器之后才能有效的播放一些视频格式,所以安装解码器时必要的。通过这篇经验,教教大家如何下载安装暴风影音的解码器,希望能够帮助到大家。暴风影音如何下载安装解码器——
霍夫曼编码及解码算法 霍夫曼编码 贪心算法
Code:#include<iostream>#include<string>using namespace std;typedef struct{int weight;int parent,lchild,rchild;}HFNode;void Select(HFNode *HT,int n,int &s1,int&s2){//在前n个数组中,选择parent为
毕业设计:卷积码编码及其解码的程序设计_之&凡
1绪论信息论作为现代科学技术中具有重大意义的崭新学科的诞生标志是美国数学家香农1948年发表的著名论述《通信的数学理论》,至此以后编码技术已发展了半个多世纪,编码理论起源于现代通信技术与电子计算机技术中差错控制研究的需要,是
浅谈视频格式2-固定码率CBR 与可变码率VBR cbr和vbr的区别
一般在我们输出视频文件的时候都会碰到一个选择即 CBR与VBR,CBR的英文全称是Constant BitRate翻译过来是固定码率就是说每一秒种的画面如果看做是一个静止的图片文件的话(实际上是每一帧的画面大小加起来)它大小是固定的,VBR的英文全称
MPEG-4视频编解码知识点 mpeg4视频下载
最近在泰瑞达找到工作,以后也不做嵌入式的东西了。现在把研究生几年来写的东西一点一点贴上来,留给有需要的人。对于有着巨大信息量的视频处理来说,需要研究出更高压缩比、更低码率、更清晰画质的编解码算法。到目前为止,视频编解码的