Welcome to mirror list, hosted at ThFree Co, Russian Federation.

HuffmanEncoder.cpp « Huffman « Compress « 7zip - github.com/kornelski/7z.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 5876bae8dd9d865735306bf92d53eacf2762bcc2 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
// Compression/HuffmanEncoder.cpp

#include "StdAfx.h"

#include "HuffmanEncoder.h"
#include "Common/Defs.h"
#include "Common/Alloc.h"

namespace NCompression {
namespace NHuffman {

static const char *kIncorrectBitLenCountsMessage = "Incorrect bit len counts";

CEncoder::CEncoder():
  m_Items(0),
  m_Heap(0),
  m_Depth(0)
{}

void CEncoder::Free()
{
  MyFree(m_Items);
  MyFree(m_Heap);
  MyFree(m_Depth);
  m_Items = 0;
  m_Heap = 0;
  m_Depth = 0;
}

bool CEncoder::Create(UInt32 numSymbols,
    const Byte *extraBits, UInt32 extraBase, UInt32 maxLength)
{
  m_NumSymbols = numSymbols;
  m_ExtraBits = extraBits;
  m_ExtraBase = extraBase;
  m_MaxLength = maxLength;
  m_HeapSize = numSymbols * 2 + 1;
  Free();
  m_Items = (CItem *)MyAlloc(m_HeapSize * sizeof(CItem));
  m_Heap = (UInt32 *)MyAlloc(m_HeapSize * sizeof(UInt32));
  m_Depth = (Byte *)MyAlloc(m_HeapSize * sizeof(Byte));
  if (m_Items == 0 || m_Heap == 0 || m_Depth == 0)
  {
    Free();
    return false;
  }
  return true;
}

CEncoder::~CEncoder()
{
  Free();
}

void CEncoder::StartNewBlock()
{
  for (UInt32 i = 0; i < m_NumSymbols; i++)
    m_Items[i].Freq = 0;
}

void CEncoder::SetFreqs(const UInt32 *freqs)
{
  for (UInt32 i = 0; i < m_NumSymbols; i++)
    m_Items[i].Freq = freqs[i];
}

static const int kSmallest = 1;

// ===========================================================================
// Remove the smallest element from the heap and recreate the heap with
// one less element. Updates heap and m_HeapLength.
 
UInt32 CEncoder::RemoveSmallest()
{
  UInt32 top = m_Heap[kSmallest]; 
  m_Heap[kSmallest] = m_Heap[m_HeapLength--]; 
  DownHeap(kSmallest); 
  return top;
}

// ===========================================================================
// Compares to subtrees, using the tree m_Depth as tie breaker when
// the subtrees have equal frequency. This minimizes the worst case length.

bool CEncoder::Smaller(int n, int m) 
{
  return (m_Items[n].Freq < m_Items[m].Freq || 
         (m_Items[n].Freq == m_Items[m].Freq && m_Depth[n] <= m_Depth[m]));
}

// ===========================================================================
// Restore the m_Heap property by moving down the tree starting at node k,
// exchanging a node with the smallest of its two sons if necessary, stopping
// when the m_Heap property is re-established (each father CompareFreqs than its
// two sons).

void CEncoder::DownHeap(UInt32 k)
{
  UInt32 symbol = m_Heap[k];
  for (UInt32 j = k << 1; j <= m_HeapLength;)   // j: left son of k 
  {
    // Set j to the smallest of the two sons: 
    if (j < m_HeapLength && Smaller(m_Heap[j+1], m_Heap[j])) 
      j++;
    UInt32 htemp = m_Heap[j];    // htemp required because of bug in SASC compiler
    if (Smaller(symbol, htemp)) // Exit if v is smaller than both sons 
      break;
    m_Heap[k] = htemp;     // Exchange v with the smallest son
    k = j;
    j <<= 1; // And continue down the tree, setting j to the left son of k
  }
  m_Heap[k] = symbol;
}

// ===========================================================================
// Compute the optimal bit lengths for a tree and update the total bit length
// for the current block.
// IN assertion: the fields freq and dad are set, heap[heapMax] and
//    above are the tree nodes sorted by increasing frequency.
// OUT assertions: the field len is set to the optimal bit length, the
//    array m_BitLenCounters contains the frequencies for each bit length.
//    The length m_BlockBitLength is updated; static_len is also updated if stree is
//    not null.

void CEncoder::GenerateBitLen(UInt32 maxCode, UInt32 heapMax)
{
  int overflow = 0;   // number of elements with bit length too large 
  
  for (UInt32 i = 0; i <= kNumBitsInLongestCode; i++) 
    m_BitLenCounters[i] = 0;
  
  /* In a first pass, compute the optimal bit lengths (which may
  * overflow in the case of the bit length tree).
  */
  m_Items[m_Heap[heapMax]].Len = 0; /* root of the heap */
  UInt32 h;              /* heap index */
  for (h = heapMax+1; h < m_HeapSize; h++) 
  {
    UInt32 symbol = m_Heap[h];
    UInt32 len = m_Items[m_Items[symbol].Dad].Len + 1;
    if (len > m_MaxLength) 
    {
      len = m_MaxLength;
      overflow++;
    }
    m_Items[symbol].Len = len;  // We overwrite m_Items[symbol].Dad which is no longer needed
    if (symbol > maxCode) 
      continue;                       // not a leaf node
    m_BitLenCounters[len]++;
    UInt32 extraBits;
    if (m_ExtraBits != 0 && symbol >= m_ExtraBase) 
      extraBits = m_ExtraBits[symbol - m_ExtraBase];
    else
      extraBits = 0;
    m_BlockBitLength += (m_Items[symbol].Freq * (len + extraBits));
  }
  if (overflow == 0) 
    return;
 
  // This happens for example on obj2 and pic of the Calgary corpus
  // Find the first bit length which could increase:
  do 
  {
    UInt32 bits = m_MaxLength-1;
    while (m_BitLenCounters[bits] == 0) 
      bits--;
    m_BitLenCounters[bits]--;        // move one leaf down the m_Items
    m_BitLenCounters[bits + 1] += 2; // move one overflow item as its brother
    m_BitLenCounters[m_MaxLength]--;
    // The brother of the overflow item also moves one step up,
    // but this does not affect m_BitLenCounters[m_MaxLength]
    overflow -= 2;
  } 
  while (overflow > 0);
  
  // Now recompute all bit lengths, scanning in increasing frequency.
  // h is still equal to HEAP_SIZE. (It is simpler to reconstruct all
  // lengths instead of fixing only the wrong ones. This idea is taken
  // from 'ar' written by Haruhiko Okumura.)
  for (UInt32 bits = m_MaxLength; bits != 0; bits--) 
  {
    UInt32 numNodes = m_BitLenCounters[bits];
    while (numNodes != 0) 
    {
      UInt32 m = m_Heap[--h];
      if (m > maxCode) 
        continue;
      if (m_Items[m].Len != (unsigned) bits) 
      {
        m_BlockBitLength += ((long)bits - (long)m_Items[m].Len) * (long)m_Items[m].Freq;
        m_Items[m].Len = bits;
      }
      numNodes--;
    }
  }
}


// ===========================================================================
// Generate the codes for a given tree and bit counts (which need not be
// optimal).
// IN assertion: the array m_BitLenCounters contains the bit length statistics for
// the given tree and the field len is set for all tree elements.
// OUT assertion: the field code is set for all tree elements of non
//     zero code length.

//    UInt32 maxCode =  largest code with non zero frequency


void CEncoder::GenerateCodes(UInt32 maxCode)
{
  UInt32 nextCodes[kNumBitsInLongestCode + 1]; // next code value for each bit length
  UInt32 code = 0;                        // running code value
                             // The distribution counts are first used to generate the code values
                             // without bit reversal.
  for (UInt32 bits = 1; bits <= kNumBitsInLongestCode; bits++) 
    nextCodes[bits] = code = (code + m_BitLenCounters[bits - 1]) << 1;
  // Check that the bit counts in m_BitLenCounters are consistent. The last code
  // must be all ones.
  if (code + m_BitLenCounters[kNumBitsInLongestCode] - 1 != (1 << kNumBitsInLongestCode) - 1)
    throw kIncorrectBitLenCountsMessage;
  for (UInt32 n = 0;  n <= maxCode; n++) 
  {
    int len = m_Items[n].Len;
    if (len == 0) 
      continue;
    m_Items[n].Code = nextCodes[len]++;
  }
}


// ===========================================================================
// Construct one Huffman tree and assigns the code bit strings and lengths.
// Update the total bit length for the current block.
// IN assertion: the field freq is set for all tree elements.
// OUT assertions: the fields len and code are set to the optimal bit length
//     and corresponding code. The length m_BlockBitLength is updated; static_len is
//     also updated if stree is not null. The field max_code is set.

void CEncoder::BuildTree(Byte *levels)
{
  m_BlockBitLength = 0;
  int maxCode = -1; // WAS = -1; largest code with non zero frequency */

  // Construct the initial m_Heap, with least frequent element in
  // m_Heap[kSmallest]. The sons of m_Heap[n] are m_Heap[2*n] and m_Heap[2*n+1].
  //   m_Heap[0] is not used.
  //

  m_HeapLength = 0;
  UInt32 n;   // iterate over m_Heap elements 
  for (n = 0; n < m_NumSymbols; n++) 
  {
    if (m_Items[n].Freq != 0) 
    {
      m_Heap[++m_HeapLength] = maxCode = n;
      m_Depth[n] = 0;
    } 
    else 
      m_Items[n].Len = 0;
  }

  // The pkzip format requires that at least one distance code exists,
  // and that at least one bit should be sent even if there is only one
  // possible code. So to avoid special checks later on we force at least
  // two codes of non zero frequency.
  while (m_HeapLength < 2) 
  {
    int aNewNode = m_Heap[++m_HeapLength] = (maxCode < 2 ? ++maxCode : 0);
    m_Items[aNewNode].Freq = 1;
    m_Depth[aNewNode] = 0;
    m_BlockBitLength--; 
    // if (stree) static_len -= stree[aNewNode].Len;
    //    aNewNode is 0 or 1 so it does not have m_ExtraBits bits
  }
  
  // The elements m_Heap[m_HeapLength/2+1 .. m_HeapLength] are leaves of the m_Items,
  // establish sub-heaps of increasing lengths:
  for (n = m_HeapLength / 2; n >= 1; n--) 
    DownHeap(n);
  
  // Construct the Huffman tree by repeatedly combining the least two
  // frequent nodes.
  int node = m_NumSymbols;   // next internal node of the tree
  UInt32 heapMax = m_NumSymbols * 2+ 1;
  do 
  {
    n = RemoveSmallest();   /* n = node of least frequency */
    UInt32 m = m_Heap[kSmallest];  /* m = node of next least frequency */
    
    m_Heap[--heapMax] = n; /* keep the nodes sorted by frequency */
    m_Heap[--heapMax] = m;
    
    // Create a new node father of n and m 
    m_Items[node].Freq = m_Items[n].Freq + m_Items[m].Freq;
    m_Depth[node] = (Byte) (MyMax(m_Depth[n], m_Depth[m]) + 1);
    m_Items[n].Dad = m_Items[m].Dad = node;
    // and insert the new node in the m_Heap
    m_Heap[kSmallest] = node++;
    DownHeap(kSmallest);
    
  } 
  while (m_HeapLength >= 2);
  
  m_Heap[--heapMax] = m_Heap[kSmallest];
  
  // At this point, the fields freq and dad are set. We can now
  // generate the bit lengths.
  GenerateBitLen(maxCode, heapMax);
  
  // The field len is now set, we can generate the bit codes 
  GenerateCodes (maxCode);

  for (n = 0; n < m_NumSymbols; n++) 
    levels[n] = Byte(m_Items[n].Len);
}

}}