Current Dev State

This commit is contained in:
Tim Lorsbach
2025-06-23 20:13:54 +02:00
parent b4f9bb277d
commit ded50edaa2
22617 changed files with 4345095 additions and 174 deletions

View File

@ -0,0 +1,374 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Function to find backward reference copies.
#include "./backward_references.h"
#include <algorithm>
#include <vector>
#include "./command.h"
namespace brotli {
template<typename Hasher, bool kUseCostModel, bool kUseDictionary>
void CreateBackwardReferences(size_t num_bytes,
size_t position,
const uint8_t* ringbuffer,
size_t ringbuffer_mask,
const float* literal_cost,
size_t literal_cost_mask,
const size_t max_backward_limit,
const double base_min_score,
const int quality,
Hasher* hasher,
int* dist_cache,
int* last_insert_len,
Command* commands,
int* num_commands) {
if (num_bytes >= 3 && position >= 3) {
// Prepare the hashes for three last bytes of the last write.
// These could not be calculated before, since they require knowledge
// of both the previous and the current block.
hasher->Store(&ringbuffer[(position - 3) & ringbuffer_mask],
position - 3);
hasher->Store(&ringbuffer[(position - 2) & ringbuffer_mask],
position - 2);
hasher->Store(&ringbuffer[(position - 1) & ringbuffer_mask],
position - 1);
}
const Command * const orig_commands = commands;
int insert_length = *last_insert_len;
size_t i = position & ringbuffer_mask;
const int i_diff = position - i;
const size_t i_end = i + num_bytes;
// For speed up heuristics for random data.
const int random_heuristics_window_size = quality < 9 ? 64 : 512;
int apply_random_heuristics = i + random_heuristics_window_size;
double average_cost = 5.4;
if (kUseCostModel) {
average_cost = 0.0;
for (int k = position; k < position + num_bytes; ++k) {
average_cost += literal_cost[k & literal_cost_mask];
}
if (num_bytes > 0) {
average_cost /= num_bytes;
}
}
// M1 match is for considering for two repeated copies, if moving
// one literal form the previous copy to the current one allows the
// current copy to be more efficient (because the way static dictionary
// codes words). M1 matching improves text compression density by ~0.15 %.
bool match_found_M1 = false;
int best_len_M1 = 0;
int best_len_code_M1 = 0;
int best_dist_M1 = 0;
double best_score_M1 = 0;
while (i + 3 < i_end) {
int max_length = i_end - i;
size_t max_distance = std::min(i + i_diff, max_backward_limit);
double min_score = base_min_score;
if (kUseCostModel && insert_length < 8) {
double cost_diff[8] =
{ 0.1, 0.038, 0.019, 0.013, 0.001, 0.001, 0.001, 0.001 };
min_score += cost_diff[insert_length];
}
int best_len = 0;
int best_len_code = 0;
int best_dist = 0;
double best_score = min_score;
bool match_found = hasher->FindLongestMatch(
ringbuffer, ringbuffer_mask,
literal_cost, literal_cost_mask, average_cost,
dist_cache, i + i_diff, max_length, max_distance,
&best_len, &best_len_code, &best_dist, &best_score);
if (match_found) {
if (kUseDictionary && match_found_M1 && best_score_M1 > best_score) {
// Two copies after each other. Take the last literal from the
// last copy, and use it as the first of this one.
Command prev_cmd = commands[-1];
commands[-1] = Command(prev_cmd.insert_len_,
prev_cmd.copy_len_ - 1,
prev_cmd.copy_len_ - 1,
prev_cmd.DistanceCode());
hasher->Store(ringbuffer + i, i + i_diff);
--i;
best_len = best_len_M1;
best_len_code = best_len_code_M1;
best_dist = best_dist_M1;
best_score = best_score_M1;
} else {
// Found a match. Let's look for something even better ahead.
int delayed_backward_references_in_row = 0;
for (;;) {
--max_length;
int best_len_2 = quality < 4 ? std::min(best_len - 1, max_length) : 0;
int best_len_code_2 = 0;
int best_dist_2 = 0;
double best_score_2 = min_score;
max_distance = std::min(i + i_diff + 1, max_backward_limit);
hasher->Store(ringbuffer + i, i + i_diff);
match_found = hasher->FindLongestMatch(
ringbuffer, ringbuffer_mask,
literal_cost, literal_cost_mask, average_cost,
dist_cache, i + i_diff + 1, max_length, max_distance,
&best_len_2, &best_len_code_2, &best_dist_2, &best_score_2);
double cost_diff_lazy = 7.0;
if (kUseCostModel) {
cost_diff_lazy = 0.0;
if (best_len >= 4) {
cost_diff_lazy +=
literal_cost[(i + 4) & literal_cost_mask] - average_cost;
}
{
const int tail_length = best_len_2 - best_len + 1;
for (int k = 0; k < tail_length; ++k) {
cost_diff_lazy -=
literal_cost[(i + best_len + k) & literal_cost_mask] -
average_cost;
}
}
// If we are not inserting any symbols, inserting one is more
// expensive than if we were inserting symbols anyways.
if (insert_length < 1) {
cost_diff_lazy += 0.97;
}
// Add bias to slightly avoid lazy matching.
cost_diff_lazy += 2.0 + delayed_backward_references_in_row * 0.2;
cost_diff_lazy += 0.04 * literal_cost[i & literal_cost_mask];
}
if (match_found && best_score_2 >= best_score + cost_diff_lazy) {
// Ok, let's just write one byte for now and start a match from the
// next byte.
++i;
++insert_length;
best_len = best_len_2;
best_len_code = best_len_code_2;
best_dist = best_dist_2;
best_score = best_score_2;
if (++delayed_backward_references_in_row < 4) {
continue;
}
}
break;
}
}
apply_random_heuristics =
i + 2 * best_len + random_heuristics_window_size;
max_distance = std::min(i + i_diff, max_backward_limit);
int distance_code = best_dist + 16;
if (best_dist <= max_distance) {
if (best_dist == dist_cache[0]) {
distance_code = 1;
} else if (best_dist == dist_cache[1]) {
distance_code = 2;
} else if (best_dist == dist_cache[2]) {
distance_code = 3;
} else if (best_dist == dist_cache[3]) {
distance_code = 4;
} else if (quality > 1 && best_dist >= 6) {
for (int k = 4; k < kNumDistanceShortCodes; ++k) {
int idx = kDistanceCacheIndex[k];
int candidate = dist_cache[idx] + kDistanceCacheOffset[k];
static const int kLimits[16] = { 0, 0, 0, 0,
6, 6, 11, 11,
11, 11, 11, 11,
12, 12, 12, 12 };
if (best_dist == candidate && best_dist >= kLimits[k]) {
distance_code = k + 1;
break;
}
}
}
if (distance_code > 1) {
dist_cache[3] = dist_cache[2];
dist_cache[2] = dist_cache[1];
dist_cache[1] = dist_cache[0];
dist_cache[0] = best_dist;
}
}
Command cmd(insert_length, best_len, best_len_code, distance_code);
*commands++ = cmd;
insert_length = 0;
if (kUseDictionary) {
++i;
// Copy all copied literals to the hasher, except the last one.
// We cannot store the last one yet, otherwise we couldn't find
// the possible M1 match.
for (int j = 1; j < best_len - 1; ++j) {
if (i + 3 < i_end) {
hasher->Store(ringbuffer + i, i + i_diff);
}
++i;
}
// Prepare M1 match.
if (hasher->HasStaticDictionary() &&
best_len >= 4 && i + 20 < i_end && best_dist <= max_distance) {
max_distance = std::min(i + i_diff, max_backward_limit);
best_score_M1 = min_score;
match_found_M1 = hasher->FindLongestMatch(
ringbuffer, ringbuffer_mask,
literal_cost, literal_cost_mask, average_cost,
dist_cache, i + i_diff, i_end - i, max_distance,
&best_len_M1, &best_len_code_M1, &best_dist_M1, &best_score_M1);
} else {
match_found_M1 = false;
}
if (kUseCostModel) {
// This byte is just moved from the previous copy to the current,
// that is no gain.
best_score_M1 -= literal_cost[i & literal_cost_mask];
// Adjust for losing the opportunity for lazy matching.
best_score_M1 -= 3.75;
}
// Store the last one of the match.
if (i + 3 < i_end) {
hasher->Store(ringbuffer + i, i + i_diff);
}
++i;
} else {
// Put the hash keys into the table, if there are enough
// bytes left.
for (int j = 1; j < best_len; ++j) {
hasher->Store(&ringbuffer[i + j], i + i_diff + j);
}
i += best_len;
}
} else {
match_found_M1 = false;
++insert_length;
hasher->Store(ringbuffer + i, i + i_diff);
++i;
// If we have not seen matches for a long time, we can skip some
// match lookups. Unsuccessful match lookups are very very expensive
// and this kind of a heuristic speeds up compression quite
// a lot.
if (i > apply_random_heuristics) {
// Going through uncompressible data, jump.
if (i > apply_random_heuristics + 4 * random_heuristics_window_size) {
// It is quite a long time since we saw a copy, so we assume
// that this data is not compressible, and store hashes less
// often. Hashes of non compressible data are less likely to
// turn out to be useful in the future, too, so we store less of
// them to not to flood out the hash table of good compressible
// data.
int i_jump = std::min(i + 16, i_end - 4);
for (; i < i_jump; i += 4) {
hasher->Store(ringbuffer + i, i + i_diff);
insert_length += 4;
}
} else {
int i_jump = std::min(i + 8, i_end - 3);
for (; i < i_jump; i += 2) {
hasher->Store(ringbuffer + i, i + i_diff);
insert_length += 2;
}
}
}
}
}
insert_length += (i_end - i);
*last_insert_len = insert_length;
*num_commands += (commands - orig_commands);
}
void CreateBackwardReferences(size_t num_bytes,
size_t position,
const uint8_t* ringbuffer,
size_t ringbuffer_mask,
const float* literal_cost,
size_t literal_cost_mask,
const size_t max_backward_limit,
const double base_min_score,
const int quality,
Hashers* hashers,
int hash_type,
int* dist_cache,
int* last_insert_len,
Command* commands,
int* num_commands) {
switch (hash_type) {
case 1:
CreateBackwardReferences<Hashers::H1, false, false>(
num_bytes, position, ringbuffer, ringbuffer_mask,
literal_cost, literal_cost_mask, max_backward_limit, base_min_score,
quality, hashers->hash_h1.get(), dist_cache, last_insert_len,
commands, num_commands);
break;
case 2:
CreateBackwardReferences<Hashers::H2, false, false>(
num_bytes, position, ringbuffer, ringbuffer_mask,
literal_cost, literal_cost_mask, max_backward_limit, base_min_score,
quality, hashers->hash_h2.get(), dist_cache, last_insert_len,
commands, num_commands);
break;
case 3:
CreateBackwardReferences<Hashers::H3, false, false>(
num_bytes, position, ringbuffer, ringbuffer_mask,
literal_cost, literal_cost_mask, max_backward_limit, base_min_score,
quality, hashers->hash_h3.get(), dist_cache, last_insert_len,
commands, num_commands);
break;
case 4:
CreateBackwardReferences<Hashers::H4, false, false>(
num_bytes, position, ringbuffer, ringbuffer_mask,
literal_cost, literal_cost_mask, max_backward_limit, base_min_score,
quality, hashers->hash_h4.get(), dist_cache, last_insert_len,
commands, num_commands);
break;
case 5:
CreateBackwardReferences<Hashers::H5, false, false>(
num_bytes, position, ringbuffer, ringbuffer_mask,
literal_cost, literal_cost_mask, max_backward_limit, base_min_score,
quality, hashers->hash_h5.get(), dist_cache, last_insert_len,
commands, num_commands);
break;
case 6:
CreateBackwardReferences<Hashers::H6, false, false>(
num_bytes, position, ringbuffer, ringbuffer_mask,
literal_cost, literal_cost_mask, max_backward_limit, base_min_score,
quality, hashers->hash_h6.get(), dist_cache, last_insert_len,
commands, num_commands);
break;
case 7:
CreateBackwardReferences<Hashers::H7, false, false>(
num_bytes, position, ringbuffer, ringbuffer_mask,
literal_cost, literal_cost_mask, max_backward_limit, base_min_score,
quality, hashers->hash_h7.get(), dist_cache, last_insert_len,
commands, num_commands);
break;
case 8:
CreateBackwardReferences<Hashers::H8, true, true>(
num_bytes, position, ringbuffer, ringbuffer_mask,
literal_cost, literal_cost_mask, max_backward_limit, base_min_score,
quality, hashers->hash_h8.get(), dist_cache, last_insert_len,
commands, num_commands);
break;
case 9:
CreateBackwardReferences<Hashers::H9, true, false>(
num_bytes, position, ringbuffer, ringbuffer_mask,
literal_cost, literal_cost_mask, max_backward_limit, base_min_score,
quality, hashers->hash_h9.get(), dist_cache, last_insert_len,
commands, num_commands);
break;
default:
break;
}
}
} // namespace brotli

View File

@ -0,0 +1,46 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Function to find backward reference copies.
#ifndef BROTLI_ENC_BACKWARD_REFERENCES_H_
#define BROTLI_ENC_BACKWARD_REFERENCES_H_
#include <stdint.h>
#include <vector>
#include "./hash.h"
#include "./command.h"
namespace brotli {
void CreateBackwardReferences(size_t num_bytes,
size_t position,
const uint8_t* ringbuffer,
size_t ringbuffer_mask,
const float* literal_cost,
size_t literal_cost_mask,
const size_t max_backward_limit,
const double base_min_score,
const int quality,
Hashers* hashers,
int hash_type,
int* dist_cache,
int* last_insert_len,
Command* commands,
int* num_commands);
} // namespace brotli
#endif // BROTLI_ENC_BACKWARD_REFERENCES_H_

View File

@ -0,0 +1,168 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Functions to estimate the bit cost of Huffman trees.
#ifndef BROTLI_ENC_BIT_COST_H_
#define BROTLI_ENC_BIT_COST_H_
#include <stdint.h>
#include "./entropy_encode.h"
#include "./fast_log.h"
namespace brotli {
static inline double BitsEntropy(const int *population, int size) {
int sum = 0;
double retval = 0;
const int *population_end = population + size;
int p;
if (size & 1) {
goto odd_number_of_elements_left;
}
while (population < population_end) {
p = *population++;
sum += p;
retval -= p * FastLog2(p);
odd_number_of_elements_left:
p = *population++;
sum += p;
retval -= p * FastLog2(p);
}
if (sum) retval += sum * FastLog2(sum);
if (retval < sum) {
// At least one bit per literal is needed.
retval = sum;
}
return retval;
}
static const int kHuffmanExtraBits[kCodeLengthCodes] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3,
};
static inline int HuffmanTreeBitCost(const int* counts, const uint8_t* depth) {
int nbits = 0;
for (int i = 0; i < kCodeLengthCodes; ++i) {
nbits += counts[i] * (depth[i] + kHuffmanExtraBits[i]);
}
return nbits;
}
static inline int HuffmanTreeBitCost(
const Histogram<kCodeLengthCodes>& histogram,
const EntropyCode<kCodeLengthCodes>& entropy) {
return HuffmanTreeBitCost(&histogram.data_[0], &entropy.depth_[0]);
}
static inline int HuffmanBitCost(const uint8_t* depth, int length) {
int max_depth = 1;
int histogram[kCodeLengthCodes] = { 0 };
int tail_start = 0;
int prev_value = 8;
// compute histogram of compacted huffman tree
for (int i = 0; i < length;) {
const int value = depth[i];
if (value > max_depth) {
max_depth = value;
}
int reps = 1;
for (int k = i + 1; k < length && depth[k] == value; ++k) {
++reps;
}
i += reps;
if (i == length && value == 0)
break;
if (value == 0) {
if (reps < 3) {
histogram[0] += reps;
} else {
reps -= 2;
while (reps > 0) {
++histogram[17];
reps >>= 3;
}
}
} else {
tail_start = i;
if (value != prev_value) {
++histogram[value];
--reps;
}
prev_value = value;
if (reps < 3) {
histogram[value] += reps;
} else {
reps -= 2;
while (reps > 0) {
++histogram[16];
reps >>= 2;
}
}
}
}
// create huffman tree of huffman tree
uint8_t cost[kCodeLengthCodes] = { 0 };
CreateHuffmanTree(histogram, kCodeLengthCodes, 7, cost);
// account for rle extra bits
cost[16] += 2;
cost[17] += 3;
int tree_size = 0;
int bits = 18 + 2 * max_depth; // huffman tree of huffman tree cost
for (int i = 0; i < kCodeLengthCodes; ++i) {
bits += histogram[i] * cost[i]; // huffman tree bit cost
tree_size += histogram[i];
}
return bits;
}
template<int kSize>
double PopulationCost(const Histogram<kSize>& histogram) {
if (histogram.total_count_ == 0) {
return 12;
}
int count = 0;
for (int i = 0; i < kSize && count < 5; ++i) {
if (histogram.data_[i] > 0) {
++count;
}
}
if (count == 1) {
return 12;
}
if (count == 2) {
return 20 + histogram.total_count_;
}
uint8_t depth[kSize] = { 0 };
CreateHuffmanTree(&histogram.data_[0], kSize, 15, depth);
int bits = 0;
for (int i = 0; i < kSize; ++i) {
bits += histogram.data_[i] * depth[i];
}
if (count == 3) {
bits += 28;
} else if (count == 4) {
bits += 37;
} else {
bits += HuffmanBitCost(depth, kSize);
}
return bits;
}
} // namespace brotli
#endif // BROTLI_ENC_BIT_COST_H_

View File

@ -0,0 +1,402 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Block split point selection utilities.
#include "./block_splitter.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <algorithm>
#include <map>
#include "./cluster.h"
#include "./command.h"
#include "./fast_log.h"
#include "./histogram.h"
namespace brotli {
static const int kMaxLiteralHistograms = 100;
static const int kMaxCommandHistograms = 50;
static const double kLiteralBlockSwitchCost = 28.1;
static const double kCommandBlockSwitchCost = 13.5;
static const double kDistanceBlockSwitchCost = 14.6;
static const int kLiteralStrideLength = 70;
static const int kCommandStrideLength = 40;
static const int kSymbolsPerLiteralHistogram = 544;
static const int kSymbolsPerCommandHistogram = 530;
static const int kSymbolsPerDistanceHistogram = 544;
static const int kMinLengthForBlockSplitting = 128;
static const int kIterMulForRefining = 2;
static const int kMinItersForRefining = 100;
void CopyLiteralsToByteArray(const Command* cmds,
const size_t num_commands,
const uint8_t* data,
std::vector<uint8_t>* literals) {
// Count how many we have.
size_t total_length = 0;
for (int i = 0; i < num_commands; ++i) {
total_length += cmds[i].insert_len_;
}
if (total_length == 0) {
return;
}
// Allocate.
literals->resize(total_length);
// Loop again, and copy this time.
size_t pos = 0;
size_t from_pos = 0;
for (int i = 0; i < num_commands && pos < total_length; ++i) {
memcpy(&(*literals)[pos], data + from_pos, cmds[i].insert_len_);
pos += cmds[i].insert_len_;
from_pos += cmds[i].insert_len_ + cmds[i].copy_len_;
}
}
void CopyCommandsToByteArray(const Command* cmds,
const size_t num_commands,
std::vector<uint16_t>* insert_and_copy_codes,
std::vector<uint8_t>* distance_prefixes) {
for (int i = 0; i < num_commands; ++i) {
const Command& cmd = cmds[i];
insert_and_copy_codes->push_back(cmd.cmd_prefix_);
if (cmd.copy_len_ > 0 && cmd.cmd_prefix_ >= 128) {
distance_prefixes->push_back(cmd.dist_prefix_);
}
}
}
inline static unsigned int MyRand(unsigned int* seed) {
*seed *= 16807U;
if (*seed == 0) {
*seed = 1;
}
return *seed;
}
template<typename HistogramType, typename DataType>
void InitialEntropyCodes(const DataType* data, size_t length,
int literals_per_histogram,
int max_histograms,
size_t stride,
std::vector<HistogramType>* vec) {
int total_histograms = length / literals_per_histogram + 1;
if (total_histograms > max_histograms) {
total_histograms = max_histograms;
}
unsigned int seed = 7;
int block_length = length / total_histograms;
for (int i = 0; i < total_histograms; ++i) {
int pos = length * i / total_histograms;
if (i != 0) {
pos += MyRand(&seed) % block_length;
}
if (pos + stride >= length) {
pos = length - stride - 1;
}
HistogramType histo;
histo.Add(data + pos, stride);
vec->push_back(histo);
}
}
template<typename HistogramType, typename DataType>
void RandomSample(unsigned int* seed,
const DataType* data,
size_t length,
size_t stride,
HistogramType* sample) {
size_t pos = 0;
if (stride >= length) {
pos = 0;
stride = length;
} else {
pos = MyRand(seed) % (length - stride + 1);
}
sample->Add(data + pos, stride);
}
template<typename HistogramType, typename DataType>
void RefineEntropyCodes(const DataType* data, size_t length,
size_t stride,
std::vector<HistogramType>* vec) {
int iters =
kIterMulForRefining * length / stride + kMinItersForRefining;
unsigned int seed = 7;
iters = ((iters + vec->size() - 1) / vec->size()) * vec->size();
for (int iter = 0; iter < iters; ++iter) {
HistogramType sample;
RandomSample(&seed, data, length, stride, &sample);
int ix = iter % vec->size();
(*vec)[ix].AddHistogram(sample);
}
}
inline static float BitCost(int total, int count) {
return count == 0 ? FastLog2(total) + 2 : FastLog2(total) - FastLog2(count);
}
template<typename DataType, int kSize>
void FindBlocks(const DataType* data, const size_t length,
const double block_switch_bitcost,
const std::vector<Histogram<kSize> > &vec,
uint8_t *block_id) {
if (vec.size() <= 1) {
for (int i = 0; i < length; ++i) {
block_id[i] = 0;
}
return;
}
int vecsize = vec.size();
double* insert_cost = new double[kSize * vecsize];
memset(insert_cost, 0, sizeof(insert_cost[0]) * kSize * vecsize);
for (int i = 0; i < kSize; ++i) {
for (int j = 0; j < vecsize; ++j) {
insert_cost[i * vecsize + j] =
BitCost(vec[j].total_count_, vec[j].data_[i]);
}
}
double *cost = new double[vecsize];
memset(cost, 0, sizeof(cost[0]) * vecsize);
bool* switch_signal = new bool[length * vecsize];
memset(switch_signal, 0, sizeof(switch_signal[0]) * length * vecsize);
// After each iteration of this loop, cost[k] will contain the difference
// between the minimum cost of arriving at the current byte position using
// entropy code k, and the minimum cost of arriving at the current byte
// position. This difference is capped at the block switch cost, and if it
// reaches block switch cost, it means that when we trace back from the last
// position, we need to switch here.
for (size_t byte_ix = 0; byte_ix < length; ++byte_ix) {
int ix = byte_ix * vecsize;
int insert_cost_ix = data[byte_ix] * vecsize;
double min_cost = 1e99;
for (int k = 0; k < vecsize; ++k) {
// We are coding the symbol in data[byte_ix] with entropy code k.
cost[k] += insert_cost[insert_cost_ix + k];
if (cost[k] < min_cost) {
min_cost = cost[k];
block_id[byte_ix] = k;
}
}
double block_switch_cost = block_switch_bitcost;
// More blocks for the beginning.
if (byte_ix < 2000) {
block_switch_cost *= 0.77 + 0.07 * byte_ix / 2000;
}
for (int k = 0; k < vecsize; ++k) {
cost[k] -= min_cost;
if (cost[k] >= block_switch_cost) {
cost[k] = block_switch_cost;
switch_signal[ix + k] = true;
}
}
}
// Now trace back from the last position and switch at the marked places.
int byte_ix = length - 1;
int ix = byte_ix * vecsize;
int cur_id = block_id[byte_ix];
while (byte_ix > 0) {
--byte_ix;
ix -= vecsize;
if (switch_signal[ix + cur_id]) {
cur_id = block_id[byte_ix];
}
block_id[byte_ix] = cur_id;
}
delete[] insert_cost;
delete[] cost;
delete[] switch_signal;
}
int RemapBlockIds(uint8_t* block_ids, const size_t length) {
std::map<uint8_t, uint8_t> new_id;
int next_id = 0;
for (int i = 0; i < length; ++i) {
if (new_id.find(block_ids[i]) == new_id.end()) {
new_id[block_ids[i]] = next_id;
++next_id;
}
}
for (int i = 0; i < length; ++i) {
block_ids[i] = new_id[block_ids[i]];
}
return next_id;
}
template<typename HistogramType, typename DataType>
void BuildBlockHistograms(const DataType* data, const size_t length,
uint8_t* block_ids,
std::vector<HistogramType>* histograms) {
int num_types = RemapBlockIds(block_ids, length);
histograms->clear();
histograms->resize(num_types);
for (int i = 0; i < length; ++i) {
(*histograms)[block_ids[i]].Add(data[i]);
}
}
template<typename HistogramType, typename DataType>
void ClusterBlocks(const DataType* data, const size_t length,
uint8_t* block_ids) {
std::vector<HistogramType> histograms;
std::vector<int> block_index(length);
int cur_idx = 0;
HistogramType cur_histogram;
for (int i = 0; i < length; ++i) {
bool block_boundary = (i + 1 == length || block_ids[i] != block_ids[i + 1]);
block_index[i] = cur_idx;
cur_histogram.Add(data[i]);
if (block_boundary) {
histograms.push_back(cur_histogram);
cur_histogram.Clear();
++cur_idx;
}
}
std::vector<HistogramType> clustered_histograms;
std::vector<int> histogram_symbols;
// Block ids need to fit in one byte.
static const int kMaxNumberOfBlockTypes = 256;
ClusterHistograms(histograms, 1, histograms.size(),
kMaxNumberOfBlockTypes,
&clustered_histograms,
&histogram_symbols);
for (int i = 0; i < length; ++i) {
block_ids[i] = histogram_symbols[block_index[i]];
}
}
void BuildBlockSplit(const std::vector<uint8_t>& block_ids, BlockSplit* split) {
int cur_id = block_ids[0];
int cur_length = 1;
split->num_types = -1;
for (int i = 1; i < block_ids.size(); ++i) {
if (block_ids[i] != cur_id) {
split->types.push_back(cur_id);
split->lengths.push_back(cur_length);
split->num_types = std::max(split->num_types, cur_id);
cur_id = block_ids[i];
cur_length = 0;
}
++cur_length;
}
split->types.push_back(cur_id);
split->lengths.push_back(cur_length);
split->num_types = std::max(split->num_types, cur_id);
++split->num_types;
}
template<typename HistogramType, typename DataType>
void SplitByteVector(const std::vector<DataType>& data,
const int literals_per_histogram,
const int max_histograms,
const int sampling_stride_length,
const double block_switch_cost,
BlockSplit* split) {
if (data.empty()) {
split->num_types = 1;
return;
} else if (data.size() < kMinLengthForBlockSplitting) {
split->num_types = 1;
split->types.push_back(0);
split->lengths.push_back(data.size());
return;
}
std::vector<HistogramType> histograms;
// Find good entropy codes.
InitialEntropyCodes(data.data(), data.size(),
literals_per_histogram,
max_histograms,
sampling_stride_length,
&histograms);
RefineEntropyCodes(data.data(), data.size(),
sampling_stride_length,
&histograms);
// Find a good path through literals with the good entropy codes.
std::vector<uint8_t> block_ids(data.size());
for (int i = 0; i < 10; ++i) {
FindBlocks(data.data(), data.size(),
block_switch_cost,
histograms,
&block_ids[0]);
BuildBlockHistograms(data.data(), data.size(), &block_ids[0], &histograms);
}
ClusterBlocks<HistogramType>(data.data(), data.size(), &block_ids[0]);
BuildBlockSplit(block_ids, split);
}
void SplitBlock(const Command* cmds,
const size_t num_commands,
const uint8_t* data,
BlockSplit* literal_split,
BlockSplit* insert_and_copy_split,
BlockSplit* dist_split) {
// Create a continuous array of literals.
std::vector<uint8_t> literals;
CopyLiteralsToByteArray(cmds, num_commands, data, &literals);
// Compute prefix codes for commands.
std::vector<uint16_t> insert_and_copy_codes;
std::vector<uint8_t> distance_prefixes;
CopyCommandsToByteArray(cmds, num_commands,
&insert_and_copy_codes,
&distance_prefixes);
SplitByteVector<HistogramLiteral>(
literals,
kSymbolsPerLiteralHistogram, kMaxLiteralHistograms,
kLiteralStrideLength, kLiteralBlockSwitchCost,
literal_split);
SplitByteVector<HistogramCommand>(
insert_and_copy_codes,
kSymbolsPerCommandHistogram, kMaxCommandHistograms,
kCommandStrideLength, kCommandBlockSwitchCost,
insert_and_copy_split);
SplitByteVector<HistogramDistance>(
distance_prefixes,
kSymbolsPerDistanceHistogram, kMaxCommandHistograms,
kCommandStrideLength, kDistanceBlockSwitchCost,
dist_split);
}
void SplitBlockByTotalLength(const Command* all_commands,
const size_t num_commands,
int input_size,
int target_length,
std::vector<std::vector<Command> >* blocks) {
int num_blocks = input_size / target_length + 1;
int length_limit = input_size / num_blocks + 1;
int total_length = 0;
std::vector<Command> cur_block;
for (int i = 0; i < num_commands; ++i) {
const Command& cmd = all_commands[i];
int cmd_length = cmd.insert_len_ + cmd.copy_len_;
if (total_length > length_limit) {
blocks->push_back(cur_block);
cur_block.clear();
total_length = 0;
}
cur_block.push_back(cmd);
total_length += cmd_length;
}
blocks->push_back(cur_block);
}
} // namespace brotli

View File

@ -0,0 +1,74 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Block split point selection utilities.
#ifndef BROTLI_ENC_BLOCK_SPLITTER_H_
#define BROTLI_ENC_BLOCK_SPLITTER_H_
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <vector>
#include <utility>
#include "./command.h"
#include "./metablock.h"
namespace brotli {
struct BlockSplitIterator {
explicit BlockSplitIterator(const BlockSplit& split)
: split_(split), idx_(0), type_(0), length_(0) {
if (!split.lengths.empty()) {
length_ = split.lengths[0];
}
}
void Next() {
if (length_ == 0) {
++idx_;
type_ = split_.types[idx_];
length_ = split_.lengths[idx_];
}
--length_;
}
const BlockSplit& split_;
int idx_;
int type_;
int length_;
};
void CopyLiteralsToByteArray(const Command* cmds,
const size_t num_commands,
const uint8_t* data,
std::vector<uint8_t>* literals);
void SplitBlock(const Command* cmds,
const size_t num_commands,
const uint8_t* data,
BlockSplit* literal_split,
BlockSplit* insert_and_copy_split,
BlockSplit* dist_split);
void SplitBlockByTotalLength(const Command* all_commands,
const size_t num_commands,
int input_size,
int target_length,
std::vector<std::vector<Command> >* blocks);
} // namespace brotli
#endif // BROTLI_ENC_BLOCK_SPLITTER_H_

View File

@ -0,0 +1,830 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Brotli bit stream functions to support the low level format. There are no
// compression algorithms here, just the right ordering of bits to match the
// specs.
#include "./brotli_bit_stream.h"
#include <algorithm>
#include <limits>
#include <vector>
#include "./bit_cost.h"
#include "./context.h"
#include "./entropy_encode.h"
#include "./fast_log.h"
#include "./prefix.h"
#include "./write_bits.h"
namespace brotli {
// returns false if fail
// nibblesbits represents the 2 bits to encode MNIBBLES (0-3)
bool EncodeMlen(size_t length, int* bits, int* numbits, int* nibblesbits) {
length--; // MLEN - 1 is encoded
int lg = length == 0 ? 1 : Log2Floor(length) + 1;
if (lg > 24) return false;
int mnibbles = (lg < 16 ? 16 : (lg + 3)) / 4;
*nibblesbits = mnibbles - 4;
*numbits = mnibbles * 4;
*bits = length;
return true;
}
void StoreVarLenUint8(int n, int* storage_ix, uint8_t* storage) {
if (n == 0) {
WriteBits(1, 0, storage_ix, storage);
} else {
WriteBits(1, 1, storage_ix, storage);
int nbits = Log2Floor(n);
WriteBits(3, nbits, storage_ix, storage);
WriteBits(nbits, n - (1 << nbits), storage_ix, storage);
}
}
bool StoreCompressedMetaBlockHeader(bool final_block,
size_t length,
int* storage_ix,
uint8_t* storage) {
// Write ISLAST bit.
WriteBits(1, final_block, storage_ix, storage);
// Write ISEMPTY bit.
if (final_block) {
WriteBits(1, length == 0, storage_ix, storage);
if (length == 0) {
return true;
}
}
if (length == 0) {
// Only the last meta-block can be empty.
return false;
}
int lenbits;
int nlenbits;
int nibblesbits;
if (!EncodeMlen(length, &lenbits, &nlenbits, &nibblesbits)) {
return false;
}
WriteBits(2, nibblesbits, storage_ix, storage);
WriteBits(nlenbits, lenbits, storage_ix, storage);
if (!final_block) {
// Write ISUNCOMPRESSED bit.
WriteBits(1, 0, storage_ix, storage);
}
return true;
}
bool StoreUncompressedMetaBlockHeader(size_t length,
int* storage_ix,
uint8_t* storage) {
// Write ISLAST bit. Uncompressed block cannot be the last one, so set to 0.
WriteBits(1, 0, storage_ix, storage);
int lenbits;
int nlenbits;
int nibblesbits;
if (!EncodeMlen(length, &lenbits, &nlenbits, &nibblesbits)) {
return false;
}
WriteBits(2, nibblesbits, storage_ix, storage);
WriteBits(nlenbits, lenbits, storage_ix, storage);
// Write ISUNCOMPRESSED bit.
WriteBits(1, 1, storage_ix, storage);
return true;
}
void StoreHuffmanTreeOfHuffmanTreeToBitMask(
const int num_codes,
const uint8_t *code_length_bitdepth,
int *storage_ix,
uint8_t *storage) {
static const uint8_t kStorageOrder[kCodeLengthCodes] = {
1, 2, 3, 4, 0, 5, 17, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15
};
// The bit lengths of the Huffman code over the code length alphabet
// are compressed with the following static Huffman code:
// Symbol Code
// ------ ----
// 0 00
// 1 1110
// 2 110
// 3 01
// 4 10
// 5 1111
static const uint8_t kHuffmanBitLengthHuffmanCodeSymbols[6] = {
0, 7, 3, 2, 1, 15
};
static const uint8_t kHuffmanBitLengthHuffmanCodeBitLengths[6] = {
2, 4, 3, 2, 2, 4
};
// Throw away trailing zeros:
int codes_to_store = kCodeLengthCodes;
if (num_codes > 1) {
for (; codes_to_store > 0; --codes_to_store) {
if (code_length_bitdepth[kStorageOrder[codes_to_store - 1]] != 0) {
break;
}
}
}
int skip_some = 0; // skips none.
if (code_length_bitdepth[kStorageOrder[0]] == 0 &&
code_length_bitdepth[kStorageOrder[1]] == 0) {
skip_some = 2; // skips two.
if (code_length_bitdepth[kStorageOrder[2]] == 0) {
skip_some = 3; // skips three.
}
}
WriteBits(2, skip_some, storage_ix, storage);
for (int i = skip_some; i < codes_to_store; ++i) {
uint8_t l = code_length_bitdepth[kStorageOrder[i]];
WriteBits(kHuffmanBitLengthHuffmanCodeBitLengths[l],
kHuffmanBitLengthHuffmanCodeSymbols[l], storage_ix, storage);
}
}
void StoreHuffmanTreeToBitMask(
const std::vector<uint8_t> &huffman_tree,
const std::vector<uint8_t> &huffman_tree_extra_bits,
const uint8_t *code_length_bitdepth,
const std::vector<uint16_t> &code_length_bitdepth_symbols,
int * __restrict storage_ix,
uint8_t * __restrict storage) {
for (int i = 0; i < huffman_tree.size(); ++i) {
int ix = huffman_tree[i];
WriteBits(code_length_bitdepth[ix], code_length_bitdepth_symbols[ix],
storage_ix, storage);
// Extra bits
switch (ix) {
case 16:
WriteBits(2, huffman_tree_extra_bits[i], storage_ix, storage);
break;
case 17:
WriteBits(3, huffman_tree_extra_bits[i], storage_ix, storage);
break;
}
}
}
void StoreSimpleHuffmanTree(const uint8_t* depths,
int symbols[4],
int num_symbols,
int max_bits,
int *storage_ix, uint8_t *storage) {
// value of 1 indicates a simple Huffman code
WriteBits(2, 1, storage_ix, storage);
WriteBits(2, num_symbols - 1, storage_ix, storage); // NSYM - 1
// Sort
for (int i = 0; i < num_symbols; i++) {
for (int j = i + 1; j < num_symbols; j++) {
if (depths[symbols[j]] < depths[symbols[i]]) {
std::swap(symbols[j], symbols[i]);
}
}
}
if (num_symbols == 2) {
WriteBits(max_bits, symbols[0], storage_ix, storage);
WriteBits(max_bits, symbols[1], storage_ix, storage);
} else if (num_symbols == 3) {
WriteBits(max_bits, symbols[0], storage_ix, storage);
WriteBits(max_bits, symbols[1], storage_ix, storage);
WriteBits(max_bits, symbols[2], storage_ix, storage);
} else {
WriteBits(max_bits, symbols[0], storage_ix, storage);
WriteBits(max_bits, symbols[1], storage_ix, storage);
WriteBits(max_bits, symbols[2], storage_ix, storage);
WriteBits(max_bits, symbols[3], storage_ix, storage);
// tree-select
WriteBits(1, depths[symbols[0]] == 1 ? 1 : 0, storage_ix, storage);
}
}
// num = alphabet size
// depths = symbol depths
void StoreHuffmanTree(const uint8_t* depths, size_t num,
int *storage_ix, uint8_t *storage) {
// Write the Huffman tree into the brotli-representation.
std::vector<uint8_t> huffman_tree;
std::vector<uint8_t> huffman_tree_extra_bits;
// TODO: Consider allocating these from stack.
huffman_tree.reserve(256);
huffman_tree_extra_bits.reserve(256);
WriteHuffmanTree(depths, num, &huffman_tree, &huffman_tree_extra_bits);
// Calculate the statistics of the Huffman tree in brotli-representation.
int huffman_tree_histogram[kCodeLengthCodes] = { 0 };
for (int i = 0; i < huffman_tree.size(); ++i) {
++huffman_tree_histogram[huffman_tree[i]];
}
int num_codes = 0;
int code = 0;
for (int i = 0; i < kCodeLengthCodes; ++i) {
if (huffman_tree_histogram[i]) {
if (num_codes == 0) {
code = i;
num_codes = 1;
} else if (num_codes == 1) {
num_codes = 2;
break;
}
}
}
// Calculate another Huffman tree to use for compressing both the
// earlier Huffman tree with.
// TODO: Consider allocating these from stack.
uint8_t code_length_bitdepth[kCodeLengthCodes] = { 0 };
std::vector<uint16_t> code_length_bitdepth_symbols(kCodeLengthCodes);
CreateHuffmanTree(&huffman_tree_histogram[0], kCodeLengthCodes,
5, &code_length_bitdepth[0]);
ConvertBitDepthsToSymbols(code_length_bitdepth, kCodeLengthCodes,
code_length_bitdepth_symbols.data());
// Now, we have all the data, let's start storing it
StoreHuffmanTreeOfHuffmanTreeToBitMask(num_codes, code_length_bitdepth,
storage_ix, storage);
if (num_codes == 1) {
code_length_bitdepth[code] = 0;
}
// Store the real huffman tree now.
StoreHuffmanTreeToBitMask(huffman_tree,
huffman_tree_extra_bits,
&code_length_bitdepth[0],
code_length_bitdepth_symbols,
storage_ix, storage);
}
void BuildAndStoreHuffmanTree(const int *histogram,
const int length,
uint8_t* depth,
uint16_t* bits,
int* storage_ix,
uint8_t* storage) {
int count = 0;
int s4[4] = { 0 };
for (size_t i = 0; i < length; i++) {
if (histogram[i]) {
if (count < 4) {
s4[count] = i;
} else if (count > 4) {
break;
}
count++;
}
}
int max_bits_counter = length - 1;
int max_bits = 0;
while (max_bits_counter) {
max_bits_counter >>= 1;
++max_bits;
}
if (count <= 1) {
WriteBits(4, 1, storage_ix, storage);
WriteBits(max_bits, s4[0], storage_ix, storage);
return;
}
CreateHuffmanTree(histogram, length, 15, depth);
ConvertBitDepthsToSymbols(depth, length, bits);
if (count <= 4) {
StoreSimpleHuffmanTree(depth, s4, count, max_bits, storage_ix, storage);
} else {
StoreHuffmanTree(depth, length, storage_ix, storage);
}
}
int IndexOf(const std::vector<int>& v, int value) {
for (int i = 0; i < v.size(); ++i) {
if (v[i] == value) return i;
}
return -1;
}
void MoveToFront(std::vector<int>* v, int index) {
int value = (*v)[index];
for (int i = index; i > 0; --i) {
(*v)[i] = (*v)[i - 1];
}
(*v)[0] = value;
}
std::vector<int> MoveToFrontTransform(const std::vector<int>& v) {
if (v.empty()) return v;
std::vector<int> mtf(*std::max_element(v.begin(), v.end()) + 1);
for (int i = 0; i < mtf.size(); ++i) mtf[i] = i;
std::vector<int> result(v.size());
for (int i = 0; i < v.size(); ++i) {
int index = IndexOf(mtf, v[i]);
result[i] = index;
MoveToFront(&mtf, index);
}
return result;
}
// Finds runs of zeros in v_in and replaces them with a prefix code of the run
// length plus extra bits in *v_out and *extra_bits. Non-zero values in v_in are
// shifted by *max_length_prefix. Will not create prefix codes bigger than the
// initial value of *max_run_length_prefix. The prefix code of run length L is
// simply Log2Floor(L) and the number of extra bits is the same as the prefix
// code.
void RunLengthCodeZeros(const std::vector<int>& v_in,
int* max_run_length_prefix,
std::vector<int>* v_out,
std::vector<int>* extra_bits) {
int max_reps = 0;
for (int i = 0; i < v_in.size();) {
for (; i < v_in.size() && v_in[i] != 0; ++i) ;
int reps = 0;
for (; i < v_in.size() && v_in[i] == 0; ++i) {
++reps;
}
max_reps = std::max(reps, max_reps);
}
int max_prefix = max_reps > 0 ? Log2Floor(max_reps) : 0;
*max_run_length_prefix = std::min(max_prefix, *max_run_length_prefix);
for (int i = 0; i < v_in.size();) {
if (v_in[i] != 0) {
v_out->push_back(v_in[i] + *max_run_length_prefix);
extra_bits->push_back(0);
++i;
} else {
int reps = 1;
for (uint32_t k = i + 1; k < v_in.size() && v_in[k] == 0; ++k) {
++reps;
}
i += reps;
while (reps) {
if (reps < (2 << *max_run_length_prefix)) {
int run_length_prefix = Log2Floor(reps);
v_out->push_back(run_length_prefix);
extra_bits->push_back(reps - (1 << run_length_prefix));
break;
} else {
v_out->push_back(*max_run_length_prefix);
extra_bits->push_back((1 << *max_run_length_prefix) - 1);
reps -= (2 << *max_run_length_prefix) - 1;
}
}
}
}
}
// Returns a maximum zero-run-length-prefix value such that run-length coding
// zeros in v with this maximum prefix value and then encoding the resulting
// histogram and entropy-coding v produces the least amount of bits.
int BestMaxZeroRunLengthPrefix(const std::vector<int>& v) {
int min_cost = std::numeric_limits<int>::max();
int best_max_prefix = 0;
for (int max_prefix = 0; max_prefix <= 16; ++max_prefix) {
std::vector<int> rle_symbols;
std::vector<int> extra_bits;
int max_run_length_prefix = max_prefix;
RunLengthCodeZeros(v, &max_run_length_prefix, &rle_symbols, &extra_bits);
if (max_run_length_prefix < max_prefix) break;
HistogramContextMap histogram;
for (int i = 0; i < rle_symbols.size(); ++i) {
histogram.Add(rle_symbols[i]);
}
int bit_cost = PopulationCost(histogram);
if (max_prefix > 0) {
bit_cost += 4;
}
for (int i = 1; i <= max_prefix; ++i) {
bit_cost += histogram.data_[i] * i; // extra bits
}
if (bit_cost < min_cost) {
min_cost = bit_cost;
best_max_prefix = max_prefix;
}
}
return best_max_prefix;
}
void EncodeContextMap(const std::vector<int>& context_map,
int num_clusters,
int* storage_ix, uint8_t* storage) {
StoreVarLenUint8(num_clusters - 1, storage_ix, storage);
if (num_clusters == 1) {
return;
}
std::vector<int> transformed_symbols = MoveToFrontTransform(context_map);
std::vector<int> rle_symbols;
std::vector<int> extra_bits;
int max_run_length_prefix = BestMaxZeroRunLengthPrefix(transformed_symbols);
RunLengthCodeZeros(transformed_symbols, &max_run_length_prefix,
&rle_symbols, &extra_bits);
HistogramContextMap symbol_histogram;
for (int i = 0; i < rle_symbols.size(); ++i) {
symbol_histogram.Add(rle_symbols[i]);
}
bool use_rle = max_run_length_prefix > 0;
WriteBits(1, use_rle, storage_ix, storage);
if (use_rle) {
WriteBits(4, max_run_length_prefix - 1, storage_ix, storage);
}
EntropyCodeContextMap symbol_code;
memset(symbol_code.depth_, 0, sizeof(symbol_code.depth_));
memset(symbol_code.bits_, 0, sizeof(symbol_code.bits_));
BuildAndStoreHuffmanTree(symbol_histogram.data_,
num_clusters + max_run_length_prefix,
symbol_code.depth_, symbol_code.bits_,
storage_ix, storage);
for (int i = 0; i < rle_symbols.size(); ++i) {
WriteBits(symbol_code.depth_[rle_symbols[i]],
symbol_code.bits_[rle_symbols[i]],
storage_ix, storage);
if (rle_symbols[i] > 0 && rle_symbols[i] <= max_run_length_prefix) {
WriteBits(rle_symbols[i], extra_bits[i], storage_ix, storage);
}
}
WriteBits(1, 1, storage_ix, storage); // use move-to-front
}
void StoreBlockSwitch(const BlockSplitCode& code,
const int block_ix,
int* storage_ix,
uint8_t* storage) {
if (block_ix > 0) {
int typecode = code.type_code[block_ix];
WriteBits(code.type_depths[typecode], code.type_bits[typecode],
storage_ix, storage);
}
int lencode = code.length_prefix[block_ix];
WriteBits(code.length_depths[lencode], code.length_bits[lencode],
storage_ix, storage);
WriteBits(code.length_nextra[block_ix], code.length_extra[block_ix],
storage_ix, storage);
}
void BuildAndStoreBlockSplitCode(const std::vector<int>& types,
const std::vector<int>& lengths,
const int num_types,
BlockSplitCode* code,
int* storage_ix,
uint8_t* storage) {
const int num_blocks = types.size();
std::vector<int> type_histo(num_types + 2);
std::vector<int> length_histo(26);
int last_type = 1;
int second_last_type = 0;
code->type_code.resize(num_blocks);
code->length_prefix.resize(num_blocks);
code->length_nextra.resize(num_blocks);
code->length_extra.resize(num_blocks);
code->type_depths.resize(num_types + 2);
code->type_bits.resize(num_types + 2);
code->length_depths.resize(26);
code->length_bits.resize(26);
for (int i = 0; i < num_blocks; ++i) {
int type = types[i];
int type_code = (type == last_type + 1 ? 1 :
type == second_last_type ? 0 :
type + 2);
second_last_type = last_type;
last_type = type;
code->type_code[i] = type_code;
if (i > 0) ++type_histo[type_code];
GetBlockLengthPrefixCode(lengths[i],
&code->length_prefix[i],
&code->length_nextra[i],
&code->length_extra[i]);
++length_histo[code->length_prefix[i]];
}
StoreVarLenUint8(num_types - 1, storage_ix, storage);
if (num_types > 1) {
BuildAndStoreHuffmanTree(&type_histo[0], num_types + 2,
&code->type_depths[0], &code->type_bits[0],
storage_ix, storage);
BuildAndStoreHuffmanTree(&length_histo[0], 26,
&code->length_depths[0], &code->length_bits[0],
storage_ix, storage);
StoreBlockSwitch(*code, 0, storage_ix, storage);
}
}
void StoreTrivialContextMap(int num_types,
int context_bits,
int* storage_ix,
uint8_t* storage) {
StoreVarLenUint8(num_types - 1, storage_ix, storage);
if (num_types > 1) {
int repeat_code = context_bits - 1;
int repeat_bits = (1 << repeat_code) - 1;
int alphabet_size = num_types + repeat_code;
std::vector<int> histogram(alphabet_size);
std::vector<uint8_t> depths(alphabet_size);
std::vector<uint16_t> bits(alphabet_size);
// Write RLEMAX.
WriteBits(1, 1, storage_ix, storage);
WriteBits(4, repeat_code - 1, storage_ix, storage);
histogram[repeat_code] = num_types;
histogram[0] = 1;
for (int i = context_bits; i < alphabet_size; ++i) {
histogram[i] = 1;
}
BuildAndStoreHuffmanTree(&histogram[0], alphabet_size,
&depths[0], &bits[0],
storage_ix, storage);
for (int i = 0; i < num_types; ++i) {
int code = (i == 0 ? 0 : i + context_bits - 1);
WriteBits(depths[code], bits[code], storage_ix, storage);
WriteBits(depths[repeat_code], bits[repeat_code], storage_ix, storage);
WriteBits(repeat_code, repeat_bits, storage_ix, storage);
}
// Write IMTF (inverse-move-to-front) bit.
WriteBits(1, 1, storage_ix, storage);
}
}
// Manages the encoding of one block category (literal, command or distance).
class BlockEncoder {
public:
BlockEncoder(int alphabet_size,
int num_block_types,
const std::vector<int>& block_types,
const std::vector<int>& block_lengths)
: alphabet_size_(alphabet_size),
num_block_types_(num_block_types),
block_types_(block_types),
block_lengths_(block_lengths),
block_ix_(0),
block_len_(block_lengths.empty() ? 0 : block_lengths[0]),
entropy_ix_(0) {}
// Creates entropy codes of block lengths and block types and stores them
// to the bit stream.
void BuildAndStoreBlockSwitchEntropyCodes(int* storage_ix, uint8_t* storage) {
BuildAndStoreBlockSplitCode(
block_types_, block_lengths_, num_block_types_,
&block_split_code_, storage_ix, storage);
}
// Creates entropy codes for all block types and stores them to the bit
// stream.
template<int kSize>
void BuildAndStoreEntropyCodes(
const std::vector<Histogram<kSize> >& histograms,
int* storage_ix, uint8_t* storage) {
depths_.resize(histograms.size() * alphabet_size_);
bits_.resize(histograms.size() * alphabet_size_);
for (int i = 0; i < histograms.size(); ++i) {
int ix = i * alphabet_size_;
BuildAndStoreHuffmanTree(&histograms[i].data_[0], alphabet_size_,
&depths_[ix], &bits_[ix],
storage_ix, storage);
}
}
// Stores the next symbol with the entropy code of the current block type.
// Updates the block type and block length at block boundaries.
void StoreSymbol(int symbol, int* storage_ix, uint8_t* storage) {
if (block_len_ == 0) {
++block_ix_;
block_len_ = block_lengths_[block_ix_];
entropy_ix_ = block_types_[block_ix_] * alphabet_size_;
StoreBlockSwitch(block_split_code_, block_ix_, storage_ix, storage);
}
--block_len_;
int ix = entropy_ix_ + symbol;
WriteBits(depths_[ix], bits_[ix], storage_ix, storage);
}
// Stores the next symbol with the entropy code of the current block type and
// context value.
// Updates the block type and block length at block boundaries.
template<int kContextBits>
void StoreSymbolWithContext(int symbol, int context,
const std::vector<int>& context_map,
int* storage_ix, uint8_t* storage) {
if (block_len_ == 0) {
++block_ix_;
block_len_ = block_lengths_[block_ix_];
entropy_ix_ = block_types_[block_ix_] << kContextBits;
StoreBlockSwitch(block_split_code_, block_ix_, storage_ix, storage);
}
--block_len_;
int histo_ix = context_map[entropy_ix_ + context];
int ix = histo_ix * alphabet_size_ + symbol;
WriteBits(depths_[ix], bits_[ix], storage_ix, storage);
}
private:
const int alphabet_size_;
const int num_block_types_;
const std::vector<int>& block_types_;
const std::vector<int>& block_lengths_;
BlockSplitCode block_split_code_;
int block_ix_;
int block_len_;
int entropy_ix_;
std::vector<uint8_t> depths_;
std::vector<uint16_t> bits_;
};
void JumpToByteBoundary(int* storage_ix, uint8_t* storage) {
*storage_ix = (*storage_ix + 7) & ~7;
storage[*storage_ix >> 3] = 0;
}
bool StoreMetaBlock(const uint8_t* input,
size_t start_pos,
size_t length,
size_t mask,
uint8_t prev_byte,
uint8_t prev_byte2,
bool is_last,
int num_direct_distance_codes,
int distance_postfix_bits,
int literal_context_mode,
const brotli::Command *commands,
size_t n_commands,
const MetaBlockSplit& mb,
int *storage_ix,
uint8_t *storage) {
if (!StoreCompressedMetaBlockHeader(is_last, length, storage_ix, storage)) {
return false;
}
if (length == 0) {
// Only the last meta-block can be empty, so jump to next byte.
JumpToByteBoundary(storage_ix, storage);
return true;
}
int num_distance_codes =
kNumDistanceShortCodes + num_direct_distance_codes +
(48 << distance_postfix_bits);
BlockEncoder literal_enc(256,
mb.literal_split.num_types,
mb.literal_split.types,
mb.literal_split.lengths);
BlockEncoder command_enc(kNumCommandPrefixes,
mb.command_split.num_types,
mb.command_split.types,
mb.command_split.lengths);
BlockEncoder distance_enc(num_distance_codes,
mb.distance_split.num_types,
mb.distance_split.types,
mb.distance_split.lengths);
literal_enc.BuildAndStoreBlockSwitchEntropyCodes(storage_ix, storage);
command_enc.BuildAndStoreBlockSwitchEntropyCodes(storage_ix, storage);
distance_enc.BuildAndStoreBlockSwitchEntropyCodes(storage_ix, storage);
WriteBits(2, distance_postfix_bits, storage_ix, storage);
WriteBits(4, num_direct_distance_codes >> distance_postfix_bits,
storage_ix, storage);
for (int i = 0; i < mb.literal_split.num_types; ++i) {
WriteBits(2, literal_context_mode, storage_ix, storage);
}
if (mb.literal_context_map.empty()) {
StoreTrivialContextMap(mb.literal_histograms.size(), kLiteralContextBits,
storage_ix, storage);
} else {
EncodeContextMap(mb.literal_context_map, mb.literal_histograms.size(),
storage_ix, storage);
}
if (mb.distance_context_map.empty()) {
StoreTrivialContextMap(mb.distance_histograms.size(), kDistanceContextBits,
storage_ix, storage);
} else {
EncodeContextMap(mb.distance_context_map, mb.distance_histograms.size(),
storage_ix, storage);
}
literal_enc.BuildAndStoreEntropyCodes(mb.literal_histograms,
storage_ix, storage);
command_enc.BuildAndStoreEntropyCodes(mb.command_histograms,
storage_ix, storage);
distance_enc.BuildAndStoreEntropyCodes(mb.distance_histograms,
storage_ix, storage);
size_t pos = start_pos;
for (int i = 0; i < n_commands; ++i) {
const Command cmd = commands[i];
int cmd_code = cmd.cmd_prefix_;
int lennumextra = cmd.cmd_extra_ >> 48;
uint64_t lenextra = cmd.cmd_extra_ & 0xffffffffffffULL;
command_enc.StoreSymbol(cmd_code, storage_ix, storage);
WriteBits(lennumextra, lenextra, storage_ix, storage);
if (mb.literal_context_map.empty()) {
for (int j = 0; j < cmd.insert_len_; j++) {
literal_enc.StoreSymbol(input[pos & mask], storage_ix, storage);
++pos;
}
} else {
for (int j = 0; j < cmd.insert_len_; ++j) {
int context = Context(prev_byte, prev_byte2,
literal_context_mode);
int literal = input[pos & mask];
literal_enc.StoreSymbolWithContext<kLiteralContextBits>(
literal, context, mb.literal_context_map, storage_ix, storage);
prev_byte2 = prev_byte;
prev_byte = literal;
++pos;
}
}
pos += cmd.copy_len_;
if (cmd.copy_len_ > 0) {
prev_byte2 = input[(pos - 2) & mask];
prev_byte = input[(pos - 1) & mask];
if (cmd.cmd_prefix_ >= 128) {
int dist_code = cmd.dist_prefix_;
int distnumextra = cmd.dist_extra_ >> 24;
int distextra = cmd.dist_extra_ & 0xffffff;
if (mb.distance_context_map.empty()) {
distance_enc.StoreSymbol(dist_code, storage_ix, storage);
} else {
int context = cmd.DistanceContext();
distance_enc.StoreSymbolWithContext<kDistanceContextBits>(
dist_code, context, mb.distance_context_map, storage_ix, storage);
}
brotli::WriteBits(distnumextra, distextra, storage_ix, storage);
}
}
}
if (is_last) {
JumpToByteBoundary(storage_ix, storage);
}
return true;
}
// This is for storing uncompressed blocks (simple raw storage of
// bytes-as-bytes).
bool StoreUncompressedMetaBlock(bool final_block,
const uint8_t * __restrict input,
size_t position, size_t mask,
size_t len,
int * __restrict storage_ix,
uint8_t * __restrict storage) {
if (!brotli::StoreUncompressedMetaBlockHeader(len, storage_ix, storage)) {
return false;
}
JumpToByteBoundary(storage_ix, storage);
size_t masked_pos = position & mask;
if (masked_pos + len > mask + 1) {
size_t len1 = mask + 1 - masked_pos;
memcpy(&storage[*storage_ix >> 3], &input[masked_pos], len1);
*storage_ix += len1 << 3;
len -= len1;
masked_pos = 0;
}
memcpy(&storage[*storage_ix >> 3], &input[masked_pos], len);
*storage_ix += len << 3;
// We need to clear the next 4 bytes to continue to be
// compatible with WriteBits.
brotli::WriteBitsPrepareStorage(*storage_ix, storage);
// Since the uncomressed block itself may not be the final block, add an empty
// one after this.
if (final_block) {
brotli::WriteBits(1, 1, storage_ix, storage); // islast
brotli::WriteBits(1, 1, storage_ix, storage); // isempty
JumpToByteBoundary(storage_ix, storage);
}
return true;
}
void StoreSyncMetaBlock(int * __restrict storage_ix,
uint8_t * __restrict storage) {
// Empty metadata meta-block bit pattern:
// 1 bit: is_last (0)
// 2 bits: num nibbles (3)
// 1 bit: reserved (0)
// 2 bits: metadata length bytes (0)
WriteBits(6, 6, storage_ix, storage);
JumpToByteBoundary(storage_ix, storage);
}
} // namespace brotli

View File

@ -0,0 +1,137 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Functions to convert brotli-related data structures into the
// brotli bit stream. The functions here operate under
// assumption that there is enough space in the storage, i.e., there are
// no out-of-range checks anywhere.
//
// These functions do bit addressing into a byte array. The byte array
// is called "storage" and the index to the bit is called storage_ix
// in function arguments.
#ifndef BROTLI_ENC_BROTLI_BIT_STREAM_H_
#define BROTLI_ENC_BROTLI_BIT_STREAM_H_
#include <stddef.h>
#include <stdint.h>
#include <vector>
#include "./metablock.h"
namespace brotli {
// All Store functions here will use a storage_ix, which is always the bit
// position for the current storage.
// Stores a number between 0 and 255.
void StoreVarLenUint8(int n, int* storage_ix, uint8_t* storage);
// Stores the compressed meta-block header.
bool StoreCompressedMetaBlockHeader(bool final_block,
size_t length,
int* storage_ix,
uint8_t* storage);
// Stores the uncompressed meta-block header.
bool StoreUncompressedMetaBlockHeader(size_t length,
int* storage_ix,
uint8_t* storage);
// Stores a context map where the histogram type is always the block type.
void StoreTrivialContextMap(int num_types,
int context_bits,
int* storage_ix,
uint8_t* storage);
void StoreHuffmanTreeOfHuffmanTreeToBitMask(
const int num_codes,
const uint8_t *code_length_bitdepth,
int *storage_ix,
uint8_t *storage);
// Builds a Huffman tree from histogram[0:length] into depth[0:length] and
// bits[0:length] and stores the encoded tree to the bit stream.
void BuildAndStoreHuffmanTree(const int *histogram,
const int length,
uint8_t* depth,
uint16_t* bits,
int* storage_ix,
uint8_t* storage);
// Encodes the given context map to the bit stream. The number of different
// histogram ids is given by num_clusters.
void EncodeContextMap(const std::vector<int>& context_map,
int num_clusters,
int* storage_ix, uint8_t* storage);
// Data structure that stores everything that is needed to encode each block
// switch command.
struct BlockSplitCode {
std::vector<int> type_code;
std::vector<int> length_prefix;
std::vector<int> length_nextra;
std::vector<int> length_extra;
std::vector<uint8_t> type_depths;
std::vector<uint16_t> type_bits;
std::vector<uint8_t> length_depths;
std::vector<uint16_t> length_bits;
};
// Builds a BlockSplitCode data structure from the block split given by the
// vector of block types and block lengths and stores it to the bit stream.
void BuildAndStoreBlockSplitCode(const std::vector<int>& types,
const std::vector<int>& lengths,
const int num_types,
BlockSplitCode* code,
int* storage_ix,
uint8_t* storage);
// Stores the block switch command with index block_ix to the bit stream.
void StoreBlockSwitch(const BlockSplitCode& code,
const int block_ix,
int* storage_ix,
uint8_t* storage);
bool StoreMetaBlock(const uint8_t* input,
size_t start_pos,
size_t length,
size_t mask,
uint8_t prev_byte,
uint8_t prev_byte2,
bool final_block,
int num_direct_distance_codes,
int distance_postfix_bits,
int literal_context_mode,
const brotli::Command *commands,
size_t n_commands,
const MetaBlockSplit& mb,
int *storage_ix,
uint8_t *storage);
// This is for storing uncompressed blocks (simple raw storage of
// bytes-as-bytes).
bool StoreUncompressedMetaBlock(bool final_block,
const uint8_t* input,
size_t position, size_t mask,
size_t len,
int* storage_ix,
uint8_t* storage);
// Stores an empty metadata meta-block and syncs to a byte boundary.
void StoreSyncMetaBlock(int* storage_ix, uint8_t* storage);
} // namespace brotli
#endif // BROTLI_ENC_BROTLI_BIT_STREAM_H_

View File

@ -0,0 +1,305 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Functions for clustering similar histograms together.
#ifndef BROTLI_ENC_CLUSTER_H_
#define BROTLI_ENC_CLUSTER_H_
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <algorithm>
#include <complex>
#include <map>
#include <set>
#include <utility>
#include <vector>
#include "./bit_cost.h"
#include "./entropy_encode.h"
#include "./fast_log.h"
#include "./histogram.h"
namespace brotli {
struct HistogramPair {
int idx1;
int idx2;
bool valid;
double cost_combo;
double cost_diff;
};
struct HistogramPairComparator {
bool operator()(const HistogramPair& p1, const HistogramPair& p2) const {
if (p1.cost_diff != p2.cost_diff) {
return p1.cost_diff > p2.cost_diff;
}
return abs(p1.idx1 - p1.idx2) > abs(p2.idx1 - p2.idx2);
}
};
// Returns entropy reduction of the context map when we combine two clusters.
inline double ClusterCostDiff(int size_a, int size_b) {
int size_c = size_a + size_b;
return size_a * FastLog2(size_a) + size_b * FastLog2(size_b) -
size_c * FastLog2(size_c);
}
// Computes the bit cost reduction by combining out[idx1] and out[idx2] and if
// it is below a threshold, stores the pair (idx1, idx2) in the *pairs heap.
template<typename HistogramType>
void CompareAndPushToHeap(const HistogramType* out,
const int* cluster_size,
int idx1, int idx2,
std::vector<HistogramPair>* pairs) {
if (idx1 == idx2) {
return;
}
if (idx2 < idx1) {
int t = idx2;
idx2 = idx1;
idx1 = t;
}
bool store_pair = false;
HistogramPair p;
p.idx1 = idx1;
p.idx2 = idx2;
p.valid = true;
p.cost_diff = 0.5 * ClusterCostDiff(cluster_size[idx1], cluster_size[idx2]);
p.cost_diff -= out[idx1].bit_cost_;
p.cost_diff -= out[idx2].bit_cost_;
if (out[idx1].total_count_ == 0) {
p.cost_combo = out[idx2].bit_cost_;
store_pair = true;
} else if (out[idx2].total_count_ == 0) {
p.cost_combo = out[idx1].bit_cost_;
store_pair = true;
} else {
double threshold = pairs->empty() ? 1e99 :
std::max(0.0, (*pairs)[0].cost_diff);
HistogramType combo = out[idx1];
combo.AddHistogram(out[idx2]);
double cost_combo = PopulationCost(combo);
if (cost_combo < threshold - p.cost_diff) {
p.cost_combo = cost_combo;
store_pair = true;
}
}
if (store_pair) {
p.cost_diff += p.cost_combo;
pairs->push_back(p);
std::push_heap(pairs->begin(), pairs->end(), HistogramPairComparator());
}
}
template<typename HistogramType>
void HistogramCombine(HistogramType* out,
int* cluster_size,
int* symbols,
int symbols_size,
int max_clusters) {
double cost_diff_threshold = 0.0;
int min_cluster_size = 1;
std::set<int> all_symbols;
std::vector<int> clusters;
for (int i = 0; i < symbols_size; ++i) {
if (all_symbols.find(symbols[i]) == all_symbols.end()) {
all_symbols.insert(symbols[i]);
clusters.push_back(symbols[i]);
}
}
// We maintain a heap of histogram pairs, ordered by the bit cost reduction.
std::vector<HistogramPair> pairs;
for (int idx1 = 0; idx1 < clusters.size(); ++idx1) {
for (int idx2 = idx1 + 1; idx2 < clusters.size(); ++idx2) {
CompareAndPushToHeap(out, cluster_size, clusters[idx1], clusters[idx2],
&pairs);
}
}
while (clusters.size() > min_cluster_size) {
if (pairs[0].cost_diff >= cost_diff_threshold) {
cost_diff_threshold = 1e99;
min_cluster_size = max_clusters;
continue;
}
// Take the best pair from the top of heap.
int best_idx1 = pairs[0].idx1;
int best_idx2 = pairs[0].idx2;
out[best_idx1].AddHistogram(out[best_idx2]);
out[best_idx1].bit_cost_ = pairs[0].cost_combo;
cluster_size[best_idx1] += cluster_size[best_idx2];
for (int i = 0; i < symbols_size; ++i) {
if (symbols[i] == best_idx2) {
symbols[i] = best_idx1;
}
}
for (int i = 0; i + 1 < clusters.size(); ++i) {
if (clusters[i] >= best_idx2) {
clusters[i] = clusters[i + 1];
}
}
clusters.pop_back();
// Invalidate pairs intersecting the just combined best pair.
for (int i = 0; i < pairs.size(); ++i) {
HistogramPair& p = pairs[i];
if (p.idx1 == best_idx1 || p.idx2 == best_idx1 ||
p.idx1 == best_idx2 || p.idx2 == best_idx2) {
p.valid = false;
}
}
// Pop invalid pairs from the top of the heap.
while (!pairs.empty() && !pairs[0].valid) {
std::pop_heap(pairs.begin(), pairs.end(), HistogramPairComparator());
pairs.pop_back();
}
// Push new pairs formed with the combined histogram to the heap.
for (int i = 0; i < clusters.size(); ++i) {
CompareAndPushToHeap(out, cluster_size, best_idx1, clusters[i], &pairs);
}
}
}
// -----------------------------------------------------------------------------
// Histogram refinement
// What is the bit cost of moving histogram from cur_symbol to candidate.
template<typename HistogramType>
double HistogramBitCostDistance(const HistogramType& histogram,
const HistogramType& candidate) {
if (histogram.total_count_ == 0) {
return 0.0;
}
HistogramType tmp = histogram;
tmp.AddHistogram(candidate);
return PopulationCost(tmp) - candidate.bit_cost_;
}
// Find the best 'out' histogram for each of the 'in' histograms.
// Note: we assume that out[]->bit_cost_ is already up-to-date.
template<typename HistogramType>
void HistogramRemap(const HistogramType* in, int in_size,
HistogramType* out, int* symbols) {
std::set<int> all_symbols;
for (int i = 0; i < in_size; ++i) {
all_symbols.insert(symbols[i]);
}
for (int i = 0; i < in_size; ++i) {
int best_out = i == 0 ? symbols[0] : symbols[i - 1];
double best_bits = HistogramBitCostDistance(in[i], out[best_out]);
for (std::set<int>::const_iterator k = all_symbols.begin();
k != all_symbols.end(); ++k) {
const double cur_bits = HistogramBitCostDistance(in[i], out[*k]);
if (cur_bits < best_bits) {
best_bits = cur_bits;
best_out = *k;
}
}
symbols[i] = best_out;
}
// Recompute each out based on raw and symbols.
for (std::set<int>::const_iterator k = all_symbols.begin();
k != all_symbols.end(); ++k) {
out[*k].Clear();
}
for (int i = 0; i < in_size; ++i) {
out[symbols[i]].AddHistogram(in[i]);
}
}
// Reorder histograms in *out so that the new symbols in *symbols come in
// increasing order.
template<typename HistogramType>
void HistogramReindex(std::vector<HistogramType>* out,
std::vector<int>* symbols) {
std::vector<HistogramType> tmp(*out);
std::map<int, int> new_index;
int next_index = 0;
for (int i = 0; i < symbols->size(); ++i) {
if (new_index.find((*symbols)[i]) == new_index.end()) {
new_index[(*symbols)[i]] = next_index;
(*out)[next_index] = tmp[(*symbols)[i]];
++next_index;
}
}
out->resize(next_index);
for (int i = 0; i < symbols->size(); ++i) {
(*symbols)[i] = new_index[(*symbols)[i]];
}
}
template<typename HistogramType>
void ClusterHistogramsTrivial(const std::vector<HistogramType>& in,
int num_contexts, int num_blocks,
int max_histograms,
std::vector<HistogramType>* out,
std::vector<int>* histogram_symbols) {
out->resize(num_blocks);
for (int i = 0; i < num_blocks; ++i) {
(*out)[i].Clear();
for (int j = 0; j < num_contexts; ++j) {
(*out)[i].AddHistogram(in[i * num_contexts + j]);
histogram_symbols->push_back(i);
}
}
}
// Clusters similar histograms in 'in' together, the selected histograms are
// placed in 'out', and for each index in 'in', *histogram_symbols will
// indicate which of the 'out' histograms is the best approximation.
template<typename HistogramType>
void ClusterHistograms(const std::vector<HistogramType>& in,
int num_contexts, int num_blocks,
int max_histograms,
std::vector<HistogramType>* out,
std::vector<int>* histogram_symbols) {
const int in_size = num_contexts * num_blocks;
std::vector<int> cluster_size(in_size, 1);
out->resize(in_size);
histogram_symbols->resize(in_size);
for (int i = 0; i < in_size; ++i) {
(*out)[i] = in[i];
(*out)[i].bit_cost_ = PopulationCost(in[i]);
(*histogram_symbols)[i] = i;
}
// Collapse similar histograms within a block type.
if (num_contexts > 1) {
for (int i = 0; i < num_blocks; ++i) {
HistogramCombine(&(*out)[0], &cluster_size[0],
&(*histogram_symbols)[i * num_contexts], num_contexts,
max_histograms);
}
}
// Collapse similar histograms.
HistogramCombine(&(*out)[0], &cluster_size[0],
&(*histogram_symbols)[0], in_size,
max_histograms);
// Find the optimal map from original histograms to the final ones.
HistogramRemap(&in[0], in_size, &(*out)[0], &(*histogram_symbols)[0]);
// Convert the context map to a canonical form.
HistogramReindex(out, histogram_symbols);
}
} // namespace brotli
#endif // BROTLI_ENC_CLUSTER_H_

View File

@ -0,0 +1,150 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// This class models a sequence of literals and a backward reference copy.
#ifndef BROTLI_ENC_COMMAND_H_
#define BROTLI_ENC_COMMAND_H_
#include <stdint.h>
#include "./fast_log.h"
namespace brotli {
static inline void GetDistCode(int distance_code,
uint16_t* code, uint32_t* extra) {
distance_code -= 1;
if (distance_code < 16) {
*code = distance_code;
*extra = 0;
} else {
distance_code -= 12;
int numextra = Log2FloorNonZero(distance_code) - 1;
int prefix = distance_code >> numextra;
*code = 12 + 2 * numextra + prefix;
*extra = (numextra << 24) | (distance_code - (prefix << numextra));
}
}
static int insbase[] = { 0, 1, 2, 3, 4, 5, 6, 8, 10, 14, 18, 26, 34, 50, 66,
98, 130, 194, 322, 578, 1090, 2114, 6210, 22594 };
static int insextra[] = { 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5,
5, 6, 7, 8, 9, 10, 12, 14, 24 };
static int copybase[] = { 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 18, 22, 30, 38,
54, 70, 102, 134, 198, 326, 582, 1094, 2118 };
static int copyextra[] = { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4,
4, 5, 5, 6, 7, 8, 9, 10, 24 };
static inline int GetInsertLengthCode(int insertlen) {
if (insertlen < 6) {
return insertlen;
} else if (insertlen < 130) {
insertlen -= 2;
int nbits = Log2FloorNonZero(insertlen) - 1;
return (nbits << 1) + (insertlen >> nbits) + 2;
} else if (insertlen < 2114) {
return Log2FloorNonZero(insertlen - 66) + 10;
} else if (insertlen < 6210) {
return 21;
} else if (insertlen < 22594) {
return 22;
} else {
return 23;
}
}
static inline int GetCopyLengthCode(int copylen) {
if (copylen < 10) {
return copylen - 2;
} else if (copylen < 134) {
copylen -= 6;
int nbits = Log2FloorNonZero(copylen) - 1;
return (nbits << 1) + (copylen >> nbits) + 4;
} else if (copylen < 2118) {
return Log2FloorNonZero(copylen - 70) + 12;
} else {
return 23;
}
}
static inline int CombineLengthCodes(
int inscode, int copycode, int distancecode) {
int bits64 = (copycode & 0x7u) | ((inscode & 0x7u) << 3);
if (distancecode == 0 && inscode < 8 && copycode < 16) {
return (copycode < 8) ? bits64 : (bits64 | 64);
} else {
// "To convert an insert-and-copy length code to an insert length code and
// a copy length code, the following table can be used"
static const int cells[9] = { 2, 3, 6, 4, 5, 8, 7, 9, 10 };
return (cells[(copycode >> 3) + 3 * (inscode >> 3)] << 6) | bits64;
}
}
static inline void GetLengthCode(int insertlen, int copylen, int distancecode,
uint16_t* code, uint64_t* extra) {
int inscode = GetInsertLengthCode(insertlen);
int copycode = GetCopyLengthCode(copylen);
uint64_t insnumextra = insextra[inscode];
uint64_t numextra = insnumextra + copyextra[copycode];
uint64_t insextraval = insertlen - insbase[inscode];
uint64_t copyextraval = copylen - copybase[copycode];
*code = CombineLengthCodes(inscode, copycode, distancecode);
*extra = (numextra << 48) | (copyextraval << insnumextra) | insextraval;
}
struct Command {
Command() {}
Command(int insertlen, int copylen, int copylen_code, int distance_code)
: insert_len_(insertlen), copy_len_(copylen) {
GetDistCode(distance_code, &dist_prefix_, &dist_extra_);
GetLengthCode(insertlen, copylen_code, dist_prefix_,
&cmd_prefix_, &cmd_extra_);
}
Command(int insertlen)
: insert_len_(insertlen), copy_len_(0), dist_prefix_(16), dist_extra_(0) {
GetLengthCode(insertlen, 4, dist_prefix_, &cmd_prefix_, &cmd_extra_);
}
int DistanceCode() const {
if (dist_prefix_ < 16) {
return dist_prefix_ + 1;
}
int nbits = dist_extra_ >> 24;
int extra = dist_extra_ & 0xffffff;
int prefix = dist_prefix_ - 12 - 2 * nbits;
return (prefix << nbits) + extra + 13;
}
int DistanceContext() const {
int r = cmd_prefix_ >> 6;
int c = cmd_prefix_ & 7;
if ((r == 0 || r == 2 || r == 4 || r == 7) && (c <= 2)) {
return c;
}
return 3;
}
int insert_len_;
int copy_len_;
uint16_t cmd_prefix_;
uint16_t dist_prefix_;
uint64_t cmd_extra_;
uint32_t dist_extra_;
};
} // namespace brotli
#endif // BROTLI_ENC_COMMAND_H_

View File

@ -0,0 +1,185 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Functions to map previous bytes into a context id.
#ifndef BROTLI_ENC_CONTEXT_H_
#define BROTLI_ENC_CONTEXT_H_
#include <stdint.h>
namespace brotli {
// Second-order context lookup table for UTF8 byte streams.
//
// If p1 and p2 are the previous two bytes, we calcualte the context as
//
// context = kUTF8ContextLookup[p1] | kUTF8ContextLookup[p2 + 256].
//
// If the previous two bytes are ASCII characters (i.e. < 128), this will be
// equivalent to
//
// context = 4 * context1(p1) + context2(p2),
//
// where context1 is based on the previous byte in the following way:
//
// 0 : non-ASCII control
// 1 : \t, \n, \r
// 2 : space
// 3 : other punctuation
// 4 : " '
// 5 : %
// 6 : ( < [ {
// 7 : ) > ] }
// 8 : , ; :
// 9 : .
// 10 : =
// 11 : number
// 12 : upper-case vowel
// 13 : upper-case consonant
// 14 : lower-case vowel
// 15 : lower-case consonant
//
// and context2 is based on the second last byte:
//
// 0 : control, space
// 1 : punctuation
// 2 : upper-case letter, number
// 3 : lower-case letter
//
// If the last byte is ASCII, and the second last byte is not (in a valid UTF8
// stream it will be a continuation byte, value between 128 and 191), the
// context is the same as if the second last byte was an ASCII control or space.
//
// If the last byte is a UTF8 lead byte (value >= 192), then the next byte will
// be a continuation byte and the context id is 2 or 3 depending on the LSB of
// the last byte and to a lesser extent on the second last byte if it is ASCII.
//
// If the last byte is a UTF8 continuation byte, the second last byte can be:
// - continuation byte: the next byte is probably ASCII or lead byte (assuming
// 4-byte UTF8 characters are rare) and the context id is 0 or 1.
// - lead byte (192 - 207): next byte is ASCII or lead byte, context is 0 or 1
// - lead byte (208 - 255): next byte is continuation byte, context is 2 or 3
//
// The possible value combinations of the previous two bytes, the range of
// context ids and the type of the next byte is summarized in the table below:
//
// |--------\-----------------------------------------------------------------|
// | \ Last byte |
// | Second \---------------------------------------------------------------|
// | last byte \ ASCII | cont. byte | lead byte |
// | \ (0-127) | (128-191) | (192-) |
// |=============|===================|=====================|==================|
// | ASCII | next: ASCII/lead | not valid | next: cont. |
// | (0-127) | context: 4 - 63 | | context: 2 - 3 |
// |-------------|-------------------|---------------------|------------------|
// | cont. byte | next: ASCII/lead | next: ASCII/lead | next: cont. |
// | (128-191) | context: 4 - 63 | context: 0 - 1 | context: 2 - 3 |
// |-------------|-------------------|---------------------|------------------|
// | lead byte | not valid | next: ASCII/lead | not valid |
// | (192-207) | | context: 0 - 1 | |
// |-------------|-------------------|---------------------|------------------|
// | lead byte | not valid | next: cont. | not valid |
// | (208-) | | context: 2 - 3 | |
// |-------------|-------------------|---------------------|------------------|
static const uint8_t kUTF8ContextLookup[512] = {
// Last byte.
//
// ASCII range.
0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 4, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12,
44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12,
12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48,
52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12,
12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56,
60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12, 0,
// UTF8 continuation byte range.
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
// UTF8 lead byte range.
2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3,
// Second last byte.
//
// ASCII range.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0,
// UTF8 continuation byte range.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// UTF8 lead byte range.
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
};
// Context lookup table for small signed integers.
static const uint8_t kSigned3BitContextLookup[] = {
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7,
};
enum ContextType {
CONTEXT_LSB6 = 0,
CONTEXT_MSB6 = 1,
CONTEXT_UTF8 = 2,
CONTEXT_SIGNED = 3
};
static inline uint8_t Context(uint8_t p1, uint8_t p2, int mode) {
switch (mode) {
case CONTEXT_LSB6:
return p1 & 0x3f;
case CONTEXT_MSB6:
return p1 >> 2;
case CONTEXT_UTF8:
return kUTF8ContextLookup[p1] | kUTF8ContextLookup[p2 + 256];
case CONTEXT_SIGNED:
return (kSigned3BitContextLookup[p1] << 3) + kSigned3BitContextLookup[p2];
default:
return 0;
}
}
} // namespace brotli
#endif // BROTLI_ENC_CONTEXT_H_

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,589 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Implementation of Brotli compressor.
#include "./encode.h"
#include <algorithm>
#include <limits>
#include "./backward_references.h"
#include "./bit_cost.h"
#include "./block_splitter.h"
#include "./brotli_bit_stream.h"
#include "./cluster.h"
#include "./context.h"
#include "./metablock.h"
#include "./transform.h"
#include "./entropy_encode.h"
#include "./fast_log.h"
#include "./hash.h"
#include "./histogram.h"
#include "./literal_cost.h"
#include "./prefix.h"
#include "./write_bits.h"
namespace brotli {
static const double kMinUTF8Ratio = 0.75;
int ParseAsUTF8(int* symbol, const uint8_t* input, int size) {
// ASCII
if ((input[0] & 0x80) == 0) {
*symbol = input[0];
if (*symbol > 0) {
return 1;
}
}
// 2-byte UTF8
if (size > 1 &&
(input[0] & 0xe0) == 0xc0 &&
(input[1] & 0xc0) == 0x80) {
*symbol = (((input[0] & 0x1f) << 6) |
(input[1] & 0x3f));
if (*symbol > 0x7f) {
return 2;
}
}
// 3-byte UFT8
if (size > 2 &&
(input[0] & 0xf0) == 0xe0 &&
(input[1] & 0xc0) == 0x80 &&
(input[2] & 0xc0) == 0x80) {
*symbol = (((input[0] & 0x0f) << 12) |
((input[1] & 0x3f) << 6) |
(input[2] & 0x3f));
if (*symbol > 0x7ff) {
return 3;
}
}
// 4-byte UFT8
if (size > 3 &&
(input[0] & 0xf8) == 0xf0 &&
(input[1] & 0xc0) == 0x80 &&
(input[2] & 0xc0) == 0x80 &&
(input[3] & 0xc0) == 0x80) {
*symbol = (((input[0] & 0x07) << 18) |
((input[1] & 0x3f) << 12) |
((input[2] & 0x3f) << 6) |
(input[3] & 0x3f));
if (*symbol > 0xffff && *symbol <= 0x10ffff) {
return 4;
}
}
// Not UTF8, emit a special symbol above the UTF8-code space
*symbol = 0x110000 | input[0];
return 1;
}
// Returns true if at least min_fraction of the data is UTF8-encoded.
bool IsMostlyUTF8(const uint8_t* data, size_t length, double min_fraction) {
size_t size_utf8 = 0;
size_t pos = 0;
while (pos < length) {
int symbol;
int bytes_read = ParseAsUTF8(&symbol, data + pos, length - pos);
pos += bytes_read;
if (symbol < 0x110000) size_utf8 += bytes_read;
}
return size_utf8 > min_fraction * length;
}
void RecomputeDistancePrefixes(Command* cmds,
size_t num_commands,
int num_direct_distance_codes,
int distance_postfix_bits) {
if (num_direct_distance_codes == 0 &&
distance_postfix_bits == 0) {
return;
}
for (int i = 0; i < num_commands; ++i) {
Command* cmd = &cmds[i];
if (cmd->copy_len_ > 0 && cmd->cmd_prefix_ >= 128) {
PrefixEncodeCopyDistance(cmd->DistanceCode(),
num_direct_distance_codes,
distance_postfix_bits,
&cmd->dist_prefix_,
&cmd->dist_extra_);
}
}
}
uint8_t* BrotliCompressor::GetBrotliStorage(size_t size) {
if (storage_size_ < size) {
storage_.reset(new uint8_t[size]);
storage_size_ = size;
}
return &storage_[0];
}
BrotliCompressor::BrotliCompressor(BrotliParams params)
: params_(params),
hashers_(new Hashers()),
input_pos_(0),
num_commands_(0),
last_insert_len_(0),
last_flush_pos_(0),
last_processed_pos_(0),
prev_byte_(0),
prev_byte2_(0),
storage_size_(0) {
// Sanitize params.
params_.quality = std::max(0, params_.quality);
if (params_.lgwin < kMinWindowBits) {
params_.lgwin = kMinWindowBits;
} else if (params_.lgwin > kMaxWindowBits) {
params_.lgwin = kMaxWindowBits;
}
if (params_.lgblock == 0) {
params_.lgblock = 16;
if (params_.quality >= 9 && params_.lgwin > params_.lgblock) {
params_.lgblock = std::min(21, params_.lgwin);
}
} else {
params_.lgblock = std::min(kMaxInputBlockBits,
std::max(kMinInputBlockBits, params_.lgblock));
}
if (params_.quality <= 9) {
params_.enable_dictionary = false;
params_.enable_transforms = false;
params_.greedy_block_split = true;
params_.enable_context_modeling = false;
}
// Set maximum distance, see section 9.1. of the spec.
max_backward_distance_ = (1 << params_.lgwin) - 16;
// Initialize input and literal cost ring buffers.
// We allocate at least lgwin + 1 bits for the ring buffer so that the newly
// added block fits there completely and we still get lgwin bits and at least
// read_block_size_bits + 1 bits because the copy tail length needs to be
// smaller than ringbuffer size.
int ringbuffer_bits = std::max(params_.lgwin + 1, params_.lgblock + 1);
ringbuffer_.reset(new RingBuffer(ringbuffer_bits, params_.lgblock));
if (params_.quality > 9) {
literal_cost_mask_ = (1 << params_.lgblock) - 1;
literal_cost_.reset(new float[literal_cost_mask_ + 1]);
}
// Allocate command buffer.
cmd_buffer_size_ = std::max(1 << 18, 1 << params_.lgblock);
commands_.reset(new brotli::Command[cmd_buffer_size_]);
// Initialize last byte with stream header.
if (params_.lgwin == 16) {
last_byte_ = 0;
last_byte_bits_ = 1;
} else {
last_byte_ = ((params_.lgwin - 17) << 1) | 1;
last_byte_bits_ = 4;
}
// Initialize distance cache.
dist_cache_[0] = 4;
dist_cache_[1] = 11;
dist_cache_[2] = 15;
dist_cache_[3] = 16;
// Initialize hashers.
switch (params_.quality) {
case 0:
case 1: hash_type_ = 1; break;
case 2:
case 3: hash_type_ = 2; break;
case 4: hash_type_ = 3; break;
case 5:
case 6: hash_type_ = 4; break;
case 7: hash_type_ = 5; break;
case 8: hash_type_ = 6; break;
case 9: hash_type_ = 7; break;
default: // quality > 9
hash_type_ = (params_.mode == BrotliParams::MODE_TEXT) ? 8 : 9;
}
hashers_->Init(hash_type_);
if (params_.mode == BrotliParams::MODE_TEXT &&
params_.enable_dictionary) {
StoreDictionaryWordHashes(params_.enable_transforms);
}
}
BrotliCompressor::~BrotliCompressor() {
}
StaticDictionary* BrotliCompressor::static_dictionary_ = NULL;
void BrotliCompressor::StoreDictionaryWordHashes(bool enable_transforms) {
if (static_dictionary_ == NULL) {
static_dictionary_ = new StaticDictionary;
static_dictionary_->Fill(enable_transforms);
}
hashers_->SetStaticDictionary(static_dictionary_);
}
void BrotliCompressor::CopyInputToRingBuffer(const size_t input_size,
const uint8_t* input_buffer) {
ringbuffer_->Write(input_buffer, input_size);
input_pos_ += input_size;
// Erase a few more bytes in the ring buffer to make hashing not
// depend on uninitialized data. This makes compression deterministic
// and it prevents uninitialized memory warnings in Valgrind. Even
// without erasing, the output would be valid (but nondeterministic).
//
// Background information: The compressor stores short (at most 8 bytes)
// substrings of the input already read in a hash table, and detects
// repetitions by looking up such substrings in the hash table. If it
// can find a substring, it checks whether the substring is really there
// in the ring buffer (or it's just a hash collision). Should the hash
// table become corrupt, this check makes sure that the output is
// still valid, albeit the compression ratio would be bad.
//
// The compressor populates the hash table from the ring buffer as it's
// reading new bytes from the input. However, at the last few indexes of
// the ring buffer, there are not enough bytes to build full-length
// substrings from. Since the hash table always contains full-length
// substrings, we erase with dummy 0s here to make sure that those
// substrings will contain 0s at the end instead of uninitialized
// data.
//
// Please note that erasing is not necessary (because the
// memory region is already initialized since he ring buffer
// has a `tail' that holds a copy of the beginning,) so we
// skip erasing if we have already gone around at least once in
// the ring buffer.
size_t pos = ringbuffer_->position();
// Only clear during the first round of ringbuffer writes. On
// subsequent rounds data in the ringbuffer would be affected.
if (pos <= ringbuffer_->mask()) {
// This is the first time when the ring buffer is being written.
// We clear 3 bytes just after the bytes that have been copied from
// the input buffer.
//
// The ringbuffer has a "tail" that holds a copy of the beginning,
// but only once the ring buffer has been fully written once, i.e.,
// pos <= mask. For the first time, we need to write values
// in this tail (where index may be larger than mask), so that
// we have exactly defined behavior and don't read un-initialized
// memory. Due to performance reasons, hashing reads data using a
// LOAD32, which can go 3 bytes beyond the bytes written in the
// ringbuffer.
memset(ringbuffer_->start() + pos, 0, 3);
}
}
bool BrotliCompressor::WriteBrotliData(const bool is_last,
const bool force_flush,
size_t* out_size,
uint8_t** output) {
const size_t bytes = input_pos_ - last_processed_pos_;
const uint8_t* data = ringbuffer_->start();
const size_t mask = ringbuffer_->mask();
if (bytes > input_block_size()) {
return false;
}
bool utf8_mode =
params_.enable_context_modeling &&
IsMostlyUTF8(&data[last_processed_pos_ & mask], bytes, kMinUTF8Ratio);
if (literal_cost_.get()) {
if (utf8_mode) {
EstimateBitCostsForLiteralsUTF8(last_processed_pos_, bytes, mask,
literal_cost_mask_, data,
literal_cost_.get());
} else {
EstimateBitCostsForLiterals(last_processed_pos_, bytes, mask,
literal_cost_mask_,
data, literal_cost_.get());
}
}
double base_min_score = params_.enable_context_modeling ? 8.115 : 4.0;
CreateBackwardReferences(bytes, last_processed_pos_, data, mask,
literal_cost_.get(),
literal_cost_mask_,
max_backward_distance_,
base_min_score,
params_.quality,
hashers_.get(),
hash_type_,
dist_cache_,
&last_insert_len_,
&commands_[num_commands_],
&num_commands_);
if (!is_last && !force_flush &&
num_commands_ + (input_block_size() >> 1) < cmd_buffer_size_ &&
input_pos_ + input_block_size() <= last_flush_pos_ + mask + 1) {
// Everything will happen later.
last_processed_pos_ = input_pos_;
*out_size = 0;
return true;
}
// Create the last insert-only command.
if (last_insert_len_ > 0) {
brotli::Command cmd(last_insert_len_);
commands_[num_commands_++] = cmd;
last_insert_len_ = 0;
}
return WriteMetaBlockInternal(is_last, utf8_mode, out_size, output);
}
bool BrotliCompressor::WriteMetaBlockInternal(const bool is_last,
const bool utf8_mode,
size_t* out_size,
uint8_t** output) {
const size_t bytes = input_pos_ - last_flush_pos_;
const uint8_t* data = ringbuffer_->start();
const size_t mask = ringbuffer_->mask();
const size_t max_out_size = 2 * bytes + 500;
uint8_t* storage = GetBrotliStorage(max_out_size);
storage[0] = last_byte_;
int storage_ix = last_byte_bits_;
bool uncompressed = false;
if (num_commands_ < (bytes >> 8) + 2) {
int num_literals = 0;
for (int i = 0; i < num_commands_; ++i) {
num_literals += commands_[i].insert_len_;
}
if (num_literals > 0.99 * bytes) {
int literal_histo[256] = { 0 };
static const int kSampleRate = 13;
static const double kMinEntropy = 7.92;
static const double kBitCostThreshold = bytes * kMinEntropy / kSampleRate;
for (int i = last_flush_pos_; i < input_pos_; i += kSampleRate) {
++literal_histo[data[i & mask]];
}
if (BitsEntropy(literal_histo, 256) > kBitCostThreshold) {
uncompressed = true;
}
}
}
if (bytes == 0) {
if (!StoreCompressedMetaBlockHeader(is_last, 0, &storage_ix, &storage[0])) {
return false;
}
storage_ix = (storage_ix + 7) & ~7;
} else if (uncompressed) {
if (!StoreUncompressedMetaBlock(is_last,
data, last_flush_pos_, mask, bytes,
&storage_ix,
&storage[0])) {
return false;
}
} else {
// Save the state of the distance cache in case we need to restore it for
// emitting an uncompressed block.
int saved_dist_cache[4];
memcpy(saved_dist_cache, dist_cache_, sizeof(dist_cache_));
int num_direct_distance_codes = 0;
int distance_postfix_bits = 0;
if (params_.quality > 9 && params_.mode == BrotliParams::MODE_FONT) {
num_direct_distance_codes = 12;
distance_postfix_bits = 1;
RecomputeDistancePrefixes(commands_.get(),
num_commands_,
num_direct_distance_codes,
distance_postfix_bits);
}
int literal_context_mode = utf8_mode ? CONTEXT_UTF8 : CONTEXT_SIGNED;
MetaBlockSplit mb;
if (params_.greedy_block_split) {
BuildMetaBlockGreedy(data, last_flush_pos_, mask,
commands_.get(), num_commands_,
&mb);
} else {
BuildMetaBlock(data, last_flush_pos_, mask,
prev_byte_, prev_byte2_,
commands_.get(), num_commands_,
literal_context_mode,
params_.enable_context_modeling,
&mb);
}
if (params_.quality >= 3) {
OptimizeHistograms(num_direct_distance_codes,
distance_postfix_bits,
&mb);
}
if (!StoreMetaBlock(data, last_flush_pos_, bytes, mask,
prev_byte_, prev_byte2_,
is_last,
num_direct_distance_codes,
distance_postfix_bits,
literal_context_mode,
commands_.get(), num_commands_,
mb,
&storage_ix,
&storage[0])) {
return false;
}
if (bytes + 4 < (storage_ix >> 3)) {
// Restore the distance cache and last byte.
memcpy(dist_cache_, saved_dist_cache, sizeof(dist_cache_));
storage[0] = last_byte_;
storage_ix = last_byte_bits_;
if (!StoreUncompressedMetaBlock(is_last, data, last_flush_pos_, mask,
bytes, &storage_ix, &storage[0])) {
return false;
}
}
}
last_byte_ = storage[storage_ix >> 3];
last_byte_bits_ = storage_ix & 7;
last_flush_pos_ = input_pos_;
last_processed_pos_ = input_pos_;
prev_byte_ = data[(last_flush_pos_ - 1) & mask];
prev_byte2_ = data[(last_flush_pos_ - 2) & mask];
num_commands_ = 0;
*output = &storage[0];
*out_size = storage_ix >> 3;
return true;
}
bool BrotliCompressor::WriteMetaBlock(const size_t input_size,
const uint8_t* input_buffer,
const bool is_last,
size_t* encoded_size,
uint8_t* encoded_buffer) {
CopyInputToRingBuffer(input_size, input_buffer);
size_t out_size = 0;
uint8_t* output;
if (!WriteBrotliData(is_last, /* force_flush = */ true, &out_size, &output) ||
out_size > *encoded_size) {
return false;
}
if (out_size > 0) {
memcpy(encoded_buffer, output, out_size);
}
*encoded_size = out_size;
return true;
}
bool BrotliCompressor::WriteMetadata(const size_t input_size,
const uint8_t* input_buffer,
const bool is_last,
size_t* encoded_size,
uint8_t* encoded_buffer) {
if (input_size > (1 << 24) || input_size + 6 > *encoded_size) {
return false;
}
int storage_ix = last_byte_bits_;
encoded_buffer[0] = last_byte_;
WriteBits(1, 0, &storage_ix, encoded_buffer);
WriteBits(2, 3, &storage_ix, encoded_buffer);
WriteBits(1, 0, &storage_ix, encoded_buffer);
if (input_size == 0) {
WriteBits(2, 0, &storage_ix, encoded_buffer);
*encoded_size = (storage_ix + 7) >> 3;
} else {
size_t nbits = Log2Floor(input_size - 1) + 1;
size_t nbytes = (nbits + 7) / 8;
WriteBits(2, nbytes, &storage_ix, encoded_buffer);
WriteBits(8 * nbytes, input_size - 1, &storage_ix, encoded_buffer);
size_t hdr_size = (storage_ix + 7) >> 3;
memcpy(&encoded_buffer[hdr_size], input_buffer, input_size);
*encoded_size = hdr_size + input_size;
}
if (is_last) {
encoded_buffer[(*encoded_size)++] = 3;
}
last_byte_ = 0;
last_byte_bits_ = 0;
return true;
}
bool BrotliCompressor::FinishStream(
size_t* encoded_size, uint8_t* encoded_buffer) {
return WriteMetaBlock(0, NULL, true, encoded_size, encoded_buffer);
}
int BrotliCompressBuffer(BrotliParams params,
size_t input_size,
const uint8_t* input_buffer,
size_t* encoded_size,
uint8_t* encoded_buffer) {
if (*encoded_size == 0) {
// Output buffer needs at least one byte.
return 0;
}
BrotliCompressor compressor(params);
BrotliMemIn in(input_buffer, input_size);
BrotliMemOut out(encoded_buffer, *encoded_size);
if (!BrotliCompress(params, &in, &out)) {
return 0;
}
*encoded_size = out.position();
return 1;
}
size_t CopyOneBlockToRingBuffer(BrotliIn* r, BrotliCompressor* compressor) {
const size_t block_size = compressor->input_block_size();
size_t bytes_read = 0;
const uint8_t* data = reinterpret_cast<const uint8_t*>(
r->Read(block_size, &bytes_read));
if (data == NULL) {
return 0;
}
compressor->CopyInputToRingBuffer(bytes_read, data);
// Read more bytes until block_size is filled or an EOF (data == NULL) is
// received. This is useful to get deterministic compressed output for the
// same input no matter how r->Read splits the input to chunks.
for (size_t remaining = block_size - bytes_read; remaining > 0; ) {
size_t more_bytes_read = 0;
data = reinterpret_cast<const uint8_t*>(
r->Read(remaining, &more_bytes_read));
if (data == NULL) {
break;
}
compressor->CopyInputToRingBuffer(more_bytes_read, data);
bytes_read += more_bytes_read;
remaining -= more_bytes_read;
}
return bytes_read;
}
bool BrotliInIsFinished(BrotliIn* r) {
size_t read_bytes;
return r->Read(0, &read_bytes) == NULL;
}
int BrotliCompress(BrotliParams params, BrotliIn* in, BrotliOut* out) {
size_t in_bytes = 0;
size_t out_bytes = 0;
uint8_t* output;
bool final_block = false;
BrotliCompressor compressor(params);
while (!final_block) {
in_bytes = CopyOneBlockToRingBuffer(in, &compressor);
final_block = in_bytes == 0 || BrotliInIsFinished(in);
out_bytes = 0;
if (!compressor.WriteBrotliData(final_block,
/* force_flush = */ false,
&out_bytes, &output)) {
return false;
}
if (out_bytes > 0 && !out->Write(output, out_bytes)) {
return false;
}
}
return true;
}
} // namespace brotli

View File

@ -0,0 +1,179 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// API for Brotli compression
#ifndef BROTLI_ENC_ENCODE_H_
#define BROTLI_ENC_ENCODE_H_
#include <stddef.h>
#include <stdint.h>
#include <string>
#include <vector>
#include "./command.h"
#include "./hash.h"
#include "./ringbuffer.h"
#include "./static_dict.h"
#include "./streams.h"
namespace brotli {
static const int kMaxWindowBits = 24;
static const int kMinWindowBits = 16;
static const int kMinInputBlockBits = 16;
static const int kMaxInputBlockBits = 24;
struct BrotliParams {
BrotliParams()
: mode(MODE_TEXT),
quality(11),
lgwin(22),
lgblock(0),
enable_dictionary(true),
enable_transforms(false),
greedy_block_split(false),
enable_context_modeling(true) {}
enum Mode {
MODE_TEXT = 0,
MODE_FONT = 1,
};
Mode mode;
// Controls the compression-speed vs compression-density tradeoffs. The higher
// the quality, the slower the compression. Range is 0 to 11.
int quality;
// Base 2 logarithm of the sliding window size. Range is 16 to 24.
int lgwin;
// Base 2 logarithm of the maximum input block size. Range is 16 to 24.
// If set to 0, the value will be set based on the quality.
int lgblock;
// These settings will be respected only if quality > 9.
bool enable_dictionary;
bool enable_transforms;
bool greedy_block_split;
bool enable_context_modeling;
};
// An instance can not be reused for multiple brotli streams.
class BrotliCompressor {
public:
explicit BrotliCompressor(BrotliParams params);
~BrotliCompressor();
// The maximum input size that can be processed at once.
size_t input_block_size() const { return 1 << params_.lgblock; }
// Encodes the data in input_buffer as a meta-block and writes it to
// encoded_buffer (*encoded_size should be set to the size of
// encoded_buffer) and sets *encoded_size to the number of bytes that
// was written. Returns 0 if there was an error and 1 otherwise.
bool WriteMetaBlock(const size_t input_size,
const uint8_t* input_buffer,
const bool is_last,
size_t* encoded_size,
uint8_t* encoded_buffer);
// Writes a metadata meta-block containing the given input to encoded_buffer.
// *encoded_size should be set to the size of the encoded_buffer.
// Sets *encoded_size to the number of bytes that was written.
// Note that the given input data will not be part of the sliding window and
// thus no backward references can be made to this data from subsequent
// metablocks.
bool WriteMetadata(const size_t input_size,
const uint8_t* input_buffer,
const bool is_last,
size_t* encoded_size,
uint8_t* encoded_buffer);
// Writes a zero-length meta-block with end-of-input bit set to the
// internal output buffer and copies the output buffer to encoded_buffer
// (*encoded_size should be set to the size of encoded_buffer) and sets
// *encoded_size to the number of bytes written. Returns false if there was
// an error and true otherwise.
bool FinishStream(size_t* encoded_size, uint8_t* encoded_buffer);
// Copies the given input data to the internal ring buffer of the compressor.
// No processing of the data occurs at this time and this function can be
// called multiple times before calling WriteBrotliData() to process the
// accumulated input. At most input_block_size() bytes of input data can be
// copied to the ring buffer, otherwise the next WriteBrotliData() will fail.
void CopyInputToRingBuffer(const size_t input_size,
const uint8_t* input_buffer);
// Processes the accumulated input data and sets *out_size to the length of
// the new output meta-block, or to zero if no new output meta-block was
// created (in this case the processed input data is buffered internally).
// If *out_size is positive, *output points to the start of the output data.
// Returns false if the size of the input data is larger than
// input_block_size() or if there was an error during writing the output.
// If is_last or force_flush is true, an output meta-block is always created.
bool WriteBrotliData(const bool is_last, const bool force_flush,
size_t* out_size, uint8_t** output);
// No-op, but we keep it here for API backward-compatibility.
void WriteStreamHeader() {}
private:
// Initializes the hasher with the hashes of dictionary words.
void StoreDictionaryWordHashes(bool enable_transforms);
uint8_t* GetBrotliStorage(size_t size);
bool WriteMetaBlockInternal(const bool is_last,
const bool utf8_mode,
size_t* out_size,
uint8_t** output);
BrotliParams params_;
int max_backward_distance_;
std::unique_ptr<Hashers> hashers_;
int hash_type_;
size_t input_pos_;
std::unique_ptr<RingBuffer> ringbuffer_;
std::unique_ptr<float[]> literal_cost_;
size_t literal_cost_mask_;
size_t cmd_buffer_size_;
std::unique_ptr<Command[]> commands_;
int num_commands_;
int last_insert_len_;
size_t last_flush_pos_;
size_t last_processed_pos_;
int dist_cache_[4];
uint8_t last_byte_;
uint8_t last_byte_bits_;
uint8_t prev_byte_;
uint8_t prev_byte2_;
int storage_size_;
std::unique_ptr<uint8_t[]> storage_;
static StaticDictionary *static_dictionary_;
};
// Compresses the data in input_buffer into encoded_buffer, and sets
// *encoded_size to the compressed length.
// Returns 0 if there was an error and 1 otherwise.
int BrotliCompressBuffer(BrotliParams params,
size_t input_size,
const uint8_t* input_buffer,
size_t* encoded_size,
uint8_t* encoded_buffer);
// Same as above, but uses the specified input and output classes instead
// of reading from and writing to pre-allocated memory buffers.
int BrotliCompress(BrotliParams params, BrotliIn* in, BrotliOut* out);
} // namespace brotli
#endif // BROTLI_ENC_ENCODE_H_

View File

@ -0,0 +1,356 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Implementation of parallel Brotli compressor.
#include "./encode_parallel.h"
#include <algorithm>
#include <limits>
#include "./backward_references.h"
#include "./bit_cost.h"
#include "./block_splitter.h"
#include "./brotli_bit_stream.h"
#include "./cluster.h"
#include "./context.h"
#include "./metablock.h"
#include "./transform.h"
#include "./entropy_encode.h"
#include "./fast_log.h"
#include "./hash.h"
#include "./histogram.h"
#include "./literal_cost.h"
#include "./prefix.h"
#include "./write_bits.h"
namespace brotli {
namespace {
int ParseAsUTF8(int* symbol, const uint8_t* input, int size) {
// ASCII
if ((input[0] & 0x80) == 0) {
*symbol = input[0];
if (*symbol > 0) {
return 1;
}
}
// 2-byte UTF8
if (size > 1 &&
(input[0] & 0xe0) == 0xc0 &&
(input[1] & 0xc0) == 0x80) {
*symbol = (((input[0] & 0x1f) << 6) |
(input[1] & 0x3f));
if (*symbol > 0x7f) {
return 2;
}
}
// 3-byte UFT8
if (size > 2 &&
(input[0] & 0xf0) == 0xe0 &&
(input[1] & 0xc0) == 0x80 &&
(input[2] & 0xc0) == 0x80) {
*symbol = (((input[0] & 0x0f) << 12) |
((input[1] & 0x3f) << 6) |
(input[2] & 0x3f));
if (*symbol > 0x7ff) {
return 3;
}
}
// 4-byte UFT8
if (size > 3 &&
(input[0] & 0xf8) == 0xf0 &&
(input[1] & 0xc0) == 0x80 &&
(input[2] & 0xc0) == 0x80 &&
(input[3] & 0xc0) == 0x80) {
*symbol = (((input[0] & 0x07) << 18) |
((input[1] & 0x3f) << 12) |
((input[2] & 0x3f) << 6) |
(input[3] & 0x3f));
if (*symbol > 0xffff && *symbol <= 0x10ffff) {
return 4;
}
}
// Not UTF8, emit a special symbol above the UTF8-code space
*symbol = 0x110000 | input[0];
return 1;
}
// Returns true if at least min_fraction of the data is UTF8-encoded.
bool IsMostlyUTF8(const uint8_t* data, size_t length, double min_fraction) {
size_t size_utf8 = 0;
for (size_t pos = 0; pos < length; ) {
int symbol;
int bytes_read = ParseAsUTF8(&symbol, data + pos, length - pos);
pos += bytes_read;
if (symbol < 0x110000) size_utf8 += bytes_read;
}
return size_utf8 > min_fraction * length;
}
void RecomputeDistancePrefixes(std::vector<Command>* cmds,
int num_direct_distance_codes,
int distance_postfix_bits) {
if (num_direct_distance_codes == 0 &&
distance_postfix_bits == 0) {
return;
}
for (int i = 0; i < cmds->size(); ++i) {
Command* cmd = &(*cmds)[i];
if (cmd->copy_len_ > 0 && cmd->cmd_prefix_ >= 128) {
PrefixEncodeCopyDistance(cmd->DistanceCode(),
num_direct_distance_codes,
distance_postfix_bits,
&cmd->dist_prefix_,
&cmd->dist_extra_);
}
}
}
bool WriteMetaBlockParallel(const BrotliParams& params,
const size_t block_size,
const uint8_t* input_buffer,
const size_t prefix_size,
const uint8_t* prefix_buffer,
const StaticDictionary* static_dict,
const bool is_first,
const bool is_last,
size_t* encoded_size,
uint8_t* encoded_buffer) {
if (block_size == 0) {
return false;
}
const size_t input_size = block_size;
// Copy prefix + next input block into a continuous area.
size_t input_pos = prefix_size;
std::vector<uint8_t> input(prefix_size + input_size);
memcpy(&input[0], prefix_buffer, prefix_size);
memcpy(&input[input_pos], input_buffer, input_size);
// Since we don't have a ringbuffer, masking is a no-op.
// We use one less bit than the full range because some of the code uses
// mask + 1 as the size of the ringbuffer.
const size_t mask = std::numeric_limits<size_t>::max() >> 1;
uint8_t prev_byte = input_pos > 0 ? input[(input_pos - 1) & mask] : 0;
uint8_t prev_byte2 = input_pos > 1 ? input[(input_pos - 2) & mask] : 0;
// Decide about UTF8 mode.
static const double kMinUTF8Ratio = 0.75;
bool utf8_mode = IsMostlyUTF8(&input[input_pos], input_size, kMinUTF8Ratio);
// Compute literal costs.
std::vector<float> literal_cost(prefix_size + input_size);
if (utf8_mode) {
EstimateBitCostsForLiteralsUTF8(input_pos, input_size, mask, mask,
&input[0], &literal_cost[0]);
} else {
EstimateBitCostsForLiterals(input_pos, input_size, mask, mask,
&input[0], &literal_cost[0]);
}
// Initialize hashers.
int hash_type = 9;
switch (params.mode) {
case BrotliParams::MODE_TEXT: hash_type = 8; break;
case BrotliParams::MODE_FONT: hash_type = 9; break;
default: break;
}
std::unique_ptr<Hashers> hashers(new Hashers());
hashers->Init(hash_type);
hashers->SetStaticDictionary(static_dict);
// Compute backward references.
int last_insert_len = 0;
int num_commands = 0;
double base_min_score = 8.115;
int max_backward_distance = (1 << params.lgwin) - 16;
int dist_cache[4] = { -4, -4, -4, -4 };
std::vector<Command> commands((input_size + 1) >> 1);
CreateBackwardReferences(
input_size, input_pos,
&input[0], mask,
&literal_cost[0], mask,
max_backward_distance,
base_min_score,
params.quality,
hashers.get(),
hash_type,
dist_cache,
&last_insert_len,
&commands[0],
&num_commands);
commands.resize(num_commands);
if (last_insert_len > 0) {
commands.push_back(Command(last_insert_len));
}
// Build the meta-block.
MetaBlockSplit mb;
int num_direct_distance_codes =
params.mode == BrotliParams::MODE_FONT ? 12 : 0;
int distance_postfix_bits = params.mode == BrotliParams::MODE_FONT ? 1 : 0;
int literal_context_mode = utf8_mode ? CONTEXT_UTF8 : CONTEXT_SIGNED;
RecomputeDistancePrefixes(&commands,
num_direct_distance_codes,
distance_postfix_bits);
if (params.greedy_block_split) {
BuildMetaBlockGreedy(&input[0], input_pos, mask,
commands.data(), commands.size(),
&mb);
} else {
BuildMetaBlock(&input[0], input_pos, mask,
prev_byte, prev_byte2,
commands.data(), commands.size(),
literal_context_mode,
true,
&mb);
}
// Set up the temporary output storage.
const size_t max_out_size = 2 * input_size + 500;
std::vector<uint8_t> storage(max_out_size);
int first_byte = 0;
int first_byte_bits = 0;
if (is_first) {
if (params.lgwin == 16) {
first_byte = 0;
first_byte_bits = 1;
} else {
first_byte = ((params.lgwin - 17) << 1) | 1;
first_byte_bits = 4;
}
}
storage[0] = first_byte;
int storage_ix = first_byte_bits;
// Store the meta-block to the temporary output.
if (!StoreMetaBlock(&input[0], input_pos, input_size, mask,
prev_byte, prev_byte2,
is_last,
num_direct_distance_codes,
distance_postfix_bits,
literal_context_mode,
commands.data(), commands.size(),
mb,
&storage_ix, &storage[0])) {
return false;
}
// If this is not the last meta-block, store an empty metadata
// meta-block so that the meta-block will end at a byte boundary.
if (!is_last) {
StoreSyncMetaBlock(&storage_ix, &storage[0]);
}
// If the compressed data is too large, fall back to an uncompressed
// meta-block.
size_t output_size = storage_ix >> 3;
if (input_size + 4 < output_size) {
storage[0] = first_byte;
storage_ix = first_byte_bits;
if (!StoreUncompressedMetaBlock(is_last, &input[0], input_pos, mask,
input_size,
&storage_ix, &storage[0])) {
return false;
}
output_size = storage_ix >> 3;
}
// Copy the temporary output with size-check to the output.
if (output_size > *encoded_size) {
return false;
}
memcpy(encoded_buffer, &storage[0], output_size);
*encoded_size = output_size;
return true;
}
} // namespace
int BrotliCompressBufferParallel(BrotliParams params,
size_t input_size,
const uint8_t* input_buffer,
size_t* encoded_size,
uint8_t* encoded_buffer) {
if (*encoded_size == 0) {
// Output buffer needs at least one byte.
return 0;
} else if (input_size == 0) {
encoded_buffer[0] = 6;
*encoded_size = 1;
return 1;
}
// Sanitize params.
if (params.lgwin < kMinWindowBits) {
params.lgwin = kMinWindowBits;
} else if (params.lgwin > kMaxWindowBits) {
params.lgwin = kMaxWindowBits;
}
if (params.lgblock == 0) {
params.lgblock = 16;
if (params.quality >= 9 && params.lgwin > params.lgblock) {
params.lgblock = std::min(21, params.lgwin);
}
} else if (params.lgblock < kMinInputBlockBits) {
params.lgblock = kMinInputBlockBits;
} else if (params.lgblock > kMaxInputBlockBits) {
params.lgblock = kMaxInputBlockBits;
}
size_t max_input_block_size = 1 << params.lgblock;
std::vector<std::vector<uint8_t> > compressed_pieces;
StaticDictionary dict;
dict.Fill(params.enable_transforms);
// Compress block-by-block independently.
for (size_t pos = 0; pos < input_size; ) {
size_t input_block_size = std::min(max_input_block_size, input_size - pos);
size_t out_size = 1.2 * input_block_size + 1024;
std::vector<uint8_t> out(out_size);
if (!WriteMetaBlockParallel(params,
input_block_size,
&input_buffer[pos],
pos,
input_buffer,
&dict,
pos == 0,
pos + input_block_size == input_size,
&out_size,
&out[0])) {
return false;
}
out.resize(out_size);
compressed_pieces.push_back(out);
pos += input_block_size;
}
// Piece together the output.
size_t out_pos = 0;
for (int i = 0; i < compressed_pieces.size(); ++i) {
const std::vector<uint8_t>& out = compressed_pieces[i];
if (out_pos + out.size() > *encoded_size) {
return false;
}
memcpy(&encoded_buffer[out_pos], &out[0], out.size());
out_pos += out.size();
}
*encoded_size = out_pos;
return true;
}
} // namespace brotli

View File

@ -0,0 +1,37 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// API for parallel Brotli compression
// Note that this is only a proof of concept currently and not part of the
// final API yet.
#ifndef BROTLI_ENC_ENCODE_PARALLEL_H_
#define BROTLI_ENC_ENCODE_PARALLEL_H_
#include <stddef.h>
#include <stdint.h>
#include "./encode.h"
namespace brotli {
int BrotliCompressBufferParallel(BrotliParams params,
size_t input_size,
const uint8_t* input_buffer,
size_t* encoded_size,
uint8_t* encoded_buffer);
} // namespace brotli
#endif // BROTLI_ENC_ENCODE_PARALLEL_H_

View File

@ -0,0 +1,492 @@
// Copyright 2010 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Entropy encoding (Huffman) utilities.
#include "./entropy_encode.h"
#include <stdint.h>
#include <algorithm>
#include <limits>
#include <vector>
#include <cstdlib>
#include "./histogram.h"
namespace brotli {
namespace {
struct HuffmanTree {
HuffmanTree();
HuffmanTree(int count, int16_t left, int16_t right)
: total_count_(count),
index_left_(left),
index_right_or_value_(right) {
}
int total_count_;
int16_t index_left_;
int16_t index_right_or_value_;
};
HuffmanTree::HuffmanTree() {}
// Sort the root nodes, least popular first.
bool SortHuffmanTree(const HuffmanTree &v0, const HuffmanTree &v1) {
return v0.total_count_ < v1.total_count_;
}
void SetDepth(const HuffmanTree &p,
HuffmanTree *pool,
uint8_t *depth,
int level) {
if (p.index_left_ >= 0) {
++level;
SetDepth(pool[p.index_left_], pool, depth, level);
SetDepth(pool[p.index_right_or_value_], pool, depth, level);
} else {
depth[p.index_right_or_value_] = level;
}
}
} // namespace
// This function will create a Huffman tree.
//
// The catch here is that the tree cannot be arbitrarily deep.
// Brotli specifies a maximum depth of 15 bits for "code trees"
// and 7 bits for "code length code trees."
//
// count_limit is the value that is to be faked as the minimum value
// and this minimum value is raised until the tree matches the
// maximum length requirement.
//
// This algorithm is not of excellent performance for very long data blocks,
// especially when population counts are longer than 2**tree_limit, but
// we are not planning to use this with extremely long blocks.
//
// See http://en.wikipedia.org/wiki/Huffman_coding
void CreateHuffmanTree(const int *data,
const int length,
const int tree_limit,
uint8_t *depth) {
// For block sizes below 64 kB, we never need to do a second iteration
// of this loop. Probably all of our block sizes will be smaller than
// that, so this loop is mostly of academic interest. If we actually
// would need this, we would be better off with the Katajainen algorithm.
for (int count_limit = 1; ; count_limit *= 2) {
std::vector<HuffmanTree> tree;
tree.reserve(2 * length + 1);
for (int i = length - 1; i >= 0; --i) {
if (data[i]) {
const int count = std::max(data[i], count_limit);
tree.push_back(HuffmanTree(count, -1, i));
}
}
const int n = tree.size();
if (n == 1) {
depth[tree[0].index_right_or_value_] = 1; // Only one element.
break;
}
std::stable_sort(tree.begin(), tree.end(), SortHuffmanTree);
// The nodes are:
// [0, n): the sorted leaf nodes that we start with.
// [n]: we add a sentinel here.
// [n + 1, 2n): new parent nodes are added here, starting from
// (n+1). These are naturally in ascending order.
// [2n]: we add a sentinel at the end as well.
// There will be (2n+1) elements at the end.
const HuffmanTree sentinel(std::numeric_limits<int>::max(), -1, -1);
tree.push_back(sentinel);
tree.push_back(sentinel);
int i = 0; // Points to the next leaf node.
int j = n + 1; // Points to the next non-leaf node.
for (int k = n - 1; k > 0; --k) {
int left, right;
if (tree[i].total_count_ <= tree[j].total_count_) {
left = i;
++i;
} else {
left = j;
++j;
}
if (tree[i].total_count_ <= tree[j].total_count_) {
right = i;
++i;
} else {
right = j;
++j;
}
// The sentinel node becomes the parent node.
int j_end = tree.size() - 1;
tree[j_end].total_count_ =
tree[left].total_count_ + tree[right].total_count_;
tree[j_end].index_left_ = left;
tree[j_end].index_right_or_value_ = right;
// Add back the last sentinel node.
tree.push_back(sentinel);
}
SetDepth(tree[2 * n - 1], &tree[0], depth, 0);
// We need to pack the Huffman tree in tree_limit bits.
// If this was not successful, add fake entities to the lowest values
// and retry.
if (*std::max_element(&depth[0], &depth[length]) <= tree_limit) {
break;
}
}
}
void Reverse(std::vector<uint8_t>* v, int start, int end) {
--end;
while (start < end) {
int tmp = (*v)[start];
(*v)[start] = (*v)[end];
(*v)[end] = tmp;
++start;
--end;
}
}
void WriteHuffmanTreeRepetitions(
const int previous_value,
const int value,
int repetitions,
std::vector<uint8_t> *tree,
std::vector<uint8_t> *extra_bits_data) {
if (previous_value != value) {
tree->push_back(value);
extra_bits_data->push_back(0);
--repetitions;
}
if (repetitions == 7) {
tree->push_back(value);
extra_bits_data->push_back(0);
--repetitions;
}
if (repetitions < 3) {
for (int i = 0; i < repetitions; ++i) {
tree->push_back(value);
extra_bits_data->push_back(0);
}
} else {
repetitions -= 3;
int start = tree->size();
while (repetitions >= 0) {
tree->push_back(16);
extra_bits_data->push_back(repetitions & 0x3);
repetitions >>= 2;
--repetitions;
}
Reverse(tree, start, tree->size());
Reverse(extra_bits_data, start, tree->size());
}
}
void WriteHuffmanTreeRepetitionsZeros(
int repetitions,
std::vector<uint8_t> *tree,
std::vector<uint8_t> *extra_bits_data) {
if (repetitions == 11) {
tree->push_back(0);
extra_bits_data->push_back(0);
--repetitions;
}
if (repetitions < 3) {
for (int i = 0; i < repetitions; ++i) {
tree->push_back(0);
extra_bits_data->push_back(0);
}
} else {
repetitions -= 3;
int start = tree->size();
while (repetitions >= 0) {
tree->push_back(17);
extra_bits_data->push_back(repetitions & 0x7);
repetitions >>= 3;
--repetitions;
}
Reverse(tree, start, tree->size());
Reverse(extra_bits_data, start, tree->size());
}
}
int OptimizeHuffmanCountsForRle(int length, int* counts) {
int nonzero_count = 0;
int stride;
int limit;
int sum;
uint8_t* good_for_rle;
// Let's make the Huffman code more compatible with rle encoding.
int i;
for (i = 0; i < length; i++) {
if (counts[i]) {
++nonzero_count;
}
}
if (nonzero_count < 16) {
return 1;
}
for (; length >= 0; --length) {
if (length == 0) {
return 1; // All zeros.
}
if (counts[length - 1] != 0) {
// Now counts[0..length - 1] does not have trailing zeros.
break;
}
}
{
int nonzeros = 0;
int smallest_nonzero = 1 << 30;
for (i = 0; i < length; ++i) {
if (counts[i] != 0) {
++nonzeros;
if (smallest_nonzero > counts[i]) {
smallest_nonzero = counts[i];
}
}
}
if (nonzeros < 5) {
// Small histogram will model it well.
return 1;
}
int zeros = length - nonzeros;
if (smallest_nonzero < 4) {
if (zeros < 6) {
for (i = 1; i < length - 1; ++i) {
if (counts[i - 1] != 0 && counts[i] == 0 && counts[i + 1] != 0) {
counts[i] = 1;
}
}
}
}
if (nonzeros < 28) {
return 1;
}
}
// 2) Let's mark all population counts that already can be encoded
// with an rle code.
good_for_rle = (uint8_t*)calloc(length, 1);
if (good_for_rle == NULL) {
return 0;
}
{
// Let's not spoil any of the existing good rle codes.
// Mark any seq of 0's that is longer as 5 as a good_for_rle.
// Mark any seq of non-0's that is longer as 7 as a good_for_rle.
int symbol = counts[0];
int stride = 0;
for (i = 0; i < length + 1; ++i) {
if (i == length || counts[i] != symbol) {
if ((symbol == 0 && stride >= 5) ||
(symbol != 0 && stride >= 7)) {
int k;
for (k = 0; k < stride; ++k) {
good_for_rle[i - k - 1] = 1;
}
}
stride = 1;
if (i != length) {
symbol = counts[i];
}
} else {
++stride;
}
}
}
// 3) Let's replace those population counts that lead to more rle codes.
// Math here is in 24.8 fixed point representation.
const int streak_limit = 1240;
stride = 0;
limit = 256 * (counts[0] + counts[1] + counts[2]) / 3 + 420;
sum = 0;
for (i = 0; i < length + 1; ++i) {
if (i == length || good_for_rle[i] ||
(i != 0 && good_for_rle[i - 1]) ||
abs(256 * counts[i] - limit) >= streak_limit) {
if (stride >= 4 || (stride >= 3 && sum == 0)) {
int k;
// The stride must end, collapse what we have, if we have enough (4).
int count = (sum + stride / 2) / stride;
if (count < 1) {
count = 1;
}
if (sum == 0) {
// Don't make an all zeros stride to be upgraded to ones.
count = 0;
}
for (k = 0; k < stride; ++k) {
// We don't want to change value at counts[i],
// that is already belonging to the next stride. Thus - 1.
counts[i - k - 1] = count;
}
}
stride = 0;
sum = 0;
if (i < length - 2) {
// All interesting strides have a count of at least 4,
// at least when non-zeros.
limit = 256 * (counts[i] + counts[i + 1] + counts[i + 2]) / 3 + 420;
} else if (i < length) {
limit = 256 * counts[i];
} else {
limit = 0;
}
}
++stride;
if (i != length) {
sum += counts[i];
if (stride >= 4) {
limit = (256 * sum + stride / 2) / stride;
}
if (stride == 4) {
limit += 120;
}
}
}
free(good_for_rle);
return 1;
}
static void DecideOverRleUse(const uint8_t* depth, const int length,
bool *use_rle_for_non_zero,
bool *use_rle_for_zero) {
int total_reps_zero = 0;
int total_reps_non_zero = 0;
int count_reps_zero = 0;
int count_reps_non_zero = 0;
for (uint32_t i = 0; i < length;) {
const int value = depth[i];
int reps = 1;
for (uint32_t k = i + 1; k < length && depth[k] == value; ++k) {
++reps;
}
if (reps >= 3 && value == 0) {
total_reps_zero += reps;
++count_reps_zero;
}
if (reps >= 4 && value != 0) {
total_reps_non_zero += reps;
++count_reps_non_zero;
}
i += reps;
}
total_reps_non_zero -= count_reps_non_zero * 2;
total_reps_zero -= count_reps_zero * 2;
*use_rle_for_non_zero = total_reps_non_zero > 2;
*use_rle_for_zero = total_reps_zero > 2;
}
void WriteHuffmanTree(const uint8_t* depth,
uint32_t length,
std::vector<uint8_t> *tree,
std::vector<uint8_t> *extra_bits_data) {
int previous_value = 8;
// Throw away trailing zeros.
int new_length = length;
for (int i = 0; i < length; ++i) {
if (depth[length - i - 1] == 0) {
--new_length;
} else {
break;
}
}
// First gather statistics on if it is a good idea to do rle.
bool use_rle_for_non_zero = false;
bool use_rle_for_zero = false;
if (length > 50) {
// Find rle coding for longer codes.
// Shorter codes seem not to benefit from rle.
DecideOverRleUse(depth, new_length,
&use_rle_for_non_zero, &use_rle_for_zero);
}
// Actual rle coding.
for (uint32_t i = 0; i < new_length;) {
const int value = depth[i];
int reps = 1;
if ((value != 0 && use_rle_for_non_zero) ||
(value == 0 && use_rle_for_zero)) {
for (uint32_t k = i + 1; k < new_length && depth[k] == value; ++k) {
++reps;
}
}
if (value == 0) {
WriteHuffmanTreeRepetitionsZeros(reps, tree, extra_bits_data);
} else {
WriteHuffmanTreeRepetitions(previous_value,
value, reps, tree, extra_bits_data);
previous_value = value;
}
i += reps;
}
}
namespace {
uint16_t ReverseBits(int num_bits, uint16_t bits) {
static const size_t kLut[16] = { // Pre-reversed 4-bit values.
0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf
};
size_t retval = kLut[bits & 0xf];
for (int i = 4; i < num_bits; i += 4) {
retval <<= 4;
bits >>= 4;
retval |= kLut[bits & 0xf];
}
retval >>= (-num_bits & 0x3);
return retval;
}
} // namespace
void ConvertBitDepthsToSymbols(const uint8_t *depth, int len, uint16_t *bits) {
// In Brotli, all bit depths are [1..15]
// 0 bit depth means that the symbol does not exist.
const int kMaxBits = 16; // 0..15 are values for bits
uint16_t bl_count[kMaxBits] = { 0 };
{
for (int i = 0; i < len; ++i) {
++bl_count[depth[i]];
}
bl_count[0] = 0;
}
uint16_t next_code[kMaxBits];
next_code[0] = 0;
{
int code = 0;
for (int bits = 1; bits < kMaxBits; ++bits) {
code = (code + bl_count[bits - 1]) << 1;
next_code[bits] = code;
}
}
for (int i = 0; i < len; ++i) {
if (depth[i]) {
bits[i] = ReverseBits(depth[i], next_code[depth[i]]++);
}
}
}
} // namespace brotli

View File

@ -0,0 +1,88 @@
// Copyright 2010 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Entropy encoding (Huffman) utilities.
#ifndef BROTLI_ENC_ENTROPY_ENCODE_H_
#define BROTLI_ENC_ENTROPY_ENCODE_H_
#include <stdint.h>
#include <string.h>
#include <vector>
#include "./histogram.h"
#include "./prefix.h"
namespace brotli {
// This function will create a Huffman tree.
//
// The (data,length) contains the population counts.
// The tree_limit is the maximum bit depth of the Huffman codes.
//
// The depth contains the tree, i.e., how many bits are used for
// the symbol.
//
// See http://en.wikipedia.org/wiki/Huffman_coding
void CreateHuffmanTree(const int *data,
const int length,
const int tree_limit,
uint8_t *depth);
// Change the population counts in a way that the consequent
// Hufmann tree compression, especially its rle-part will be more
// likely to compress this data more efficiently.
//
// length contains the size of the histogram.
// counts contains the population counts.
int OptimizeHuffmanCountsForRle(int length, int* counts);
// Write a huffman tree from bit depths into the bitstream representation
// of a Huffman tree. The generated Huffman tree is to be compressed once
// more using a Huffman tree
void WriteHuffmanTree(const uint8_t* depth,
uint32_t num,
std::vector<uint8_t> *tree,
std::vector<uint8_t> *extra_bits_data);
// Get the actual bit values for a tree of bit depths.
void ConvertBitDepthsToSymbols(const uint8_t *depth, int len, uint16_t *bits);
template<int kSize>
struct EntropyCode {
// How many bits for symbol.
uint8_t depth_[kSize];
// Actual bits used to represent the symbol.
uint16_t bits_[kSize];
// How many non-zero depth.
int count_;
// First four symbols with non-zero depth.
int symbols_[4];
};
static const int kCodeLengthCodes = 18;
// Literal entropy code.
typedef EntropyCode<256> EntropyCodeLiteral;
// Prefix entropy codes.
typedef EntropyCode<kNumCommandPrefixes> EntropyCodeCommand;
typedef EntropyCode<kNumDistancePrefixes> EntropyCodeDistance;
typedef EntropyCode<kNumBlockLenPrefixes> EntropyCodeBlockLength;
// Context map entropy code, 256 Huffman tree indexes + 16 run length codes.
typedef EntropyCode<272> EntropyCodeContextMap;
// Block type entropy code, 256 block types + 2 special symbols.
typedef EntropyCode<258> EntropyCodeBlockType;
} // namespace brotli
#endif // BROTLI_ENC_ENTROPY_ENCODE_H_

View File

@ -0,0 +1,179 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Utilities for fast computation of logarithms.
#ifndef BROTLI_ENC_FAST_LOG_H_
#define BROTLI_ENC_FAST_LOG_H_
#include <assert.h>
#include <math.h>
#include <stdint.h>
namespace brotli {
// Return floor(log2(n)) for positive integer n. Returns -1 iff n == 0.
inline int Log2Floor(uint32_t n) {
#if defined(__clang__) || \
(defined(__GNUC__) && \
((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4))
return n == 0 ? -1 : 31 ^ __builtin_clz(n);
#else
if (n == 0)
return -1;
int log = 0;
uint32_t value = n;
for (int i = 4; i >= 0; --i) {
int shift = (1 << i);
uint32_t x = value >> shift;
if (x != 0) {
value = x;
log += shift;
}
}
assert(value == 1);
return log;
#endif
}
static inline int Log2FloorNonZero(uint32_t n) {
#ifdef __GNUC__
return 31 ^ __builtin_clz(n);
#else
unsigned int result = 0;
while (n >>= 1) result++;
return result;
#endif
}
// Return ceiling(log2(n)) for positive integer n. Returns -1 iff n == 0.
inline int Log2Ceiling(uint32_t n) {
int floor = Log2Floor(n);
if (n == (n &~ (n - 1))) // zero or a power of two
return floor;
else
return floor + 1;
}
// A lookup table for small values of log2(int) to be used in entropy
// computation.
//
// ", ".join(["%.16ff" % x for x in [0.0]+[log2(x) for x in range(1, 256)]])
static const float kLog2Table[] = {
0.0000000000000000f, 0.0000000000000000f, 1.0000000000000000f,
1.5849625007211563f, 2.0000000000000000f, 2.3219280948873622f,
2.5849625007211561f, 2.8073549220576042f, 3.0000000000000000f,
3.1699250014423126f, 3.3219280948873626f, 3.4594316186372978f,
3.5849625007211565f, 3.7004397181410922f, 3.8073549220576037f,
3.9068905956085187f, 4.0000000000000000f, 4.0874628412503400f,
4.1699250014423122f, 4.2479275134435852f, 4.3219280948873626f,
4.3923174227787607f, 4.4594316186372973f, 4.5235619560570131f,
4.5849625007211570f, 4.6438561897747244f, 4.7004397181410926f,
4.7548875021634691f, 4.8073549220576037f, 4.8579809951275728f,
4.9068905956085187f, 4.9541963103868758f, 5.0000000000000000f,
5.0443941193584534f, 5.0874628412503400f, 5.1292830169449664f,
5.1699250014423122f, 5.2094533656289501f, 5.2479275134435852f,
5.2854022188622487f, 5.3219280948873626f, 5.3575520046180838f,
5.3923174227787607f, 5.4262647547020979f, 5.4594316186372973f,
5.4918530963296748f, 5.5235619560570131f, 5.5545888516776376f,
5.5849625007211570f, 5.6147098441152083f, 5.6438561897747244f,
5.6724253419714961f, 5.7004397181410926f, 5.7279204545631996f,
5.7548875021634691f, 5.7813597135246599f, 5.8073549220576046f,
5.8328900141647422f, 5.8579809951275719f, 5.8826430493618416f,
5.9068905956085187f, 5.9307373375628867f, 5.9541963103868758f,
5.9772799234999168f, 6.0000000000000000f, 6.0223678130284544f,
6.0443941193584534f, 6.0660891904577721f, 6.0874628412503400f,
6.1085244567781700f, 6.1292830169449672f, 6.1497471195046822f,
6.1699250014423122f, 6.1898245588800176f, 6.2094533656289510f,
6.2288186904958804f, 6.2479275134435861f, 6.2667865406949019f,
6.2854022188622487f, 6.3037807481771031f, 6.3219280948873617f,
6.3398500028846252f, 6.3575520046180847f, 6.3750394313469254f,
6.3923174227787598f, 6.4093909361377026f, 6.4262647547020979f,
6.4429434958487288f, 6.4594316186372982f, 6.4757334309663976f,
6.4918530963296748f, 6.5077946401986964f, 6.5235619560570131f,
6.5391588111080319f, 6.5545888516776376f, 6.5698556083309478f,
6.5849625007211561f, 6.5999128421871278f, 6.6147098441152092f,
6.6293566200796095f, 6.6438561897747253f, 6.6582114827517955f,
6.6724253419714952f, 6.6865005271832185f, 6.7004397181410917f,
6.7142455176661224f, 6.7279204545631988f, 6.7414669864011465f,
6.7548875021634691f, 6.7681843247769260f, 6.7813597135246599f,
6.7944158663501062f, 6.8073549220576037f, 6.8201789624151887f,
6.8328900141647422f, 6.8454900509443757f, 6.8579809951275719f,
6.8703647195834048f, 6.8826430493618416f, 6.8948177633079437f,
6.9068905956085187f, 6.9188632372745955f, 6.9307373375628867f,
6.9425145053392399f, 6.9541963103868758f, 6.9657842846620879f,
6.9772799234999168f, 6.9886846867721664f, 7.0000000000000000f,
7.0112272554232540f, 7.0223678130284544f, 7.0334230015374501f,
7.0443941193584534f, 7.0552824355011898f, 7.0660891904577721f,
7.0768155970508317f, 7.0874628412503400f, 7.0980320829605272f,
7.1085244567781700f, 7.1189410727235076f, 7.1292830169449664f,
7.1395513523987937f, 7.1497471195046822f, 7.1598713367783891f,
7.1699250014423130f, 7.1799090900149345f, 7.1898245588800176f,
7.1996723448363644f, 7.2094533656289492f, 7.2191685204621621f,
7.2288186904958804f, 7.2384047393250794f, 7.2479275134435861f,
7.2573878426926521f, 7.2667865406949019f, 7.2761244052742384f,
7.2854022188622487f, 7.2946207488916270f, 7.3037807481771031f,
7.3128829552843557f, 7.3219280948873617f, 7.3309168781146177f,
7.3398500028846243f, 7.3487281542310781f, 7.3575520046180847f,
7.3663222142458151f, 7.3750394313469254f, 7.3837042924740528f,
7.3923174227787607f, 7.4008794362821844f, 7.4093909361377026f,
7.4178525148858991f, 7.4262647547020979f, 7.4346282276367255f,
7.4429434958487288f, 7.4512111118323299f, 7.4594316186372973f,
7.4676055500829976f, 7.4757334309663976f, 7.4838157772642564f,
7.4918530963296748f, 7.4998458870832057f, 7.5077946401986964f,
7.5156998382840436f, 7.5235619560570131f, 7.5313814605163119f,
7.5391588111080319f, 7.5468944598876373f, 7.5545888516776376f,
7.5622424242210728f, 7.5698556083309478f, 7.5774288280357487f,
7.5849625007211561f, 7.5924570372680806f, 7.5999128421871278f,
7.6073303137496113f, 7.6147098441152075f, 7.6220518194563764f,
7.6293566200796095f, 7.6366246205436488f, 7.6438561897747244f,
7.6510516911789290f, 7.6582114827517955f, 7.6653359171851765f,
7.6724253419714952f, 7.6794800995054464f, 7.6865005271832185f,
7.6934869574993252f, 7.7004397181410926f, 7.7073591320808825f,
7.7142455176661224f, 7.7210991887071856f, 7.7279204545631996f,
7.7347096202258392f, 7.7414669864011465f, 7.7481928495894596f,
7.7548875021634691f, 7.7615512324444795f, 7.7681843247769260f,
7.7747870596011737f, 7.7813597135246608f, 7.7879025593914317f,
7.7944158663501062f, 7.8008998999203047f, 7.8073549220576037f,
7.8137811912170374f, 7.8201789624151887f, 7.8265484872909159f,
7.8328900141647422f, 7.8392037880969445f, 7.8454900509443757f,
7.8517490414160571f, 7.8579809951275719f, 7.8641861446542798f,
7.8703647195834048f, 7.8765169465650002f, 7.8826430493618425f,
7.8887432488982601f, 7.8948177633079446f, 7.9008668079807496f,
7.9068905956085187f, 7.9128893362299619f, 7.9188632372745955f,
7.9248125036057813f, 7.9307373375628867f, 7.9366379390025719f,
7.9425145053392399f, 7.9483672315846778f, 7.9541963103868758f,
7.9600019320680806f, 7.9657842846620870f, 7.9715435539507720f,
7.9772799234999168f, 7.9829935746943104f, 7.9886846867721664f,
7.9943534368588578f
};
// Faster logarithm for small integers, with the property of log2(0) == 0.
static inline double FastLog2(int v) {
if (v < (int)(sizeof(kLog2Table) / sizeof(kLog2Table[0]))) {
return kLog2Table[v];
}
#if defined(_MSC_VER) && _MSC_VER <= 1600
// Visual Studio 2010 does not have the log2() function defined, so we use
// log() and a multiplication instead.
static const double kLog2Inv = 1.4426950408889634f;
return log(static_cast<double>(v)) * kLog2Inv;
#else
return log2(static_cast<double>(v));
#endif
}
} // namespace brotli
#endif // BROTLI_ENC_FAST_LOG_H_

View File

@ -0,0 +1,87 @@
// Copyright 2010 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Function to find maximal matching prefixes of strings.
#ifndef BROTLI_ENC_FIND_MATCH_LENGTH_H_
#define BROTLI_ENC_FIND_MATCH_LENGTH_H_
#include <stdint.h>
#include <stddef.h>
#include "./port.h"
namespace brotli {
// Separate implementation for little-endian 64-bit targets, for speed.
#if defined(__GNUC__) && defined(_LP64) && defined(IS_LITTLE_ENDIAN)
static inline int FindMatchLengthWithLimit(const uint8_t* s1,
const uint8_t* s2,
size_t limit) {
int matched = 0;
size_t limit2 = (limit >> 3) + 1; // + 1 is for pre-decrement in while
while (PREDICT_TRUE(--limit2)) {
if (PREDICT_FALSE(BROTLI_UNALIGNED_LOAD64(s2) ==
BROTLI_UNALIGNED_LOAD64(s1 + matched))) {
s2 += 8;
matched += 8;
} else {
uint64_t x =
BROTLI_UNALIGNED_LOAD64(s2) ^ BROTLI_UNALIGNED_LOAD64(s1 + matched);
int matching_bits = __builtin_ctzll(x);
matched += matching_bits >> 3;
return matched;
}
}
limit = (limit & 7) + 1; // + 1 is for pre-decrement in while
while (--limit) {
if (PREDICT_TRUE(s1[matched] == *s2)) {
++s2;
++matched;
} else {
return matched;
}
}
return matched;
}
#else
static inline int FindMatchLengthWithLimit(const uint8_t* s1,
const uint8_t* s2,
size_t limit) {
int matched = 0;
const uint8_t* s2_limit = s2 + limit;
const uint8_t* s2_ptr = s2;
// Find out how long the match is. We loop over the data 32 bits at a
// time until we find a 32-bit block that doesn't match; then we find
// the first non-matching bit and use that to calculate the total
// length of the match.
while (s2_ptr <= s2_limit - 4 &&
BROTLI_UNALIGNED_LOAD32(s2_ptr) ==
BROTLI_UNALIGNED_LOAD32(s1 + matched)) {
s2_ptr += 4;
matched += 4;
}
while ((s2_ptr < s2_limit) && (s1[matched] == *s2_ptr)) {
++s2_ptr;
++matched;
}
return matched;
}
#endif
} // namespace brotli
#endif // BROTLI_ENC_FIND_MATCH_LENGTH_H_

View File

@ -0,0 +1,634 @@
// Copyright 2010 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// A (forgetful) hash table to the data seen by the compressor, to
// help create backward references to previous data.
#ifndef BROTLI_ENC_HASH_H_
#define BROTLI_ENC_HASH_H_
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <sys/types.h>
#include <algorithm>
#include <cstdlib>
#include <memory>
#include <string>
#include "./dictionary_hash.h"
#include "./fast_log.h"
#include "./find_match_length.h"
#include "./port.h"
#include "./prefix.h"
#include "./static_dict.h"
#include "./transform.h"
namespace brotli {
static const int kDistanceCacheIndex[] = {
0, 1, 2, 3, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
};
static const int kDistanceCacheOffset[] = {
0, 0, 0, 0, -1, 1, -2, 2, -3, 3, -1, 1, -2, 2, -3, 3
};
// kHashMul32 multiplier has these properties:
// * The multiplier must be odd. Otherwise we may lose the highest bit.
// * No long streaks of 1s or 0s.
// * There is no effort to ensure that it is a prime, the oddity is enough
// for this use.
// * The number has been tuned heuristically against compression benchmarks.
static const uint32_t kHashMul32 = 0x1e35a7bd;
template<int kShiftBits, int kMinLength>
inline uint32_t Hash(const uint8_t *data) {
if (kMinLength <= 3) {
// If kMinLength is 2 or 3, we hash the first 3 bytes of data.
uint32_t h = (BROTLI_UNALIGNED_LOAD32(data) & 0xffffff) * kHashMul32;
// The higher bits contain more mixture from the multiplication,
// so we take our results from there.
return h >> (32 - kShiftBits);
} else {
// If kMinLength is at least 4, we hash the first 4 bytes of data.
uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;
// The higher bits contain more mixture from the multiplication,
// so we take our results from there.
return h >> (32 - kShiftBits);
}
}
// Usually, we always choose the longest backward reference. This function
// allows for the exception of that rule.
//
// If we choose a backward reference that is further away, it will
// usually be coded with more bits. We approximate this by assuming
// log2(distance). If the distance can be expressed in terms of the
// last four distances, we use some heuristic constants to estimate
// the bits cost. For the first up to four literals we use the bit
// cost of the literals from the literal cost model, after that we
// use the average bit cost of the cost model.
//
// This function is used to sometimes discard a longer backward reference
// when it is not much longer and the bit cost for encoding it is more
// than the saved literals.
inline double BackwardReferenceScore(double average_cost,
int copy_length,
int backward_reference_offset) {
return (copy_length * average_cost -
1.20 * Log2Floor(backward_reference_offset));
}
inline double BackwardReferenceScoreUsingLastDistance(double average_cost,
int copy_length,
int distance_short_code) {
static const double kDistanceShortCodeBitCost[16] = {
-0.6, 0.95, 1.17, 1.27,
0.93, 0.93, 0.96, 0.96, 0.99, 0.99,
1.05, 1.05, 1.15, 1.15, 1.25, 1.25
};
return (average_cost * copy_length
- kDistanceShortCodeBitCost[distance_short_code]);
}
// A (forgetful) hash table to the data seen by the compressor, to
// help create backward references to previous data.
//
// This is a hash map of fixed size (kBucketSize). Starting from the
// given index, kBucketSweep buckets are used to store values of a key.
template <int kBucketBits, int kBucketSweep>
class HashLongestMatchQuickly {
public:
HashLongestMatchQuickly() {
Reset();
}
void Reset() {
// It is not strictly necessary to fill this buffer here, but
// not filling will make the results of the compression stochastic
// (but correct). This is because random data would cause the
// system to find accidentally good backward references here and there.
std::fill(&buckets_[0],
&buckets_[sizeof(buckets_) / sizeof(buckets_[0])],
0);
num_dict_lookups_ = 0;
num_dict_matches_ = 0;
}
// Look at 4 bytes at data.
// Compute a hash from these, and store the value somewhere within
// [ix .. ix+3].
inline void Store(const uint8_t *data, const int ix) {
const uint32_t key = Hash<kBucketBits, 4>(data);
// Wiggle the value with the bucket sweep range.
const uint32_t off = (static_cast<uint32_t>(ix) >> 3) % kBucketSweep;
buckets_[key + off] = ix;
}
// Store hashes for a range of data.
void StoreHashes(const uint8_t *data, size_t len, int startix, int mask) {
for (int p = 0; p < len; ++p) {
Store(&data[p & mask], startix + p);
}
}
bool HasStaticDictionary() const { return false; }
// Find a longest backward match of &ring_buffer[cur_ix & ring_buffer_mask]
// up to the length of max_length.
//
// Does not look for matches longer than max_length.
// Does not look for matches further away than max_backward.
// Writes the best found match length into best_len_out.
// Writes the index (&data[index]) of the start of the best match into
// best_distance_out.
inline bool FindLongestMatch(const uint8_t * __restrict ring_buffer,
const size_t ring_buffer_mask,
const float* __restrict literal_cost,
const size_t literal_cost_mask,
const double average_cost,
const int* __restrict distance_cache,
const uint32_t cur_ix,
const uint32_t max_length,
const uint32_t max_backward,
int * __restrict best_len_out,
int * __restrict best_len_code_out,
int * __restrict best_distance_out,
double* __restrict best_score_out) {
const int best_len_in = *best_len_out;
const int cur_ix_masked = cur_ix & ring_buffer_mask;
int compare_char = ring_buffer[cur_ix_masked + best_len_in];
double best_score = *best_score_out;
int best_len = best_len_in;
int backward = distance_cache[0];
size_t prev_ix = cur_ix - backward;
bool match_found = false;
if (prev_ix < cur_ix) {
prev_ix &= ring_buffer_mask;
if (compare_char == ring_buffer[prev_ix + best_len]) {
int len = FindMatchLengthWithLimit(&ring_buffer[prev_ix],
&ring_buffer[cur_ix_masked],
max_length);
if (len >= 4) {
best_score = BackwardReferenceScoreUsingLastDistance(average_cost,
len, 0);
best_len = len;
*best_len_out = len;
*best_len_code_out = len;
*best_distance_out = backward;
*best_score_out = best_score;
compare_char = ring_buffer[cur_ix_masked + best_len];
if (kBucketSweep == 1) {
return true;
} else {
match_found = true;
}
}
}
}
const uint32_t key = Hash<kBucketBits, 4>(&ring_buffer[cur_ix_masked]);
if (kBucketSweep == 1) {
// Only one to look for, don't bother to prepare for a loop.
prev_ix = buckets_[key];
backward = cur_ix - prev_ix;
prev_ix &= ring_buffer_mask;
if (compare_char != ring_buffer[prev_ix + best_len_in]) {
return false;
}
if (PREDICT_FALSE(backward == 0 || backward > max_backward)) {
return false;
}
const int len = FindMatchLengthWithLimit(&ring_buffer[prev_ix],
&ring_buffer[cur_ix_masked],
max_length);
if (len >= 4) {
*best_len_out = len;
*best_len_code_out = len;
*best_distance_out = backward;
*best_score_out = BackwardReferenceScore(average_cost, len, backward);
return true;
} else {
return false;
}
} else {
uint32_t *bucket = buckets_ + key;
prev_ix = *bucket++;
for (int i = 0; i < kBucketSweep; ++i, prev_ix = *bucket++) {
const int backward = cur_ix - prev_ix;
prev_ix &= ring_buffer_mask;
if (compare_char != ring_buffer[prev_ix + best_len]) {
continue;
}
if (PREDICT_FALSE(backward == 0 || backward > max_backward)) {
continue;
}
const int len =
FindMatchLengthWithLimit(&ring_buffer[prev_ix],
&ring_buffer[cur_ix_masked],
max_length);
if (len >= 4) {
const double score = BackwardReferenceScore(average_cost,
len, backward);
if (best_score < score) {
best_score = score;
best_len = len;
*best_len_out = best_len;
*best_len_code_out = best_len;
*best_distance_out = backward;
*best_score_out = score;
compare_char = ring_buffer[cur_ix_masked + best_len];
match_found = true;
}
}
}
if (!match_found && num_dict_matches_ >= (num_dict_lookups_ >> 7)) {
++num_dict_lookups_;
const uint32_t key = Hash<14, 4>(&ring_buffer[cur_ix_masked]) << 1;
const uint16_t v = kStaticDictionaryHash[key];
if (v > 0) {
const int len = v & 31;
const int dist = v >> 5;
const int offset = kBrotliDictionaryOffsetsByLength[len] + len * dist;
if (len <= max_length) {
const int matchlen =
FindMatchLengthWithLimit(&ring_buffer[cur_ix_masked],
&kBrotliDictionary[offset], len);
if (matchlen == len) {
const size_t backward = max_backward + dist + 1;
const double score = BackwardReferenceScore(average_cost,
len, backward);
if (best_score < score) {
++num_dict_matches_;
best_score = score;
best_len = len;
*best_len_out = best_len;
*best_len_code_out = best_len;
*best_distance_out = backward;
*best_score_out = best_score;
return true;
}
}
}
}
}
return match_found;
}
}
private:
static const uint32_t kBucketSize = 1 << kBucketBits;
uint32_t buckets_[kBucketSize + kBucketSweep];
size_t num_dict_lookups_;
size_t num_dict_matches_;
};
// A (forgetful) hash table to the data seen by the compressor, to
// help create backward references to previous data.
//
// This is a hash map of fixed size (kBucketSize) to a ring buffer of
// fixed size (kBlockSize). The ring buffer contains the last kBlockSize
// index positions of the given hash key in the compressed data.
template <int kBucketBits,
int kBlockBits,
int kMinLength,
int kNumLastDistancesToCheck,
bool kUseCostModel,
bool kUseDictionary>
class HashLongestMatch {
public:
HashLongestMatch() : static_dict_(NULL) {
Reset();
}
void Reset() {
std::fill(&num_[0], &num_[sizeof(num_) / sizeof(num_[0])], 0);
num_dict_lookups_ = 0;
num_dict_matches_ = 0;
}
void SetStaticDictionary(const StaticDictionary *dict) {
static_dict_ = dict;
}
bool HasStaticDictionary() const {
return static_dict_ != NULL;
}
// Look at 3 bytes at data.
// Compute a hash from these, and store the value of ix at that position.
inline void Store(const uint8_t *data, const int ix) {
const uint32_t key = Hash<kBucketBits, kMinLength>(data);
const int minor_ix = num_[key] & kBlockMask;
buckets_[key][minor_ix] = ix;
++num_[key];
}
// Store hashes for a range of data.
void StoreHashes(const uint8_t *data, size_t len, int startix, int mask) {
for (int p = 0; p < len; ++p) {
Store(&data[p & mask], startix + p);
}
}
// Find a longest backward match of &data[cur_ix] up to the length of
// max_length.
//
// Does not look for matches longer than max_length.
// Does not look for matches further away than max_backward.
// Writes the best found match length into best_len_out.
// Writes the index (&data[index]) offset from the start of the best match
// into best_distance_out.
// Write the score of the best match into best_score_out.
bool FindLongestMatch(const uint8_t * __restrict data,
const size_t ring_buffer_mask,
const float * __restrict literal_cost,
const size_t literal_cost_mask,
const double average_cost,
const int* __restrict distance_cache,
const uint32_t cur_ix,
uint32_t max_length,
const uint32_t max_backward,
int * __restrict best_len_out,
int * __restrict best_len_code_out,
int * __restrict best_distance_out,
double * __restrict best_score_out) {
*best_len_code_out = 0;
const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
double start_cost_diff4 = 0.0;
double start_cost_diff3 = 0.0;
double start_cost_diff2 = 0.0;
if (kUseCostModel) {
start_cost_diff4 = literal_cost == NULL ? 0 :
literal_cost[cur_ix & literal_cost_mask] +
literal_cost[(cur_ix + 1) & literal_cost_mask] +
literal_cost[(cur_ix + 2) & literal_cost_mask] +
literal_cost[(cur_ix + 3) & literal_cost_mask] -
4 * average_cost;
start_cost_diff3 = literal_cost == NULL ? 0 :
literal_cost[cur_ix & literal_cost_mask] +
literal_cost[(cur_ix + 1) & literal_cost_mask] +
literal_cost[(cur_ix + 2) & literal_cost_mask] -
3 * average_cost + 0.3;
start_cost_diff2 = literal_cost == NULL ? 0 :
literal_cost[cur_ix & literal_cost_mask] +
literal_cost[(cur_ix + 1) & literal_cost_mask] -
2 * average_cost + 1.2;
}
bool match_found = false;
// Don't accept a short copy from far away.
double best_score = *best_score_out;
int best_len = *best_len_out;
*best_len_out = 0;
// Try last distance first.
for (int i = 0; i < kNumLastDistancesToCheck; ++i) {
const int idx = kDistanceCacheIndex[i];
const int backward = distance_cache[idx] + kDistanceCacheOffset[i];
size_t prev_ix = cur_ix - backward;
if (prev_ix >= cur_ix) {
continue;
}
if (PREDICT_FALSE(backward > max_backward)) {
continue;
}
prev_ix &= ring_buffer_mask;
if (cur_ix_masked + best_len > ring_buffer_mask ||
prev_ix + best_len > ring_buffer_mask ||
data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
continue;
}
const size_t len =
FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
max_length);
if (len >= std::max(kMinLength, 3) ||
(kMinLength == 2 && len == 2 && i < 2)) {
// Comparing for >= 2 does not change the semantics, but just saves for
// a few unnecessary binary logarithms in backward reference score,
// since we are not interested in such short matches.
double score = BackwardReferenceScoreUsingLastDistance(
average_cost, len, i);
if (kUseCostModel) {
switch (len) {
case 2: score += start_cost_diff2; break;
case 3: score += start_cost_diff3; break;
default: score += start_cost_diff4;
}
}
if (best_score < score) {
best_score = score;
best_len = len;
*best_len_out = best_len;
*best_len_code_out = best_len;
*best_distance_out = backward;
*best_score_out = best_score;
match_found = true;
}
}
}
if (kMinLength == 2) {
int stop = int(cur_ix) - 64;
if (stop < 0) { stop = 0; }
start_cost_diff2 -= 1.0;
for (int i = cur_ix - 1; i > stop; --i) {
size_t prev_ix = i;
const size_t backward = cur_ix - prev_ix;
if (PREDICT_FALSE(backward > max_backward)) {
break;
}
prev_ix &= ring_buffer_mask;
if (data[cur_ix_masked] != data[prev_ix] ||
data[cur_ix_masked + 1] != data[prev_ix + 1]) {
continue;
}
int len = 2;
const double score =
average_cost * 2 - 2.3 * Log2Floor(backward) + start_cost_diff2;
if (best_score < score) {
best_score = score;
best_len = len;
*best_len_out = best_len;
*best_len_code_out = best_len;
*best_distance_out = backward;
match_found = true;
}
}
}
const uint32_t key = Hash<kBucketBits, kMinLength>(&data[cur_ix_masked]);
const int * __restrict const bucket = &buckets_[key][0];
const int down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0;
for (int i = num_[key] - 1; i >= down; --i) {
int prev_ix = bucket[i & kBlockMask];
if (prev_ix >= 0) {
const size_t backward = cur_ix - prev_ix;
if (PREDICT_FALSE(backward > max_backward)) {
break;
}
prev_ix &= ring_buffer_mask;
if (cur_ix_masked + best_len > ring_buffer_mask ||
prev_ix + best_len > ring_buffer_mask ||
data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
continue;
}
const size_t len =
FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
max_length);
if (len >= std::max(kMinLength, 3)) {
// Comparing for >= 3 does not change the semantics, but just saves
// for a few unnecessary binary logarithms in backward reference
// score, since we are not interested in such short matches.
double score = BackwardReferenceScore(average_cost,
len, backward);
if (kUseCostModel) {
score += (len >= 4) ? start_cost_diff4 : start_cost_diff3;
}
if (best_score < score) {
best_score = score;
best_len = len;
*best_len_out = best_len;
*best_len_code_out = best_len;
*best_distance_out = backward;
*best_score_out = best_score;
match_found = true;
}
}
}
}
if (!match_found && num_dict_matches_ >= (num_dict_lookups_ >> 7)) {
uint32_t key = Hash<14, 4>(&data[cur_ix_masked]) << 1;
for (int k = 0; k < 2; ++k, ++key) {
++num_dict_lookups_;
const uint16_t v = kStaticDictionaryHash[key];
if (v > 0) {
const int len = v & 31;
const int dist = v >> 5;
const int offset = kBrotliDictionaryOffsetsByLength[len] + len * dist;
if (len <= max_length) {
const int matchlen =
FindMatchLengthWithLimit(&data[cur_ix_masked],
&kBrotliDictionary[offset], len);
if (matchlen == len) {
const size_t backward = max_backward + dist + 1;
double score = BackwardReferenceScore(average_cost,
len, backward);
if (kUseCostModel) {
score += start_cost_diff4;
}
if (best_score < score) {
++num_dict_matches_;
best_score = score;
best_len = len;
*best_len_out = best_len;
*best_len_code_out = best_len;
*best_distance_out = backward;
*best_score_out = best_score;
match_found = true;
break;
}
}
}
}
}
}
if (kUseDictionary && static_dict_ != NULL) {
// We decide based on first 4 bytes how many bytes to test for.
uint32_t prefix = BROTLI_UNALIGNED_LOAD32(&data[cur_ix_masked]);
int maxlen = static_dict_->GetLength(prefix);
for (int len = std::min<size_t>(maxlen, max_length);
len > best_len && len >= 4; --len) {
std::string snippet((const char *)&data[cur_ix_masked], len);
int copy_len_code;
int word_id;
if (static_dict_->Get(snippet, &copy_len_code, &word_id)) {
const size_t backward = max_backward + word_id + 1;
const double score = (BackwardReferenceScore(average_cost,
len, backward) +
start_cost_diff4);
if (best_score < score) {
best_score = score;
best_len = len;
*best_len_out = best_len;
*best_len_code_out = copy_len_code;
*best_distance_out = backward;
*best_score_out = best_score;
match_found = true;
}
}
}
}
return match_found;
}
private:
// Number of hash buckets.
static const uint32_t kBucketSize = 1 << kBucketBits;
// Only kBlockSize newest backward references are kept,
// and the older are forgotten.
static const uint32_t kBlockSize = 1 << kBlockBits;
// Mask for accessing entries in a block (in a ringbuffer manner).
static const uint32_t kBlockMask = (1 << kBlockBits) - 1;
// Number of entries in a particular bucket.
uint16_t num_[kBucketSize];
// Buckets containing kBlockSize of backward references.
int buckets_[kBucketSize][kBlockSize];
size_t num_dict_lookups_;
size_t num_dict_matches_;
const StaticDictionary *static_dict_;
};
struct Hashers {
typedef HashLongestMatchQuickly<16, 1> H1;
typedef HashLongestMatchQuickly<17, 4> H2;
typedef HashLongestMatch<14, 4, 4, 4, false, false> H3;
typedef HashLongestMatch<14, 5, 4, 4, false, false> H4;
typedef HashLongestMatch<15, 6, 4, 10, false, false> H5;
typedef HashLongestMatch<15, 7, 4, 10, false, false> H6;
typedef HashLongestMatch<15, 8, 4, 16, false, false> H7;
typedef HashLongestMatch<15, 8, 4, 16, true, true> H8;
typedef HashLongestMatch<15, 8, 2, 16, true, false> H9;
void Init(int type) {
switch (type) {
case 1: hash_h1.reset(new H1); break;
case 2: hash_h2.reset(new H2); break;
case 3: hash_h3.reset(new H3); break;
case 4: hash_h4.reset(new H4); break;
case 5: hash_h5.reset(new H5); break;
case 6: hash_h6.reset(new H6); break;
case 7: hash_h7.reset(new H7); break;
case 8: hash_h8.reset(new H8); break;
case 9: hash_h9.reset(new H9); break;
default: break;
}
}
void SetStaticDictionary(const StaticDictionary *dict) {
if (hash_h8.get() != NULL) hash_h8->SetStaticDictionary(dict);
}
std::unique_ptr<H1> hash_h1;
std::unique_ptr<H2> hash_h2;
std::unique_ptr<H3> hash_h3;
std::unique_ptr<H4> hash_h4;
std::unique_ptr<H5> hash_h5;
std::unique_ptr<H6> hash_h6;
std::unique_ptr<H7> hash_h7;
std::unique_ptr<H8> hash_h8;
std::unique_ptr<H9> hash_h9;
};
} // namespace brotli
#endif // BROTLI_ENC_HASH_H_

View File

@ -0,0 +1,76 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Build per-context histograms of literals, commands and distance codes.
#include "./histogram.h"
#include <stdint.h>
#include <cmath>
#include "./block_splitter.h"
#include "./command.h"
#include "./context.h"
#include "./prefix.h"
namespace brotli {
void BuildHistograms(
const Command* cmds,
const size_t num_commands,
const BlockSplit& literal_split,
const BlockSplit& insert_and_copy_split,
const BlockSplit& dist_split,
const uint8_t* ringbuffer,
size_t start_pos,
size_t mask,
uint8_t prev_byte,
uint8_t prev_byte2,
const std::vector<int>& context_modes,
std::vector<HistogramLiteral>* literal_histograms,
std::vector<HistogramCommand>* insert_and_copy_histograms,
std::vector<HistogramDistance>* copy_dist_histograms) {
size_t pos = start_pos;
BlockSplitIterator literal_it(literal_split);
BlockSplitIterator insert_and_copy_it(insert_and_copy_split);
BlockSplitIterator dist_it(dist_split);
for (int i = 0; i < num_commands; ++i) {
const Command &cmd = cmds[i];
insert_and_copy_it.Next();
(*insert_and_copy_histograms)[insert_and_copy_it.type_].Add(
cmd.cmd_prefix_);
for (int j = 0; j < cmd.insert_len_; ++j) {
literal_it.Next();
int context = (literal_it.type_ << kLiteralContextBits) +
Context(prev_byte, prev_byte2, context_modes[literal_it.type_]);
(*literal_histograms)[context].Add(ringbuffer[pos & mask]);
prev_byte2 = prev_byte;
prev_byte = ringbuffer[pos & mask];
++pos;
}
pos += cmd.copy_len_;
if (cmd.copy_len_ > 0) {
prev_byte2 = ringbuffer[(pos - 2) & mask];
prev_byte = ringbuffer[(pos - 1) & mask];
if (cmd.cmd_prefix_ >= 128) {
dist_it.Next();
int context = (dist_it.type_ << kDistanceContextBits) +
cmd.DistanceContext();
(*copy_dist_histograms)[context].Add(cmd.dist_prefix_);
}
}
}
}
} // namespace brotli

View File

@ -0,0 +1,107 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Models the histograms of literals, commands and distance codes.
#ifndef BROTLI_ENC_HISTOGRAM_H_
#define BROTLI_ENC_HISTOGRAM_H_
#include <stdint.h>
#include <string.h>
#include <vector>
#include <utility>
#include "./command.h"
#include "./fast_log.h"
#include "./prefix.h"
namespace brotli {
class BlockSplit;
// A simple container for histograms of data in blocks.
template<int kDataSize>
struct Histogram {
Histogram() {
Clear();
}
void Clear() {
memset(data_, 0, sizeof(data_));
total_count_ = 0;
}
void Add(int val) {
++data_[val];
++total_count_;
}
void Remove(int val) {
--data_[val];
--total_count_;
}
template<typename DataType>
void Add(const DataType *p, size_t n) {
total_count_ += n;
n += 1;
while(--n) ++data_[*p++];
}
void AddHistogram(const Histogram& v) {
total_count_ += v.total_count_;
for (int i = 0; i < kDataSize; ++i) {
data_[i] += v.data_[i];
}
}
double EntropyBitCost() const {
double retval = total_count_ * FastLog2(total_count_);
for (int i = 0; i < kDataSize; ++i) {
retval -= data_[i] * FastLog2(data_[i]);
}
return retval;
}
int data_[kDataSize];
int total_count_;
double bit_cost_;
};
// Literal histogram.
typedef Histogram<256> HistogramLiteral;
// Prefix histograms.
typedef Histogram<kNumCommandPrefixes> HistogramCommand;
typedef Histogram<kNumDistancePrefixes> HistogramDistance;
typedef Histogram<kNumBlockLenPrefixes> HistogramBlockLength;
// Context map histogram, 256 Huffman tree indexes + 16 run length codes.
typedef Histogram<272> HistogramContextMap;
// Block type histogram, 256 block types + 2 special symbols.
typedef Histogram<258> HistogramBlockType;
static const int kLiteralContextBits = 6;
static const int kDistanceContextBits = 2;
void BuildHistograms(
const Command* cmds,
const size_t num_commands,
const BlockSplit& literal_split,
const BlockSplit& insert_and_copy_split,
const BlockSplit& dist_split,
const uint8_t* ringbuffer,
size_t pos,
size_t mask,
uint8_t prev_byte,
uint8_t prev_byte2,
const std::vector<int>& context_modes,
std::vector<HistogramLiteral>* literal_histograms,
std::vector<HistogramCommand>* insert_and_copy_histograms,
std::vector<HistogramDistance>* copy_dist_histograms);
} // namespace brotli
#endif // BROTLI_ENC_HISTOGRAM_H_

View File

@ -0,0 +1,172 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Literal cost model to allow backward reference replacement to be efficient.
#include "./literal_cost.h"
#include <math.h>
#include <stdint.h>
#include <algorithm>
#include "./fast_log.h"
namespace brotli {
static int UTF8Position(int last, int c, int clamp) {
if (c < 128) {
return 0; // Next one is the 'Byte 1' again.
} else if (c >= 192) {
return std::min(1, clamp); // Next one is the 'Byte 2' of utf-8 encoding.
} else {
// Let's decide over the last byte if this ends the sequence.
if (last < 0xe0) {
return 0; // Completed two or three byte coding.
} else {
return std::min(2, clamp); // Next one is the 'Byte 3' of utf-8 encoding.
}
}
}
static int DecideMultiByteStatsLevel(size_t pos, size_t len, size_t mask,
const uint8_t *data) {
int counts[3] = { 0 };
int max_utf8 = 1; // should be 2, but 1 compresses better.
int last_c = 0;
int utf8_pos = 0;
for (int i = 0; i < len; ++i) {
int c = data[(pos + i) & mask];
utf8_pos = UTF8Position(last_c, c, 2);
++counts[utf8_pos];
last_c = c;
}
if (counts[2] < 500) {
max_utf8 = 1;
}
if (counts[1] + counts[2] < 25) {
max_utf8 = 0;
}
return max_utf8;
}
void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
size_t cost_mask, const uint8_t *data,
float *cost) {
// max_utf8 is 0 (normal ascii single byte modeling),
// 1 (for 2-byte utf-8 modeling), or 2 (for 3-byte utf-8 modeling).
const int max_utf8 = DecideMultiByteStatsLevel(pos, len, mask, data);
int histogram[3][256] = { { 0 } };
int window_half = 495;
int in_window = std::min(static_cast<size_t>(window_half), len);
int in_window_utf8[3] = { 0 };
// Bootstrap histograms.
int last_c = 0;
int utf8_pos = 0;
for (int i = 0; i < in_window; ++i) {
int c = data[(pos + i) & mask];
++histogram[utf8_pos][c];
++in_window_utf8[utf8_pos];
utf8_pos = UTF8Position(last_c, c, max_utf8);
last_c = c;
}
// Compute bit costs with sliding window.
for (int i = 0; i < len; ++i) {
if (i - window_half >= 0) {
// Remove a byte in the past.
int c = (i - window_half - 1) < 0 ?
0 : data[(pos + i - window_half - 1) & mask];
int last_c = (i - window_half - 2) < 0 ?
0 : data[(pos + i - window_half - 2) & mask];
int utf8_pos2 = UTF8Position(last_c, c, max_utf8);
--histogram[utf8_pos2][data[(pos + i - window_half) & mask]];
--in_window_utf8[utf8_pos2];
}
if (i + window_half < len) {
// Add a byte in the future.
int c = (i + window_half - 1) < 0 ?
0 : data[(pos + i + window_half - 1) & mask];
int last_c = (i + window_half - 2) < 0 ?
0 : data[(pos + i + window_half - 2) & mask];
int utf8_pos2 = UTF8Position(last_c, c, max_utf8);
++histogram[utf8_pos2][data[(pos + i + window_half) & mask]];
++in_window_utf8[utf8_pos2];
}
int c = i < 1 ? 0 : data[(pos + i - 1) & mask];
int last_c = i < 2 ? 0 : data[(pos + i - 2) & mask];
int utf8_pos = UTF8Position(last_c, c, max_utf8);
int masked_pos = (pos + i) & mask;
int histo = histogram[utf8_pos][data[masked_pos]];
if (histo == 0) {
histo = 1;
}
float lit_cost = FastLog2(in_window_utf8[utf8_pos]) - FastLog2(histo);
lit_cost += 0.02905;
if (lit_cost < 1.0) {
lit_cost *= 0.5;
lit_cost += 0.5;
}
// Make the first bytes more expensive -- seems to help, not sure why.
// Perhaps because the entropy source is changing its properties
// rapidly in the beginning of the file, perhaps because the beginning
// of the data is a statistical "anomaly".
if (i < 2000) {
lit_cost += 0.7 - ((2000 - i) / 2000.0 * 0.35);
}
cost[(pos + i) & cost_mask] = lit_cost;
}
}
void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
size_t cost_mask, const uint8_t *data,
float *cost) {
int histogram[256] = { 0 };
int window_half = 2000;
int in_window = std::min(static_cast<size_t>(window_half), len);
// Bootstrap histogram.
for (int i = 0; i < in_window; ++i) {
++histogram[data[(pos + i) & mask]];
}
// Compute bit costs with sliding window.
for (int i = 0; i < len; ++i) {
if (i - window_half >= 0) {
// Remove a byte in the past.
--histogram[data[(pos + i - window_half) & mask]];
--in_window;
}
if (i + window_half < len) {
// Add a byte in the future.
++histogram[data[(pos + i + window_half) & mask]];
++in_window;
}
int histo = histogram[data[(pos + i) & mask]];
if (histo == 0) {
histo = 1;
}
float lit_cost = FastLog2(in_window) - FastLog2(histo);
lit_cost += 0.029;
if (lit_cost < 1.0) {
lit_cost *= 0.5;
lit_cost += 0.5;
}
cost[(pos + i) & cost_mask] = lit_cost;
}
}
} // namespace brotli

View File

@ -0,0 +1,38 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Literal cost model to allow backward reference replacement to be efficient.
#ifndef BROTLI_ENC_LITERAL_COST_H_
#define BROTLI_ENC_LITERAL_COST_H_
#include <stddef.h>
#include <stdint.h>
namespace brotli {
// Estimates how many bits the literals in the interval [pos, pos + len) in the
// ringbuffer (data, mask) will take entropy coded and writes these estimates
// to the ringbuffer (cost, mask).
void EstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
size_t cost_mask, const uint8_t *data,
float *cost);
void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
size_t cost_mask, const uint8_t *data,
float *cost);
} // namespace brotli
#endif // BROTLI_ENC_LITERAL_COST_H_

View File

@ -0,0 +1,319 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Algorithms for distributing the literals and commands of a metablock between
// block types and contexts.
#include "./metablock.h"
#include "./block_splitter.h"
#include "./cluster.h"
#include "./histogram.h"
namespace brotli {
void BuildMetaBlock(const uint8_t* ringbuffer,
const size_t pos,
const size_t mask,
uint8_t prev_byte,
uint8_t prev_byte2,
const Command* cmds,
size_t num_commands,
int literal_context_mode,
bool enable_context_modeling,
MetaBlockSplit* mb) {
SplitBlock(cmds, num_commands,
&ringbuffer[pos & mask],
&mb->literal_split,
&mb->command_split,
&mb->distance_split);
std::vector<int> literal_context_modes(mb->literal_split.num_types,
literal_context_mode);
int num_literal_contexts =
mb->literal_split.num_types << kLiteralContextBits;
int num_distance_contexts =
mb->distance_split.num_types << kDistanceContextBits;
std::vector<HistogramLiteral> literal_histograms(num_literal_contexts);
mb->command_histograms.resize(mb->command_split.num_types);
std::vector<HistogramDistance> distance_histograms(num_distance_contexts);
BuildHistograms(cmds, num_commands,
mb->literal_split,
mb->command_split,
mb->distance_split,
ringbuffer,
pos,
mask,
prev_byte,
prev_byte2,
literal_context_modes,
&literal_histograms,
&mb->command_histograms,
&distance_histograms);
// Histogram ids need to fit in one byte.
static const int kMaxNumberOfHistograms = 256;
mb->literal_histograms = literal_histograms;
if (enable_context_modeling) {
ClusterHistograms(literal_histograms,
1 << kLiteralContextBits,
mb->literal_split.num_types,
kMaxNumberOfHistograms,
&mb->literal_histograms,
&mb->literal_context_map);
} else {
ClusterHistogramsTrivial(literal_histograms,
1 << kLiteralContextBits,
mb->literal_split.num_types,
kMaxNumberOfHistograms,
&mb->literal_histograms,
&mb->literal_context_map);
}
mb->distance_histograms = distance_histograms;
if (enable_context_modeling) {
ClusterHistograms(distance_histograms,
1 << kDistanceContextBits,
mb->distance_split.num_types,
kMaxNumberOfHistograms,
&mb->distance_histograms,
&mb->distance_context_map);
} else {
ClusterHistogramsTrivial(distance_histograms,
1 << kDistanceContextBits,
mb->distance_split.num_types,
kMaxNumberOfHistograms,
&mb->distance_histograms,
&mb->distance_context_map);
}
}
// Greedy block splitter for one block category (literal, command or distance).
template<typename HistogramType>
class BlockSplitter {
public:
BlockSplitter(int alphabet_size,
int min_block_size,
double split_threshold,
int num_symbols,
BlockSplit* split,
std::vector<HistogramType>* histograms)
: alphabet_size_(alphabet_size),
min_block_size_(min_block_size),
split_threshold_(split_threshold),
num_blocks_(0),
split_(split),
histograms_(histograms),
target_block_size_(min_block_size),
block_size_(0),
curr_histogram_ix_(0),
merge_last_count_(0) {
int max_num_blocks = num_symbols / min_block_size + 1;
// We have to allocate one more histogram than the maximum number of block
// types for the current histogram when the meta-block is too big.
int max_num_types = std::min(max_num_blocks, kMaxBlockTypes + 1);
split_->lengths.resize(max_num_blocks);
split_->types.resize(max_num_blocks);
histograms_->resize(max_num_types);
last_histogram_ix_[0] = last_histogram_ix_[1] = 0;
}
// Adds the next symbol to the current histogram. When the current histogram
// reaches the target size, decides on merging the block.
void AddSymbol(int symbol) {
(*histograms_)[curr_histogram_ix_].Add(symbol);
++block_size_;
if (block_size_ == target_block_size_) {
FinishBlock(/* is_final = */ false);
}
}
// Does either of three things:
// (1) emits the current block with a new block type;
// (2) emits the current block with the type of the second last block;
// (3) merges the current block with the last block.
void FinishBlock(bool is_final) {
if (block_size_ < min_block_size_) {
block_size_ = min_block_size_;
}
if (num_blocks_ == 0) {
// Create first block.
split_->lengths[0] = block_size_;
split_->types[0] = 0;
last_entropy_[0] =
BitsEntropy(&(*histograms_)[0].data_[0], alphabet_size_);
last_entropy_[1] = last_entropy_[0];
++num_blocks_;
++split_->num_types;
++curr_histogram_ix_;
block_size_ = 0;
} else if (block_size_ > 0) {
double entropy = BitsEntropy(&(*histograms_)[curr_histogram_ix_].data_[0],
alphabet_size_);
HistogramType combined_histo[2];
double combined_entropy[2];
double diff[2];
for (int j = 0; j < 2; ++j) {
int last_histogram_ix = last_histogram_ix_[j];
combined_histo[j] = (*histograms_)[curr_histogram_ix_];
combined_histo[j].AddHistogram((*histograms_)[last_histogram_ix]);
combined_entropy[j] = BitsEntropy(
&combined_histo[j].data_[0], alphabet_size_);
diff[j] = combined_entropy[j] - entropy - last_entropy_[j];
}
if (split_->num_types < kMaxBlockTypes &&
diff[0] > split_threshold_ &&
diff[1] > split_threshold_) {
// Create new block.
split_->lengths[num_blocks_] = block_size_;
split_->types[num_blocks_] = split_->num_types;
last_histogram_ix_[1] = last_histogram_ix_[0];
last_histogram_ix_[0] = split_->num_types;
last_entropy_[1] = last_entropy_[0];
last_entropy_[0] = entropy;
++num_blocks_;
++split_->num_types;
++curr_histogram_ix_;
block_size_ = 0;
merge_last_count_ = 0;
target_block_size_ = min_block_size_;
} else if (diff[1] < diff[0] - 20.0) {
// Combine this block with second last block.
split_->lengths[num_blocks_] = block_size_;
split_->types[num_blocks_] = split_->types[num_blocks_ - 2];
std::swap(last_histogram_ix_[0], last_histogram_ix_[1]);
(*histograms_)[last_histogram_ix_[0]] = combined_histo[1];
last_entropy_[1] = last_entropy_[0];
last_entropy_[0] = combined_entropy[1];
++num_blocks_;
block_size_ = 0;
(*histograms_)[curr_histogram_ix_].Clear();
merge_last_count_ = 0;
target_block_size_ = min_block_size_;
} else {
// Combine this block with last block.
split_->lengths[num_blocks_ - 1] += block_size_;
(*histograms_)[last_histogram_ix_[0]] = combined_histo[0];
last_entropy_[0] = combined_entropy[0];
if (split_->num_types == 1) {
last_entropy_[1] = last_entropy_[0];
}
block_size_ = 0;
(*histograms_)[curr_histogram_ix_].Clear();
if (++merge_last_count_ > 1) {
target_block_size_ += min_block_size_;
}
}
}
if (is_final) {
(*histograms_).resize(split_->num_types);
split_->types.resize(num_blocks_);
split_->lengths.resize(num_blocks_);
}
}
private:
static const int kMaxBlockTypes = 256;
// Alphabet size of particular block category.
const int alphabet_size_;
// We collect at least this many symbols for each block.
const int min_block_size_;
// We merge histograms A and B if
// entropy(A+B) < entropy(A) + entropy(B) + split_threshold_,
// where A is the current histogram and B is the histogram of the last or the
// second last block type.
const double split_threshold_;
int num_blocks_;
BlockSplit* split_; // not owned
std::vector<HistogramType>* histograms_; // not owned
// The number of symbols that we want to collect before deciding on whether
// or not to merge the block with a previous one or emit a new block.
int target_block_size_;
// The number of symbols in the current histogram.
int block_size_;
// Offset of the current histogram.
int curr_histogram_ix_;
// Offset of the histograms of the previous two block types.
int last_histogram_ix_[2];
// Entropy of the previous two block types.
double last_entropy_[2];
// The number of times we merged the current block with the last one.
int merge_last_count_;
};
void BuildMetaBlockGreedy(const uint8_t* ringbuffer,
size_t pos,
size_t mask,
const Command *commands,
size_t n_commands,
MetaBlockSplit* mb) {
int num_literals = 0;
for (int i = 0; i < n_commands; ++i) {
num_literals += commands[i].insert_len_;
}
BlockSplitter<HistogramLiteral> lit_blocks(
256, 512, 400.0, num_literals,
&mb->literal_split, &mb->literal_histograms);
BlockSplitter<HistogramCommand> cmd_blocks(
kNumCommandPrefixes, 1024, 500.0, n_commands,
&mb->command_split, &mb->command_histograms);
BlockSplitter<HistogramDistance> dist_blocks(
64, 512, 100.0, n_commands,
&mb->distance_split, &mb->distance_histograms);
for (int i = 0; i < n_commands; ++i) {
const Command cmd = commands[i];
cmd_blocks.AddSymbol(cmd.cmd_prefix_);
for (int j = 0; j < cmd.insert_len_; ++j) {
lit_blocks.AddSymbol(ringbuffer[pos & mask]);
++pos;
}
pos += cmd.copy_len_;
if (cmd.copy_len_ > 0 && cmd.cmd_prefix_ >= 128) {
dist_blocks.AddSymbol(cmd.dist_prefix_);
}
}
lit_blocks.FinishBlock(/* is_final = */ true);
cmd_blocks.FinishBlock(/* is_final = */ true);
dist_blocks.FinishBlock(/* is_final = */ true);
}
void OptimizeHistograms(int num_direct_distance_codes,
int distance_postfix_bits,
MetaBlockSplit* mb) {
for (int i = 0; i < mb->literal_histograms.size(); ++i) {
OptimizeHuffmanCountsForRle(256, &mb->literal_histograms[i].data_[0]);
}
for (int i = 0; i < mb->command_histograms.size(); ++i) {
OptimizeHuffmanCountsForRle(kNumCommandPrefixes,
&mb->command_histograms[i].data_[0]);
}
int num_distance_codes =
kNumDistanceShortCodes + num_direct_distance_codes +
(48 << distance_postfix_bits);
for (int i = 0; i < mb->distance_histograms.size(); ++i) {
OptimizeHuffmanCountsForRle(num_distance_codes,
&mb->distance_histograms[i].data_[0]);
}
}
} // namespace brotli

View File

@ -0,0 +1,71 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Algorithms for distributing the literals and commands of a metablock between
// block types and contexts.
#ifndef BROTLI_ENC_METABLOCK_H_
#define BROTLI_ENC_METABLOCK_H_
#include <vector>
#include "./command.h"
#include "./histogram.h"
namespace brotli {
struct BlockSplit {
BlockSplit() : num_types(0) {}
int num_types;
std::vector<int> types;
std::vector<int> lengths;
};
struct MetaBlockSplit {
BlockSplit literal_split;
BlockSplit command_split;
BlockSplit distance_split;
std::vector<int> literal_context_map;
std::vector<int> distance_context_map;
std::vector<HistogramLiteral> literal_histograms;
std::vector<HistogramCommand> command_histograms;
std::vector<HistogramDistance> distance_histograms;
};
void BuildMetaBlock(const uint8_t* ringbuffer,
const size_t pos,
const size_t mask,
uint8_t prev_byte,
uint8_t prev_byte2,
const Command* cmds,
size_t num_commands,
int literal_context_mode,
bool enable_context_modleing,
MetaBlockSplit* mb);
void BuildMetaBlockGreedy(const uint8_t* ringbuffer,
size_t pos,
size_t mask,
const Command *commands,
size_t n_commands,
MetaBlockSplit* mb);
void OptimizeHistograms(int num_direct_distance_codes,
int distance_postfix_bits,
MetaBlockSplit* mb);
} // namespace brotli
#endif // BROTLI_ENC_METABLOCK_H_

View File

@ -0,0 +1,143 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Macros for endianness, branch prediction and unaligned loads and stores.
#ifndef BROTLI_ENC_PORT_H_
#define BROTLI_ENC_PORT_H_
#if defined OS_LINUX || defined OS_CYGWIN
#include <endian.h>
#elif defined OS_FREEBSD
#include <machine/endian.h>
#elif defined OS_MACOSX
#include <machine/endian.h>
/* Let's try and follow the Linux convention */
#define __BYTE_ORDER BYTE_ORDER
#define __LITTLE_ENDIAN LITTLE_ENDIAN
#define __BIG_ENDIAN BIG_ENDIAN
#endif
// define the macros IS_LITTLE_ENDIAN or IS_BIG_ENDIAN
// using the above endian definitions from endian.h if
// endian.h was included
#ifdef __BYTE_ORDER
#if __BYTE_ORDER == __LITTLE_ENDIAN
#define IS_LITTLE_ENDIAN
#endif
#if __BYTE_ORDER == __BIG_ENDIAN
#define IS_BIG_ENDIAN
#endif
#else
#if defined(__LITTLE_ENDIAN__)
#define IS_LITTLE_ENDIAN
#elif defined(__BIG_ENDIAN__)
#define IS_BIG_ENDIAN
#endif
#endif // __BYTE_ORDER
// Enable little-endian optimization for x64 architecture on Windows.
#if (defined(_WIN32) || defined(_WIN64)) && defined(_M_X64)
#define IS_LITTLE_ENDIAN
#endif
#if defined(COMPILER_GCC3)
#define PREDICT_FALSE(x) (__builtin_expect(x, 0))
#define PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
#else
#define PREDICT_FALSE(x) x
#define PREDICT_TRUE(x) x
#endif
// Portable handling of unaligned loads, stores, and copies.
// On some platforms, like ARM, the copy functions can be more efficient
// then a load and a store.
#if defined(ARCH_PIII) || defined(ARCH_ATHLON) || \
defined(ARCH_K8) || defined(_ARCH_PPC)
// x86 and x86-64 can perform unaligned loads/stores directly;
// modern PowerPC hardware can also do unaligned integer loads and stores;
// but note: the FPU still sends unaligned loads and stores to a trap handler!
#define BROTLI_UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32_t *>(_p))
#define BROTLI_UNALIGNED_LOAD64(_p) (*reinterpret_cast<const uint64_t *>(_p))
#define BROTLI_UNALIGNED_STORE32(_p, _val) \
(*reinterpret_cast<uint32_t *>(_p) = (_val))
#define BROTLI_UNALIGNED_STORE64(_p, _val) \
(*reinterpret_cast<uint64_t *>(_p) = (_val))
#elif defined(__arm__) && \
!defined(__ARM_ARCH_5__) && \
!defined(__ARM_ARCH_5T__) && \
!defined(__ARM_ARCH_5TE__) && \
!defined(__ARM_ARCH_5TEJ__) && \
!defined(__ARM_ARCH_6__) && \
!defined(__ARM_ARCH_6J__) && \
!defined(__ARM_ARCH_6K__) && \
!defined(__ARM_ARCH_6Z__) && \
!defined(__ARM_ARCH_6ZK__) && \
!defined(__ARM_ARCH_6T2__)
// ARMv7 and newer support native unaligned accesses, but only of 16-bit
// and 32-bit values (not 64-bit); older versions either raise a fatal signal,
// do an unaligned read and rotate the words around a bit, or do the reads very
// slowly (trip through kernel mode).
#define BROTLI_UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32_t *>(_p))
#define BROTLI_UNALIGNED_STORE32(_p, _val) \
(*reinterpret_cast<uint32_t *>(_p) = (_val))
inline uint64_t BROTLI_UNALIGNED_LOAD64(const void *p) {
uint64_t t;
memcpy(&t, p, sizeof t);
return t;
}
inline void BROTLI_UNALIGNED_STORE64(void *p, uint64_t v) {
memcpy(p, &v, sizeof v);
}
#else
// These functions are provided for architectures that don't support
// unaligned loads and stores.
inline uint32_t BROTLI_UNALIGNED_LOAD32(const void *p) {
uint32_t t;
memcpy(&t, p, sizeof t);
return t;
}
inline uint64_t BROTLI_UNALIGNED_LOAD64(const void *p) {
uint64_t t;
memcpy(&t, p, sizeof t);
return t;
}
inline void BROTLI_UNALIGNED_STORE32(void *p, uint32_t v) {
memcpy(p, &v, sizeof v);
}
inline void BROTLI_UNALIGNED_STORE64(void *p, uint64_t v) {
memcpy(p, &v, sizeof v);
}
#endif
#endif // BROTLI_ENC_PORT_H_

View File

@ -0,0 +1,86 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Functions for encoding of integers into prefix codes the amount of extra
// bits, and the actual values of the extra bits.
#ifndef BROTLI_ENC_PREFIX_H_
#define BROTLI_ENC_PREFIX_H_
#include <stdint.h>
#include "./fast_log.h"
namespace brotli {
static const int kNumInsertLenPrefixes = 24;
static const int kNumCopyLenPrefixes = 24;
static const int kNumCommandPrefixes = 704;
static const int kNumBlockLenPrefixes = 26;
static const int kNumDistanceShortCodes = 16;
static const int kNumDistancePrefixes = 520;
// Represents the range of values belonging to a prefix code:
// [offset, offset + 2^nbits)
struct PrefixCodeRange {
int offset;
int nbits;
};
static const PrefixCodeRange kBlockLengthPrefixCode[kNumBlockLenPrefixes] = {
{ 1, 2}, { 5, 2}, { 9, 2}, { 13, 2},
{ 17, 3}, { 25, 3}, { 33, 3}, { 41, 3},
{ 49, 4}, { 65, 4}, { 81, 4}, { 97, 4},
{ 113, 5}, { 145, 5}, { 177, 5}, { 209, 5},
{ 241, 6}, { 305, 6}, { 369, 7}, { 497, 8},
{ 753, 9}, { 1265, 10}, {2289, 11}, {4337, 12},
{8433, 13}, {16625, 24}
};
inline void GetBlockLengthPrefixCode(int len,
int* code, int* n_extra, int* extra) {
*code = 0;
while (*code < 25 && len >= kBlockLengthPrefixCode[*code + 1].offset) {
++(*code);
}
*n_extra = kBlockLengthPrefixCode[*code].nbits;
*extra = len - kBlockLengthPrefixCode[*code].offset;
}
inline void PrefixEncodeCopyDistance(int distance_code,
int num_direct_codes,
int postfix_bits,
uint16_t* code,
uint32_t* extra_bits) {
distance_code -= 1;
if (distance_code < kNumDistanceShortCodes + num_direct_codes) {
*code = distance_code;
*extra_bits = 0;
return;
}
distance_code -= kNumDistanceShortCodes + num_direct_codes;
distance_code += (1 << (postfix_bits + 2));
int bucket = Log2Floor(distance_code) - 1;
int postfix_mask = (1 << postfix_bits) - 1;
int postfix = distance_code & postfix_mask;
int prefix = (distance_code >> bucket) & 1;
int offset = (2 + prefix) << bucket;
int nbits = bucket - postfix_bits;
*code = kNumDistanceShortCodes + num_direct_codes +
((2 * (nbits - 1) + prefix) << postfix_bits) + postfix;
*extra_bits = (nbits << 24) | ((distance_code - offset) >> postfix_bits);
}
} // namespace brotli
#endif // BROTLI_ENC_PREFIX_H_

View File

@ -0,0 +1,108 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Sliding window over the input data.
#ifndef BROTLI_ENC_RINGBUFFER_H_
#define BROTLI_ENC_RINGBUFFER_H_
#include <stddef.h>
#include <stdint.h>
#include "./port.h"
namespace brotli {
// A RingBuffer(window_bits, tail_bits) contains `1 << window_bits' bytes of
// data in a circular manner: writing a byte writes it to
// `position() % (1 << window_bits)'. For convenience, the RingBuffer array
// contains another copy of the first `1 << tail_bits' bytes:
// buffer_[i] == buffer_[i + (1 << window_bits)] if i < (1 << tail_bits).
class RingBuffer {
public:
RingBuffer(int window_bits, int tail_bits)
: window_bits_(window_bits),
mask_((1 << window_bits) - 1),
tail_size_(1 << tail_bits),
pos_(0) {
static const int kSlackForFourByteHashingEverywhere = 3;
const int buflen = (1 << window_bits_) + tail_size_;
buffer_ = new uint8_t[buflen + kSlackForFourByteHashingEverywhere];
for (int i = 0; i < kSlackForFourByteHashingEverywhere; ++i) {
buffer_[buflen + i] = 0;
}
}
~RingBuffer() {
delete [] buffer_;
}
// Push bytes into the ring buffer.
void Write(const uint8_t *bytes, size_t n) {
const size_t masked_pos = pos_ & mask_;
// The length of the writes is limited so that we do not need to worry
// about a write
WriteTail(bytes, n);
if (PREDICT_TRUE(masked_pos + n <= (1 << window_bits_))) {
// A single write fits.
memcpy(&buffer_[masked_pos], bytes, n);
} else {
// Split into two writes.
// Copy into the end of the buffer, including the tail buffer.
memcpy(&buffer_[masked_pos], bytes,
std::min(n, ((1 << window_bits_) + tail_size_) - masked_pos));
// Copy into the begining of the buffer
memcpy(&buffer_[0], bytes + ((1 << window_bits_) - masked_pos),
n - ((1 << window_bits_) - masked_pos));
}
pos_ += n;
}
void Reset() {
pos_ = 0;
}
// Logical cursor position in the ring buffer.
size_t position() const { return pos_; }
// Bit mask for getting the physical position for a logical position.
size_t mask() const { return mask_; }
uint8_t *start() { return &buffer_[0]; }
const uint8_t *start() const { return &buffer_[0]; }
private:
void WriteTail(const uint8_t *bytes, size_t n) {
const size_t masked_pos = pos_ & mask_;
if (PREDICT_FALSE(masked_pos < tail_size_)) {
// Just fill the tail buffer with the beginning data.
const size_t p = (1 << window_bits_) + masked_pos;
memcpy(&buffer_[p], bytes, std::min(n, tail_size_ - masked_pos));
}
}
// Size of the ringbuffer is (1 << window_bits) + tail_size_.
const int window_bits_;
const size_t mask_;
const size_t tail_size_;
// Position to write in the ring buffer.
size_t pos_;
// The actual ring buffer containing the data and the copy of the beginning
// as a tail.
uint8_t *buffer_;
};
} // namespace brotli
#endif // BROTLI_ENC_RINGBUFFER_H_

View File

@ -0,0 +1,87 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Class to model the static dictionary.
#ifndef BROTLI_ENC_STATIC_DICT_H_
#define BROTLI_ENC_STATIC_DICT_H_
#include <algorithm>
#include <unordered_map>
#include <string>
#include "./dictionary.h"
#include "./transform.h"
namespace brotli {
class StaticDictionary {
public:
StaticDictionary() {}
void Fill(bool enable_transforms) {
const int num_transforms = enable_transforms ? kNumTransforms : 1;
for (int t = num_transforms - 1; t >= 0; --t) {
for (int i = kMaxDictionaryWordLength;
i >= kMinDictionaryWordLength; --i) {
const int num_words = 1 << kBrotliDictionarySizeBitsByLength[i];
for (int j = num_words - 1; j >= 0; --j) {
int word_id = t * num_words + j;
std::string word = GetTransformedDictionaryWord(i, word_id);
if (word.size() >= 4) {
Insert(word, i, word_id);
}
}
}
}
}
void Insert(const std::string &str, int len, int dist) {
int ix = (dist << 6) + len;
std::unordered_map<std::string, int>::const_iterator it = map_.find(str);
if (it != map_.end() && ix >= it->second) {
return;
}
map_[str] = ix;
uint32_t v = 0;
for (int i = 0; i < 4 && i < str.size(); ++i) {
v += static_cast<uint32_t>(str[i]) << (8 * i);
}
if (prefix_map_[v] < str.size()) {
prefix_map_[v] = str.size();
}
}
int GetLength(uint32_t v) const {
std::unordered_map<uint32_t, int>::const_iterator it = prefix_map_.find(v);
if (it == prefix_map_.end()) {
return 0;
}
return it->second;
}
bool Get(const std::string &str, int *len, int *dist) const {
std::unordered_map<std::string, int>::const_iterator it = map_.find(str);
if (it == map_.end()) {
return false;
}
int v = it->second;
*len = v & 63;
*dist = v >> 6;
return true;
}
private:
std::unordered_map<std::string, int> map_;
std::unordered_map<uint32_t, int> prefix_map_;
};
} // namespace brotli
#endif // BROTLI_ENC_STATIC_DICT_H_

View File

@ -0,0 +1,126 @@
// Copyright 2009 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Convience routines to make Brotli I/O classes from some memory containers and
// files.
#include "./streams.h"
#include <assert.h>
#include <string.h>
namespace brotli {
BrotliMemOut::BrotliMemOut(void* buf, int len)
: buf_(buf),
len_(len),
pos_(0) {}
void BrotliMemOut::Reset(void* buf, int len) {
buf_ = buf;
len_ = len;
pos_ = 0;
}
// Brotli output routine: copy n bytes to the output buffer.
bool BrotliMemOut::Write(const void *buf, size_t n) {
if (n + pos_ > len_)
return false;
char* p = reinterpret_cast<char*>(buf_) + pos_;
memcpy(p, buf, n);
pos_ += n;
return true;
}
BrotliStringOut::BrotliStringOut(std::string* buf, int max_size)
: buf_(buf),
max_size_(max_size) {
assert(buf->empty());
}
void BrotliStringOut::Reset(std::string* buf, int max_size) {
buf_ = buf;
max_size_ = max_size;
}
// Brotli output routine: add n bytes to a string.
bool BrotliStringOut::Write(const void *buf, size_t n) {
if (buf_->size() + n > max_size_)
return false;
buf_->append(static_cast<const char*>(buf), n);
return true;
}
BrotliMemIn::BrotliMemIn(const void* buf, int len)
: buf_(buf),
len_(len),
pos_(0) {}
void BrotliMemIn::Reset(const void* buf, int len) {
buf_ = buf;
len_ = len;
pos_ = 0;
}
// Brotli input routine: read the next chunk of memory.
const void* BrotliMemIn::Read(size_t n, size_t* output) {
if (pos_ == len_) {
return NULL;
}
if (n > len_ - pos_)
n = len_ - pos_;
const char* p = reinterpret_cast<const char*>(buf_) + pos_;
pos_ += n;
*output = n;
return p;
}
BrotliFileIn::BrotliFileIn(FILE* f, size_t max_read_size)
: f_(f),
buf_(malloc(max_read_size)),
buf_size_(max_read_size) {}
BrotliFileIn::~BrotliFileIn() {
if (buf_) free(buf_);
}
const void* BrotliFileIn::Read(size_t n, size_t* bytes_read) {
if (buf_ == NULL) {
*bytes_read = 0;
return NULL;
}
if (n > buf_size_) {
n = buf_size_;
} else if (n == 0) {
return feof(f_) ? NULL : buf_;
}
*bytes_read = fread(buf_, 1, n, f_);
if (*bytes_read == 0) {
return NULL;
} else {
return buf_;
}
}
BrotliFileOut::BrotliFileOut(FILE* f) : f_(f) {}
bool BrotliFileOut::Write(const void* buf, size_t n) {
if (fwrite(buf, n, 1, f_) != 1) {
return false;
}
return true;
}
} // namespace brotli

View File

@ -0,0 +1,130 @@
// Copyright 2009 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Input and output classes for streaming brotli compression.
#ifndef BROTLI_ENC_STREAMS_H_
#define BROTLI_ENC_STREAMS_H_
#include <stdlib.h>
#include <stddef.h>
#include <stdio.h>
#include <string>
namespace brotli {
// Input interface for the compression routines.
class BrotliIn {
public:
virtual ~BrotliIn() {}
// Return a pointer to the next block of input of at most n bytes.
// Return the actual length in *nread.
// At end of data, return NULL. Don't return NULL if there is more data
// to read, even if called with n == 0.
// Read will only be called if some of its bytes are needed.
virtual const void* Read(size_t n, size_t* nread) = 0;
};
// Output interface for the compression routines.
class BrotliOut {
public:
virtual ~BrotliOut() {}
// Write n bytes of data from buf.
// Return true if all written, false otherwise.
virtual bool Write(const void *buf, size_t n) = 0;
};
// Adapter class to make BrotliIn objects from raw memory.
class BrotliMemIn : public BrotliIn {
public:
BrotliMemIn(const void* buf, int len);
void Reset(const void* buf, int len);
// returns the amount of data consumed
int position() const { return pos_; }
const void* Read(size_t n, size_t* OUTPUT) override;
private:
const void* buf_; // start of input buffer
int len_; // length of input
int pos_; // current read position within input
};
// Adapter class to make BrotliOut objects from raw memory.
class BrotliMemOut : public BrotliOut {
public:
BrotliMemOut(void* buf, int len);
void Reset(void* buf, int len);
// returns the amount of data written
int position() const { return pos_; }
bool Write(const void* buf, size_t n) override;
private:
void* buf_; // start of output buffer
int len_; // length of output
int pos_; // current write position within output
};
// Adapter class to make BrotliOut objects from a string.
class BrotliStringOut : public BrotliOut {
public:
// Create a writer that appends its data to buf.
// buf->size() will grow to at most max_size
// buf is expected to be empty when constructing BrotliStringOut.
BrotliStringOut(std::string* buf, int max_size);
void Reset(std::string* buf, int max_len);
bool Write(const void* buf, size_t n) override;
private:
std::string* buf_; // start of output buffer
int max_size_; // max length of output
};
// Adapter class to make BrotliIn object from a file.
class BrotliFileIn : public BrotliIn {
public:
BrotliFileIn(FILE* f, size_t max_read_size);
~BrotliFileIn();
const void* Read(size_t n, size_t* bytes_read) override;
private:
FILE* f_;
void* buf_;
size_t buf_size_;
};
// Adapter class to make BrotliOut object from a file.
class BrotliFileOut : public BrotliOut {
public:
explicit BrotliFileOut(FILE* f);
bool Write(const void* buf, size_t n) override;
private:
FILE* f_;
};
} // namespace brotli
#endif // BROTLI_ENC_STREAMS_H_

Binary file not shown.

View File

@ -0,0 +1,242 @@
// Copyright 2010 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Transformations on dictionary words.
#ifndef BROTLI_ENC_TRANSFORM_H_
#define BROTLI_ENC_TRANSFORM_H_
#include <string>
#include "./dictionary.h"
namespace brotli {
enum WordTransformType {
kIdentity = 0,
kOmitLast1 = 1,
kOmitLast2 = 2,
kOmitLast3 = 3,
kOmitLast4 = 4,
kOmitLast5 = 5,
kOmitLast6 = 6,
kOmitLast7 = 7,
kOmitLast8 = 8,
kOmitLast9 = 9,
kUppercaseFirst = 10,
kUppercaseAll = 11,
kOmitFirst1 = 12,
kOmitFirst2 = 13,
kOmitFirst3 = 14,
kOmitFirst4 = 15,
kOmitFirst5 = 16,
kOmitFirst6 = 17,
kOmitFirst7 = 18,
kOmitFirst8 = 19,
kOmitFirst9 = 20,
};
struct Transform {
const char* prefix;
WordTransformType word_transform;
const char* suffix;
};
static const Transform kTransforms[] = {
{ "", kIdentity, "" },
{ "", kIdentity, " " },
{ " ", kIdentity, " " },
{ "", kOmitFirst1, "" },
{ "", kUppercaseFirst, " " },
{ "", kIdentity, " the " },
{ " ", kIdentity, "" },
{ "s ", kIdentity, " " },
{ "", kIdentity, " of " },
{ "", kUppercaseFirst, "" },
{ "", kIdentity, " and " },
{ "", kOmitFirst2, "" },
{ "", kOmitLast1, "" },
{ ", ", kIdentity, " " },
{ "", kIdentity, ", " },
{ " ", kUppercaseFirst, " " },
{ "", kIdentity, " in " },
{ "", kIdentity, " to " },
{ "e ", kIdentity, " " },
{ "", kIdentity, "\"" },
{ "", kIdentity, "." },
{ "", kIdentity, "\">" },
{ "", kIdentity, "\n" },
{ "", kOmitLast3, "" },
{ "", kIdentity, "]" },
{ "", kIdentity, " for " },
{ "", kOmitFirst3, "" },
{ "", kOmitLast2, "" },
{ "", kIdentity, " a " },
{ "", kIdentity, " that " },
{ " ", kUppercaseFirst, "" },
{ "", kIdentity, ". " },
{ ".", kIdentity, "" },
{ " ", kIdentity, ", " },
{ "", kOmitFirst4, "" },
{ "", kIdentity, " with " },
{ "", kIdentity, "'" },
{ "", kIdentity, " from " },
{ "", kIdentity, " by " },
{ "", kOmitFirst5, "" },
{ "", kOmitFirst6, "" },
{ " the ", kIdentity, "" },
{ "", kOmitLast4, "" },
{ "", kIdentity, ". The " },
{ "", kUppercaseAll, "" },
{ "", kIdentity, " on " },
{ "", kIdentity, " as " },
{ "", kIdentity, " is " },
{ "", kOmitLast7, "" },
{ "", kOmitLast1, "ing " },
{ "", kIdentity, "\n\t" },
{ "", kIdentity, ":" },
{ " ", kIdentity, ". " },
{ "", kIdentity, "ed " },
{ "", kOmitFirst9, "" },
{ "", kOmitFirst7, "" },
{ "", kOmitLast6, "" },
{ "", kIdentity, "(" },
{ "", kUppercaseFirst, ", " },
{ "", kOmitLast8, "" },
{ "", kIdentity, " at " },
{ "", kIdentity, "ly " },
{ " the ", kIdentity, " of " },
{ "", kOmitLast5, "" },
{ "", kOmitLast9, "" },
{ " ", kUppercaseFirst, ", " },
{ "", kUppercaseFirst, "\"" },
{ ".", kIdentity, "(" },
{ "", kUppercaseAll, " " },
{ "", kUppercaseFirst, "\">" },
{ "", kIdentity, "=\"" },
{ " ", kIdentity, "." },
{ ".com/", kIdentity, "" },
{ " the ", kIdentity, " of the " },
{ "", kUppercaseFirst, "'" },
{ "", kIdentity, ". This " },
{ "", kIdentity, "," },
{ ".", kIdentity, " " },
{ "", kUppercaseFirst, "(" },
{ "", kUppercaseFirst, "." },
{ "", kIdentity, " not " },
{ " ", kIdentity, "=\"" },
{ "", kIdentity, "er " },
{ " ", kUppercaseAll, " " },
{ "", kIdentity, "al " },
{ " ", kUppercaseAll, "" },
{ "", kIdentity, "='" },
{ "", kUppercaseAll, "\"" },
{ "", kUppercaseFirst, ". " },
{ " ", kIdentity, "(" },
{ "", kIdentity, "ful " },
{ " ", kUppercaseFirst, ". " },
{ "", kIdentity, "ive " },
{ "", kIdentity, "less " },
{ "", kUppercaseAll, "'" },
{ "", kIdentity, "est " },
{ " ", kUppercaseFirst, "." },
{ "", kUppercaseAll, "\">" },
{ " ", kIdentity, "='" },
{ "", kUppercaseFirst, "," },
{ "", kIdentity, "ize " },
{ "", kUppercaseAll, "." },
{ "\xc2\xa0", kIdentity, "" },
{ " ", kIdentity, "," },
{ "", kUppercaseFirst, "=\"" },
{ "", kUppercaseAll, "=\"" },
{ "", kIdentity, "ous " },
{ "", kUppercaseAll, ", " },
{ "", kUppercaseFirst, "='" },
{ " ", kUppercaseFirst, "," },
{ " ", kUppercaseAll, "=\"" },
{ " ", kUppercaseAll, ", " },
{ "", kUppercaseAll, "," },
{ "", kUppercaseAll, "(" },
{ "", kUppercaseAll, ". " },
{ " ", kUppercaseAll, "." },
{ "", kUppercaseAll, "='" },
{ " ", kUppercaseAll, ". " },
{ " ", kUppercaseFirst, "=\"" },
{ " ", kUppercaseAll, "='" },
{ " ", kUppercaseFirst, "='" },
};
static const int kNumTransforms = sizeof(kTransforms) / sizeof(kTransforms[0]);
static int ToUpperCase(uint8_t *p, int len) {
if (len == 1 || p[0] < 0xc0) {
if (p[0] >= 'a' && p[0] <= 'z') {
p[0] ^= 32;
}
return 1;
}
if (p[0] < 0xe0) {
p[1] ^= 32;
return 2;
}
if (len == 2) {
return 2;
}
p[2] ^= 5;
return 3;
}
inline std::string ApplyTransform(
const Transform& t, const uint8_t* word, int len) {
std::string ret(t.prefix);
if (t.word_transform <= kOmitLast9) {
len -= t.word_transform;
}
if (len > 0) {
if (t.word_transform >= kOmitFirst1) {
const int skip = t.word_transform - (kOmitFirst1 - 1);
if (len > skip) {
ret += std::string(word + skip, word + len);
}
} else {
ret += std::string(word, word + len);
uint8_t *uppercase = reinterpret_cast<uint8_t*>(&ret[ret.size() - len]);
if (t.word_transform == kUppercaseFirst) {
ToUpperCase(uppercase, len);
} else if (t.word_transform == kUppercaseAll) {
while (len > 0) {
int step = ToUpperCase(uppercase, len);
uppercase += step;
len -= step;
}
}
}
}
ret += std::string(t.suffix);
return ret;
}
inline std::string GetTransformedDictionaryWord(int len_code, int word_id) {
int num_words = 1 << kBrotliDictionarySizeBitsByLength[len_code];
int offset = kBrotliDictionaryOffsetsByLength[len_code];
int t = word_id / num_words;
int word_idx = word_id % num_words;
offset += len_code * word_idx;
const uint8_t* word = &kBrotliDictionary[offset];
return ApplyTransform(kTransforms[t], word, len_code);
}
} // namespace brotli
#endif // BROTLI_ENC_TRANSFORM_H_

View File

@ -0,0 +1,91 @@
// Copyright 2010 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Write bits into a byte array.
#ifndef BROTLI_ENC_WRITE_BITS_H_
#define BROTLI_ENC_WRITE_BITS_H_
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include "./port.h"
namespace brotli {
//#define BIT_WRITER_DEBUG
// This function writes bits into bytes in increasing addresses, and within
// a byte least-significant-bit first.
//
// The function can write up to 56 bits in one go with WriteBits
// Example: let's assume that 3 bits (Rs below) have been written already:
//
// BYTE-0 BYTE+1 BYTE+2
//
// 0000 0RRR 0000 0000 0000 0000
//
// Now, we could write 5 or less bits in MSB by just sifting by 3
// and OR'ing to BYTE-0.
//
// For n bits, we take the last 5 bits, OR that with high bits in BYTE-0,
// and locate the rest in BYTE+1, BYTE+2, etc.
inline void WriteBits(int n_bits,
uint64_t bits,
int * __restrict pos,
uint8_t * __restrict array) {
#ifdef BIT_WRITER_DEBUG
printf("WriteBits %2d 0x%016llx %10d\n", n_bits, bits, *pos);
#endif
assert(bits < 1ULL << n_bits);
#ifdef IS_LITTLE_ENDIAN
// This branch of the code can write up to 56 bits at a time,
// 7 bits are lost by being perhaps already in *p and at least
// 1 bit is needed to initialize the bit-stream ahead (i.e. if 7
// bits are in *p and we write 57 bits, then the next write will
// access a byte that was never initialized).
uint8_t *p = &array[*pos >> 3];
uint64_t v = *p;
v |= bits << (*pos & 7);
BROTLI_UNALIGNED_STORE64(p, v); // Set some bits.
*pos += n_bits;
#else
// implicit & 0xff is assumed for uint8_t arithmetics
uint8_t *array_pos = &array[*pos >> 3];
const int bits_reserved_in_first_byte = (*pos & 7);
bits <<= bits_reserved_in_first_byte;
*array_pos++ |= bits;
for (int bits_left_to_write = n_bits - 8 + bits_reserved_in_first_byte;
bits_left_to_write >= 1;
bits_left_to_write -= 8) {
bits >>= 8;
*array_pos++ = bits;
}
*array_pos = 0;
*pos += n_bits;
#endif
}
inline void WriteBitsPrepareStorage(int pos, uint8_t *array) {
#ifdef BIT_WRITER_DEBUG
printf("WriteBitsPrepareStorage %10d\n", pos);
#endif
assert((pos & 7) == 0);
array[pos >> 3] = 0;
}
} // namespace brotli
#endif // BROTLI_ENC_WRITE_BITS_H_