diff options
Diffstat (limited to 'include/consteval_huffman')
-rw-r--r-- | include/consteval_huffman/consteval_huffman.hpp | 379 |
1 files changed, 379 insertions, 0 deletions
diff --git a/include/consteval_huffman/consteval_huffman.hpp b/include/consteval_huffman/consteval_huffman.hpp new file mode 100644 index 0000000..b7ea57f --- /dev/null +++ b/include/consteval_huffman/consteval_huffman.hpp @@ -0,0 +1,379 @@ +/** + * consteval_huffman.hpp - Provides compile-time text compression. + * Written by Clyne Sullivan. + * https://github.com/tcsullivan/consteval-huffman + */ + +#ifndef TCSULLIVAN_CONSTEVAL_HUFFMAN_HPP_ +#define TCSULLIVAN_CONSTEVAL_HUFFMAN_HPP_ + +#include <algorithm> +#include <concepts> +#include <span> +#include <type_traits> + +namespace detail +{ + // Provides a string container for the huffman compressor. + // Using this allows for automatic string data length measurement, as + // well as implementation of the _huffman suffix. + template<unsigned long int N> + struct huffman_string_container { + char data[N]; + consteval huffman_string_container(const char (&s)[N]) noexcept { + std::copy(s, s + N, data); + } + consteval operator const char *() const noexcept { + return data; + } + consteval auto size() const noexcept { + return N; + } + }; +} + +/** + * Compresses the given data string using Huffman coding, providing a + * minimal run-time interface for decompressing the data. + * @tparam data The string of data to be compressed. + */ +template<auto raw_data> + requires( + std::same_as<std::remove_cvref_t<decltype(raw_data)>, + detail::huffman_string_container<raw_data.size()>> && + raw_data.size() > 0) +class huffman_compressor +{ + using size_t = long int; + using usize_t = unsigned long int; + + // Note: class internals need to be defined before the public interface. + // See the bottom of the class definition for usage. +private: + // Node structure used to build a tree for calculating Huffman codes. + struct node { + int value = 0; + size_t freq = 0; + + // Below values are indices into the node list + int parent = -1; + int left = -1; + int right = -1; + }; + + /** + * Builds a list of nodes for every character that appears in the given data. + * This list is sorted by increasing frequency. + * @return Compile-time allocated array of nodes + */ + consteval static auto build_node_list() noexcept { + // Build a list for counting every occuring value + auto list = std::span(new node[256] {}, 256); + for (int i = 0; i < 256; i++) + list[i].value = i; + for (usize_t i = 0; i < raw_data.size(); i++) + list[raw_data[i]].freq++; + + std::sort(list.begin(), list.end(), + [](const auto& a, const auto& b) { return a.freq < b.freq; }); + + // Filter out the non-occuring values, and build a compact list to return + auto first_valid_node = std::find_if(list.begin(), list.end(), + [](const auto& n) { return n.freq != 0; }); + auto fit_size = std::distance(first_valid_node, list.end()); + if (fit_size < 2) + fit_size = 2; + auto fit_list = std::span(new node[fit_size] {}, fit_size); + std::copy(first_valid_node, list.end(), fit_list.begin()); + delete[] list.data(); + return fit_list; + } + + /** + * Returns the count of how many nodes are in the node tree. + */ + consteval static auto tree_count() noexcept { + auto list = build_node_list(); + auto count = list.size() * 2 - 1; + delete[] list.data(); + return count; + } + + /** + * Builds a tree out of the node list, allowing for the calculation of + * Huffman codes. + * @return Compile-time allocated tree of nodes, root node at index zero. + */ + consteval static auto build_node_tree() noexcept { + auto list = build_node_list(); + auto tree = std::span(new node[tree_count()] {}, tree_count()); + + auto list_end = list.end(); // Track end of list as it shrinks + auto tree_begin = tree.end(); // Build tree from bottom + int next_parent_node_value = 0x100; // Give parent nodes unique ids + while (1) { + // Create parent node for two least-occuring values + node new_node { + next_parent_node_value++, + list[0].freq + list[1].freq, + -1, + list[0].value, + list[1].value + }; + + // Move the two nodes into the tree and remove them from the list + *--tree_begin = list[0]; + *--tree_begin = list[1]; + std::copy(list.begin() + 2, list_end--, list.begin()); + if (std::distance(list.begin(), list_end) == 1) { + list.front() = new_node; + break; + } + + // Insert the parent node back into the list + auto insertion_point = std::find_if(list.begin(), list_end - 1, + [&new_node](const auto& n) { return n.freq >= new_node.freq; }); + if (insertion_point != list_end - 1) { + *(list_end - 1) = node(); + std::copy_backward(insertion_point, list_end - 1, list_end); + } + + *insertion_point = new_node; + } + + // Connect child nodes to their parents + tree[0] = list[0]; + for (auto iter = tree.begin(); ++iter != tree.end();) { + if (iter->parent == -1) { + auto parent = std::find_if(tree.begin(), iter, + [&iter](const auto& n) { + return n.left == iter->value || n.right == iter->value; + }); + if (parent != iter) + iter->parent = std::distance(tree.begin(), parent); + } + } + + delete[] list.data(); + return tree; + } + + /** + * Determines the size of the compressed data. + * @return A pair of total bytes used, and bits used in last byte. + */ + consteval static auto compressed_size_info() noexcept { + auto tree = build_node_tree(); + size_t bytes = 1, bits = 0; + + for (usize_t i = 0; i < raw_data.size(); i++) { + auto leaf = std::find_if(tree.begin(), tree.end(), + [c = raw_data[i]](const auto& n) { return n.value == c; }); + + while (leaf->parent != -1) { + if (++bits == 8) + bits = 0, bytes++; + leaf = tree.begin() + leaf->parent; + } + } + + delete[] tree.data(); + return std::make_pair(bytes, bits); + } + + /** + * Compresses the input data, storing the result in the object instance. + */ + consteval void compress() noexcept { + auto tree = build_node_tree(); + + // Set up byte and bit count (note, we're compressing the data backwards) + auto [bytes, bits] = compressed_size_info(); + if (bits > 0) + bits = 8 - bits; + else + bits = 0, bytes--; + + // Compress data backwards, because we obtain the Huffman codes backwards + // as we traverse towards the parent node. + for (auto i = raw_data.size(); i > 0; i--) { + auto leaf = std::find_if(tree.begin(), tree.end(), + [c = raw_data[i - 1]](auto& n) { return n.value == c; }); + + while (leaf->parent != -1) { + auto parent = tree.begin() + leaf->parent; + if (parent->right == leaf->value) + compressed_data[bytes - 1] |= (1 << bits); + if (++bits == 8) + bits = 0, --bytes; + leaf = parent; + } + } + + delete[] tree.data(); + } + + /** + * Builds the decode tree, used to decompress the data. + * Format: three bytes per node. + * 1. Node value, 2. Distance to left child, 3. Distance to right child. + */ + consteval void build_decode_tree() noexcept { + auto tree = build_node_tree(); + auto decode_tree = compressed_data + compressed_size_info().first; + + for (usize_t i = 0; i < tree_count(); i++) { + // Only store node value if it represents a data value + decode_tree[i * 3] = tree[i].value <= 0xFF ? tree[i].value : 0; + + usize_t j; + // Find the left child of this node + for (j = i + 1; j < tree_count(); j++) { + if (tree[i].left == tree[j].value) + break; + } + decode_tree[i * 3 + 1] = j < tree_count() ? j - i : 0; + // Find the right child of this node + for (j = i + 1; j < tree_count(); j++) { + if (tree[i].right == tree[j].value) + break; + } + decode_tree[i * 3 + 2] = j < tree_count() ? j - i : 0; + } + + delete[] tree.data(); + } + +public: + consteval static auto compressed_size() noexcept { + return compressed_size_info().first + 3 * tree_count(); + } + consteval static auto uncompressed_size() noexcept { + return raw_data.size(); + } + consteval static size_t bytes_saved() noexcept { + size_t diff = uncompressed_size() - compressed_size(); + return diff > 0 ? diff : 0; + } + + // Utility for decoding compressed data. + class decoder { + public: + using difference_type = std::ptrdiff_t; + using value_type = int; + + decoder(const unsigned char *comp_data) noexcept + : m_data(comp_data), + m_table(comp_data + compressed_size_info().first) { get_next(); } + decoder() = default; + + constexpr static decoder end(const unsigned char *comp_data) noexcept { + decoder ender; + ender.m_data = comp_data; + if constexpr (bytes_saved() > 0) { + const auto [size_bytes, last_bits] = compressed_size_info(); + ender.m_data += size_bytes - 1; + ender.m_bit = 1 << (7 - last_bits); + } else { + ender.m_data += raw_data.size() + 1; + } + + return ender; + } + + bool operator==(const decoder& other) const noexcept { + return m_data == other.m_data && m_bit == other.m_bit; + } + auto operator*() const noexcept { + return m_current; + } + decoder& operator++() noexcept { + get_next(); + return *this; + } + decoder operator++(int) noexcept { + auto old = *this; + get_next(); + return old; + } + + private: + void get_next() noexcept { + if (*this == end(m_data)) + return; + if constexpr (bytes_saved() > 0) { + auto *node = m_table; + int data = *m_data; + auto bit = m_bit; + do { + node += (data & bit) ? node[2] * 3u : node[1] * 3u; + bit >>= 1; + if (!bit) + bit = 0x80, data = *++m_data; + } while (node[1] != 0); + m_bit = bit; + m_current = *node; + } else { + m_current = *m_data++; + } + } + + const unsigned char *m_data = nullptr; + const unsigned char *m_table = nullptr; + unsigned char m_bit = 0x80; + int m_current = -1; + + friend class huffman_compressor; + }; + + // Stick the forward_iterator check here just so it's run + consteval huffman_compressor() noexcept + requires (std::forward_iterator<decoder>) + { + if constexpr (bytes_saved() > 0) { + build_decode_tree(); + compress(); + } else { + std::copy(raw_data.data, raw_data.data + raw_data.size(), + compressed_data); + } + } + + auto begin() const noexcept { + return decoder(compressed_data); + } + auto end() const noexcept { + return decoder::end(compressed_data); + } + auto cbegin() const noexcept { return begin(); } + auto cend() const noexcept { return end(); } + + // For accessing the compressed data + auto data() const noexcept { + if constexpr (bytes_saved() > 0) + return compressed_data; + else + return raw_data; + } + + auto size() const noexcept { + if constexpr (bytes_saved() > 0) + return compressed_size(); + else + return uncompressed_size(); + } + +private: + // Contains the compressed data, followed by the decoding tree. + unsigned char compressed_data[ + bytes_saved() > 0 ? compressed_size_info().first + 3 * tree_count() + : raw_data.size()] = {0}; +}; + +template <detail::huffman_string_container hsc> +constexpr auto operator ""_huffman() +{ + return huffman_compressor<hsc>(); +} + +#endif // TCSULLIVAN_CONSTEVAL_HUFFMAN_HPP_ |