// Copyright (c) 2011 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "google/bdict_writer.h" #include #include #include "base/logging.h" #include "base/strings/stringprintf.h" #include "google/bdict.h" namespace hunspell { // Represents one node the word trie in memory. This does not have to be very // efficient since it is only used when building. class DicNode { public: enum StorageType { UNDEFINED, // Uninitialized storage type. LEAF, // Word with no additional string data. LEAFMORE, // Word with additional suffix following. LIST16, // List of sub-nodes with 16-bit relative offsets. LIST8, // List of sub-nodes with 8-bit relative offsets. LOOKUP32, // Lookup table with 32-bit absolute offsets. LOOKUP16, // LOokup table with 16-bit relative offsets. }; DicNode() : addition(0), storage(UNDEFINED) { } ~DicNode() { for (size_t i = 0; i < children.size(); i++) delete children[i]; } bool is_leaf() const { return children.empty(); } // When non-zero, this character is the additional level that this // node represents. This will be 0 for some leaf nodes when there is no // addition and for the root node. char addition; std::vector children; // When there are no children, this is a leaf node and this "addition string" // is appended to the result. When there are children, this will be empty. std::string leaf_addition; // For leaf nodes, this are the indices into the affix table. std::vector affix_indices; // Initially uninitialized, ComputeStorage() will fill this in with the // desired serialization method. StorageType storage; }; namespace { void SerializeTrie(const DicNode* node, std::string* output); // Returns true if the nth character in the given word is |ch|. Will return // false when there is no nth character. Note that this will also match an // implicit NULL at the end of the string. bool NthCharacterIs(const std::string& word, size_t n, char ch) { if (word.length() < n) // Want to allow n == length() to catch the NULL. return false; return word.c_str()[n] == ch; // Use c_str() to get NULL terminator. } // Recursively build the trie data structure for the range in the |words| list // in [begin, end). It is assumed that all words in that range will have the // same |node_depth - 2| characters at the beginning. This node will key off of // the |node_depth - 1| character, with a special case for the root. // // |prefix_chars| is how deep this node is in the trie (and corresponds to how // many letters of the word we will skip). The root level will have // |prefix_chars| of 0. // // The given |node| will be filled with the data. The return value is the // index into the |words| vector of the next word to process. It will be // equal to |end| when all words have been consumed. size_t BuildTrie(const BDictWriter::WordList& words, size_t begin, size_t end, size_t node_depth, DicNode* node) { // Find the prefix that this node represents. const std::string& begin_str = words[begin].first; if (begin_str.length() < node_depth) { // Singleton. node->addition = 0; node->affix_indices = words[begin].second; return begin + 1; } // Now find the range of words sharing this prefix. size_t match_count; if (node_depth == 0 && begin == 0) { // Special case the root node. match_count = end - begin; node->addition = 0; } else { match_count = 0; node->addition = begin_str[node_depth - 1]; // We know the strings should have [0, node_depth-1) characters at the // beginning already matching, so we only need to check the new one. while (begin + match_count < end && NthCharacterIs(words[begin + match_count].first, node_depth - 1, node->addition)) match_count++; } if (match_count == 1) { // Just found a leaf node with no other words sharing its prefix. Save any // remaining characters and we're done. node->affix_indices = words[begin].second; node->leaf_addition = begin_str.substr(node_depth); return begin + 1; } // We have a range of words, add them as children of this node. size_t i = begin; while (i < begin + match_count) { DicNode* cur = new DicNode; i = BuildTrie(words, i, begin + match_count, node_depth + 1, cur); node->children.push_back(cur); } return begin + match_count; } // Lookup tables are complicated. They can have a magic 0th entry not counted // in the table dimensions, and also have indices only for the used sub-range. // This function will compute the starting point and size of a lookup table, // in addition to whether it should have the magic 0th entry for the given // list of child nodes. void ComputeLookupStrategyDetails(const std::vector& children, bool* has_0th_entry, int* first_item, int* list_size) { *has_0th_entry = false; *first_item = 0; *list_size = 0; if (children.empty()) return; size_t first_offset = 0; if (children[0]->addition == 0) { *has_0th_entry = true; first_offset++; } if (children.size() == first_offset) return; *first_item = static_cast(children[first_offset]->addition); unsigned char last_item = children[children.size() - 1]->addition; *list_size = last_item - *first_item + 1; } // Recursively fills in the storage strategy for this node and each of its // children. This must be done before actually serializing because the storage // mode will depend on the size of the children. size_t ComputeTrieStorage(DicNode* node) { if (node->is_leaf()) { // The additional affix list holds affixes when there is more than one. Each // entry is two bytes, plus an additional FFFF terminator. size_t supplimentary_size = 0; if (node->affix_indices[0] > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) { // We cannot store the first affix ID of the affix list into a leaf node. // In this case, we have to store all the affix IDs and a terminator // into a supplimentary list. supplimentary_size = node->affix_indices.size() * 2 + 2; } else if (node->affix_indices.size() > 1) { // We can store the first affix ID of the affix list into a leaf node. // In this case, we need to store the remaining affix IDs and a // terminator into a supplimentary list. supplimentary_size = node->affix_indices.size() * 2; } if (node->leaf_addition.empty()) { node->storage = DicNode::LEAF; return 2 + supplimentary_size; } node->storage = DicNode::LEAFMORE; // Signature & affix (2) + null for leaf_addition (1) = 3 return 3 + node->leaf_addition.size() + supplimentary_size; } // Recursively compute the size of the children for non-leaf nodes. size_t child_size = 0; for (size_t i = 0; i < node->children.size(); i++) child_size += ComputeTrieStorage(node->children[i]); // Fixed size is only 1 byte which is the ID byte and the count combined. static const int kListHeaderSize = 1; // Lists can only store up to 16 items. static const size_t kListThreshold = 16; if (node->children.size() < kListThreshold && child_size <= 0xFF) { node->storage = DicNode::LIST8; return kListHeaderSize + node->children.size() * 2 + child_size; } if (node->children.size() < kListThreshold && child_size <= 0xFFFF) { node->storage = DicNode::LIST16; // Fixed size is one byte plus 3 for each table entry. return kListHeaderSize + node->children.size() * 3 + child_size; } static const int kTableHeaderSize = 2; // Type + table size. bool has_0th_item; int first_table_item, table_item_count; ComputeLookupStrategyDetails(node->children, &has_0th_item, &first_table_item, &table_item_count); if (child_size + kTableHeaderSize + (has_0th_item ? 2 : 0) + table_item_count * 2 < 0xFFFF) { // Use 16-bit addressing since the children will fit. node->storage = DicNode::LOOKUP16; return kTableHeaderSize + (has_0th_item ? 2 : 0) + table_item_count * 2 + child_size; } // Use 32-bit addressing as a last resort. node->storage = DicNode::LOOKUP32; return kTableHeaderSize + (has_0th_item ? 4 : 0) + table_item_count * 4 + child_size; } // Serializes the given node when it is DicNode::LEAF* to the output. void SerializeLeaf(const DicNode* node, std::string* output) { // The low 6 bits of the ID byte are the high 6 bits of the first affix ID. int first_affix = node->affix_indices.size() ? node->affix_indices[0] : 0; // We may store the first value with the node or in the supplimentary list. size_t first_affix_in_supplimentary_list = 1; if (first_affix > BDict::LEAF_NODE_MAX_FIRST_AFFIX_ID) { // There are not enough bits for this value, move it to the supplimentary // list where there are more bits per value. first_affix_in_supplimentary_list = 0; first_affix = BDict::FIRST_AFFIX_IS_UNUSED; } unsigned char id_byte = (first_affix >> 8) & BDict::LEAF_NODE_FIRST_BYTE_AFFIX_MASK; // The next two bits indicates an additional string and more affixes. if (node->storage == DicNode::LEAFMORE) id_byte |= BDict::LEAF_NODE_ADDITIONAL_VALUE; if (node->affix_indices.size() > 1 || first_affix_in_supplimentary_list == 0) id_byte |= BDict::LEAF_NODE_FOLLOWING_VALUE; output->push_back(id_byte); // Following is the low 8 bits of the affix index. output->push_back(first_affix & 0xff); // Handle the optional addition with NULL terminator. if (node->storage == DicNode::LEAFMORE) { for (size_t i = 0; i < node->leaf_addition.size() + 1; i++) output->push_back(node->leaf_addition.c_str()[i]); } // Handle any following affixes. We already wrote the 0th one. if (node->affix_indices.size() > first_affix_in_supplimentary_list) { for (size_t i = first_affix_in_supplimentary_list; i < node->affix_indices.size() && i < BDict::MAX_AFFIXES_PER_WORD; i++) { output->push_back(static_cast(node->affix_indices[i] & 0xFF)); output->push_back( static_cast((node->affix_indices[i] >> 8) & 0xFF)); } // Terminator for affix list. We use 0xFFFF. output->push_back(static_cast(0xFF)); output->push_back(static_cast(0xFF)); } } // Serializes the given node when it is DicNode::LIST* to the output. void SerializeList(const DicNode* node, std::string* output) { bool is_8_bit = node->storage == DicNode::LIST8; unsigned char id_byte = BDict::LIST_NODE_TYPE_VALUE | (is_8_bit ? 0 : BDict::LIST_NODE_16BIT_VALUE); id_byte |= node->children.size(); // We assume the size is < 4 bits. output->push_back(id_byte); // Reserve enough room for the lookup table (either 2 or 3 bytes per entry). int bytes_per_entry = (is_8_bit ? 2 : 3); size_t table_begin = output->size(); output->resize(output->size() + node->children.size() * bytes_per_entry); size_t children_begin = output->size(); for (size_t i = 0; i < node->children.size(); i++) { // First is the character this entry represents. (*output)[table_begin + i * bytes_per_entry] = node->children[i]->addition; // Next is the 8- or 16-bit offset. size_t offset = output->size() - children_begin; if (is_8_bit) { DCHECK(offset <= 0xFF); (*output)[table_begin + i * bytes_per_entry + 1] = static_cast(offset & 0xFF); } else { unsigned short* output16 = reinterpret_cast( &(*output)[table_begin + i * bytes_per_entry + 1]); *output16 = static_cast(offset); } // Now append the children's data. SerializeTrie(node->children[i], output); } } // Serializes the given node when it is DicNode::LOOKUP* to the output. void SerializeLookup(const DicNode* node, std::string* output) { unsigned char id_byte = BDict::LOOKUP_NODE_TYPE_VALUE; bool has_0th_item; int first_table_item, table_item_count; ComputeLookupStrategyDetails(node->children, &has_0th_item, &first_table_item, &table_item_count); // Set the extra bits in the ID byte. bool is_32_bit = (node->storage == DicNode::LOOKUP32); if (is_32_bit) id_byte |= BDict::LOOKUP_NODE_32BIT_VALUE; if (has_0th_item) id_byte |= BDict::LOOKUP_NODE_0TH_VALUE; size_t begin_offset = output->size(); output->push_back(id_byte); output->push_back(static_cast(first_table_item)); output->push_back(static_cast(table_item_count)); // Save room for the lookup table and the optional 0th item. int bytes_per_entry = (is_32_bit ? 4 : 2); size_t zeroth_item_offset = output->size(); if (has_0th_item) output->resize(output->size() + bytes_per_entry); size_t table_begin = output->size(); output->resize(output->size() + table_item_count * bytes_per_entry); // Append the children. for (size_t i = 0; i < node->children.size(); i++) { size_t offset = output->size(); // Compute the location at which we'll store the offset of the child data. // We may be writing the magic 0th item. size_t offset_offset; if (i == 0 && has_0th_item) { offset_offset = zeroth_item_offset; } else { int table_index = static_cast(node->children[i]->addition) - first_table_item; offset_offset = table_begin + table_index * bytes_per_entry; } // Write the offset. if (is_32_bit) { // Use 32-bit absolute offsets. // FIXME(brettw) use bit cast. unsigned* offset32 = reinterpret_cast(&(*output)[offset_offset]); *offset32 = static_cast(output->size()); } else { // Use 16-bit relative offsets. unsigned short* offset16 = reinterpret_cast(&(*output)[offset_offset]); *offset16 = static_cast(output->size() - begin_offset); } SerializeTrie(node->children[i], output); } } // Recursively serializes this node and all of its children to the output. void SerializeTrie(const DicNode* node, std::string* output) { if (node->storage == DicNode::LEAF || node->storage == DicNode::LEAFMORE) { SerializeLeaf(node, output); } else if (node->storage == DicNode::LIST8 || node->storage == DicNode::LIST16) { SerializeList(node, output); } else if (node->storage == DicNode::LOOKUP16 || node->storage == DicNode::LOOKUP32) { SerializeLookup(node, output); } } // Serializes the given list of strings with 0 bytes separating them. The end // will be marked by a double-0. void SerializeStringListNullTerm(const std::vector& strings, std::string* output) { for (size_t i = 0; i < strings.size(); i++) { // Can't tolerate empty strings since the'll mark the end. if (strings[i].empty()) output->push_back(' '); else output->append(strings[i]); output->push_back(0); } output->push_back(0); } void SerializeReplacements( const std::vector< std::pair >& repl, std::string* output) { for (size_t i = 0; i < repl.size(); i++) { output->append(repl[i].first); output->push_back(0); output->append(repl[i].second); output->push_back(0); } output->push_back(0); } } // namespace BDictWriter::BDictWriter() : trie_root_(NULL) { } BDictWriter::~BDictWriter() { delete trie_root_; } void BDictWriter::SetWords(const WordList& words) { trie_root_ = new DicNode; BuildTrie(words, 0, words.size(), 0, trie_root_); } std::string BDictWriter::GetBDict() const { std::string ret; // Save room for the header. This will be populated at the end. ret.resize(sizeof(hunspell::BDict::Header)); // Serialize the affix portion. size_t aff_offset = ret.size(); SerializeAff(&ret); // Serialize the dictionary words. size_t dic_offset = ret.size(); ret.reserve(ret.size() + ComputeTrieStorage(trie_root_)); SerializeTrie(trie_root_, &ret); // Fill the header last, now that we have the data. hunspell::BDict::Header* header = reinterpret_cast(&ret[0]); header->signature = hunspell::BDict::SIGNATURE; header->major_version = hunspell::BDict::MAJOR_VERSION; header->minor_version = hunspell::BDict::MINOR_VERSION; header->aff_offset = static_cast(aff_offset); header->dic_offset = static_cast(dic_offset); // Write the MD5 digest of the affix information and the dictionary words at // the end of the BDic header. if (header->major_version >= 2) base::MD5Sum(&ret[aff_offset], ret.size() - aff_offset, &header->digest); return ret; } void BDictWriter::SerializeAff(std::string* output) const { // Reserve enough room for the header. size_t header_offset = output->size(); output->resize(output->size() + sizeof(hunspell::BDict::AffHeader)); // Write the comment. output->push_back('\n'); output->append(comment_); output->push_back('\n'); // We need a magic first AF line that lists the number of following ones. size_t affix_group_offset = output->size(); output->append(base::StringPrintf("AF %d", static_cast(affix_groups_.size()))); output->push_back(0); SerializeStringListNullTerm(affix_groups_, output); size_t affix_rule_offset = output->size(); SerializeStringListNullTerm(affix_rules_, output); size_t rep_offset = output->size(); SerializeReplacements(replacements_, output); size_t other_offset = output->size(); SerializeStringListNullTerm(other_commands_, output); // Add the header now that we know the offsets. hunspell::BDict::AffHeader* header = reinterpret_cast(&(*output)[header_offset]); header->affix_group_offset = static_cast(affix_group_offset); header->affix_rule_offset = static_cast(affix_rule_offset); header->rep_offset = static_cast(rep_offset); header->other_offset = static_cast(other_offset); } } // namespace hunspell