diff -r 000000000000 -r 1fb32624e06b fontservices/textshaperplugin/IcuSource/common/brkdict.h --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fontservices/textshaperplugin/IcuSource/common/brkdict.h Tue Feb 02 02:02:46 2010 +0200 @@ -0,0 +1,174 @@ +/* +********************************************************************** +* Copyright (C) 1999-2004 IBM and others. All rights reserved. +********************************************************************** +* Date Name Description +* 12/1/99 rtg Ported from Java +* 01/13/2000 helena Added UErrorCode to ctors. +********************************************************************** +*/ + +#ifndef BRKDICT_H +#define BRKDICT_H + +#include "unicode/utypes.h" +#include "unicode/uobject.h" +#include "ucmp8.h" + +U_NAMESPACE_BEGIN + +/** + * This is the class that represents the list of known words used by + * DictionaryBasedBreakIterator. The conceptual data structure used + * here is a trie: there is a node hanging off the root node for every + * letter that can start a word. Each of these nodes has a node hanging + * off of it for every letter that can be the second letter of a word + * if this node is the first letter, and so on. The trie is represented + * as a two-dimensional array that can be treated as a table of state + * transitions. Indexes are used to compress this array, taking + * advantage of the fact that this array will always be very sparse. + */ +class BreakDictionary : public UMemory { + //================================================================================= + // data members + //================================================================================= +private: + + /** + * Maps from characters to column numbers. The main use of this is to + * avoid making room in the array for empty columns. + */ + CompactByteArray* columnMap; + + /** + * The number of actual columns in the table + */ + int32_t numCols; + + /** + * Columns are organized into groups of 32. This says how many + * column groups. (We could calculate this, but we store the + * value to avoid having to repeatedly calculate it.) + */ + int32_t numColGroups; + + /** + * The actual compressed state table. Each conceptual row represents + * a state, and the cells in it contain the row numbers of the states + * to transition to for each possible letter. 0 is used to indicate + * an illegal combination of letters (i.e., the error state). The + * table is compressed by eliminating all the unpopulated (i.e., zero) + * cells. Multiple conceptual rows can then be doubled up in a single + * physical row by sliding them up and possibly shifting them to one + * side or the other so the populated cells don't collide. Indexes + * are used to identify unpopulated cells and to locate populated cells. + */ + int16_t* table; + + /** + * This index maps logical row numbers to physical row numbers + */ + int16_t* rowIndex; + + /** + * A bitmap is used to tell which cells in the comceptual table are + * populated. This array contains all the unique bit combinations + * in that bitmap. If the table is more than 32 columns wide, + * successive entries in this array are used for a single row. + */ + int32_t* rowIndexFlags; + + /** + * This index maps from a logical row number into the bitmap table above. + * (This keeps us from storing duplicate bitmap combinations.) Since there + * are a lot of rows with only one populated cell, instead of wasting space + * in the bitmap table, we just store a negative number in this index for + * rows with one populated cell. The absolute value of that number is + * the column number of the populated cell. + */ + int16_t* rowIndexFlagsIndex; + + /** + * For each logical row, this index contains a constant that is added to + * the logical column number to get the physical column number + */ + int8_t* rowIndexShifts; + + //================================================================================= + // deserialization + //================================================================================= + +public: + /** + * Constructor. Creates the BreakDictionary by using readDictionaryFile() to + * load the dictionary tables from the disk. + * @param dictionaryFilename The name of the dictionary file + * @param status for errors if it occurs + */ + BreakDictionary(const char* dictionaryFilename, UErrorCode& status); + + /** + * Destructor. + */ + ~BreakDictionary(); + + /** + * Reads the dictionary file on the disk and constructs the appropriate in-memory + * representation. + * @param in The given memory stream + */ + void readDictionaryFile(const uint8_t * in); + + //================================================================================= + // access to the words + //================================================================================= + + /** + * Uses the column map to map the character to a column number, then + * passes the row and column number to the other version of at() + * @param row The current state + * @param ch The character whose column we're interested in + * @return The new state to transition to + */ + int16_t at(int32_t row, UChar ch) const; + + /** + * Returns the value in the cell with the specified (logical) row and + * column numbers. In DictionaryBasedBreakIterator, the row number is + * a state number, the column number is an input, and the return value + * is the row number of the new state to transition to. (0 is the + * "error" state, and -1 is the "end of word" state in a dictionary) + * @param row The row number of the current state + * @param col The column number of the input character (0 means "not a + * dictionary character") + * @return The row number of the new state to transition to + */ + int16_t at(int32_t row, int32_t col) const; + +private: + /** + * Given (logical) row and column numbers, returns true if the + * cell in that position is populated + * @param row The LOGICAL row number of the cell + * @param col The PHYSICAL row number of the cell + * @return true if the cell in that position is populated + */ + UBool cellIsPopulated(int32_t row, int32_t col) const; + + /** + * Implementation of at() when we know the specified cell is populated. + * @param row The PHYSICAL row number of the cell + * @param col The PHYSICAL column number of the cell + * @return The value stored in the cell + */ + int16_t internalAt(int32_t row, int32_t col) const; + + // the following methods are never meant to be called and so are not defined + // (if you don't declare them, you get default implementations) + BreakDictionary(const BreakDictionary& that); + BreakDictionary& operator=(const BreakDictionary& that); +}; + +U_NAMESPACE_END + +#endif