fontservices/textshaperplugin/IcuSource/common/brkdict.h
changeset 0 1fb32624e06b
equal deleted inserted replaced
-1:000000000000 0:1fb32624e06b
       
     1 /*
       
     2 **********************************************************************
       
     3 *   Copyright (C) 1999-2004 IBM and others. All rights reserved.
       
     4 **********************************************************************
       
     5 *   Date        Name        Description
       
     6 *   12/1/99     rtg         Ported from Java
       
     7 *   01/13/2000  helena      Added UErrorCode to ctors.
       
     8 **********************************************************************
       
     9 */
       
    10 
       
    11 #ifndef BRKDICT_H
       
    12 #define BRKDICT_H
       
    13 
       
    14 #include "unicode/utypes.h"
       
    15 #include "unicode/uobject.h"
       
    16 #include "ucmp8.h"
       
    17 
       
    18 U_NAMESPACE_BEGIN
       
    19 
       
    20 /**
       
    21  * This is the class that represents the list of known words used by
       
    22  * DictionaryBasedBreakIterator.  The conceptual data structure used
       
    23  * here is a trie: there is a node hanging off the root node for every
       
    24  * letter that can start a word.  Each of these nodes has a node hanging
       
    25  * off of it for every letter that can be the second letter of a word
       
    26  * if this node is the first letter, and so on.  The trie is represented
       
    27  * as a two-dimensional array that can be treated as a table of state
       
    28  * transitions.  Indexes are used to compress this array, taking
       
    29  * advantage of the fact that this array will always be very sparse.
       
    30  */
       
    31 class BreakDictionary : public UMemory {
       
    32     //=================================================================================
       
    33     // data members
       
    34     //=================================================================================
       
    35 private:
       
    36 
       
    37     /**
       
    38      * Maps from characters to column numbers.  The main use of this is to
       
    39      * avoid making room in the array for empty columns.
       
    40      */
       
    41     CompactByteArray* columnMap;
       
    42 
       
    43     /**
       
    44      * The number of actual columns in the table
       
    45      */
       
    46     int32_t numCols;
       
    47 
       
    48     /**
       
    49      * Columns are organized into groups of 32.  This says how many
       
    50      * column groups.  (We could calculate this, but we store the
       
    51      * value to avoid having to repeatedly calculate it.)
       
    52      */
       
    53     int32_t numColGroups;
       
    54 
       
    55     /**
       
    56      * The actual compressed state table.  Each conceptual row represents
       
    57      * a state, and the cells in it contain the row numbers of the states
       
    58      * to transition to for each possible letter.  0 is used to indicate
       
    59      * an illegal combination of letters (i.e., the error state).  The
       
    60      * table is compressed by eliminating all the unpopulated (i.e., zero)
       
    61      * cells.  Multiple conceptual rows can then be doubled up in a single
       
    62      * physical row by sliding them up and possibly shifting them to one
       
    63      * side or the other so the populated cells don't collide.  Indexes
       
    64      * are used to identify unpopulated cells and to locate populated cells.
       
    65      */
       
    66     int16_t* table;
       
    67 
       
    68     /**
       
    69      * This index maps logical row numbers to physical row numbers
       
    70      */
       
    71     int16_t* rowIndex;
       
    72 
       
    73     /**
       
    74      * A bitmap is used to tell which cells in the comceptual table are
       
    75      * populated.  This array contains all the unique bit combinations
       
    76      * in that bitmap.  If the table is more than 32 columns wide,
       
    77      * successive entries in this array are used for a single row.
       
    78      */
       
    79     int32_t* rowIndexFlags;
       
    80 
       
    81     /**
       
    82      * This index maps from a logical row number into the bitmap table above.
       
    83      * (This keeps us from storing duplicate bitmap combinations.)  Since there
       
    84      * are a lot of rows with only one populated cell, instead of wasting space
       
    85      * in the bitmap table, we just store a negative number in this index for
       
    86      * rows with one populated cell.  The absolute value of that number is
       
    87      * the column number of the populated cell.
       
    88      */
       
    89     int16_t* rowIndexFlagsIndex;
       
    90 
       
    91     /**
       
    92      * For each logical row, this index contains a constant that is added to
       
    93      * the logical column number to get the physical column number
       
    94      */
       
    95     int8_t* rowIndexShifts;
       
    96 
       
    97     //=================================================================================
       
    98     // deserialization
       
    99     //=================================================================================
       
   100 
       
   101 public:
       
   102     /**
       
   103      * Constructor.  Creates the BreakDictionary by using readDictionaryFile() to
       
   104      * load the dictionary tables from the disk.
       
   105      * @param dictionaryFilename The name of the dictionary file
       
   106      * @param status for errors if it occurs
       
   107      */
       
   108     BreakDictionary(const char* dictionaryFilename, UErrorCode& status);
       
   109 
       
   110     /**
       
   111      * Destructor.
       
   112      */
       
   113     ~BreakDictionary();
       
   114 
       
   115     /**
       
   116      * Reads the dictionary file on the disk and constructs the appropriate in-memory
       
   117      * representation.
       
   118      * @param in The given memory stream
       
   119      */
       
   120     void readDictionaryFile(const uint8_t * in);
       
   121 
       
   122     //=================================================================================
       
   123     // access to the words
       
   124     //=================================================================================
       
   125 
       
   126     /**
       
   127      * Uses the column map to map the character to a column number, then
       
   128      * passes the row and column number to the other version of at()
       
   129      * @param row The current state
       
   130      * @param ch The character whose column we're interested in
       
   131      * @return The new state to transition to
       
   132      */
       
   133     int16_t at(int32_t row, UChar ch) const;
       
   134 
       
   135     /**
       
   136      * Returns the value in the cell with the specified (logical) row and
       
   137      * column numbers.  In DictionaryBasedBreakIterator, the row number is
       
   138      * a state number, the column number is an input, and the return value
       
   139      * is the row number of the new state to transition to.  (0 is the
       
   140      * "error" state, and -1 is the "end of word" state in a dictionary)
       
   141      * @param row The row number of the current state
       
   142      * @param col The column number of the input character (0 means "not a
       
   143      * dictionary character")
       
   144      * @return The row number of the new state to transition to
       
   145      */
       
   146     int16_t at(int32_t row, int32_t col) const;
       
   147 
       
   148 private:
       
   149     /**
       
   150      * Given (logical) row and column numbers, returns true if the
       
   151      * cell in that position is populated
       
   152      * @param row The LOGICAL row number of the cell
       
   153      * @param col The PHYSICAL row number of the cell
       
   154      * @return true if the cell in that position is populated
       
   155      */
       
   156     UBool cellIsPopulated(int32_t row, int32_t col) const;
       
   157 
       
   158     /**
       
   159      * Implementation of at() when we know the specified cell is populated.
       
   160      * @param row The PHYSICAL row number of the cell
       
   161      * @param col The PHYSICAL column number of the cell
       
   162      * @return The value stored in the cell
       
   163      */
       
   164     int16_t internalAt(int32_t row, int32_t col) const;
       
   165 
       
   166     // the following methods are never meant to be called and so are not defined
       
   167     // (if you don't declare them, you get default implementations)
       
   168     BreakDictionary(const BreakDictionary& that);
       
   169     BreakDictionary& operator=(const BreakDictionary& that);
       
   170 };
       
   171 
       
   172 U_NAMESPACE_END
       
   173 
       
   174 #endif