|
1 /* |
|
2 ********************************************************************** |
|
3 * Copyright (C) 1999-2004 IBM and others. All rights reserved. |
|
4 ********************************************************************** |
|
5 * Date Name Description |
|
6 * 12/1/99 rtg Ported from Java |
|
7 * 01/13/2000 helena Added UErrorCode to ctors. |
|
8 ********************************************************************** |
|
9 */ |
|
10 |
|
11 #ifndef BRKDICT_H |
|
12 #define BRKDICT_H |
|
13 |
|
14 #include "unicode/utypes.h" |
|
15 #include "unicode/uobject.h" |
|
16 #include "ucmp8.h" |
|
17 |
|
18 U_NAMESPACE_BEGIN |
|
19 |
|
20 /** |
|
21 * This is the class that represents the list of known words used by |
|
22 * DictionaryBasedBreakIterator. The conceptual data structure used |
|
23 * here is a trie: there is a node hanging off the root node for every |
|
24 * letter that can start a word. Each of these nodes has a node hanging |
|
25 * off of it for every letter that can be the second letter of a word |
|
26 * if this node is the first letter, and so on. The trie is represented |
|
27 * as a two-dimensional array that can be treated as a table of state |
|
28 * transitions. Indexes are used to compress this array, taking |
|
29 * advantage of the fact that this array will always be very sparse. |
|
30 */ |
|
31 class BreakDictionary : public UMemory { |
|
32 //================================================================================= |
|
33 // data members |
|
34 //================================================================================= |
|
35 private: |
|
36 |
|
37 /** |
|
38 * Maps from characters to column numbers. The main use of this is to |
|
39 * avoid making room in the array for empty columns. |
|
40 */ |
|
41 CompactByteArray* columnMap; |
|
42 |
|
43 /** |
|
44 * The number of actual columns in the table |
|
45 */ |
|
46 int32_t numCols; |
|
47 |
|
48 /** |
|
49 * Columns are organized into groups of 32. This says how many |
|
50 * column groups. (We could calculate this, but we store the |
|
51 * value to avoid having to repeatedly calculate it.) |
|
52 */ |
|
53 int32_t numColGroups; |
|
54 |
|
55 /** |
|
56 * The actual compressed state table. Each conceptual row represents |
|
57 * a state, and the cells in it contain the row numbers of the states |
|
58 * to transition to for each possible letter. 0 is used to indicate |
|
59 * an illegal combination of letters (i.e., the error state). The |
|
60 * table is compressed by eliminating all the unpopulated (i.e., zero) |
|
61 * cells. Multiple conceptual rows can then be doubled up in a single |
|
62 * physical row by sliding them up and possibly shifting them to one |
|
63 * side or the other so the populated cells don't collide. Indexes |
|
64 * are used to identify unpopulated cells and to locate populated cells. |
|
65 */ |
|
66 int16_t* table; |
|
67 |
|
68 /** |
|
69 * This index maps logical row numbers to physical row numbers |
|
70 */ |
|
71 int16_t* rowIndex; |
|
72 |
|
73 /** |
|
74 * A bitmap is used to tell which cells in the comceptual table are |
|
75 * populated. This array contains all the unique bit combinations |
|
76 * in that bitmap. If the table is more than 32 columns wide, |
|
77 * successive entries in this array are used for a single row. |
|
78 */ |
|
79 int32_t* rowIndexFlags; |
|
80 |
|
81 /** |
|
82 * This index maps from a logical row number into the bitmap table above. |
|
83 * (This keeps us from storing duplicate bitmap combinations.) Since there |
|
84 * are a lot of rows with only one populated cell, instead of wasting space |
|
85 * in the bitmap table, we just store a negative number in this index for |
|
86 * rows with one populated cell. The absolute value of that number is |
|
87 * the column number of the populated cell. |
|
88 */ |
|
89 int16_t* rowIndexFlagsIndex; |
|
90 |
|
91 /** |
|
92 * For each logical row, this index contains a constant that is added to |
|
93 * the logical column number to get the physical column number |
|
94 */ |
|
95 int8_t* rowIndexShifts; |
|
96 |
|
97 //================================================================================= |
|
98 // deserialization |
|
99 //================================================================================= |
|
100 |
|
101 public: |
|
102 /** |
|
103 * Constructor. Creates the BreakDictionary by using readDictionaryFile() to |
|
104 * load the dictionary tables from the disk. |
|
105 * @param dictionaryFilename The name of the dictionary file |
|
106 * @param status for errors if it occurs |
|
107 */ |
|
108 BreakDictionary(const char* dictionaryFilename, UErrorCode& status); |
|
109 |
|
110 /** |
|
111 * Destructor. |
|
112 */ |
|
113 ~BreakDictionary(); |
|
114 |
|
115 /** |
|
116 * Reads the dictionary file on the disk and constructs the appropriate in-memory |
|
117 * representation. |
|
118 * @param in The given memory stream |
|
119 */ |
|
120 void readDictionaryFile(const uint8_t * in); |
|
121 |
|
122 //================================================================================= |
|
123 // access to the words |
|
124 //================================================================================= |
|
125 |
|
126 /** |
|
127 * Uses the column map to map the character to a column number, then |
|
128 * passes the row and column number to the other version of at() |
|
129 * @param row The current state |
|
130 * @param ch The character whose column we're interested in |
|
131 * @return The new state to transition to |
|
132 */ |
|
133 int16_t at(int32_t row, UChar ch) const; |
|
134 |
|
135 /** |
|
136 * Returns the value in the cell with the specified (logical) row and |
|
137 * column numbers. In DictionaryBasedBreakIterator, the row number is |
|
138 * a state number, the column number is an input, and the return value |
|
139 * is the row number of the new state to transition to. (0 is the |
|
140 * "error" state, and -1 is the "end of word" state in a dictionary) |
|
141 * @param row The row number of the current state |
|
142 * @param col The column number of the input character (0 means "not a |
|
143 * dictionary character") |
|
144 * @return The row number of the new state to transition to |
|
145 */ |
|
146 int16_t at(int32_t row, int32_t col) const; |
|
147 |
|
148 private: |
|
149 /** |
|
150 * Given (logical) row and column numbers, returns true if the |
|
151 * cell in that position is populated |
|
152 * @param row The LOGICAL row number of the cell |
|
153 * @param col The PHYSICAL row number of the cell |
|
154 * @return true if the cell in that position is populated |
|
155 */ |
|
156 UBool cellIsPopulated(int32_t row, int32_t col) const; |
|
157 |
|
158 /** |
|
159 * Implementation of at() when we know the specified cell is populated. |
|
160 * @param row The PHYSICAL row number of the cell |
|
161 * @param col The PHYSICAL column number of the cell |
|
162 * @return The value stored in the cell |
|
163 */ |
|
164 int16_t internalAt(int32_t row, int32_t col) const; |
|
165 |
|
166 // the following methods are never meant to be called and so are not defined |
|
167 // (if you don't declare them, you get default implementations) |
|
168 BreakDictionary(const BreakDictionary& that); |
|
169 BreakDictionary& operator=(const BreakDictionary& that); |
|
170 }; |
|
171 |
|
172 U_NAMESPACE_END |
|
173 |
|
174 #endif |