|
1 // |
|
2 // rbbirb.h |
|
3 // |
|
4 // Copyright (C) 2002-2004, International Business Machines Corporation and others. |
|
5 // All Rights Reserved. |
|
6 // |
|
7 // This file contains declarations for several classes from the |
|
8 // Rule Based Break Iterator rule builder. |
|
9 // |
|
10 |
|
11 |
|
12 #ifndef RBBIRB_H |
|
13 #define RBBIRB_H |
|
14 |
|
15 #include "unicode/utypes.h" |
|
16 #include "unicode/uobject.h" |
|
17 #include "unicode/rbbi.h" |
|
18 #include "unicode/uniset.h" |
|
19 #include "unicode/parseerr.h" |
|
20 #include "uhash.h" |
|
21 #include "uvector.h" |
|
22 #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that |
|
23 // looks up references to $variables within a set. |
|
24 |
|
25 |
|
26 |
|
27 U_NAMESPACE_BEGIN |
|
28 |
|
29 class RBBIRuleScanner; |
|
30 struct RBBIRuleTableEl; |
|
31 class RBBISetBuilder; |
|
32 class RBBINode; |
|
33 class RBBITableBuilder; |
|
34 |
|
35 |
|
36 |
|
37 //-------------------------------------------------------------------------------- |
|
38 // |
|
39 // RBBISymbolTable. Implements SymbolTable interface that is used by the |
|
40 // UnicodeSet parser to resolve references to $variables. |
|
41 // |
|
42 //-------------------------------------------------------------------------------- |
|
43 class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one |
|
44 public: // of these structs for each entry. |
|
45 RBBISymbolTableEntry(); |
|
46 UnicodeString key; |
|
47 RBBINode *val; |
|
48 ~RBBISymbolTableEntry(); |
|
49 |
|
50 private: |
|
51 RBBISymbolTableEntry(const RBBISymbolTableEntry &other); // forbid copying of this class |
|
52 RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other); // forbid copying of this class |
|
53 }; |
|
54 |
|
55 |
|
56 class RBBISymbolTable : public UMemory, public SymbolTable { |
|
57 private: |
|
58 const UnicodeString &fRules; |
|
59 UHashtable *fHashTable; |
|
60 RBBIRuleScanner *fRuleScanner; |
|
61 |
|
62 // These next two fields are part of the mechanism for passing references to |
|
63 // already-constructed UnicodeSets back to the UnicodeSet constructor |
|
64 // when the pattern includes $variable references. |
|
65 const UnicodeString ffffString; // = "/uffff" |
|
66 UnicodeSet *fCachedSetLookup; |
|
67 |
|
68 public: |
|
69 // API inherited from class SymbolTable |
|
70 virtual const UnicodeString* lookup(const UnicodeString& s) const; |
|
71 virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const; |
|
72 virtual UnicodeString parseReference(const UnicodeString& text, |
|
73 ParsePosition& pos, int32_t limit) const; |
|
74 |
|
75 // Additional Functions |
|
76 RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status); |
|
77 virtual ~RBBISymbolTable(); |
|
78 |
|
79 virtual RBBINode *lookupNode(const UnicodeString &key) const; |
|
80 virtual void addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err); |
|
81 |
|
82 #ifdef RBBI_DEBUG |
|
83 virtual void rbbiSymtablePrint() const; |
|
84 #else |
|
85 // A do-nothing inline function for non-debug builds. Member funcs can't be empty |
|
86 // or the call sites won't compile. |
|
87 int fFakeField; |
|
88 #define rbbiSymtablePrint() fFakeField=0; |
|
89 #endif |
|
90 |
|
91 private: |
|
92 RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class |
|
93 RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class |
|
94 }; |
|
95 |
|
96 |
|
97 //-------------------------------------------------------------------------------- |
|
98 // |
|
99 // class RBBIRuleBuilder The top-level class handling RBBI rule compiling. |
|
100 // |
|
101 //-------------------------------------------------------------------------------- |
|
102 class RBBIRuleBuilder : public UMemory { |
|
103 public: |
|
104 |
|
105 // Create a rule based break iterator from a set of rules. |
|
106 // This function is the main entry point into the rule builder. The |
|
107 // public ICU API for creating RBBIs uses this function to do the actual work. |
|
108 // |
|
109 static BreakIterator * createRuleBasedBreakIterator( const UnicodeString &rules, |
|
110 UParseError &parseError, |
|
111 UErrorCode &status); |
|
112 |
|
113 public: |
|
114 // The "public" functions and data members that appear below are accessed |
|
115 // (and shared) by the various parts that make up the rule builder. They |
|
116 // are NOT intended to be accessed by anything outside of the |
|
117 // rule builder implementation. |
|
118 RBBIRuleBuilder(const UnicodeString &rules, |
|
119 UParseError &parseErr, |
|
120 UErrorCode &status |
|
121 ); |
|
122 |
|
123 virtual ~RBBIRuleBuilder(); |
|
124 char *fDebugEnv; // controls debug trace output |
|
125 UErrorCode *fStatus; // Error reporting. Keeping status |
|
126 UParseError *fParseError; // here avoids passing it everywhere. |
|
127 const UnicodeString &fRules; // The rule string that we are compiling |
|
128 |
|
129 RBBIRuleScanner *fScanner; // The scanner. |
|
130 RBBINode *fForwardTree; // The parse trees, generated by the scanner, |
|
131 RBBINode *fReverseTree; // then manipulated by subsequent steps. |
|
132 RBBINode *fSafeFwdTree; |
|
133 RBBINode *fSafeRevTree; |
|
134 |
|
135 RBBINode **fDefaultTree; // For rules not qualified with a ! |
|
136 // the tree to which they belong to. |
|
137 |
|
138 UBool fChainRules; // True for chained Unicode TR style rules. |
|
139 // False for traditional regexp rules. |
|
140 |
|
141 UBool fLBCMNoChain; // True: suppress chaining of rules on |
|
142 // chars with LineBreak property == CM. |
|
143 |
|
144 UBool fLookAheadHardBreak; // True: Look ahead matches cause an |
|
145 // immediate break, no continuing for the |
|
146 // longest match. |
|
147 |
|
148 RBBISetBuilder *fSetBuilder; // Set and Character Category builder. |
|
149 UVector *fUSetNodes; // Vector of all uset nodes. |
|
150 |
|
151 RBBITableBuilder *fForwardTables; // State transition tables |
|
152 RBBITableBuilder *fReverseTables; |
|
153 RBBITableBuilder *fSafeFwdTables; |
|
154 RBBITableBuilder *fSafeRevTables; |
|
155 |
|
156 UVector *fRuleStatusVals; // The values that can be returned |
|
157 // from getRuleStatus(). |
|
158 |
|
159 RBBIDataHeader *flattenData(); // Create the flattened (runtime format) |
|
160 // data tables.. |
|
161 private: |
|
162 RBBIRuleBuilder(const RBBIRuleBuilder &other); // forbid copying of this class |
|
163 RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other); // forbid copying of this class |
|
164 }; |
|
165 |
|
166 |
|
167 |
|
168 |
|
169 //---------------------------------------------------------------------------- |
|
170 // |
|
171 // RBBISetTableEl is an entry in the hash table of UnicodeSets that have |
|
172 // been encountered. The val Node will be of nodetype uset |
|
173 // and contain pointers to the actual UnicodeSets. |
|
174 // The Key is the source string for initializing the set. |
|
175 // |
|
176 // The hash table is used to avoid creating duplicate |
|
177 // unnamed (not $var references) UnicodeSets. |
|
178 // |
|
179 // Memory Management: |
|
180 // The Hash Table owns these RBBISetTableEl structs and |
|
181 // the key strings. It does NOT own the val nodes. |
|
182 // |
|
183 //---------------------------------------------------------------------------- |
|
184 struct RBBISetTableEl { |
|
185 UnicodeString *key; |
|
186 RBBINode *val; |
|
187 }; |
|
188 |
|
189 |
|
190 //---------------------------------------------------------------------------- |
|
191 // |
|
192 // RBBIDebugPrintf Printf equivalent, for debugging output. |
|
193 // Conditional compilation of the implementation lets us |
|
194 // get rid of the stdio dependency in environments where it |
|
195 // is unavailable. |
|
196 // |
|
197 //---------------------------------------------------------------------------- |
|
198 #ifdef RBBI_DEBUG |
|
199 #include <stdio.h> |
|
200 #define RBBIDebugPrintf printf |
|
201 #define RBBIDebugPuts puts |
|
202 #else |
|
203 #undef RBBIDebugPrintf |
|
204 #define RBBIDebugPuts(arg) |
|
205 #endif |
|
206 |
|
207 U_NAMESPACE_END |
|
208 #endif |
|
209 |
|
210 |
|
211 |