|
1 // |
|
2 // rbbisetb.h |
|
3 /* |
|
4 ********************************************************************** |
|
5 * Copyright (c) 2001-2005, International Business Machines |
|
6 * Corporation and others. All Rights Reserved. |
|
7 ********************************************************************** |
|
8 */ |
|
9 |
|
10 #ifndef RBBISETB_H |
|
11 #define RBBISETB_H |
|
12 |
|
13 #include "unicode/utypes.h" |
|
14 #include "unicode/uobject.h" |
|
15 #include "rbbirb.h" |
|
16 #include "uvector.h" |
|
17 |
|
18 struct UNewTrie; |
|
19 |
|
20 U_NAMESPACE_BEGIN |
|
21 |
|
22 // |
|
23 // RBBISetBuilder Derives the character categories used by the runtime RBBI engine |
|
24 // from the Unicode Sets appearing in the source RBBI rules, and |
|
25 // creates the TRIE table used to map from Unicode to the |
|
26 // character categories. |
|
27 // |
|
28 |
|
29 |
|
30 // |
|
31 // RangeDescriptor |
|
32 // |
|
33 // Each of the non-overlapping character ranges gets one of these descriptors. |
|
34 // All of them are strung together in a linked list, which is kept in order |
|
35 // (by character) |
|
36 // |
|
37 class RangeDescriptor : public UMemory { |
|
38 public: |
|
39 UChar32 fStartChar; // Start of range, unicode 32 bit value. |
|
40 UChar32 fEndChar; // End of range, unicode 32 bit value. |
|
41 int32_t fNum; // runtime-mapped input value for this range. |
|
42 UVector *fIncludesSets; // vector of the the original |
|
43 // Unicode sets that include this range. |
|
44 // (Contains ptrs to uset nodes) |
|
45 RangeDescriptor *fNext; // Next RangeDescriptor in the linked list. |
|
46 |
|
47 RangeDescriptor(UErrorCode &status); |
|
48 RangeDescriptor(const RangeDescriptor &other, UErrorCode &status); |
|
49 ~RangeDescriptor(); |
|
50 void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with |
|
51 // where appearing in the second (higher) part. |
|
52 void setDictionaryFlag(); // Check whether this range appears as part of |
|
53 // the Unicode set named "dictionary" |
|
54 |
|
55 private: |
|
56 RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class |
|
57 RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class |
|
58 }; |
|
59 |
|
60 |
|
61 // |
|
62 // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules. |
|
63 // |
|
64 // Starting with the rules parse tree from the scanner, |
|
65 // |
|
66 // - Enumerate the set of UnicodeSets that are referenced |
|
67 // by the RBBI rules. |
|
68 // - compute a derived set of non-overlapping UnicodeSets |
|
69 // that will correspond to columns in the state table for |
|
70 // the RBBI execution engine. |
|
71 // - construct the trie table that maps input characters |
|
72 // to set numbers in the non-overlapping set of sets. |
|
73 // |
|
74 |
|
75 |
|
76 class RBBISetBuilder : public UMemory { |
|
77 public: |
|
78 RBBISetBuilder(RBBIRuleBuilder *rb); |
|
79 ~RBBISetBuilder(); |
|
80 |
|
81 void build(); |
|
82 void addValToSets(UVector *sets, uint32_t val); |
|
83 void addValToSet (RBBINode *usetNode, uint32_t val); |
|
84 int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the |
|
85 // runtime state machine, which are the same as |
|
86 // columns in the DFA state table |
|
87 int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie. |
|
88 void serializeTrie(uint8_t *where); // write out the serialized Trie. |
|
89 UChar32 getFirstChar(int32_t val) const; |
|
90 #ifdef RBBI_DEBUG |
|
91 void printSets(); |
|
92 void printRanges(); |
|
93 void printRangeGroups(); |
|
94 #else |
|
95 #define printSets() |
|
96 #define printRanges() |
|
97 #define printRangeGroups() |
|
98 #endif |
|
99 |
|
100 private: |
|
101 void numberSets(); |
|
102 |
|
103 RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us. |
|
104 UErrorCode *fStatus; |
|
105 |
|
106 RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors |
|
107 |
|
108 UNewTrie *fTrie; // The mapping TRIE that is the end result of processing |
|
109 uint32_t fTrieSize; // the Unicode Sets. |
|
110 |
|
111 // Groups correspond to character categories - |
|
112 // groups of ranges that are in the same original UnicodeSets. |
|
113 // fGroupCount is the index of the last used group. |
|
114 // fGroupCount+1 is also the number of columns in the RBBI state table being compiled. |
|
115 // State table column 0 is not used. Column 1 is for end-of-input. |
|
116 // column 2 is for group 0. Funny counting. |
|
117 int32_t fGroupCount; |
|
118 |
|
119 RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class |
|
120 RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class |
|
121 }; |
|
122 |
|
123 |
|
124 |
|
125 U_NAMESPACE_END |
|
126 #endif |