|
1 /* |
|
2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 namespace analysis { |
|
18 |
|
19 template<class Encoding> |
|
20 StateMachineBreakIterator<Encoding>::StateMachineBreakIterator( |
|
21 StateMachine<Encoding>& machine) |
|
22 : machine_( machine ), |
|
23 state_(), |
|
24 current_(-1), |
|
25 next_(-1), |
|
26 text_(0) |
|
27 { |
|
28 } |
|
29 |
|
30 template<class Encoding> |
|
31 StateMachineBreakIterator<Encoding>::~StateMachineBreakIterator() {} |
|
32 |
|
33 template<class Encoding> |
|
34 void StateMachineBreakIterator<Encoding>::setText(const wchar_t* text) |
|
35 { |
|
36 // Let's point to the begining of new text |
|
37 text_ = text; |
|
38 cursor_ = 0; |
|
39 |
|
40 // First boundary is in the beginning of buffer |
|
41 current_ = 0; |
|
42 // We haven't searched for next boundary yet |
|
43 next_ = -1; |
|
44 } |
|
45 |
|
46 template<class Encoding> |
|
47 bool StateMachineBreakIterator<Encoding>::hasNext() |
|
48 { |
|
49 prepareNext(); |
|
50 return next_ != -1; |
|
51 } |
|
52 |
|
53 template<class Encoding> |
|
54 int StateMachineBreakIterator<Encoding>::current() |
|
55 { |
|
56 return current_; |
|
57 } |
|
58 |
|
59 template<class Encoding> |
|
60 int StateMachineBreakIterator<Encoding>::next() |
|
61 { |
|
62 prepareNext(); |
|
63 current_ = next_; |
|
64 next_ = -1; |
|
65 return current_; |
|
66 } |
|
67 |
|
68 template<class Encoding> |
|
69 void StateMachineBreakIterator<Encoding>::prepareNext() |
|
70 { |
|
71 // Implements longest matching word algorithm. The used |
|
72 // state machine contains an entire dictionary. Each state |
|
73 // transition interprets as incremental search in dictionary. |
|
74 // Each final (or terminal) state, marks location, where the |
|
75 // consumed states form a valid word. We try to find the |
|
76 // longest matching word. |
|
77 // |
|
78 |
|
79 // Prepare next_ only, if new next_ hasn't been prepared before |
|
80 if (next_ == -1 && text_ && text_[cursor_]) { |
|
81 // Reset state machine |
|
82 machine_.rootState(state_); |
|
83 |
|
84 // lastBreak points to the end of last recognized word |
|
85 int lastBreak = -1; |
|
86 // Continue until EOF |
|
87 while (text_[cursor_]) { |
|
88 // Feed next character to the state machine |
|
89 // and try to transit the state |
|
90 if (!state_.next(text_[cursor_++])) { |
|
91 // Check last final state |
|
92 if (lastBreak != -1) { |
|
93 // Final state marked a valid word |
|
94 // This is word boundary we were lookign |
|
95 cursor_ = lastBreak; |
|
96 } |
|
97 break; |
|
98 } else if (state_.isFinal()) { |
|
99 // Found a valid word! Mark the location |
|
100 lastBreak = cursor_; |
|
101 // Still, continue and try to find even a longer word |
|
102 } |
|
103 } |
|
104 next_ = cursor_; |
|
105 } |
|
106 } |
|
107 |
|
108 } |