searchengine/oss/loc/analysis/inc/private/breakiterator.inl
changeset 24 65456528cac2
equal deleted inserted replaced
23:d4d56f5e7c55 24:65456528cac2
       
     1 /*
       
     2 * Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 namespace analysis {
       
    18 	
       
    19 	template<class Encoding>
       
    20 	StateMachineBreakIterator<Encoding>::StateMachineBreakIterator(
       
    21 		StateMachine<Encoding>& machine)
       
    22 	:	machine_( machine ),
       
    23 		state_(), 
       
    24 		current_(-1),
       
    25 		next_(-1), 
       
    26 		text_(0)
       
    27 	{
       
    28 	}
       
    29 	
       
    30 	template<class Encoding>
       
    31 	StateMachineBreakIterator<Encoding>::~StateMachineBreakIterator() {}
       
    32 	
       
    33 	template<class Encoding>
       
    34 	void StateMachineBreakIterator<Encoding>::setText(const wchar_t* text) 
       
    35 	{
       
    36 		// Let's point to the begining of new text
       
    37 		text_ = text;
       
    38 		cursor_ = 0; 
       
    39 		
       
    40 		// First boundary is in the beginning of buffer 
       
    41 		current_ = 0; 
       
    42 		// We haven't searched for next boundary yet
       
    43 		next_ = -1; 
       
    44 	}
       
    45 	
       
    46 	template<class Encoding>
       
    47 	bool StateMachineBreakIterator<Encoding>::hasNext()
       
    48 	{
       
    49 		prepareNext(); 
       
    50 		return next_ != -1; 
       
    51 	}
       
    52 	
       
    53 	template<class Encoding>
       
    54 	int StateMachineBreakIterator<Encoding>::current()
       
    55 	{
       
    56 		return current_; 
       
    57 	}
       
    58 	
       
    59 	template<class Encoding>
       
    60 	int StateMachineBreakIterator<Encoding>::next()
       
    61 	{
       
    62 		prepareNext();
       
    63 		current_ = next_; 
       
    64 		next_ = -1; 
       
    65 		return current_;
       
    66 	}
       
    67 	
       
    68 	template<class Encoding>
       
    69 	void StateMachineBreakIterator<Encoding>::prepareNext()
       
    70 	{
       
    71 		// Implements longest matching word algorithm. The used 
       
    72 		// state machine contains an entire dictionary. Each state
       
    73 		// transition interprets as incremental search in dictionary. 
       
    74 		// Each final (or terminal) state, marks location, where the 
       
    75 		// consumed states form a valid word. We try to find the 
       
    76 		// longest matching word. 
       
    77 		// 
       
    78 		
       
    79 		// Prepare next_ only, if new next_ hasn't been prepared before 
       
    80 		if (next_ == -1 && text_ && text_[cursor_]) {
       
    81 			// Reset state machine
       
    82 			machine_.rootState(state_);
       
    83 
       
    84 			// lastBreak points to the end of last recognized word
       
    85 			int lastBreak = -1;  
       
    86 			// Continue until EOF
       
    87 			while (text_[cursor_]) {
       
    88 				// Feed next character to the state machine 
       
    89 				// and try to transit the state
       
    90 				if (!state_.next(text_[cursor_++])) {
       
    91 					// Check last final state
       
    92 					if (lastBreak != -1) { 
       
    93 						// Final state marked a valid word
       
    94 						// This is word boundary we were lookign
       
    95 						cursor_ = lastBreak; 
       
    96 					}
       
    97 					break;
       
    98 				} else if (state_.isFinal()) {
       
    99 					// Found a valid word! Mark the location 
       
   100 					lastBreak = cursor_; 
       
   101 					// Still, continue and try to find even a longer word
       
   102 				}
       
   103 			}
       
   104 			next_ = cursor_;
       
   105 		}
       
   106 	}
       
   107 
       
   108 }