searchengine/oss/loc/analysis/inc/private/breakiterator.inl
changeset 10 afe194b6b1cd
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/searchengine/oss/loc/analysis/inc/private/breakiterator.inl	Tue Jul 06 15:30:04 2010 +0300
@@ -0,0 +1,108 @@
+/*
+* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description: 
+*
+*/
+namespace analysis {
+	
+	template<class Encoding>
+	StateMachineBreakIterator<Encoding>::StateMachineBreakIterator(
+		StateMachine<Encoding>& machine)
+	:	machine_( machine ),
+		state_(), 
+		current_(-1),
+		next_(-1), 
+		text_(0)
+	{
+	}
+	
+	template<class Encoding>
+	StateMachineBreakIterator<Encoding>::~StateMachineBreakIterator() {}
+	
+	template<class Encoding>
+	void StateMachineBreakIterator<Encoding>::setText(const wchar_t* text) 
+	{
+		// Let's point to the begining of new text
+		text_ = text;
+		cursor_ = 0; 
+		
+		// First boundary is in the beginning of buffer 
+		current_ = 0; 
+		// We haven't searched for next boundary yet
+		next_ = -1; 
+	}
+	
+	template<class Encoding>
+	bool StateMachineBreakIterator<Encoding>::hasNext()
+	{
+		prepareNext(); 
+		return next_ != -1; 
+	}
+	
+	template<class Encoding>
+	int StateMachineBreakIterator<Encoding>::current()
+	{
+		return current_; 
+	}
+	
+	template<class Encoding>
+	int StateMachineBreakIterator<Encoding>::next()
+	{
+		prepareNext();
+		current_ = next_; 
+		next_ = -1; 
+		return current_;
+	}
+	
+	template<class Encoding>
+	void StateMachineBreakIterator<Encoding>::prepareNext()
+	{
+		// Implements longest matching word algorithm. The used 
+		// state machine contains an entire dictionary. Each state
+		// transition interprets as incremental search in dictionary. 
+		// Each final (or terminal) state, marks location, where the 
+		// consumed states form a valid word. We try to find the 
+		// longest matching word. 
+		// 
+		
+		// Prepare next_ only, if new next_ hasn't been prepared before 
+		if (next_ == -1 && text_ && text_[cursor_]) {
+			// Reset state machine
+			machine_.rootState(state_);
+
+			// lastBreak points to the end of last recognized word
+			int lastBreak = -1;  
+			// Continue until EOF
+			while (text_[cursor_]) {
+				// Feed next character to the state machine 
+				// and try to transit the state
+				if (!state_.next(text_[cursor_++])) {
+					// Check last final state
+					if (lastBreak != -1) { 
+						// Final state marked a valid word
+						// This is word boundary we were lookign
+						cursor_ = lastBreak; 
+					}
+					break;
+				} else if (state_.isFinal()) {
+					// Found a valid word! Mark the location 
+					lastBreak = cursor_; 
+					// Still, continue and try to find even a longer word
+				}
+			}
+			next_ = cursor_;
+		}
+	}
+
+}