searchengine/oss/loc/analysis/inc/private/breakiterator.inl
author hgs
Fri, 15 Oct 2010 12:09:28 +0530
changeset 24 65456528cac2
permissions -rw-r--r--
201041
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
24
hgs
parents:
diff changeset
     1
/*
hgs
parents:
diff changeset
     2
* Copyright (c) 2010 Nokia Corporation and/or its subsidiary(-ies).
hgs
parents:
diff changeset
     3
* All rights reserved.
hgs
parents:
diff changeset
     4
* This component and the accompanying materials are made available
hgs
parents:
diff changeset
     5
* under the terms of "Eclipse Public License v1.0"
hgs
parents:
diff changeset
     6
* which accompanies this distribution, and is available
hgs
parents:
diff changeset
     7
* at the URL "http://www.eclipse.org/legal/epl-v10.html".
hgs
parents:
diff changeset
     8
*
hgs
parents:
diff changeset
     9
* Initial Contributors:
hgs
parents:
diff changeset
    10
* Nokia Corporation - initial contribution.
hgs
parents:
diff changeset
    11
*
hgs
parents:
diff changeset
    12
* Contributors:
hgs
parents:
diff changeset
    13
*
hgs
parents:
diff changeset
    14
* Description: 
hgs
parents:
diff changeset
    15
*
hgs
parents:
diff changeset
    16
*/
hgs
parents:
diff changeset
    17
namespace analysis {
hgs
parents:
diff changeset
    18
	
hgs
parents:
diff changeset
    19
	template<class Encoding>
hgs
parents:
diff changeset
    20
	StateMachineBreakIterator<Encoding>::StateMachineBreakIterator(
hgs
parents:
diff changeset
    21
		StateMachine<Encoding>& machine)
hgs
parents:
diff changeset
    22
	:	machine_( machine ),
hgs
parents:
diff changeset
    23
		state_(), 
hgs
parents:
diff changeset
    24
		current_(-1),
hgs
parents:
diff changeset
    25
		next_(-1), 
hgs
parents:
diff changeset
    26
		text_(0)
hgs
parents:
diff changeset
    27
	{
hgs
parents:
diff changeset
    28
	}
hgs
parents:
diff changeset
    29
	
hgs
parents:
diff changeset
    30
	template<class Encoding>
hgs
parents:
diff changeset
    31
	StateMachineBreakIterator<Encoding>::~StateMachineBreakIterator() {}
hgs
parents:
diff changeset
    32
	
hgs
parents:
diff changeset
    33
	template<class Encoding>
hgs
parents:
diff changeset
    34
	void StateMachineBreakIterator<Encoding>::setText(const wchar_t* text) 
hgs
parents:
diff changeset
    35
	{
hgs
parents:
diff changeset
    36
		// Let's point to the begining of new text
hgs
parents:
diff changeset
    37
		text_ = text;
hgs
parents:
diff changeset
    38
		cursor_ = 0; 
hgs
parents:
diff changeset
    39
		
hgs
parents:
diff changeset
    40
		// First boundary is in the beginning of buffer 
hgs
parents:
diff changeset
    41
		current_ = 0; 
hgs
parents:
diff changeset
    42
		// We haven't searched for next boundary yet
hgs
parents:
diff changeset
    43
		next_ = -1; 
hgs
parents:
diff changeset
    44
	}
hgs
parents:
diff changeset
    45
	
hgs
parents:
diff changeset
    46
	template<class Encoding>
hgs
parents:
diff changeset
    47
	bool StateMachineBreakIterator<Encoding>::hasNext()
hgs
parents:
diff changeset
    48
	{
hgs
parents:
diff changeset
    49
		prepareNext(); 
hgs
parents:
diff changeset
    50
		return next_ != -1; 
hgs
parents:
diff changeset
    51
	}
hgs
parents:
diff changeset
    52
	
hgs
parents:
diff changeset
    53
	template<class Encoding>
hgs
parents:
diff changeset
    54
	int StateMachineBreakIterator<Encoding>::current()
hgs
parents:
diff changeset
    55
	{
hgs
parents:
diff changeset
    56
		return current_; 
hgs
parents:
diff changeset
    57
	}
hgs
parents:
diff changeset
    58
	
hgs
parents:
diff changeset
    59
	template<class Encoding>
hgs
parents:
diff changeset
    60
	int StateMachineBreakIterator<Encoding>::next()
hgs
parents:
diff changeset
    61
	{
hgs
parents:
diff changeset
    62
		prepareNext();
hgs
parents:
diff changeset
    63
		current_ = next_; 
hgs
parents:
diff changeset
    64
		next_ = -1; 
hgs
parents:
diff changeset
    65
		return current_;
hgs
parents:
diff changeset
    66
	}
hgs
parents:
diff changeset
    67
	
hgs
parents:
diff changeset
    68
	template<class Encoding>
hgs
parents:
diff changeset
    69
	void StateMachineBreakIterator<Encoding>::prepareNext()
hgs
parents:
diff changeset
    70
	{
hgs
parents:
diff changeset
    71
		// Implements longest matching word algorithm. The used 
hgs
parents:
diff changeset
    72
		// state machine contains an entire dictionary. Each state
hgs
parents:
diff changeset
    73
		// transition interprets as incremental search in dictionary. 
hgs
parents:
diff changeset
    74
		// Each final (or terminal) state, marks location, where the 
hgs
parents:
diff changeset
    75
		// consumed states form a valid word. We try to find the 
hgs
parents:
diff changeset
    76
		// longest matching word. 
hgs
parents:
diff changeset
    77
		// 
hgs
parents:
diff changeset
    78
		
hgs
parents:
diff changeset
    79
		// Prepare next_ only, if new next_ hasn't been prepared before 
hgs
parents:
diff changeset
    80
		if (next_ == -1 && text_ && text_[cursor_]) {
hgs
parents:
diff changeset
    81
			// Reset state machine
hgs
parents:
diff changeset
    82
			machine_.rootState(state_);
hgs
parents:
diff changeset
    83
hgs
parents:
diff changeset
    84
			// lastBreak points to the end of last recognized word
hgs
parents:
diff changeset
    85
			int lastBreak = -1;  
hgs
parents:
diff changeset
    86
			// Continue until EOF
hgs
parents:
diff changeset
    87
			while (text_[cursor_]) {
hgs
parents:
diff changeset
    88
				// Feed next character to the state machine 
hgs
parents:
diff changeset
    89
				// and try to transit the state
hgs
parents:
diff changeset
    90
				if (!state_.next(text_[cursor_++])) {
hgs
parents:
diff changeset
    91
					// Check last final state
hgs
parents:
diff changeset
    92
					if (lastBreak != -1) { 
hgs
parents:
diff changeset
    93
						// Final state marked a valid word
hgs
parents:
diff changeset
    94
						// This is word boundary we were lookign
hgs
parents:
diff changeset
    95
						cursor_ = lastBreak; 
hgs
parents:
diff changeset
    96
					}
hgs
parents:
diff changeset
    97
					break;
hgs
parents:
diff changeset
    98
				} else if (state_.isFinal()) {
hgs
parents:
diff changeset
    99
					// Found a valid word! Mark the location 
hgs
parents:
diff changeset
   100
					lastBreak = cursor_; 
hgs
parents:
diff changeset
   101
					// Still, continue and try to find even a longer word
hgs
parents:
diff changeset
   102
				}
hgs
parents:
diff changeset
   103
			}
hgs
parents:
diff changeset
   104
			next_ = cursor_;
hgs
parents:
diff changeset
   105
		}
hgs
parents:
diff changeset
   106
	}
hgs
parents:
diff changeset
   107
hgs
parents:
diff changeset
   108
}