genericopenlibs/openenvcore/libc/src/charcnv.cpp
author Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
Mon, 04 Oct 2010 02:56:42 +0300
changeset 68 ff3fc7722556
parent 0 e4d67989cc36
permissions -rw-r--r--
Revision: 201039 Kit: 201039

// Copyright (c) 2005-2009 Nokia Corporation and/or its subsidiary(-ies).
// All rights reserved.
// This component and the accompanying materials are made available
// under the terms of "Eclipse Public License v1.0"
// which accompanies this distribution, and is available
// at the URL "http://www.eclipse.org/legal/epl-v10.html".
//
// Initial Contributors:
// Nokia Corporation - initial contribution.
//
// Contributors:
//
// Description:
// Name        : MRT_WCHARCNVT.CPP
// Part of     : MRT LIBC
// Contains the source for the helper functions used by wchar 
// restartable conversion API's in libc
// Version     : 1.0
//



// Copyright (c) 1997-2003 Symbian Ltd.  All rights reserved.

// system includes
#include <e32std.h>
#include <e32base.h>
#include <utf.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <wchar.h>

#include "wcharcnv.h"

#define  KSURROGATE_OFFSET 0x10000 - (0xD800 << 10) - 0xDC00

//-----------------------------------------------------------------------------
//Function Name : TInt ConvertToUnicodeFromUtf8(TDes16& aUnicode, const 
//                           TDesC8& aUtf8, mbstate_t *state)
//Description   : Converts the unicode to UTF8 
//Return Value  : The number of unconverted bytes left at the end of the input
//descriptor, or one of the error values defined in TError.
//-----------------------------------------------------------------------------
TInt ConvertToUnicodeFromUtf8(TDes16& aUnicode, const TDesC8& aUtf8, mbstate_t *state)
{
	aUnicode.SetLength(0);
	if (aUtf8.Length()==0)
	{
		return 0;
	}
	if (aUnicode.MaxLength()==0)
	{
		return aUtf8.Length();
	}
	
	HBufC8* utf8 = NULL;
	if ( state->__count > 0)
        {
	        // state have some information, use that.
	        utf8 = HBufC8::NewLC ( state->__count + aUtf8.Length() );
	        TPtr8 tempBuf = utf8->Des();
	        TPtr8 temp ((TUint8*)state->__value.__wchb, state->__count);
	        tempBuf.Copy(temp);
	        tempBuf.Append(aUtf8);
        }
    
	TUint16* pointerToCurrentUnicodeCharacter=CONST_CAST(TUint16*, aUnicode.Ptr());
	const TUint16* pointerToLastUnicodeCharacter=pointerToCurrentUnicodeCharacter+(aUnicode.MaxLength()-1);
	const TUint8* pointerToCurrentUtf8Byte= utf8 ? utf8->Des().Ptr() : aUtf8.Ptr();
	const TUint8* pointerToPendingUtf8Byte=utf8 ? utf8->Des().Ptr() : aUtf8.Ptr();
	TInt length = utf8 ? utf8->Des().Length() : aUtf8.Length();
	const TUint8* pointerToLastUtf8Byte=pointerToCurrentUtf8Byte+(length-1);
	TUint16 replacementcharacter = 0xFFFD;
	TUint8 currentUtf8Byte;
	TUint currentUnicodeCharacter;
	TInt sequenceLength;		
	
	
	FOREVER
	{
		currentUtf8Byte=*pointerToCurrentUtf8Byte;
		pointerToPendingUtf8Byte = pointerToCurrentUtf8Byte;
		sequenceLength=100;
        
		for(TInt i=0;i<7;i++)
		{
			if ((currentUtf8Byte&(0xf8<<i))==(static_cast<TUint8>(0xF0<<i)))
			{
				sequenceLength = 4-i;
				break;
			}
		}

		if ((sequenceLength<2 || sequenceLength>6) && sequenceLength!=0)
		{
			currentUnicodeCharacter=replacementcharacter;
		}
		else
		{		
			if ((pointerToLastUtf8Byte-pointerToCurrentUtf8Byte+1)<sequenceLength)
			{
				// we dnt have enough UTF-8 bytes to complete the Muti-Byte character.
				// store the character within the state.
				state->__count = 0;
            			while (pointerToCurrentUtf8Byte <= pointerToLastUtf8Byte)
        		        {
            			        state->__value.__wchb[state->__count++] = *(pointerToCurrentUtf8Byte++);
		                }
            			// reset the current pointer
    	        		pointerToCurrentUtf8Byte -= state->__count;
				if((pointerToCurrentUnicodeCharacter-aUnicode.Ptr())==0)
			        {
				        // still nothing is decoded.
				        if ( utf8 )
			                {
	        			        CleanupStack::PopAndDestroy(); // utf8
			                }
				        return -2;
				        //return -1;
				}
        			// something is already decoded, so return the no of bytes that use for 
	        		// decoding.
		        	break;
		        }			
		    
                        // reset the state				
                    	state->__count = 0;
			currentUnicodeCharacter = currentUtf8Byte&(0x7F>>sequenceLength);
			
        		for(TInt i=sequenceLength;i>1; i--)
	                {
        			currentUtf8Byte = *(++pointerToCurrentUtf8Byte);
        			if ((currentUtf8Byte&0xc0)==0x80)
        			{
	       				currentUnicodeCharacter = (currentUnicodeCharacter<<6)|(currentUtf8Byte&0x3F);
        			}
        			else
        			{
        				// Encoding error occured.
        				// store the contained within the state and return -1.
        				// set the error EILSEQ to errno
                		        if ( utf8 )
               			        {
                			        CleanupStack::PopAndDestroy(); // utf8
               			        }
        				errno = EILSEQ;
               				return -1;
        				//currentUnicodeCharacter=replacementcharacter;
        				//--pointerToCurrentUtf8Byte;
        			}
	       		}
	        }
			
       		if (currentUnicodeCharacter > 0xFFFF)
       		{
	        	if(pointerToCurrentUnicodeCharacter>=pointerToLastUnicodeCharacter)
	        	{
        			// unicode descriptor dnt have 2 wchar bytes to hold the data.
	       			pointerToCurrentUtf8Byte=pointerToPendingUtf8Byte;
	        		break;
	        	}
		
        		TUint surrogate = (currentUnicodeCharacter>>10) + 0xD7C0;
	       		*pointerToCurrentUnicodeCharacter=static_cast<TUint16>(surrogate);			
	        	++pointerToCurrentUnicodeCharacter;
				
		        surrogate = (currentUnicodeCharacter&0x3FF)+0xDC00;
        		*pointerToCurrentUnicodeCharacter=static_cast<TUint16>(surrogate);			
	       		++pointerToCurrentUnicodeCharacter;
	        	++pointerToCurrentUtf8Byte;
	        }
        	else
		{
	        	*pointerToCurrentUnicodeCharacter=static_cast<TUint16>(currentUnicodeCharacter);
		        ++pointerToCurrentUnicodeCharacter;
        		++pointerToCurrentUtf8Byte;
		}
	        
	        if ((pointerToCurrentUtf8Byte>pointerToLastUtf8Byte) || (pointerToCurrentUnicodeCharacter>pointerToLastUnicodeCharacter))
	        {
        		// checking the boundary condition.
        		// Here either the UTF-8 or Unicode descriptor reached to the end.
        		break;
        	}
	} // forever
        // decoding finished.
	aUnicode.SetLength(pointerToCurrentUnicodeCharacter-aUnicode.Ptr());
        if ( utf8 )
        {
                CleanupStack::PopAndDestroy(); // utf8
        }
       	//return pointerToLastUtf8Byte-pointerToCurrentUtf8Byte+1;
	// returns the number of bytes used to complete a valid multibyte character.
	return pointerToCurrentUtf8Byte - aUtf8.Ptr();
} //end of function

//-----------------------------------------------------------------------------
//Function Name : TInt _Utf16ToUtf8(char* aDst, wchar_t aSrc, mbstate_t* ps, int aLen )
//Description   : Converts wide char in UCS2 format to UTF8 equivalent
//Return Value  : The number of bytes converted, 0 if L'\0\' was translated, -1 on
//generic error and errno set appropriately, -2 if len is not sufficient to store aSrc wide char
//-----------------------------------------------------------------------------
TInt _Utf16ToUtf8(char* dst, wchar_t aSrc, mbstate_t* ps, int aLen)
{
	int retval = 0;
	// check the state 
	if(ps->__count !=_EUTF16InitialState && ps->__count != _EUTF16_21BitExtensionState)
	{
		errno = EINVAL;
		return -1;
	}
	
	//following characters are illegal
	//see http://www.unicode.org/faq/utf_bom.html#40
	if(aSrc == 0xFFFE || aSrc == 0xFFFF || (aSrc >= 0xFDD0 && aSrc <= 0xFDEF) )
	{
		errno = EILSEQ;
		return -1;
	}
	
			
	if(ps->__count == _EUTF16InitialState)
	{
	
		//following characters in addition are illegal in initial state
		//see http://www.unicode.org/faq/utf_bom.html#40
		if((aSrc >= 0xDC00 && aSrc <= 0xDFFF) )
		{
			errno = EILSEQ;
			return -1;
		}

	
		if ((aSrc & 0xff80)==0x0000)
		{
			if(aLen >= 1)
			{
				*dst++ = static_cast<TUint8>(aSrc);
				retval = 1;
			}
			else
			{
				return -2;
			}
			
		}
		else if ((aSrc & 0xf800)==0x0000)
		{
			if (aLen >= 2)
			{
				*dst++ = static_cast<TUint8>(0xc0|(aSrc>>6));
				*dst++ = static_cast<TUint8> (0x80|(aSrc&0x3f));
				retval = 2;
			}
			else
			{
				return -2;
			}
		}
		else if ((aSrc & 0xfc00)==0xd800)
		{
			 ps->__value.lead = aSrc;
		 	 ps->__count = _EUTF16_21BitExtensionState;
		 	retval = 0; //nothing written out just yet
		}
		else
		{
			if ( aLen >= 3)
			{
				*dst++ = static_cast<TUint8>(0xe0|(aSrc>>12));
				*dst++ = static_cast<TUint8>(0x80|((aSrc>>6)&0x3f));
				*dst++ = static_cast<TUint8>(0x80|(aSrc&0x3f));
				retval = 3;
			}
			else
			{
				return -2;
			}
		}
		
		
	}
	else //ps->__count == _EUCS2_21BitExtensionState)
	{
		//characters outside this range are illegal in this state
		//see http://www.unicode.org/faq/utf_bom.html#40
		if((aSrc < 0xDC00 || aSrc > 0xDFFF) )
		{
			errno = EILSEQ;
			return -1;
		}
		
		if ((aSrc & 0xfc00)!=0xdc00)
		{
			errno = EILSEQ;
			return -1;
		}
		if ( aLen >= 4)
		{
			//snippet taken from unicode faq
			//http://www.unicode.org/faq/utf_bom.html#39
			
			unsigned long  codepoint = (ps->__value.lead << 10) + aSrc + KSURROGATE_OFFSET;
			
			*dst++ = static_cast<TUint8>( 0xf0|(codepoint>>18));
			*dst++ = static_cast<TUint8>(0x80|((codepoint>>12)&0x3f));
			*dst++ = static_cast<TUint8>(0x80|((codepoint>>6)&0x3f));
			*dst++ = static_cast<TUint8>(0x80|(codepoint&0x3f));
			retval = 4;
		}
		else
		{
			return -2;
		}
		ps->__count = _EUTF16InitialState;
	}
	return retval;
	
	
}//end of function