bintools/rcomp/src/UNICODE_COMPRESSOR.CPP
changeset 0 044383f39525
equal deleted inserted replaced
-1:000000000000 0:044383f39525
       
     1 /*
       
     2 * Copyright (c) 2001-2009 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of the License "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: 
       
    15 *
       
    16 */
       
    17 
       
    18                                
       
    19 #include "UNICODE_COMPRESSOR.H"
       
    20 
       
    21 void CompressUnicode(unsigned char* aOutputBuffer, int& aOutputLength, int aMaximumOutputLength, const UTF16* aInputBuffer, int aInputLength)
       
    22 	{
       
    23 	TUnicodeCompressor unicodeCompressor;
       
    24 	TMemoryUnicodeSource decompressedUnicode(aInputBuffer);
       
    25 	TInt numberOfInputElementsConsumed;
       
    26 	unicodeCompressor.CompressL(aOutputBuffer, decompressedUnicode, aMaximumOutputLength, aInputLength, &aOutputLength, &numberOfInputElementsConsumed);
       
    27 	TInt temp;
       
    28 	unicodeCompressor.FlushL(aOutputBuffer, aMaximumOutputLength, temp);
       
    29 	aOutputLength+=temp;
       
    30 	if (aOutputLength<aMaximumOutputLength && numberOfInputElementsConsumed!=aInputLength)
       
    31 		{
       
    32 		::Panic(1);
       
    33 		}
       
    34 	}
       
    35 
       
    36 // the rest of the contents of this file is a selective copy of base\store\ustrm\US_UCMP.CPP
       
    37 
       
    38 const TUint32 TUnicodeCompressionState::iStaticWindow[EStaticWindows] =
       
    39 	{
       
    40 	0x0000,		// tags
       
    41 	0x0080,		// Latin-1 supplement
       
    42 	0x0100,		// Latin Extended-A
       
    43 	0x0300,		// Combining Diacritics
       
    44 	0x2000,		// General Punctuation
       
    45 	0x2080,		// Currency Symbols
       
    46 	0x2100,		// Letterlike Symbols and Number Forms
       
    47 	0x3000		// CJK Symbols and Punctuation
       
    48 	};
       
    49 
       
    50 const TUint32 TUnicodeCompressionState::iDynamicWindowDefault[EDynamicWindows] =
       
    51 	{
       
    52 	0x0080,		// Latin-1 supplement
       
    53 	0x00C0,		// parts of Latin-1 supplement and Latin Extended-A
       
    54 	0x0400,		// Cyrillic
       
    55 	0x0600,		// Arabic
       
    56 	0x0900,		// Devanagari
       
    57 	0x3040,		// Hiragana
       
    58 	0x30A0,		// Katakana
       
    59 	0xFF00		// Fullwidth ASCII
       
    60 	};
       
    61 
       
    62 const TUint16 TUnicodeCompressionState::iSpecialBase[ESpecialBases] =
       
    63 	{
       
    64 	0x00C0,		// Latin 1 letters (not symbols) and some of Extended-A
       
    65 	0x0250,		// IPA extensions
       
    66 	0x0370,		// Greek
       
    67 	0x0530,		// Armenian
       
    68 	0x3040,		// Hiragana
       
    69 	0x30A0,		// Katakana
       
    70 	0xFF60		// Halfwidth katakana
       
    71 	};
       
    72 
       
    73 // Single-byte mode tag values
       
    74 const TUint8 SQ0 = 0x01;	// <byte>				quote from window 0
       
    75 const TUint8 SDX = 0x0B;	// <hbyte> <lbyte>		define window in expansion area
       
    76 const TUint8 SQU = 0x0E;	// <hbyte> <lbyte>		quote Unicode value
       
    77 const TUint8 SCU = 0x0F;	//						switch to Unicode mode
       
    78 const TUint8 SC0 = 0x10;	//						select dynamic window 0
       
    79 const TUint8 SD0 = 0x18;	// <byte>				set dynamic window 0 index to <byte> and select it
       
    80 
       
    81 // Unicode mode tag values
       
    82 const TUint8 UC0 = 0xE0;	//						select dynamic window 0 and switch to single-byte mode
       
    83 const TUint8 UD0 = 0xE8;	// <byte>				set dynamic window 0 index to <byte>, select it and switch to
       
    84 							//						single-byte mode
       
    85 const TUint8 UQU = 0xF0;	// <hbyte>, <lbyte>		quote Unicode value
       
    86 const TUint8 UDX = 0xF1;	// <hbyte>, <lbyte>		define window in expansion area and switch to single-byte mode
       
    87 	
       
    88 TUnicodeCompressionState::TUnicodeCompressionState():
       
    89 	iUnicodeWords(0),
       
    90 	iMaxUnicodeWords(0),
       
    91 	iCompressedBytes(0),
       
    92 	iMaxCompressedBytes(0)
       
    93 	{
       
    94 	Reset();
       
    95 	}
       
    96 
       
    97 void TUnicodeCompressionState::Reset()
       
    98 	{
       
    99 	iUnicodeMode = FALSE;
       
   100 	iActiveWindowBase = 0x0080;
       
   101 	for (int i = 0; i < EDynamicWindows; i++)
       
   102 		iDynamicWindow[i] = iDynamicWindowDefault[i];
       
   103 	}
       
   104 
       
   105 
       
   106 // Return the index of the static window that contains this code, if any, or -1 if there is none.
       
   107 TInt TUnicodeCompressionState::StaticWindowIndex(TUint16 aCode)
       
   108 	{
       
   109 	for (TInt i = 0; i < EStaticWindows; i++)
       
   110 		if (aCode >= iStaticWindow[i] && aCode < iStaticWindow[i] + 128)
       
   111 			return i;
       
   112 	return -1;
       
   113 	}
       
   114 
       
   115 /*
       
   116 If aCode can be accommodated in one of the legal dynamic windows, return the index of that window
       
   117 in the offset table. If not return KErrNotFound.
       
   118 */
       
   119 TInt TUnicodeCompressionState::DynamicWindowOffsetIndex(TUint16 aCode)
       
   120 	{
       
   121 	if (aCode < 0x0080)
       
   122 		return KErrNotFound;
       
   123 	if (aCode >= 0x3400 && aCode <= 0xDFFF)
       
   124 		return KErrNotFound;
       
   125 
       
   126 	/*
       
   127 	Prefer sections that cross half-block boundaries. These are better adapted to actual text.
       
   128 	They are represented by offset indices 0xf9..0xff.
       
   129 	*/
       
   130 	for (int i = 0; i < ESpecialBases; i++)
       
   131 		if (aCode >= iSpecialBase[i] && aCode < iSpecialBase[i] + 128)
       
   132 			return 0xF9 + i;
       
   133 
       
   134 	/*
       
   135 	Offset indices 0x01..0x67 represent half blocks from 0x0080 to 0x3380 and
       
   136 	0x68..0xA7 represent half blocks from 0xE000 to 0xFF80.
       
   137 	*/
       
   138 	if (aCode >= 0xE000)
       
   139 		aCode -= 0xAC00;
       
   140 	return aCode / 0x80;
       
   141 	}
       
   142 
       
   143 // Return the base of the window represented by offset index <n>. Return 0 if the offset index is illegal.
       
   144 TUint32 TUnicodeCompressionState::DynamicWindowBase(TInt aOffsetIndex)
       
   145 	{
       
   146 	if (aOffsetIndex >= 0xF9 && aOffsetIndex <= 0xFF)
       
   147 		{
       
   148 		/*
       
   149 		WARNING: don't optimise the following two lines by replacing them with
       
   150 		'return iSpecialBase[aOffsetIndex - 0xF9];'. To do so would re-introduce a defect
       
   151 		in ARM builds caused by optimisation and consequent erroneous fixing up
       
   152 		of the array base: see defect EDNGASR-4AGJQX in ER5U defects.
       
   153 		*/
       
   154 		int special_base_index = aOffsetIndex - 0xF9;
       
   155 		return iSpecialBase[special_base_index];
       
   156 		}
       
   157 	if (aOffsetIndex >= 0x01 && aOffsetIndex <= 0x67)
       
   158 		return aOffsetIndex * 0x80;
       
   159 	if (aOffsetIndex >= 0x68 && aOffsetIndex <= 0xA7)
       
   160 		return aOffsetIndex * 0x80 + 0xAC00;
       
   161 	return 0;
       
   162 	}
       
   163 
       
   164 TBool TUnicodeCompressionState::EncodeAsIs(TUint16 aCode)
       
   165 	{
       
   166 	return aCode == 0x0000 || aCode == 0x0009 || aCode == 0x000A || aCode == 0x000D ||
       
   167 		   (aCode >= 0x0020 && aCode <= 0x007F);
       
   168 	}
       
   169 
       
   170 void TUnicodeCompressionState::Panic(TPanic aPanic)
       
   171 	{
       
   172 	::Panic(100+aPanic);
       
   173 	}
       
   174 
       
   175 EXPORT_C TUnicodeCompressor::TUnicodeCompressor():
       
   176 	iInputBufferStart(0),
       
   177 	iInputBufferSize(0),
       
   178 	iOutputBufferStart(0),
       
   179 	iOutputBufferSize(0),
       
   180 	iDynamicWindowIndex(0),
       
   181 	iOutputStream(NULL),
       
   182 	iOutputPointer(NULL),
       
   183 	iInput(NULL)
       
   184 	{
       
   185 	}
       
   186 
       
   187 EXPORT_C void TUnicodeCompressor::CompressL(TUint8* aOutput,MUnicodeSource& aInput,
       
   188 											TInt aMaxOutputBytes,TInt aMaxInputWords,
       
   189 											TInt* aOutputBytes,TInt* aInputWords)
       
   190 	{
       
   191 	DoCompressL(NULL,aOutput,&aInput,aMaxOutputBytes,aMaxInputWords,aOutputBytes,aInputWords);
       
   192 	}
       
   193 
       
   194 EXPORT_C TInt TUnicodeCompressor::FlushL(TUint8* aOutput,TInt aMaxOutputBytes,TInt& aOutputBytes)
       
   195 	{
       
   196 	DoCompressL(NULL,aOutput,NULL,aMaxOutputBytes,0,&aOutputBytes,NULL);
       
   197 	return iOutputBufferSize;
       
   198 	}
       
   199 
       
   200 EXPORT_C TInt TUnicodeCompressor::CompressedSizeL(MUnicodeSource& aInput,TInt aInputWords)
       
   201 	{
       
   202 	TInt bytes;
       
   203 	TUnicodeCompressor c;
       
   204 	c.DoCompressL(NULL,NULL,&aInput,KMaxTInt,aInputWords,&bytes,NULL);
       
   205 	return bytes;
       
   206 	}
       
   207 
       
   208 // Compress until input or output is exhausted or an exception occurs.
       
   209 void TUnicodeCompressor::DoCompressL(RWriteStream* aOutputStream,TUint8* aOutputPointer,MUnicodeSource* aInput,
       
   210 									 TInt aMaxOutputBytes,TInt aMaxInputWords,
       
   211 									 TInt* aOutputBytes,TInt* aInputWords)
       
   212 	{
       
   213 	iOutputStream = aOutputStream;
       
   214 	iOutputPointer = aOutputPointer;
       
   215 	iInput = aInput;
       
   216 	iMaxCompressedBytes = aMaxOutputBytes;
       
   217 	iMaxUnicodeWords = aMaxInputWords;
       
   218 	iCompressedBytes = iUnicodeWords = 0;
       
   219 	FlushOutputBufferL();
       
   220 	if (iInput)
       
   221 		{
       
   222 		while (iUnicodeWords < iMaxUnicodeWords && iCompressedBytes < iMaxCompressedBytes)
       
   223 			{
       
   224 			TUint16 x = iInput->ReadUnicodeValueL();
       
   225 			TAction action(x);
       
   226 			iInputBuffer[(iInputBufferStart + iInputBufferSize) % EMaxInputBufferSize] = action;
       
   227 			iInputBufferSize++;
       
   228 			iUnicodeWords++;
       
   229 			if (iInputBufferSize == EMaxInputBufferSize)
       
   230 				WriteRunL();
       
   231 			}
       
   232 		}
       
   233 	FlushInputBufferL();
       
   234 	if (aOutputBytes)
       
   235 		*aOutputBytes = iCompressedBytes;
       
   236 	if (aInputWords)
       
   237 		*aInputWords = iUnicodeWords;
       
   238 	}
       
   239 
       
   240 TUnicodeCompressor::TAction::TAction(TUint16 aCode):
       
   241 	iCode(aCode)
       
   242 	{
       
   243 	if (TUnicodeCompressionState::EncodeAsIs(aCode))
       
   244 		iTreatment = EPlainASCII;
       
   245 	else
       
   246 		{
       
   247 		iTreatment = TUnicodeCompressionState::DynamicWindowOffsetIndex(aCode);
       
   248 		if (iTreatment == -1)
       
   249 			{
       
   250 			iTreatment = TUnicodeCompressionState::StaticWindowIndex(aCode);
       
   251 			if (iTreatment == -1)
       
   252 				iTreatment = EPlainUnicode;
       
   253 			else
       
   254 				iTreatment += EFirstStatic;
       
   255 			}
       
   256 		}
       
   257 	}
       
   258 
       
   259 void TUnicodeCompressor::WriteCharacterFromBuffer()
       
   260 	{
       
   261 	const TAction& action = iInputBuffer[iInputBufferStart];
       
   262 	iInputBufferSize--;
       
   263 	iInputBufferStart = (iInputBufferStart + 1) % EMaxInputBufferSize;
       
   264 	WriteCharacter(action);
       
   265 	}
       
   266 
       
   267 void TUnicodeCompressor::FlushInputBufferL()
       
   268 	{
       
   269 	while (iInputBufferSize > 0 && iCompressedBytes < iMaxCompressedBytes)
       
   270 		WriteRunL();
       
   271 	}
       
   272 
       
   273 void TUnicodeCompressor::WriteRunL()
       
   274 	{
       
   275 	// Write out any leading characters that can be passed through.
       
   276 	if (!iUnicodeMode)
       
   277 		while (iInputBufferSize > 0)
       
   278 			{
       
   279 			const TAction& action = iInputBuffer[iInputBufferStart];
       
   280 			if (action.iTreatment == TAction::EPlainASCII ||
       
   281 				(action.iCode >= iActiveWindowBase && action.iCode < iActiveWindowBase + 128))
       
   282 				WriteCharacterFromBuffer();
       
   283 			else
       
   284 				break;
       
   285 			}
       
   286 
       
   287 	// Write a run of characters that cannot be passed through.
       
   288 	int i;
       
   289 	if (iInputBufferSize > 0)
       
   290 		{
       
   291 		/*
       
   292 		Find a run of characters with the same treatment and select that treatment
       
   293 		if the run has more than one character.
       
   294 		*/
       
   295 		int treatment = iInputBuffer[iInputBufferStart].iTreatment;
       
   296 		int next_treatment = treatment;
       
   297 		int run_size = 1;
       
   298 		for (i = 1; i < iInputBufferSize; i++)
       
   299 			{
       
   300 			int index = (iInputBufferStart + i) % EMaxInputBufferSize;
       
   301 			next_treatment = iInputBuffer[index].iTreatment;
       
   302 			if (next_treatment != treatment)
       
   303 				break;
       
   304 			run_size++;
       
   305 			}
       
   306 		if (run_size > 1)
       
   307 			SelectTreatment(treatment);
       
   308 		for (i = 0; i < run_size; i++)
       
   309 			WriteCharacterFromBuffer();
       
   310 		}
       
   311 
       
   312 	FlushOutputBufferL();
       
   313 	}
       
   314 
       
   315 void TUnicodeCompressor::FlushOutputBufferL()
       
   316 	{
       
   317 	while (iOutputBufferSize > 0 &&	iCompressedBytes < iMaxCompressedBytes)
       
   318 		{
       
   319 		TUint8 byte = iOutputBuffer[iOutputBufferStart];
       
   320 		if (iOutputPointer)
       
   321 			*iOutputPointer++ = byte;
       
   322 		else if (iOutputStream)
       
   323 			Panic(ECannotUseStreams);
       
   324 		iCompressedBytes++;
       
   325 		iOutputBufferSize--;
       
   326 		iOutputBufferStart = (iOutputBufferStart + 1) % EMaxOutputBufferSize;
       
   327 		}
       
   328 	}
       
   329 
       
   330 void TUnicodeCompressor::SelectTreatment(TInt aTreatment)
       
   331 	{
       
   332 	if (aTreatment == TAction::EPlainUnicode)
       
   333 		{
       
   334 		// Switch to Unicode mode if not there already.
       
   335 		if (!iUnicodeMode)
       
   336 			{
       
   337 			WriteByte(SCU);
       
   338 			iUnicodeMode = TRUE;
       
   339 			}
       
   340 		return;
       
   341 		}
       
   342 
       
   343 	if (aTreatment == TAction::EPlainASCII)
       
   344 		{
       
   345 		// Switch to single-byte mode, using the current dynamic window, if not there already.
       
   346 		if (iUnicodeMode)
       
   347 			{
       
   348 			WriteByte(UC0 + iDynamicWindowIndex);
       
   349 			iUnicodeMode = FALSE;
       
   350 			}
       
   351 		return;
       
   352 		}
       
   353 
       
   354 	if (aTreatment >= TAction::EFirstDynamic && aTreatment <= TAction::ELastDynamic)
       
   355 		{
       
   356 		TUint32 base = DynamicWindowBase(aTreatment);
       
   357 
       
   358 		// Switch to the appropriate dynamic window if it is available; if not, redefine and select dynamic window 4.
       
   359 		for (int i = 0; i < EDynamicWindows; i++)
       
   360 			if (base == iDynamicWindow[i])
       
   361 				{
       
   362 				if (iUnicodeMode)
       
   363 					WriteByte(UC0 + i);
       
   364 				else if (i != iDynamicWindowIndex)
       
   365 					WriteByte(SC0 + i);
       
   366 				iUnicodeMode = FALSE;
       
   367 				iDynamicWindowIndex = i;
       
   368 				iActiveWindowBase = base;
       
   369 				return;
       
   370 				}
       
   371 		if (iUnicodeMode)
       
   372 			WriteByte(UD0 + 4);
       
   373 		else
       
   374 			WriteByte(SD0 + 4);
       
   375 		iDynamicWindowIndex = 4;
       
   376 		iUnicodeMode = FALSE;
       
   377 		WriteByte(aTreatment);
       
   378 		iDynamicWindow[4] = base;
       
   379 		iActiveWindowBase = base;
       
   380 		return;
       
   381 		}
       
   382 	}
       
   383 
       
   384 // Write a character without changing mode or window.
       
   385 void TUnicodeCompressor::WriteCharacter(const TAction& aAction)
       
   386 	{
       
   387 	if (iUnicodeMode)
       
   388 		WriteUCharacter(aAction.iCode);
       
   389 	else
       
   390 		WriteSCharacter(aAction);
       
   391 	}
       
   392 
       
   393 void TUnicodeCompressor::WriteUCharacter(TUint16 aCode)
       
   394 	{
       
   395 	// Emit the 'quote Unicode' tag if the character would conflict with a tag.
       
   396 	if (aCode >= 0xE000 && aCode <= 0xF2FF)
       
   397 		WriteByte(UQU);
       
   398 
       
   399 	// Write the Unicode value big-end first.
       
   400 	WriteByte((aCode >> 8) & 0xFF);
       
   401 	WriteByte(aCode & 0xFF);
       
   402 	}
       
   403 
       
   404 void TUnicodeCompressor::WriteByte(TUint aByte)
       
   405 	{
       
   406 	if (iOutputBufferSize >= EMaxOutputBufferSize)
       
   407 		Panic(EOutputBufferOverflow);
       
   408 	iOutputBuffer[(iOutputBufferStart + iOutputBufferSize) % EMaxOutputBufferSize] = (TUint8)aByte;
       
   409 	iOutputBufferSize++;
       
   410 	}
       
   411 
       
   412 void TUnicodeCompressor::WriteSCharacter(const TAction& aAction)
       
   413 	{
       
   414 	// Characters in the range 0x0020..0x007F, plus nul, tab, cr, and lf, can be emitted as their low bytes.
       
   415 	if (aAction.iTreatment == TAction::EPlainASCII)
       
   416 		{
       
   417 		WriteByte(aAction.iCode);
       
   418 		return;
       
   419 		}
       
   420 
       
   421 	// Characters in a static window can be written using SQ<n> plus a byte in the range 0x00-0x7F
       
   422 	if (aAction.iTreatment >= TAction::EFirstStatic && aAction.iTreatment <= TAction::ELastStatic)
       
   423 		{
       
   424 		int window = aAction.iTreatment - TAction::EFirstStatic;
       
   425 		WriteByte(SQ0 + window);
       
   426 		WriteByte(aAction.iCode);
       
   427 		return;
       
   428 		}
       
   429 
       
   430 	// Characters in the current dynamic window can be written as a byte in the range 0x80-0xFF.
       
   431 	if (aAction.iCode >= iActiveWindowBase && aAction.iCode < iActiveWindowBase + 128)
       
   432 		{
       
   433 		WriteByte(aAction.iCode - iActiveWindowBase + 0x80);
       
   434 		return;
       
   435 		}
       
   436 
       
   437 	// Characters in another dynamic window can be written using SQ<n> plus a byte in the range 0x80-0xFF
       
   438 	int i;
       
   439 	for (i = 0; i < EDynamicWindows; i++)
       
   440 		if (aAction.iCode >= iDynamicWindow[i] && aAction.iCode < iDynamicWindow[i] + 128)
       
   441 			{
       
   442 			WriteByte(SQ0 + i);
       
   443 			WriteByte(aAction.iCode - iDynamicWindow[i] + 0x80);
       
   444 			return;
       
   445 			}
       
   446 
       
   447 	// Other characters can be quoted.
       
   448 	WriteByte(SQU);
       
   449 	WriteByte((aAction.iCode >> 8) & 0xFF);
       
   450 	WriteByte(aAction.iCode & 0xFF);
       
   451 	return;
       
   452 	}
       
   453