libraries/ltkutils/tsrc/tutf8.cpp
changeset 0 7f656887cf89
equal deleted inserted replaced
-1:000000000000 0:7f656887cf89
       
     1 // tutf8.cpp
       
     2 // 
       
     3 // Copyright (c) 2010 Accenture. All rights reserved.
       
     4 // This component and the accompanying materials are made available
       
     5 // under the terms of the "Eclipse Public License v1.0"
       
     6 // which accompanies this distribution, and is available
       
     7 // at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 // 
       
     9 // Initial Contributors:
       
    10 // Accenture - Initial contribution
       
    11 //
       
    12 
       
    13 #include <fshell/ioutils.h>
       
    14 #include <fshell/common.mmh>
       
    15 #include <fshell/descriptorutils.h>
       
    16 
       
    17 using namespace IoUtils;
       
    18 using namespace LtkUtils;
       
    19 
       
    20 class CCmdtutf8 : public CCommandBase
       
    21 	{
       
    22 public:
       
    23 	static CCommandBase* NewLC();
       
    24 	~CCmdtutf8();
       
    25 private:
       
    26 	CCmdtutf8();
       
    27 private: // From CCommandBase.
       
    28 	virtual const TDesC& Name() const;
       
    29 	virtual const TDesC& Description() const;
       
    30 	virtual void DoRunL();
       
    31 	virtual void ArgumentsL(RCommandArgumentList& aArguments);
       
    32 	virtual void OptionsL(RCommandOptionList& aOptions);
       
    33 private:
       
    34 	TFileName2 iFile;
       
    35 	TInt iBlockSize;
       
    36 	};
       
    37 
       
    38 EXE_BOILER_PLATE(CCmdtutf8)
       
    39 
       
    40 CCommandBase* CCmdtutf8::NewLC()
       
    41 	{
       
    42 	CCmdtutf8* self = new(ELeave) CCmdtutf8();
       
    43 	CleanupStack::PushL(self);
       
    44 	self->BaseConstructL();
       
    45 	return self;
       
    46 	}
       
    47 
       
    48 CCmdtutf8::~CCmdtutf8()
       
    49 	{
       
    50 	}
       
    51 
       
    52 CCmdtutf8::CCmdtutf8()
       
    53 	{
       
    54 	}
       
    55 
       
    56 const TDesC& CCmdtutf8::Name() const
       
    57 	{
       
    58 	_LIT(KName, "tutf8");	
       
    59 	return KName;
       
    60 	}
       
    61 
       
    62 const TDesC& CCmdtutf8::Description() const
       
    63 	{
       
    64 	_LIT(KDescription, "Test for RLtkBuf8::CopyAsUtf8L() and RLtkBuf16::AppendUtf8L().");
       
    65 	return KDescription;
       
    66 	}
       
    67 
       
    68 void CCmdtutf8::ArgumentsL(RCommandArgumentList& aArguments)
       
    69 	{
       
    70 	aArguments.AppendFileNameL(iFile, _L("filename"), _L("If specified, parse this file as UTF-8 and print the results"), KValueTypeFlagOptional);
       
    71 	}
       
    72 
       
    73 void CCmdtutf8::OptionsL(RCommandOptionList& aOptions)
       
    74 	{
       
    75 	aOptions.AppendIntL(iBlockSize, 'b', _L("blocksize"), _L("block size for parsing the specified file"));
       
    76 	}
       
    77 
       
    78 void CCmdtutf8::DoRunL()
       
    79 	{
       
    80 	if (iFile.Length() == 0)
       
    81 		{
       
    82 		CleanupStack::PushL((CBase*)1); // Panicker
       
    83 		_LIT(KTest, "A \u03A9 \u8A9E \uFFFD \uFEFF \uD800 "); // The original UTF-16 string: A LowercaseOmega SomeGlyphOrOther ReplacementChar ZWNBSP UnmatchedLeadingSurrogate
       
    84 		_LIT8(KOut, "A \xCE\xA9 \xE8\xAA\x9E \xEF\xBF\xBD \xEF\xBB\xBF \xEF\xBF\xBD "); // What it should be in UTF-8
       
    85 		_LIT(KOutInUnicode, "A \u03A9 \u8A9E \uFFFD \uFEFF \uFFFD "); // Almost the same as the original, except that the UnmatchedSurrogate was transformed into ReplacementChar in UTF-8 so the last char here is U+FFFD
       
    86 		RLtkBuf8 buf;
       
    87 		buf.CopyAsUtf8L(KTest);
       
    88 		ASSERT(buf == KOut());
       
    89 		buf.Close();
       
    90 
       
    91 		RLtkBuf16 wbuf;
       
    92 		wbuf.AppendUtf8L(KOut().Left(6));
       
    93 		ASSERT(wbuf.Length() == 4); // Testing that only the 4 complete characters are in there
       
    94 		wbuf.AppendUtf8L(KNullDesC8());
       
    95 		ASSERT(wbuf.Length() == 4); // Testing that appending a null descriptor hasn't changed the length (or crashed)
       
    96 		TInt firstprob;
       
    97 		wbuf.FinalizeUtf8(firstprob);
       
    98 		ASSERT(firstprob == 5); // Correctly indentified the first invalid bit
       
    99 		_LIT(KFirstFrag, "A \u03A9 \uFFFD");
       
   100 		ASSERT(wbuf == KFirstFrag());
       
   101 		wbuf.SetLength(4);
       
   102 		wbuf.AppendUtf8L(KOut().Mid(5,1));
       
   103 		wbuf.ReAllocL(256); // Be really evil and realloc the buffer while we have fragmented bytes cached
       
   104 		ASSERT(wbuf == KOutInUnicode().Left(4));
       
   105 		wbuf.AppendUtf8L(KOut().Mid(6,1));
       
   106 		ASSERT(wbuf == KOutInUnicode().Left(4));
       
   107 		wbuf.AppendUtf8L(KOut().Mid(7,3));
       
   108 		ASSERT(wbuf == KOutInUnicode().Left(6));
       
   109 		wbuf.AppendUtf8L(KOut().Mid(10));
       
   110 		ASSERT(wbuf == KOutInUnicode());
       
   111 		wbuf.FinalizeUtf8(firstprob);
       
   112 		ASSERT(firstprob == KErrNotFound);
       
   113 		wbuf.Close();
       
   114 
       
   115 		_LIT8(KBomTest, "\xEF\xBB\xBF BB \xEF\xBB\xBF");
       
   116 		_LIT(KBomOutput, " BB \uFEFF");
       
   117 		wbuf.AppendUtf8L(KBomTest);
       
   118 		wbuf.FinalizeUtf8(firstprob);
       
   119 		ASSERT(wbuf == KBomOutput());
       
   120 		ASSERT(firstprob == KErrNotFound);
       
   121 		wbuf.Close();
       
   122 
       
   123 		wbuf.AppendUtf8L(KBomTest().Left(2));
       
   124 		wbuf.AppendUtf8L(KBomTest().Mid(2, 5));
       
   125 		wbuf.AppendUtf8L(KBomTest().Mid(7));
       
   126 		wbuf.FinalizeUtf8(firstprob);
       
   127 		ASSERT(wbuf == KBomOutput());
       
   128 		ASSERT(firstprob == KErrNotFound);
       
   129 		wbuf.Close();
       
   130 
       
   131 		// Maximal subexpression replacement test - Example taken from unicode standard section 3.9, table 3-8.
       
   132 		_LIT8(KInvalid, "\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64");
       
   133 		_LIT(KInvalidOutput, "\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064"); // And this is how the standard recommends it is processed
       
   134 		wbuf.AppendUtf8L(KInvalid);
       
   135 		wbuf.FinalizeUtf8(firstprob);
       
   136 		ASSERT(wbuf == KInvalidOutput());
       
   137 		ASSERT(firstprob == 1);
       
   138 		wbuf.Close();
       
   139 
       
   140 		// Check that the first bad byte calculations are right
       
   141 		wbuf.AppendUtf8L(_L8(" \x61\xF1"), firstprob);
       
   142 		ASSERT(firstprob == KErrNotFound); // F1 is potentially valid
       
   143 		wbuf.AppendUtf8L(_L8("\x80\x80\xE1"), firstprob);
       
   144 		ASSERT(firstprob == 0); // Technically it's the -1th byte of what we just passed in, but we can only say zero
       
   145 		wbuf.FinalizeUtf8(firstprob);
       
   146 		ASSERT(firstprob == 2); // The overall first invalid byte was byte 1, the 0xF1
       
   147 		wbuf.Close();
       
   148 		CleanupStack::Pop(); // Panicker
       
   149 		}
       
   150 	else
       
   151 		{
       
   152 		RFile file;
       
   153 		LeaveIfErr(file.Open(FsL(), iFile, EFileRead), _L("Couldn't open file %S"), &iFile);
       
   154 		CleanupClosePushL(file);
       
   155 		TInt fileSize;
       
   156 		LeaveIfErr(file.Size(fileSize), _L("Couldn't get file size"));
       
   157 		RBuf8 nbuf;
       
   158 		nbuf.CreateL(iBlockSize ? iBlockSize : fileSize);
       
   159 		CleanupClosePushL(nbuf);
       
   160 		RLtkBuf buf;
       
   161 		CleanupClosePushL(buf);
       
   162 		TInt read = 0;
       
   163 		while (read < fileSize)
       
   164 			{
       
   165 			nbuf.Zero();
       
   166 			LeaveIfErr(file.Read(nbuf), _L("Couldn't read file"));
       
   167 			read += nbuf.Length();
       
   168 			buf.Zero();
       
   169 			buf.AppendUtf8L(nbuf);
       
   170 			Write(buf);
       
   171 			}
       
   172 		TInt unconverted;
       
   173 		buf.FinalizeUtf8(unconverted);
       
   174 		Printf(_L("First bad byte: %d\r\n"), unconverted);
       
   175 		Write(buf);
       
   176 
       
   177 		CleanupStack::PopAndDestroy(3, &file); // buf, nbuf, file
       
   178 		}
       
   179 	}
       
   180 
       
   181 /*
       
   182 This turned out to not actually make much difference, but it might be useful in future...
       
   183 
       
   184 static const TInt8 KFirstByteSequenceLengths[] = { // 40 chars per line
       
   185 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
       
   186 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
       
   187 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 
       
   188 	1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
       
   189 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 
       
   190 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 
       
   191 	4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       
   192 	};
       
   193 __ASSERT_COMPILE(sizeof(KFirstByteSequenceLengths) == 256);
       
   194 ASSERT(KFirstByteSequenceLengths[0x7F] == 1);
       
   195 ASSERT(KFirstByteSequenceLengths[0x80] == 0);
       
   196 ASSERT(KFirstByteSequenceLengths[0xC1] == 0);
       
   197 ASSERT(KFirstByteSequenceLengths[0xC2] == 2);
       
   198 ASSERT(KFirstByteSequenceLengths[0xDF] == 2);
       
   199 ASSERT(KFirstByteSequenceLengths[0xE0] == 3);
       
   200 ASSERT(KFirstByteSequenceLengths[0xEF] == 3);
       
   201 ASSERT(KFirstByteSequenceLengths[0xF0] == 4);
       
   202 ASSERT(KFirstByteSequenceLengths[0xF4] == 4);
       
   203 ASSERT(KFirstByteSequenceLengths[0xF5] == 0);
       
   204 ASSERT(KFirstByteSequenceLengths[0xFF] == 0);
       
   205 */