diff -r 000000000000 -r 7f656887cf89 libraries/ltkutils/tsrc/tutf8.cpp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libraries/ltkutils/tsrc/tutf8.cpp Wed Jun 23 15:52:26 2010 +0100 @@ -0,0 +1,205 @@ +// tutf8.cpp +// +// Copyright (c) 2010 Accenture. All rights reserved. +// This component and the accompanying materials are made available +// under the terms of the "Eclipse Public License v1.0" +// which accompanies this distribution, and is available +// at the URL "http://www.eclipse.org/legal/epl-v10.html". +// +// Initial Contributors: +// Accenture - Initial contribution +// + +#include +#include +#include + +using namespace IoUtils; +using namespace LtkUtils; + +class CCmdtutf8 : public CCommandBase + { +public: + static CCommandBase* NewLC(); + ~CCmdtutf8(); +private: + CCmdtutf8(); +private: // From CCommandBase. + virtual const TDesC& Name() const; + virtual const TDesC& Description() const; + virtual void DoRunL(); + virtual void ArgumentsL(RCommandArgumentList& aArguments); + virtual void OptionsL(RCommandOptionList& aOptions); +private: + TFileName2 iFile; + TInt iBlockSize; + }; + +EXE_BOILER_PLATE(CCmdtutf8) + +CCommandBase* CCmdtutf8::NewLC() + { + CCmdtutf8* self = new(ELeave) CCmdtutf8(); + CleanupStack::PushL(self); + self->BaseConstructL(); + return self; + } + +CCmdtutf8::~CCmdtutf8() + { + } + +CCmdtutf8::CCmdtutf8() + { + } + +const TDesC& CCmdtutf8::Name() const + { + _LIT(KName, "tutf8"); + return KName; + } + +const TDesC& CCmdtutf8::Description() const + { + _LIT(KDescription, "Test for RLtkBuf8::CopyAsUtf8L() and RLtkBuf16::AppendUtf8L()."); + return KDescription; + } + +void CCmdtutf8::ArgumentsL(RCommandArgumentList& aArguments) + { + aArguments.AppendFileNameL(iFile, _L("filename"), _L("If specified, parse this file as UTF-8 and print the results"), KValueTypeFlagOptional); + } + +void CCmdtutf8::OptionsL(RCommandOptionList& aOptions) + { + aOptions.AppendIntL(iBlockSize, 'b', _L("blocksize"), _L("block size for parsing the specified file")); + } + +void CCmdtutf8::DoRunL() + { + if (iFile.Length() == 0) + { + CleanupStack::PushL((CBase*)1); // Panicker + _LIT(KTest, "A \u03A9 \u8A9E \uFFFD \uFEFF \uD800 "); // The original UTF-16 string: A LowercaseOmega SomeGlyphOrOther ReplacementChar ZWNBSP UnmatchedLeadingSurrogate + _LIT8(KOut, "A \xCE\xA9 \xE8\xAA\x9E \xEF\xBF\xBD \xEF\xBB\xBF \xEF\xBF\xBD "); // What it should be in UTF-8 + _LIT(KOutInUnicode, "A \u03A9 \u8A9E \uFFFD \uFEFF \uFFFD "); // Almost the same as the original, except that the UnmatchedSurrogate was transformed into ReplacementChar in UTF-8 so the last char here is U+FFFD + RLtkBuf8 buf; + buf.CopyAsUtf8L(KTest); + ASSERT(buf == KOut()); + buf.Close(); + + RLtkBuf16 wbuf; + wbuf.AppendUtf8L(KOut().Left(6)); + ASSERT(wbuf.Length() == 4); // Testing that only the 4 complete characters are in there + wbuf.AppendUtf8L(KNullDesC8()); + ASSERT(wbuf.Length() == 4); // Testing that appending a null descriptor hasn't changed the length (or crashed) + TInt firstprob; + wbuf.FinalizeUtf8(firstprob); + ASSERT(firstprob == 5); // Correctly indentified the first invalid bit + _LIT(KFirstFrag, "A \u03A9 \uFFFD"); + ASSERT(wbuf == KFirstFrag()); + wbuf.SetLength(4); + wbuf.AppendUtf8L(KOut().Mid(5,1)); + wbuf.ReAllocL(256); // Be really evil and realloc the buffer while we have fragmented bytes cached + ASSERT(wbuf == KOutInUnicode().Left(4)); + wbuf.AppendUtf8L(KOut().Mid(6,1)); + ASSERT(wbuf == KOutInUnicode().Left(4)); + wbuf.AppendUtf8L(KOut().Mid(7,3)); + ASSERT(wbuf == KOutInUnicode().Left(6)); + wbuf.AppendUtf8L(KOut().Mid(10)); + ASSERT(wbuf == KOutInUnicode()); + wbuf.FinalizeUtf8(firstprob); + ASSERT(firstprob == KErrNotFound); + wbuf.Close(); + + _LIT8(KBomTest, "\xEF\xBB\xBF BB \xEF\xBB\xBF"); + _LIT(KBomOutput, " BB \uFEFF"); + wbuf.AppendUtf8L(KBomTest); + wbuf.FinalizeUtf8(firstprob); + ASSERT(wbuf == KBomOutput()); + ASSERT(firstprob == KErrNotFound); + wbuf.Close(); + + wbuf.AppendUtf8L(KBomTest().Left(2)); + wbuf.AppendUtf8L(KBomTest().Mid(2, 5)); + wbuf.AppendUtf8L(KBomTest().Mid(7)); + wbuf.FinalizeUtf8(firstprob); + ASSERT(wbuf == KBomOutput()); + ASSERT(firstprob == KErrNotFound); + wbuf.Close(); + + // Maximal subexpression replacement test - Example taken from unicode standard section 3.9, table 3-8. + _LIT8(KInvalid, "\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64"); + _LIT(KInvalidOutput, "\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064"); // And this is how the standard recommends it is processed + wbuf.AppendUtf8L(KInvalid); + wbuf.FinalizeUtf8(firstprob); + ASSERT(wbuf == KInvalidOutput()); + ASSERT(firstprob == 1); + wbuf.Close(); + + // Check that the first bad byte calculations are right + wbuf.AppendUtf8L(_L8(" \x61\xF1"), firstprob); + ASSERT(firstprob == KErrNotFound); // F1 is potentially valid + wbuf.AppendUtf8L(_L8("\x80\x80\xE1"), firstprob); + ASSERT(firstprob == 0); // Technically it's the -1th byte of what we just passed in, but we can only say zero + wbuf.FinalizeUtf8(firstprob); + ASSERT(firstprob == 2); // The overall first invalid byte was byte 1, the 0xF1 + wbuf.Close(); + CleanupStack::Pop(); // Panicker + } + else + { + RFile file; + LeaveIfErr(file.Open(FsL(), iFile, EFileRead), _L("Couldn't open file %S"), &iFile); + CleanupClosePushL(file); + TInt fileSize; + LeaveIfErr(file.Size(fileSize), _L("Couldn't get file size")); + RBuf8 nbuf; + nbuf.CreateL(iBlockSize ? iBlockSize : fileSize); + CleanupClosePushL(nbuf); + RLtkBuf buf; + CleanupClosePushL(buf); + TInt read = 0; + while (read < fileSize) + { + nbuf.Zero(); + LeaveIfErr(file.Read(nbuf), _L("Couldn't read file")); + read += nbuf.Length(); + buf.Zero(); + buf.AppendUtf8L(nbuf); + Write(buf); + } + TInt unconverted; + buf.FinalizeUtf8(unconverted); + Printf(_L("First bad byte: %d\r\n"), unconverted); + Write(buf); + + CleanupStack::PopAndDestroy(3, &file); // buf, nbuf, file + } + } + +/* +This turned out to not actually make much difference, but it might be useful in future... + +static const TInt8 KFirstByteSequenceLengths[] = { // 40 chars per line + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + }; +__ASSERT_COMPILE(sizeof(KFirstByteSequenceLengths) == 256); +ASSERT(KFirstByteSequenceLengths[0x7F] == 1); +ASSERT(KFirstByteSequenceLengths[0x80] == 0); +ASSERT(KFirstByteSequenceLengths[0xC1] == 0); +ASSERT(KFirstByteSequenceLengths[0xC2] == 2); +ASSERT(KFirstByteSequenceLengths[0xDF] == 2); +ASSERT(KFirstByteSequenceLengths[0xE0] == 3); +ASSERT(KFirstByteSequenceLengths[0xEF] == 3); +ASSERT(KFirstByteSequenceLengths[0xF0] == 4); +ASSERT(KFirstByteSequenceLengths[0xF4] == 4); +ASSERT(KFirstByteSequenceLengths[0xF5] == 0); +ASSERT(KFirstByteSequenceLengths[0xFF] == 0); +*/