--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/libraries/ltkutils/tsrc/tutf8.cpp Wed Jun 23 15:52:26 2010 +0100
@@ -0,0 +1,205 @@
+// tutf8.cpp
+//
+// Copyright (c) 2010 Accenture. All rights reserved.
+// This component and the accompanying materials are made available
+// under the terms of the "Eclipse Public License v1.0"
+// which accompanies this distribution, and is available
+// at the URL "http://www.eclipse.org/legal/epl-v10.html".
+//
+// Initial Contributors:
+// Accenture - Initial contribution
+//
+
+#include <fshell/ioutils.h>
+#include <fshell/common.mmh>
+#include <fshell/descriptorutils.h>
+
+using namespace IoUtils;
+using namespace LtkUtils;
+
+class CCmdtutf8 : public CCommandBase
+ {
+public:
+ static CCommandBase* NewLC();
+ ~CCmdtutf8();
+private:
+ CCmdtutf8();
+private: // From CCommandBase.
+ virtual const TDesC& Name() const;
+ virtual const TDesC& Description() const;
+ virtual void DoRunL();
+ virtual void ArgumentsL(RCommandArgumentList& aArguments);
+ virtual void OptionsL(RCommandOptionList& aOptions);
+private:
+ TFileName2 iFile;
+ TInt iBlockSize;
+ };
+
+EXE_BOILER_PLATE(CCmdtutf8)
+
+CCommandBase* CCmdtutf8::NewLC()
+ {
+ CCmdtutf8* self = new(ELeave) CCmdtutf8();
+ CleanupStack::PushL(self);
+ self->BaseConstructL();
+ return self;
+ }
+
+CCmdtutf8::~CCmdtutf8()
+ {
+ }
+
+CCmdtutf8::CCmdtutf8()
+ {
+ }
+
+const TDesC& CCmdtutf8::Name() const
+ {
+ _LIT(KName, "tutf8");
+ return KName;
+ }
+
+const TDesC& CCmdtutf8::Description() const
+ {
+ _LIT(KDescription, "Test for RLtkBuf8::CopyAsUtf8L() and RLtkBuf16::AppendUtf8L().");
+ return KDescription;
+ }
+
+void CCmdtutf8::ArgumentsL(RCommandArgumentList& aArguments)
+ {
+ aArguments.AppendFileNameL(iFile, _L("filename"), _L("If specified, parse this file as UTF-8 and print the results"), KValueTypeFlagOptional);
+ }
+
+void CCmdtutf8::OptionsL(RCommandOptionList& aOptions)
+ {
+ aOptions.AppendIntL(iBlockSize, 'b', _L("blocksize"), _L("block size for parsing the specified file"));
+ }
+
+void CCmdtutf8::DoRunL()
+ {
+ if (iFile.Length() == 0)
+ {
+ CleanupStack::PushL((CBase*)1); // Panicker
+ _LIT(KTest, "A \u03A9 \u8A9E \uFFFD \uFEFF \uD800 "); // The original UTF-16 string: A LowercaseOmega SomeGlyphOrOther ReplacementChar ZWNBSP UnmatchedLeadingSurrogate
+ _LIT8(KOut, "A \xCE\xA9 \xE8\xAA\x9E \xEF\xBF\xBD \xEF\xBB\xBF \xEF\xBF\xBD "); // What it should be in UTF-8
+ _LIT(KOutInUnicode, "A \u03A9 \u8A9E \uFFFD \uFEFF \uFFFD "); // Almost the same as the original, except that the UnmatchedSurrogate was transformed into ReplacementChar in UTF-8 so the last char here is U+FFFD
+ RLtkBuf8 buf;
+ buf.CopyAsUtf8L(KTest);
+ ASSERT(buf == KOut());
+ buf.Close();
+
+ RLtkBuf16 wbuf;
+ wbuf.AppendUtf8L(KOut().Left(6));
+ ASSERT(wbuf.Length() == 4); // Testing that only the 4 complete characters are in there
+ wbuf.AppendUtf8L(KNullDesC8());
+ ASSERT(wbuf.Length() == 4); // Testing that appending a null descriptor hasn't changed the length (or crashed)
+ TInt firstprob;
+ wbuf.FinalizeUtf8(firstprob);
+ ASSERT(firstprob == 5); // Correctly indentified the first invalid bit
+ _LIT(KFirstFrag, "A \u03A9 \uFFFD");
+ ASSERT(wbuf == KFirstFrag());
+ wbuf.SetLength(4);
+ wbuf.AppendUtf8L(KOut().Mid(5,1));
+ wbuf.ReAllocL(256); // Be really evil and realloc the buffer while we have fragmented bytes cached
+ ASSERT(wbuf == KOutInUnicode().Left(4));
+ wbuf.AppendUtf8L(KOut().Mid(6,1));
+ ASSERT(wbuf == KOutInUnicode().Left(4));
+ wbuf.AppendUtf8L(KOut().Mid(7,3));
+ ASSERT(wbuf == KOutInUnicode().Left(6));
+ wbuf.AppendUtf8L(KOut().Mid(10));
+ ASSERT(wbuf == KOutInUnicode());
+ wbuf.FinalizeUtf8(firstprob);
+ ASSERT(firstprob == KErrNotFound);
+ wbuf.Close();
+
+ _LIT8(KBomTest, "\xEF\xBB\xBF BB \xEF\xBB\xBF");
+ _LIT(KBomOutput, " BB \uFEFF");
+ wbuf.AppendUtf8L(KBomTest);
+ wbuf.FinalizeUtf8(firstprob);
+ ASSERT(wbuf == KBomOutput());
+ ASSERT(firstprob == KErrNotFound);
+ wbuf.Close();
+
+ wbuf.AppendUtf8L(KBomTest().Left(2));
+ wbuf.AppendUtf8L(KBomTest().Mid(2, 5));
+ wbuf.AppendUtf8L(KBomTest().Mid(7));
+ wbuf.FinalizeUtf8(firstprob);
+ ASSERT(wbuf == KBomOutput());
+ ASSERT(firstprob == KErrNotFound);
+ wbuf.Close();
+
+ // Maximal subexpression replacement test - Example taken from unicode standard section 3.9, table 3-8.
+ _LIT8(KInvalid, "\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64");
+ _LIT(KInvalidOutput, "\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064"); // And this is how the standard recommends it is processed
+ wbuf.AppendUtf8L(KInvalid);
+ wbuf.FinalizeUtf8(firstprob);
+ ASSERT(wbuf == KInvalidOutput());
+ ASSERT(firstprob == 1);
+ wbuf.Close();
+
+ // Check that the first bad byte calculations are right
+ wbuf.AppendUtf8L(_L8(" \x61\xF1"), firstprob);
+ ASSERT(firstprob == KErrNotFound); // F1 is potentially valid
+ wbuf.AppendUtf8L(_L8("\x80\x80\xE1"), firstprob);
+ ASSERT(firstprob == 0); // Technically it's the -1th byte of what we just passed in, but we can only say zero
+ wbuf.FinalizeUtf8(firstprob);
+ ASSERT(firstprob == 2); // The overall first invalid byte was byte 1, the 0xF1
+ wbuf.Close();
+ CleanupStack::Pop(); // Panicker
+ }
+ else
+ {
+ RFile file;
+ LeaveIfErr(file.Open(FsL(), iFile, EFileRead), _L("Couldn't open file %S"), &iFile);
+ CleanupClosePushL(file);
+ TInt fileSize;
+ LeaveIfErr(file.Size(fileSize), _L("Couldn't get file size"));
+ RBuf8 nbuf;
+ nbuf.CreateL(iBlockSize ? iBlockSize : fileSize);
+ CleanupClosePushL(nbuf);
+ RLtkBuf buf;
+ CleanupClosePushL(buf);
+ TInt read = 0;
+ while (read < fileSize)
+ {
+ nbuf.Zero();
+ LeaveIfErr(file.Read(nbuf), _L("Couldn't read file"));
+ read += nbuf.Length();
+ buf.Zero();
+ buf.AppendUtf8L(nbuf);
+ Write(buf);
+ }
+ TInt unconverted;
+ buf.FinalizeUtf8(unconverted);
+ Printf(_L("First bad byte: %d\r\n"), unconverted);
+ Write(buf);
+
+ CleanupStack::PopAndDestroy(3, &file); // buf, nbuf, file
+ }
+ }
+
+/*
+This turned out to not actually make much difference, but it might be useful in future...
+
+static const TInt8 KFirstByteSequenceLengths[] = { // 40 chars per line
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+ 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ };
+__ASSERT_COMPILE(sizeof(KFirstByteSequenceLengths) == 256);
+ASSERT(KFirstByteSequenceLengths[0x7F] == 1);
+ASSERT(KFirstByteSequenceLengths[0x80] == 0);
+ASSERT(KFirstByteSequenceLengths[0xC1] == 0);
+ASSERT(KFirstByteSequenceLengths[0xC2] == 2);
+ASSERT(KFirstByteSequenceLengths[0xDF] == 2);
+ASSERT(KFirstByteSequenceLengths[0xE0] == 3);
+ASSERT(KFirstByteSequenceLengths[0xEF] == 3);
+ASSERT(KFirstByteSequenceLengths[0xF0] == 4);
+ASSERT(KFirstByteSequenceLengths[0xF4] == 4);
+ASSERT(KFirstByteSequenceLengths[0xF5] == 0);
+ASSERT(KFirstByteSequenceLengths[0xFF] == 0);
+*/