|
1 // tutf8.cpp |
|
2 // |
|
3 // Copyright (c) 2010 Accenture. All rights reserved. |
|
4 // This component and the accompanying materials are made available |
|
5 // under the terms of the "Eclipse Public License v1.0" |
|
6 // which accompanies this distribution, and is available |
|
7 // at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 // |
|
9 // Initial Contributors: |
|
10 // Accenture - Initial contribution |
|
11 // |
|
12 |
|
13 #include <fshell/ioutils.h> |
|
14 #include <fshell/common.mmh> |
|
15 #include <fshell/descriptorutils.h> |
|
16 |
|
17 using namespace IoUtils; |
|
18 using namespace LtkUtils; |
|
19 |
|
20 class CCmdtutf8 : public CCommandBase |
|
21 { |
|
22 public: |
|
23 static CCommandBase* NewLC(); |
|
24 ~CCmdtutf8(); |
|
25 private: |
|
26 CCmdtutf8(); |
|
27 private: // From CCommandBase. |
|
28 virtual const TDesC& Name() const; |
|
29 virtual const TDesC& Description() const; |
|
30 virtual void DoRunL(); |
|
31 virtual void ArgumentsL(RCommandArgumentList& aArguments); |
|
32 virtual void OptionsL(RCommandOptionList& aOptions); |
|
33 private: |
|
34 TFileName2 iFile; |
|
35 TInt iBlockSize; |
|
36 }; |
|
37 |
|
38 EXE_BOILER_PLATE(CCmdtutf8) |
|
39 |
|
40 CCommandBase* CCmdtutf8::NewLC() |
|
41 { |
|
42 CCmdtutf8* self = new(ELeave) CCmdtutf8(); |
|
43 CleanupStack::PushL(self); |
|
44 self->BaseConstructL(); |
|
45 return self; |
|
46 } |
|
47 |
|
48 CCmdtutf8::~CCmdtutf8() |
|
49 { |
|
50 } |
|
51 |
|
52 CCmdtutf8::CCmdtutf8() |
|
53 { |
|
54 } |
|
55 |
|
56 const TDesC& CCmdtutf8::Name() const |
|
57 { |
|
58 _LIT(KName, "tutf8"); |
|
59 return KName; |
|
60 } |
|
61 |
|
62 const TDesC& CCmdtutf8::Description() const |
|
63 { |
|
64 _LIT(KDescription, "Test for RLtkBuf8::CopyAsUtf8L() and RLtkBuf16::AppendUtf8L()."); |
|
65 return KDescription; |
|
66 } |
|
67 |
|
68 void CCmdtutf8::ArgumentsL(RCommandArgumentList& aArguments) |
|
69 { |
|
70 aArguments.AppendFileNameL(iFile, _L("filename"), _L("If specified, parse this file as UTF-8 and print the results"), KValueTypeFlagOptional); |
|
71 } |
|
72 |
|
73 void CCmdtutf8::OptionsL(RCommandOptionList& aOptions) |
|
74 { |
|
75 aOptions.AppendIntL(iBlockSize, 'b', _L("blocksize"), _L("block size for parsing the specified file")); |
|
76 } |
|
77 |
|
78 void CCmdtutf8::DoRunL() |
|
79 { |
|
80 if (iFile.Length() == 0) |
|
81 { |
|
82 CleanupStack::PushL((CBase*)1); // Panicker |
|
83 _LIT(KTest, "A \u03A9 \u8A9E \uFFFD \uFEFF \uD800 "); // The original UTF-16 string: A LowercaseOmega SomeGlyphOrOther ReplacementChar ZWNBSP UnmatchedLeadingSurrogate |
|
84 _LIT8(KOut, "A \xCE\xA9 \xE8\xAA\x9E \xEF\xBF\xBD \xEF\xBB\xBF \xEF\xBF\xBD "); // What it should be in UTF-8 |
|
85 _LIT(KOutInUnicode, "A \u03A9 \u8A9E \uFFFD \uFEFF \uFFFD "); // Almost the same as the original, except that the UnmatchedSurrogate was transformed into ReplacementChar in UTF-8 so the last char here is U+FFFD |
|
86 RLtkBuf8 buf; |
|
87 buf.CopyAsUtf8L(KTest); |
|
88 ASSERT(buf == KOut()); |
|
89 buf.Close(); |
|
90 |
|
91 RLtkBuf16 wbuf; |
|
92 wbuf.AppendUtf8L(KOut().Left(6)); |
|
93 ASSERT(wbuf.Length() == 4); // Testing that only the 4 complete characters are in there |
|
94 wbuf.AppendUtf8L(KNullDesC8()); |
|
95 ASSERT(wbuf.Length() == 4); // Testing that appending a null descriptor hasn't changed the length (or crashed) |
|
96 TInt firstprob; |
|
97 wbuf.FinalizeUtf8(firstprob); |
|
98 ASSERT(firstprob == 5); // Correctly indentified the first invalid bit |
|
99 _LIT(KFirstFrag, "A \u03A9 \uFFFD"); |
|
100 ASSERT(wbuf == KFirstFrag()); |
|
101 wbuf.SetLength(4); |
|
102 wbuf.AppendUtf8L(KOut().Mid(5,1)); |
|
103 wbuf.ReAllocL(256); // Be really evil and realloc the buffer while we have fragmented bytes cached |
|
104 ASSERT(wbuf == KOutInUnicode().Left(4)); |
|
105 wbuf.AppendUtf8L(KOut().Mid(6,1)); |
|
106 ASSERT(wbuf == KOutInUnicode().Left(4)); |
|
107 wbuf.AppendUtf8L(KOut().Mid(7,3)); |
|
108 ASSERT(wbuf == KOutInUnicode().Left(6)); |
|
109 wbuf.AppendUtf8L(KOut().Mid(10)); |
|
110 ASSERT(wbuf == KOutInUnicode()); |
|
111 wbuf.FinalizeUtf8(firstprob); |
|
112 ASSERT(firstprob == KErrNotFound); |
|
113 wbuf.Close(); |
|
114 |
|
115 _LIT8(KBomTest, "\xEF\xBB\xBF BB \xEF\xBB\xBF"); |
|
116 _LIT(KBomOutput, " BB \uFEFF"); |
|
117 wbuf.AppendUtf8L(KBomTest); |
|
118 wbuf.FinalizeUtf8(firstprob); |
|
119 ASSERT(wbuf == KBomOutput()); |
|
120 ASSERT(firstprob == KErrNotFound); |
|
121 wbuf.Close(); |
|
122 |
|
123 wbuf.AppendUtf8L(KBomTest().Left(2)); |
|
124 wbuf.AppendUtf8L(KBomTest().Mid(2, 5)); |
|
125 wbuf.AppendUtf8L(KBomTest().Mid(7)); |
|
126 wbuf.FinalizeUtf8(firstprob); |
|
127 ASSERT(wbuf == KBomOutput()); |
|
128 ASSERT(firstprob == KErrNotFound); |
|
129 wbuf.Close(); |
|
130 |
|
131 // Maximal subexpression replacement test - Example taken from unicode standard section 3.9, table 3-8. |
|
132 _LIT8(KInvalid, "\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64"); |
|
133 _LIT(KInvalidOutput, "\u0061\uFFFD\uFFFD\uFFFD\u0062\uFFFD\u0063\uFFFD\uFFFD\u0064"); // And this is how the standard recommends it is processed |
|
134 wbuf.AppendUtf8L(KInvalid); |
|
135 wbuf.FinalizeUtf8(firstprob); |
|
136 ASSERT(wbuf == KInvalidOutput()); |
|
137 ASSERT(firstprob == 1); |
|
138 wbuf.Close(); |
|
139 |
|
140 // Check that the first bad byte calculations are right |
|
141 wbuf.AppendUtf8L(_L8(" \x61\xF1"), firstprob); |
|
142 ASSERT(firstprob == KErrNotFound); // F1 is potentially valid |
|
143 wbuf.AppendUtf8L(_L8("\x80\x80\xE1"), firstprob); |
|
144 ASSERT(firstprob == 0); // Technically it's the -1th byte of what we just passed in, but we can only say zero |
|
145 wbuf.FinalizeUtf8(firstprob); |
|
146 ASSERT(firstprob == 2); // The overall first invalid byte was byte 1, the 0xF1 |
|
147 wbuf.Close(); |
|
148 CleanupStack::Pop(); // Panicker |
|
149 } |
|
150 else |
|
151 { |
|
152 RFile file; |
|
153 LeaveIfErr(file.Open(FsL(), iFile, EFileRead), _L("Couldn't open file %S"), &iFile); |
|
154 CleanupClosePushL(file); |
|
155 TInt fileSize; |
|
156 LeaveIfErr(file.Size(fileSize), _L("Couldn't get file size")); |
|
157 RBuf8 nbuf; |
|
158 nbuf.CreateL(iBlockSize ? iBlockSize : fileSize); |
|
159 CleanupClosePushL(nbuf); |
|
160 RLtkBuf buf; |
|
161 CleanupClosePushL(buf); |
|
162 TInt read = 0; |
|
163 while (read < fileSize) |
|
164 { |
|
165 nbuf.Zero(); |
|
166 LeaveIfErr(file.Read(nbuf), _L("Couldn't read file")); |
|
167 read += nbuf.Length(); |
|
168 buf.Zero(); |
|
169 buf.AppendUtf8L(nbuf); |
|
170 Write(buf); |
|
171 } |
|
172 TInt unconverted; |
|
173 buf.FinalizeUtf8(unconverted); |
|
174 Printf(_L("First bad byte: %d\r\n"), unconverted); |
|
175 Write(buf); |
|
176 |
|
177 CleanupStack::PopAndDestroy(3, &file); // buf, nbuf, file |
|
178 } |
|
179 } |
|
180 |
|
181 /* |
|
182 This turned out to not actually make much difference, but it might be useful in future... |
|
183 |
|
184 static const TInt8 KFirstByteSequenceLengths[] = { // 40 chars per line |
|
185 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
186 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
187 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
|
188 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
189 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, |
|
190 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
|
191 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
192 }; |
|
193 __ASSERT_COMPILE(sizeof(KFirstByteSequenceLengths) == 256); |
|
194 ASSERT(KFirstByteSequenceLengths[0x7F] == 1); |
|
195 ASSERT(KFirstByteSequenceLengths[0x80] == 0); |
|
196 ASSERT(KFirstByteSequenceLengths[0xC1] == 0); |
|
197 ASSERT(KFirstByteSequenceLengths[0xC2] == 2); |
|
198 ASSERT(KFirstByteSequenceLengths[0xDF] == 2); |
|
199 ASSERT(KFirstByteSequenceLengths[0xE0] == 3); |
|
200 ASSERT(KFirstByteSequenceLengths[0xEF] == 3); |
|
201 ASSERT(KFirstByteSequenceLengths[0xF0] == 4); |
|
202 ASSERT(KFirstByteSequenceLengths[0xF4] == 4); |
|
203 ASSERT(KFirstByteSequenceLengths[0xF5] == 0); |
|
204 ASSERT(KFirstByteSequenceLengths[0xFF] == 0); |
|
205 */ |