|
1 /* |
|
2 * Copyright (c) 2000 - 2001 Nokia Corporation and/or its subsidiary(-ies). |
|
3 * All rights reserved. |
|
4 * This component and the accompanying materials are made available |
|
5 * under the terms of the License "Eclipse Public License v1.0" |
|
6 * which accompanies this distribution, and is available |
|
7 * at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
8 * |
|
9 * Initial Contributors: |
|
10 * Nokia Corporation - initial contribution. |
|
11 * |
|
12 * Contributors: |
|
13 * |
|
14 * Description: |
|
15 * |
|
16 */ |
|
17 |
|
18 |
|
19 /* |
|
20 This module provides a text (character) stream, pointers into the stream |
|
21 and operations on segments of the stream as though they were strings. |
|
22 The goal is to isolate the client from stream buffers, cross buffer |
|
23 issues and some character set encoding concerns. |
|
24 |
|
25 This particular version is for input from a single buffer. |
|
26 */ |
|
27 |
|
28 #include "cxml_internal.h" |
|
29 #include <xml/cxml/nw_string_char.h> |
|
30 #include <xml/cxml/nw_xmlp_xmlreader.h> |
|
31 |
|
32 static |
|
33 NW_Status_t |
|
34 NW_XML_Reader_ReadAsciiChar(NW_Uint32 c, NW_Uint32* pReturnChar) |
|
35 { |
|
36 /* This looks a bit weird but the idea is to force the conversion |
|
37 of the ASCII character through the same function that is used |
|
38 to read a character from the text. This will impose the same conversion |
|
39 limitations and the same result encoding. */ |
|
40 NW_Int32 byteCount; |
|
41 NW_Uint8 buf[2]; |
|
42 NW_Ucs2 c_ucs2; |
|
43 buf[0] = (NW_Uint8)(c & 0xff); |
|
44 buf[1] = 0; |
|
45 /* should only use this function for ASCII */ |
|
46 if (c > 127) { |
|
47 return NW_STAT_FAILURE; |
|
48 } |
|
49 /* call it UTF-8 because ASCII doesn't work with NW_String_readChar() |
|
50 at the moment */ |
|
51 byteCount = NW_String_readChar((NW_Byte*)buf, &c_ucs2, HTTP_utf_8); |
|
52 if (byteCount != 1) { |
|
53 return NW_STAT_FAILURE; |
|
54 } |
|
55 *pReturnChar = c_ucs2; |
|
56 return NW_STAT_SUCCESS; |
|
57 } |
|
58 |
|
59 /* assumes this is just a handoff of the buffer (i.e., won't make a copy) */ |
|
60 EXPORT_C NW_Status_t |
|
61 NW_XML_Reader_InitFromBuffer(NW_XML_Reader_t* pT, NW_Uint32 length, unsigned char* pBuf) |
|
62 { |
|
63 pT->encoding = 0; |
|
64 pT->endianness = NW_NATIVE_ENDIAN; |
|
65 pT->index = 0; |
|
66 pT->charIndex = 0; |
|
67 pT->lineColumn.crCount = 0; |
|
68 pT->lineColumn.lfCount = 0; |
|
69 pT->lineColumn.charsSinceLastCR = 0; |
|
70 pT->lineColumn.charsSinceLastLF = 0; |
|
71 pT->end = 0; |
|
72 pT->length = length; |
|
73 pT->pBuf = pBuf; |
|
74 return NW_STAT_SUCCESS; |
|
75 } |
|
76 |
|
77 EXPORT_C NW_Status_t |
|
78 NW_XML_Reader_DataAddressFromBuffer(NW_XML_Reader_t* pT, |
|
79 NW_Uint32 start, NW_Uint32* length, |
|
80 unsigned char** ppData) |
|
81 { |
|
82 NW_ASSERT(start < pT->length); |
|
83 NW_ASSERT(*length <= pT->length); |
|
84 NW_ASSERT((start + *length) <= pT->length); |
|
85 *ppData = NULL; |
|
86 if (start < pT->length) { |
|
87 *ppData = pT->pBuf + start; |
|
88 *length = (((start + *length) <= pT->length) ? |
|
89 *length : (pT->length - start)); |
|
90 return NW_STAT_SUCCESS; |
|
91 } |
|
92 return NW_STAT_FAILURE; |
|
93 } |
|
94 |
|
95 /* peekOrAdvance: first arg "advance": peek = 0, advance = 1 */ |
|
96 static |
|
97 NW_Status_t |
|
98 NW_XML_Reader_PeekOrAdvanceOffset(NW_Bool advance, NW_XML_Reader_t* pT, |
|
99 NW_Uint32 offsetCharCount, NW_Uint32* pC) |
|
100 { |
|
101 NW_Ucs2 c_ucs2; |
|
102 NW_Uint32 i; |
|
103 NW_Uint32 charCount = 0; |
|
104 NW_Int32 byteCount = 0; |
|
105 NW_Uint32 crCount = 0; |
|
106 NW_Uint32 lfCount = 0; |
|
107 NW_Uint32 charsPastCR = 0; |
|
108 NW_Uint32 charsPastLF = 0; |
|
109 NW_Bool resetPastCR = 0; |
|
110 NW_Bool resetPastLF = 0; |
|
111 |
|
112 NW_ASSERT(!(advance && !offsetCharCount)); |
|
113 |
|
114 if (pT->end) { |
|
115 return NW_STAT_FAILURE; |
|
116 } |
|
117 for (i = pT->index; i < pT->length; i += (NW_Uint32)byteCount) { |
|
118 NW_ASSERT(charCount <= offsetCharCount); |
|
119 |
|
120 /* It is assumed that this func returns UNICODE code points. */ |
|
121 byteCount = NW_String_readChar((NW_Byte*)&(pT->pBuf[i]), |
|
122 &c_ucs2, pT->encoding); |
|
123 *pC = c_ucs2; |
|
124 if (byteCount == -1) { |
|
125 return NW_STAT_FAILURE; |
|
126 } |
|
127 if (charCount == offsetCharCount) { |
|
128 /* This catches NW_String_readChar() reading past buffer end |
|
129 and can be removed when the readChar function does proper |
|
130 error checking. */ |
|
131 if ((i + (NW_Uint32)byteCount) > pT->length) { |
|
132 return NW_STAT_FAILURE; |
|
133 } |
|
134 break; |
|
135 } |
|
136 charCount++; |
|
137 charsPastCR++; |
|
138 charsPastLF++; |
|
139 if (c_ucs2 == 0xd /* CR */) { |
|
140 crCount++; |
|
141 resetPastCR = 1; |
|
142 charsPastCR = 0; |
|
143 } else if (c_ucs2 == 0xa /* LF */) { |
|
144 lfCount++; |
|
145 resetPastLF = 1; |
|
146 charsPastLF = 0; |
|
147 } |
|
148 } |
|
149 if (i >= pT->length) { |
|
150 pT->end = 1; |
|
151 } |
|
152 /* This catches NW_String_readChar() reading past buffer end and can be |
|
153 removed when the readChar function does proper error checking. */ |
|
154 if (i > pT->length) { |
|
155 return NW_STAT_FAILURE; |
|
156 } |
|
157 if (advance) { |
|
158 pT->index = i; |
|
159 pT->charIndex += charCount; |
|
160 pT->lineColumn.crCount += crCount; |
|
161 pT->lineColumn.lfCount += lfCount; |
|
162 if (resetPastCR) { |
|
163 pT->lineColumn.charsSinceLastCR = charsPastCR; |
|
164 } else { |
|
165 pT->lineColumn.charsSinceLastCR += charsPastCR; |
|
166 } |
|
167 if (resetPastLF) { |
|
168 pT->lineColumn.charsSinceLastLF = charsPastLF; |
|
169 } else { |
|
170 pT->lineColumn.charsSinceLastLF += charsPastLF; |
|
171 } |
|
172 } |
|
173 return NW_STAT_SUCCESS; |
|
174 } |
|
175 |
|
176 EXPORT_C NW_Status_t |
|
177 NW_XML_Reader_PeekOffset(NW_XML_Reader_t* pT, NW_Uint32 nChars, NW_Uint32* pC) |
|
178 { |
|
179 return NW_XML_Reader_PeekOrAdvanceOffset(0, pT, nChars, pC); |
|
180 } |
|
181 |
|
182 EXPORT_C NW_Status_t |
|
183 NW_XML_Reader_AdvanceOffset(NW_XML_Reader_t* pT, NW_Uint32 nChars) |
|
184 { |
|
185 NW_Uint32 c; |
|
186 return NW_XML_Reader_PeekOrAdvanceOffset(1, pT, nChars, &c); |
|
187 } |
|
188 |
|
189 EXPORT_C |
|
190 void NW_XML_Reader_GetPosition(NW_XML_Reader_t* pT, NW_Uint32* pByteIndex, |
|
191 NW_Uint32* pCharIndex, |
|
192 NW_XML_Reader_LineColumn_t* pLineColumn) |
|
193 { |
|
194 *pByteIndex = pT->index; |
|
195 *pCharIndex = pT->charIndex; |
|
196 pLineColumn->crCount = pT->lineColumn.crCount; |
|
197 pLineColumn->lfCount = pT->lineColumn.lfCount; |
|
198 pLineColumn->charsSinceLastCR = pT->lineColumn.charsSinceLastCR; |
|
199 pLineColumn->charsSinceLastLF = pT->lineColumn.charsSinceLastLF; |
|
200 } |
|
201 |
|
202 /* Note: Setting the position (similar to seeking in a file) is in general |
|
203 not possible without reading the characters (usually reading forward) because |
|
204 character encoding may use a variable numbers of bytes per character. This is |
|
205 here so that if you have defined a valid interval, then you can reposition to |
|
206 the beginning of the interval. Setting to the position to a bad value will |
|
207 not always be caught immediately. Don't forget to also save and set line |
|
208 and column with position. */ |
|
209 EXPORT_C void |
|
210 NW_XML_Reader_SetPosition(NW_XML_Reader_t* pT, NW_Uint32 byteIndex, |
|
211 NW_Uint32 charIndex, |
|
212 const NW_XML_Reader_LineColumn_t* pLineColumn) |
|
213 { |
|
214 pT->index = byteIndex; |
|
215 pT->charIndex = charIndex; |
|
216 pT->lineColumn.crCount = pLineColumn->crCount; |
|
217 pT->lineColumn.lfCount = pLineColumn->lfCount; |
|
218 pT->lineColumn.charsSinceLastCR = pLineColumn->charsSinceLastCR; |
|
219 pT->lineColumn.charsSinceLastLF = pLineColumn->charsSinceLastLF; |
|
220 } |
|
221 |
|
222 /* |
|
223 Reader Interval Functions |
|
224 */ |
|
225 |
|
226 EXPORT_C void |
|
227 NW_XML_Reader_Interval_Start(NW_XML_Reader_Interval_t* pI, NW_XML_Reader_t* pT) |
|
228 { |
|
229 /* set both start and stop for safety in later use */ |
|
230 pI->start = pI->stop = pT->index; |
|
231 pI->charStart = pI->charStop = pT->charIndex; |
|
232 } |
|
233 |
|
234 EXPORT_C void |
|
235 NW_XML_Reader_Interval_Stop(NW_XML_Reader_Interval_t* pI, NW_XML_Reader_t* pT) |
|
236 { |
|
237 pI->stop = pT->index; |
|
238 pI->charStop = pT->charIndex; |
|
239 } |
|
240 |
|
241 /* BEGIN GENERIC Reader CHARACTER AND STRING FUNCTIONS */ |
|
242 |
|
243 /* pMatch is 1 if ASCII character c matches Reader char in its encoding */ |
|
244 EXPORT_C NW_Status_t |
|
245 NW_XML_Reader_AsciiCharMatch(NW_XML_Reader_t* pT, NW_Uint32 asciiC, NW_Uint32* pMatch) |
|
246 { |
|
247 NW_Uint32 c_text, c_ascii; |
|
248 NW_Status_t s = NW_XML_Reader_Peek(pT, &c_text); |
|
249 *pMatch = 0; |
|
250 if (NW_STAT_IS_SUCCESS(s)) { |
|
251 s = NW_XML_Reader_ReadAsciiChar(asciiC, &c_ascii); |
|
252 if (NW_STAT_IS_SUCCESS(s)) { |
|
253 *pMatch = (c_text == c_ascii); |
|
254 } |
|
255 } |
|
256 return s; |
|
257 } |
|
258 |
|
259 /* pMatch is 1 if ASCII string matches Reader sequence in its encoding */ |
|
260 EXPORT_C NW_Status_t |
|
261 NW_XML_Reader_AsciiStringMatch(NW_XML_Reader_t* pT, NW_Uint32 length, const NW_Uint8* pString, |
|
262 NW_Uint32* pMatch) |
|
263 { |
|
264 NW_Uint32 c_text, c_ascii; |
|
265 NW_Uint32 i; |
|
266 NW_Status_t s = NW_STAT_SUCCESS; |
|
267 *pMatch = 0; |
|
268 NW_ASSERT(length); |
|
269 for (i = 0; i < length; i++) { |
|
270 s = NW_XML_Reader_PeekOffset(pT, i, &c_text); |
|
271 if (NW_STAT_IS_FAILURE(s)) { |
|
272 break; |
|
273 } |
|
274 s = NW_XML_Reader_ReadAsciiChar(pString[i], &c_ascii); |
|
275 if (NW_STAT_IS_FAILURE(s)) { |
|
276 break; |
|
277 } |
|
278 if (c_text != c_ascii) { |
|
279 break; |
|
280 } |
|
281 } |
|
282 if (i == length) { |
|
283 *pMatch = 1; |
|
284 } |
|
285 return s; |
|
286 } |
|
287 |
|
288 /* Note: For XML, whitespace is only ASCII 0x20 (space), |
|
289 0x09 (tab), 0x0d (CR), 0x0a (LF). The base test used here, |
|
290 CXML_Str_Isspace(), includes two other forms of whitespace. */ |
|
291 EXPORT_C NW_Status_t |
|
292 NW_XML_Reader_SkipSpace(NW_XML_Reader_t* pT) |
|
293 { |
|
294 NW_Uint32 c; |
|
295 NW_Status_t s = NW_STAT_SUCCESS; |
|
296 for (;;) { |
|
297 s = NW_XML_Reader_Peek(pT, &c); |
|
298 if (NW_STAT_IS_FAILURE(s)) { |
|
299 break; |
|
300 } |
|
301 if (c > 0xffff) { /* validate casting */ |
|
302 break; |
|
303 } |
|
304 if (!CXML_Str_Isspace((NW_Ucs2)(c & 0xffff))) { |
|
305 break; |
|
306 } |
|
307 s = NW_XML_Reader_Advance(pT); |
|
308 if (NW_STAT_IS_FAILURE(s)) { |
|
309 break; |
|
310 } |
|
311 if (pT->end){ |
|
312 /* At the end so break */ |
|
313 break; |
|
314 } |
|
315 } |
|
316 return s; |
|
317 } |
|
318 |
|
319 /* Note: For XML, whitespace is only ASCII 0x20 (space), |
|
320 0x09 (tab), 0x0d (CR), 0x0a (LF). The base test used here, |
|
321 CXML_Str_Isspace(), includes two other forms of whitespace. */ |
|
322 EXPORT_C NW_Status_t |
|
323 NW_XML_Reader_IsSpace(NW_XML_Reader_t* pT, NW_Uint32* pMatch) |
|
324 { |
|
325 NW_Uint32 c; |
|
326 NW_Status_t s; |
|
327 |
|
328 *pMatch = 0; |
|
329 s = NW_XML_Reader_Peek(pT, &c); |
|
330 if (NW_STAT_IS_FAILURE(s)) { |
|
331 return s; |
|
332 } |
|
333 if (c > 0xffff) { /* validate casting */ |
|
334 return NW_STAT_FAILURE; |
|
335 } |
|
336 if (CXML_Str_Isspace((NW_Ucs2)(c & 0xffff))) { |
|
337 *pMatch = 1; |
|
338 } |
|
339 return NW_STAT_SUCCESS; |
|
340 } |
|
341 |
|
342 EXPORT_C NW_Status_t |
|
343 NW_XML_Reader_IsLetter(NW_XML_Reader_t* pT, NW_Uint32* pMatch) |
|
344 { |
|
345 NW_Uint32 c; |
|
346 NW_Status_t s; |
|
347 |
|
348 *pMatch = 0; |
|
349 s = NW_XML_Reader_Peek(pT, &c); |
|
350 if (NW_STAT_IS_FAILURE(s)) { |
|
351 return s; |
|
352 } |
|
353 /* This is an approximation to what XML charaters are "letter". |
|
354 Everything above the 8-bit range is considered to be a "letter".*/ |
|
355 if (c >= 0x41 && c <= 0x5a) { |
|
356 *pMatch = 1; |
|
357 } |
|
358 else if (c >= 0x61 && c <= 0x7a) { |
|
359 *pMatch = 1; |
|
360 } |
|
361 else if (c >= 0xc0 && c <= 0xd6) { |
|
362 *pMatch = 1; |
|
363 } |
|
364 else if (c >= 0xd8 && c <= 0xf6) { |
|
365 *pMatch = 1; |
|
366 } |
|
367 else if (c >= 0xf8) {/* letters become anything above 0xf8 */ |
|
368 *pMatch = 1; |
|
369 } |
|
370 return NW_STAT_SUCCESS; |
|
371 } |
|
372 |
|
373 /* Note: For XML, digits include not only the ASCII digits but |
|
374 other language forms of digits. The base test used here, |
|
375 CXML_Str_Isdigit() only tests for ASCII digits. */ |
|
376 EXPORT_C NW_Status_t |
|
377 NW_XML_Reader_IsDigit(NW_XML_Reader_t* pT, NW_Uint32* pMatch) |
|
378 { |
|
379 NW_Uint32 c; |
|
380 NW_Status_t s; |
|
381 |
|
382 *pMatch = 0; |
|
383 s = NW_XML_Reader_Peek(pT, &c); |
|
384 if (NW_STAT_IS_FAILURE(s)) { |
|
385 return s; |
|
386 } |
|
387 if (c > 0xffff) {/* validate casting */ |
|
388 return NW_STAT_SUCCESS; |
|
389 } |
|
390 if (CXML_Str_Isdigit((NW_Ucs2)(c & 0xffff))) { |
|
391 *pMatch = 1; |
|
392 } |
|
393 return NW_STAT_SUCCESS; |
|
394 } |
|
395 |
|
396 /* Returns an estimate of the current line and column position in the text. |
|
397 It is an estimate because it has to guess at what the intended line ending |
|
398 sequence is using a count of CR and LF characters. Line and Column indices |
|
399 are 1-based not 0-based. */ |
|
400 EXPORT_C void |
|
401 NW_XML_Reader_GetLineColumn(NW_XML_Reader_t* pT, NW_Uint32* pLine, |
|
402 NW_Uint32* pColumn) |
|
403 { |
|
404 NW_Uint32 crCount, lfCount, charsSinceCR, charsSinceLF; |
|
405 crCount = pT->lineColumn.crCount; |
|
406 lfCount = pT->lineColumn.lfCount; |
|
407 charsSinceCR = pT->lineColumn.charsSinceLastCR; |
|
408 charsSinceLF = pT->lineColumn.charsSinceLastLF; |
|
409 if (crCount == lfCount) { |
|
410 /* assume CR, LF, DOS style */ |
|
411 /* use a bias in favor of CR followed by LF |
|
412 which will give the correct column for DOS */ |
|
413 *pLine = lfCount + 1; |
|
414 *pColumn = charsSinceLF + 1; |
|
415 } else if (lfCount == 0) { |
|
416 /* assume CR only, Unix style */ |
|
417 *pLine = crCount + 1; |
|
418 *pColumn = charsSinceCR + 1; |
|
419 } else if (crCount == 0) { |
|
420 /* assume LF only, Mac style */ |
|
421 *pLine = lfCount + 1; |
|
422 *pColumn = charsSinceLF + 1; |
|
423 } else { |
|
424 /* an unclear situation so use |
|
425 thresholds on the ratio to guess */ |
|
426 NW_Uint32 ratio; |
|
427 ratio = ((crCount * 100) / lfCount); |
|
428 if (ratio > 300) {/* more than 3 to 1 crCount to lfCount */ |
|
429 /* assume CR only, Unix style */ |
|
430 *pLine = crCount + 1; |
|
431 *pColumn = charsSinceCR + 1; |
|
432 } else if (ratio < 33) {/* less than 1 to 3 crCount to lfCount */ |
|
433 /* assume LF only, Mac style */ |
|
434 *pLine = lfCount + 1; |
|
435 *pColumn = charsSinceLF + 1; |
|
436 } else { |
|
437 /* assume CR, LF, DOS style */ |
|
438 /* use a bias in favor of CR, LF sequence (DOS style) |
|
439 which will give the correct column */ |
|
440 *pLine = lfCount + 1; |
|
441 *pColumn = charsSinceLF + 1; |
|
442 } |
|
443 } |
|
444 } |
|
445 |