|
1 // Copyright (c) 2005-2009 Nokia Corporation and/or its subsidiary(-ies). |
|
2 // All rights reserved. |
|
3 // This component and the accompanying materials are made available |
|
4 // under the terms of "Eclipse Public License v1.0" |
|
5 // which accompanies this distribution, and is available |
|
6 // at the URL "http://www.eclipse.org/legal/epl-v10.html". |
|
7 // |
|
8 // Initial Contributors: |
|
9 // Nokia Corporation - initial contribution. |
|
10 // |
|
11 // Contributors: |
|
12 // |
|
13 // Description: |
|
14 // |
|
15 |
|
16 #include <xml/xmlparsererrors.h> |
|
17 #include <xml/taginfo.h> |
|
18 #include <xml/documentparameters.h> |
|
19 #include <xml/contenthandler.h> |
|
20 #include <e32test.h> |
|
21 |
|
22 #include "HtmlParser.h" |
|
23 #include "StringParser.h" |
|
24 #include "badesca.h" |
|
25 #include "HtmlTagsTable.h" |
|
26 |
|
27 using namespace Xml; |
|
28 |
|
29 const TInt KPartialContenttSize = 256; |
|
30 const TInt KDefaultGranularity = 4; |
|
31 const TInt KStartTag = '<' ; |
|
32 const TInt KEndTag = '>'; |
|
33 |
|
34 _LIT8 ( KMetaName, "meta" ); |
|
35 _LIT8 ( KDefaultCharset, "UTF-8" ); |
|
36 _LIT8 ( KXMLName, "?xml" ); |
|
37 |
|
38 LOCAL_C void AttributeArrayDelete( TAny* aPtr ); |
|
39 |
|
40 |
|
41 CHtmlParser::CHtmlParser ( Xml::MContentHandler& aContentHandler, RStringPool& aStringPool ) |
|
42 :iContentHandler ( &aContentHandler ), |
|
43 iStringPool ( aStringPool ), |
|
44 iTagState ( ENoTag ), |
|
45 iParserState ( EInitialState ), |
|
46 iFirst ( ETrue ), |
|
47 iEndDocCalled ( EFalse ) |
|
48 { |
|
49 } |
|
50 |
|
51 CHtmlParser::~CHtmlParser () |
|
52 { |
|
53 delete iPartialContent; |
|
54 delete iTagAttribute; |
|
55 delete iTagAttributeValue; |
|
56 delete iTagQueue; |
|
57 } |
|
58 |
|
59 CHtmlParser* CHtmlParser::NewL ( Xml::MContentHandler& aContentHandler, RStringPool& aStringPool ) |
|
60 { |
|
61 CHtmlParser* self = new (ELeave) CHtmlParser ( aContentHandler, aStringPool ); |
|
62 CleanupStack::PushL ( self ); |
|
63 |
|
64 self->ConstructL (); |
|
65 |
|
66 CleanupStack::Pop (); |
|
67 return self; |
|
68 } |
|
69 |
|
70 |
|
71 void CHtmlParser::ConstructL () |
|
72 { |
|
73 iStringPool.OpenL ( HtmlTagsStringTable::Table ); |
|
74 iTagAttribute = new (ELeave) CDesC8ArrayFlat ( KDefaultGranularity ); |
|
75 iTagAttributeValue = new (ELeave) CDesC8ArrayFlat ( KDefaultGranularity ); |
|
76 iTagQueue = new (ELeave) CDesC8ArrayFlat ( KDefaultGranularity ); |
|
77 |
|
78 iPartialContent = HBufC8::NewL ( KPartialContenttSize ); |
|
79 } |
|
80 |
|
81 void CHtmlParser::ParseL ( const TDesC8& aBuffer, TBool aLastChunk /* = EFalse */ ) |
|
82 { |
|
83 ParseContentL ( aBuffer, aLastChunk ); |
|
84 } |
|
85 |
|
86 void CHtmlParser::CreateTagInfoLC ( RTagInfo& aTagInfo, const TDesC8& aTag ) |
|
87 { |
|
88 RString uriString; |
|
89 RString nameString; |
|
90 RString prefixString; |
|
91 |
|
92 // For HTML there is no uri and prefix. So make it null |
|
93 uriString = iStringPool.OpenStringL( KNullDesC8 ); |
|
94 CleanupClosePushL ( uriString ); |
|
95 prefixString = iStringPool.OpenStringL( KNullDesC8 ); |
|
96 CleanupClosePushL ( prefixString ); |
|
97 |
|
98 // name string ( tag ) |
|
99 nameString = iStringPool.OpenStringL ( aTag ); |
|
100 CleanupClosePushL ( nameString ); |
|
101 |
|
102 // Create tag info with the uri, localname and prefix values. |
|
103 aTagInfo.Open( uriString, prefixString, nameString ); |
|
104 |
|
105 CleanupStack::Pop ( 3 ); |
|
106 |
|
107 CleanupClosePushL( aTagInfo ); |
|
108 } |
|
109 |
|
110 void CHtmlParser::SetContentSink( class Xml::MContentHandler& aContentHandler ) |
|
111 { |
|
112 iContentHandler = &aContentHandler; |
|
113 } |
|
114 |
|
115 // InspectCurrentCharacterL - updates iParserState, iTagState and iInTag |
|
116 TBool CHtmlParser::InspectCurrentCharacter( TChar aChar ) |
|
117 { |
|
118 TBool processCharacter = ETrue; |
|
119 |
|
120 if ( iParserState == EReadingJavascript ) |
|
121 { |
|
122 if ( aChar == '>' ) |
|
123 { |
|
124 iParserState = ESeeEndOfTagWhileReadingJavascript; |
|
125 } |
|
126 |
|
127 } |
|
128 else if ( aChar == '<' ) |
|
129 { |
|
130 iTagState =EOpeningTag; |
|
131 iParserState = ESeeStartOfTag; |
|
132 iInTag =ETrue; |
|
133 } |
|
134 else if ( iInTag ) |
|
135 { |
|
136 if ( aChar == '/' || aChar == '=' || aChar == '"' || aChar == '>' || aChar == '!' || aChar == '\'' ) |
|
137 { |
|
138 SeeSpecialCharactersInTag( aChar, processCharacter ); |
|
139 } |
|
140 else if ( aChar.IsSpace() ) |
|
141 { |
|
142 SeeWhiteSpaceCharacterInTag( processCharacter ); |
|
143 } |
|
144 else if ( iParserState == ESeeExclamationMark && aChar=='-' ) |
|
145 { |
|
146 iParserState = EReadingJavascript; |
|
147 processCharacter = EFalse; |
|
148 } |
|
149 else |
|
150 { |
|
151 SeeOtherCharactersInTag(); |
|
152 } |
|
153 } |
|
154 else |
|
155 { |
|
156 SeeOtherCharactersNotInTag(processCharacter); |
|
157 } |
|
158 |
|
159 return processCharacter; |
|
160 } |
|
161 |
|
162 /** |
|
163 Set the new parser state depends on the previous, when the parser finds a |
|
164 space in between the tags. |
|
165 eg: <a href="/AudioVideo/TT/" class="navlink" shape="rect"> |
|
166 1. Read a -> State EReadingOpeningTag |
|
167 2. Read space -> State EFinishedReadingTag |
|
168 so on... |
|
169 |
|
170 @param aBool& [out] |
|
171 */ |
|
172 void CHtmlParser::SeeWhiteSpaceCharacterInTag( TBool& aBool ) |
|
173 { |
|
174 switch (iParserState) |
|
175 { |
|
176 case EReadingOpeningTag: |
|
177 iParserState = EFinishedReadingTag; |
|
178 aBool = EFalse; |
|
179 break; |
|
180 case EReadingClosingTag: |
|
181 iParserState = EFinishedReadingTag; |
|
182 aBool = EFalse; |
|
183 break; |
|
184 case EReadingAttribute: |
|
185 iParserState = EFinishedReadingAttribute; |
|
186 aBool = EFalse; |
|
187 break; |
|
188 case EReadingAttributeValue: |
|
189 iParserState = EFinishedReadingAttributeValue; |
|
190 break; |
|
191 case EReadingAttributeValueWithinQuot: |
|
192 break; |
|
193 default: |
|
194 break; |
|
195 } |
|
196 } |
|
197 |
|
198 void CHtmlParser::SeeSpecialCharactersInTag( TChar aChar, TBool& aBool ) |
|
199 { |
|
200 switch (aChar) |
|
201 { |
|
202 case '/': |
|
203 if ( ( iParserState == EReadingAttributeValue ) || ( iParserState == EReadingAttributeValueWithinQuot ) || ( iParserState == EReadingAttributeValueWithinApos ) ) |
|
204 { |
|
205 aBool = ETrue; |
|
206 } |
|
207 else if ( iParserState == ESeeStartOfTag ) |
|
208 { |
|
209 iTagState = EClosingTag; |
|
210 iParserState = ESeeClosingTagIndicator; |
|
211 aBool = ETrue; |
|
212 } |
|
213 else if ( iParserState == ESeeEquals ) |
|
214 { |
|
215 // special case. '/' comming after =. The parser will not understand this situation and |
|
216 // will not parse the element. |
|
217 // eg: <form action=/search name=f ... > |
|
218 iParserState = EReadingAttributeValue; |
|
219 } |
|
220 else if ( ( iParserState == EFinishedReadingAttributeValue ) || ( iParserState == EFinishedReadingTag ) ) |
|
221 { |
|
222 iParserState = ESeeClosingTagIndicator; |
|
223 } |
|
224 break; |
|
225 case '=': |
|
226 if( ( iParserState != EReadingAttributeValue ) && ( iParserState != EReadingAttributeValueWithinQuot ) && ( iParserState != EReadingAttributeValueWithinApos ) ) |
|
227 { |
|
228 iParserState = ESeeEquals; |
|
229 aBool = EFalse; |
|
230 } |
|
231 else |
|
232 { |
|
233 aBool = ETrue; |
|
234 } |
|
235 break; |
|
236 case '"': |
|
237 if ( iParserState == ESeeEquals ) |
|
238 { |
|
239 iParserState = EReadingAttributeValueWithinQuot; |
|
240 aBool = EFalse; |
|
241 } |
|
242 else |
|
243 { |
|
244 iParserState = EFinishedReadingAttributeValue; |
|
245 } |
|
246 break; |
|
247 case '\'': |
|
248 if ( iParserState == ESeeEquals ) |
|
249 { |
|
250 iParserState = EReadingAttributeValueWithinApos; |
|
251 aBool = EFalse; |
|
252 } |
|
253 else |
|
254 { |
|
255 iParserState = EFinishedReadingAttributeValue; |
|
256 } |
|
257 break; |
|
258 |
|
259 case '>': |
|
260 iParserState = ESeeEndOfTag; |
|
261 iTagState = EClosingTag; |
|
262 iInTag = EFalse; |
|
263 break; |
|
264 case '!': |
|
265 if(iParserState == ESeeStartOfTag) |
|
266 { |
|
267 iParserState = ESeeExclamationMark; |
|
268 aBool = EFalse; |
|
269 } |
|
270 break; |
|
271 |
|
272 default: |
|
273 break; |
|
274 } |
|
275 } |
|
276 |
|
277 void CHtmlParser::SeeOtherCharactersInTag() |
|
278 { |
|
279 switch (iParserState) |
|
280 { |
|
281 case ESeeStartOfTag: |
|
282 iParserState = EReadingOpeningTag; |
|
283 break; |
|
284 case ESeeClosingTagIndicator: |
|
285 iParserState = EReadingClosingTag; |
|
286 break; |
|
287 case ESeeEquals: |
|
288 iParserState = EReadingAttributeValue; |
|
289 break; |
|
290 case EFinishedReadingTag: |
|
291 iParserState = EReadingAttribute; |
|
292 break; |
|
293 case EFinishedReadingAttribute: |
|
294 iParserState = EReadingAttribute; |
|
295 break; |
|
296 case EFinishedReadingAttributeValue: |
|
297 iParserState = EReadingAttribute; |
|
298 break; |
|
299 default: |
|
300 break; |
|
301 } |
|
302 } |
|
303 |
|
304 void CHtmlParser::SeeOtherCharactersNotInTag( TBool& aBool ) |
|
305 { |
|
306 switch (iParserState) |
|
307 { |
|
308 case EReadingText: |
|
309 aBool = EFalse; |
|
310 break; |
|
311 case ESeeEndOfTag: |
|
312 iParserState = EReadingText; |
|
313 iTagState = EText; |
|
314 break; |
|
315 case EInitialState: |
|
316 iParserState = EReadingText; |
|
317 break; |
|
318 default: |
|
319 break; |
|
320 } |
|
321 } |
|
322 |
|
323 /** |
|
324 Parses the content and fetches the buffer in between tags or HTML text. |
|
325 The string can be partial ie unable to find the end tag or unable to find |
|
326 the opening tag in case of HTML text. In this case the pending buffer will be |
|
327 saved and will be parsed during the next call. |
|
328 |
|
329 @param aContent [out] Content to be parsed. |
|
330 */ |
|
331 void CHtmlParser::ParseContentL ( const TDesC8& aContent, TBool aLastChunk /* = EFalse */ ) |
|
332 { |
|
333 |
|
334 CStringParser* parser = CStringParser::NewL ( aContent ); |
|
335 CleanupStack::PushL ( parser ); |
|
336 |
|
337 if ( iFirst ) |
|
338 { |
|
339 TBool xmlFound; |
|
340 TPtrC8 charset ( KDefaultCharset ); |
|
341 TRAPD ( err, ExtractCharsetL ( aContent, charset, xmlFound ) ); |
|
342 TChar ch; |
|
343 parser->GetCurrentCharacter ( ch ); |
|
344 if ( ( err == KErrNotFound ) && ( ch != KStartTag ) ) |
|
345 { |
|
346 CleanupStack::PopAndDestroy ( parser ); |
|
347 // Maybe an invalid html document. First character must be '<' |
|
348 return; |
|
349 } |
|
350 |
|
351 CallStartDocumentL ( charset ); |
|
352 iFirst = EFalse; |
|
353 |
|
354 if ( xmlFound ) |
|
355 { |
|
356 // ignore that line ( <?xml ... ?> ) |
|
357 parser->ParseTill ( charset, KEndTag ); |
|
358 parser->SkipLength ( 1 ); // skip '>' |
|
359 } |
|
360 } |
|
361 |
|
362 |
|
363 // Process the partial content which is saved in the last call. |
|
364 TRAPD ( err, ProcessPartialContentL ( *parser ) ); |
|
365 if ( err != KErrNone ) |
|
366 { |
|
367 AddToPartialContentL ( aContent, ETrue ); |
|
368 CleanupStack::PopAndDestroy ( parser ); |
|
369 return; |
|
370 } |
|
371 |
|
372 TChar ch; |
|
373 while ( parser->GetCurrentCharacter( ch ) ) |
|
374 { |
|
375 |
|
376 if ( InspectCurrentCharacter ( ch ) ) |
|
377 { |
|
378 switch ( iTagState ) |
|
379 { |
|
380 case EOpeningTag: |
|
381 { |
|
382 TPtrC8 TaggedBuffer; |
|
383 |
|
384 if ( ch == KStartTag ) |
|
385 { |
|
386 parser->SkipLength ( 1 ); // skip '<' |
|
387 } |
|
388 // The parser relies on the assumption that there will not be any '>' in between the |
|
389 // tagged content. But there is a chance that attribute value can contain a '<'. The |
|
390 // parsing will not happen properly in that case. |
|
391 // eg. <tag name="value>abc" > |
|
392 TBool success = parser->ParseTill ( TaggedBuffer, KEndTag ); |
|
393 ParseTaggedBufferL ( TaggedBuffer, !success ); // handle tagged buffer |
|
394 break; |
|
395 } |
|
396 case EClosingTag: |
|
397 parser->SkipLength ( 1 ); // skip '>' |
|
398 break; |
|
399 case EText: |
|
400 { |
|
401 TPtrC8 Text; |
|
402 if ( parser->ParseTill ( Text, KStartTag ) ) |
|
403 { |
|
404 ProcessTextL ( Text ); // Handle text. |
|
405 } |
|
406 else |
|
407 { |
|
408 AddToPartialContentL ( Text ); |
|
409 } |
|
410 } |
|
411 break; |
|
412 default: |
|
413 parser->SkipLength ( 1 ); |
|
414 break; |
|
415 } |
|
416 } |
|
417 else |
|
418 { |
|
419 parser->SkipLength ( 1 ); |
|
420 } |
|
421 |
|
422 if ( iEndDocCalled ) |
|
423 { |
|
424 break; // </html> has already been reached. No further processing. |
|
425 } |
|
426 } |
|
427 |
|
428 // This is the last chunk. OnEndDocumentL may have been called when it arrives for </html> |
|
429 // In case if it cannot find </html> call OnEndDocumentL here. |
|
430 if ( aLastChunk && !iEndDocCalled ) |
|
431 { |
|
432 iContentHandler->OnEndDocumentL ( KErrNone ); |
|
433 } |
|
434 CleanupStack::Pop ( parser ); |
|
435 delete parser; |
|
436 } |
|
437 |
|
438 /** |
|
439 Parses the string inbetween the start tag and end tag. |
|
440 Fetches the HTML tags, attribute, values or java content. |
|
441 |
|
442 @param aTaggedBuffer [in] Content inbetween start and end tag. |
|
443 @Param aPartial [in] ETrue for partial tagged content. |
|
444 */ |
|
445 void CHtmlParser::ParseTaggedBufferL ( const TPtrC8& aTaggedBuffer, TBool aPartial /* = EFalse */ ) |
|
446 { |
|
447 |
|
448 // Will add to the partial content variable and return. |
|
449 if ( aPartial ) |
|
450 { |
|
451 AddToPartialContentL ( aTaggedBuffer, ETrue ); |
|
452 return; |
|
453 } |
|
454 |
|
455 CStringParser* parser = CStringParser::NewL ( aTaggedBuffer ); |
|
456 CleanupStack::PushL ( parser ); |
|
457 |
|
458 TPtrC8 htmltag; |
|
459 TBool callEndElement = EFalse; |
|
460 TBool callStartElement = EFalse; |
|
461 TChar ch; |
|
462 |
|
463 while ( parser->GetCurrentCharacter( ch ) ) |
|
464 { |
|
465 InspectCurrentCharacter ( ch ); |
|
466 |
|
467 switch ( iParserState ) |
|
468 { |
|
469 case ESeeExclamationMark: |
|
470 { |
|
471 TPtrC8 text; |
|
472 |
|
473 TChar ch; |
|
474 parser->GetNextCharacter( ch ); |
|
475 |
|
476 if ( ch == '-' ) |
|
477 { |
|
478 parser->SkipLength ( 2 ); // skip -- |
|
479 parser->ParseTill ( text, _L8 ("/-") ); |
|
480 } |
|
481 else |
|
482 { |
|
483 parser->GetRemainder ( text ); |
|
484 } |
|
485 |
|
486 iContentHandler->OnContentL( text, KErrNone ); // This is a java script. |
|
487 |
|
488 parser->GetRemainder ( text ); |
|
489 } |
|
490 break; |
|
491 |
|
492 case ESeeClosingTagIndicator: |
|
493 { |
|
494 TPtrC8 tag; |
|
495 parser->SkipLength ( 1 ); // skip '/' |
|
496 parser->GetRemainder ( tag ); |
|
497 // check the tag in the array and remove it. |
|
498 if ( !CheckAndRemoveTagL ( tag ) ) |
|
499 { |
|
500 iContentHandler->OnError( EXmlTagMismatch ); |
|
501 } |
|
502 else |
|
503 { |
|
504 if ( tag.Length () ) |
|
505 { |
|
506 htmltag.Set ( tag ); |
|
507 } |
|
508 callEndElement = ETrue; |
|
509 } |
|
510 } |
|
511 break; |
|
512 |
|
513 case EReadingOpeningTag: |
|
514 { |
|
515 parser->ParseTill( htmltag, _L8 ("\t\n ") ); |
|
516 |
|
517 // check whether it is an optional tag. |
|
518 // Only the tags which is defined as optional in the HTML 4.01 specs is |
|
519 // considered. Any other tag will not be considered optional. |
|
520 // In case of any other tag ( other than defined in specs ) which has optional behaviour |
|
521 // and if no end tag specified for it, the end tag will can get nested. |
|
522 if ( IsOptionalTagL ( htmltag ) ) |
|
523 { |
|
524 CheckAndProcessLastOptionalTagL ( htmltag ); |
|
525 } |
|
526 |
|
527 iTagQueue->AppendL ( htmltag ); // append to tag array |
|
528 callStartElement = ETrue; |
|
529 } |
|
530 break; |
|
531 |
|
532 case EReadingAttribute: |
|
533 { |
|
534 TPtrC8 attribute; |
|
535 parser->ParseTill( attribute, _L8 (" \n\t\r=") ); |
|
536 iTagAttribute->AppendL ( attribute ); // append to attribute array. |
|
537 |
|
538 TChar ch; |
|
539 TBool success = parser->GetCurrentCharacter ( ch ); |
|
540 |
|
541 if ( !success || ( ch != '=') ) |
|
542 { |
|
543 // This attribute doesn't have an attribute value. |
|
544 // The attribute value will be same as attribute name. |
|
545 iTagAttributeValue->AppendL ( attribute ); |
|
546 } |
|
547 } |
|
548 break; |
|
549 |
|
550 case EReadingAttributeValue: |
|
551 { |
|
552 TPtrC8 attributeval; |
|
553 parser->ParseTill( attributeval, _L8 ( " \t\r\n" ) ); |
|
554 iTagAttributeValue->AppendL ( attributeval ); // append to attribute value array |
|
555 } |
|
556 break; |
|
557 |
|
558 case EReadingAttributeValueWithinQuot: |
|
559 { |
|
560 parser->SkipLength ( 1 ); // skip '"' |
|
561 TPtrC8 attributeval; |
|
562 parser->ParseTill( attributeval, '"' ); |
|
563 iTagAttributeValue->AppendL ( attributeval ); // append to attribute value array |
|
564 } |
|
565 break; |
|
566 |
|
567 case EReadingAttributeValueWithinApos: |
|
568 { |
|
569 parser->SkipLength ( 1 ); // skip ''' |
|
570 TPtrC8 attributeval; |
|
571 parser->ParseTill( attributeval, '\'' ); |
|
572 iTagAttributeValue->AppendL ( attributeval ); // append to attribute value array |
|
573 } |
|
574 break; |
|
575 |
|
576 default: |
|
577 parser->SkipLength ( 1 ); |
|
578 break; |
|
579 } |
|
580 }; |
|
581 |
|
582 // Forbidden tags need to be closed immediatly. Check whether it |
|
583 // has already closed otherwise close it. |
|
584 if ( !callEndElement && IsForbiddenTagL ( htmltag ) ) |
|
585 { |
|
586 CheckAndRemoveTagL ( htmltag ); |
|
587 callEndElement = ETrue; |
|
588 } |
|
589 |
|
590 // Tell the XML framework about the tag and attribute/value. |
|
591 if ( callStartElement ) |
|
592 { |
|
593 CallStartElementL ( htmltag ); |
|
594 } |
|
595 |
|
596 // Calling end element... ( End tag ) </html> |
|
597 if ( callEndElement ) |
|
598 { |
|
599 CallEndElementL ( htmltag ); |
|
600 |
|
601 if ( !htmltag.CompareF ( _L8 ("html" ) ) ) |
|
602 { |
|
603 iContentHandler->OnEndDocumentL ( KErrNone ); |
|
604 iEndDocCalled = ETrue; |
|
605 } |
|
606 } |
|
607 |
|
608 |
|
609 iTagAttribute->Delete ( 0, iTagAttribute->Count () ); |
|
610 iTagAttributeValue->Delete ( 0, iTagAttributeValue->Count () ); |
|
611 |
|
612 CleanupStack::Pop ( parser ); |
|
613 delete parser; |
|
614 } |
|
615 |
|
616 /** |
|
617 Inform the CHtmlParser about the HTML text. |
|
618 |
|
619 @param aText [in] Document text. |
|
620 */ |
|
621 void CHtmlParser::ProcessTextL ( const TPtrC8& aText ) |
|
622 { |
|
623 iContentHandler->OnContentL ( aText, KErrNone ); |
|
624 } |
|
625 |
|
626 /** |
|
627 Process the partial content, which is saved in the last call with the new content. |
|
628 |
|
629 @param aParser [in,out] String parser object. |
|
630 */ |
|
631 void CHtmlParser::ProcessPartialContentL ( CStringParser& aParser ) |
|
632 { |
|
633 if ( iPartialContent->Des().Length () > 0 ) |
|
634 { |
|
635 switch ( iTagState ) |
|
636 { |
|
637 case EOpeningTag: |
|
638 { |
|
639 TPtrC8 TaggedBuffer; |
|
640 TBool success = aParser.ParseTill ( TaggedBuffer, KEndTag ); |
|
641 if ( success ) |
|
642 { |
|
643 AddToPartialContentL ( TaggedBuffer, ETrue ); |
|
644 ParseTaggedBufferL ( iPartialContent->Des(), EFalse ); |
|
645 } |
|
646 else |
|
647 { |
|
648 User::Leave ( KErrNotFound ); |
|
649 } |
|
650 } |
|
651 break; |
|
652 case EText: |
|
653 { |
|
654 TPtrC8 Text; |
|
655 if ( aParser.ParseTill ( Text, KStartTag ) ) |
|
656 { |
|
657 AddToPartialContentL ( Text, ETrue ); |
|
658 ProcessTextL ( iPartialContent->Des() ); // Handle text. |
|
659 } |
|
660 else |
|
661 { |
|
662 User::Leave ( KErrNotFound ); |
|
663 } |
|
664 } |
|
665 break; |
|
666 default: |
|
667 // shouldn't come here. |
|
668 User::Panic ( _L ("Html parser - partial content processing error."), KErrUnknown ); |
|
669 break; |
|
670 } |
|
671 } |
|
672 iPartialContent->Des().Zero (); // zero the partial content length. |
|
673 } |
|
674 |
|
675 /** |
|
676 Add or append to the partial buffer. |
|
677 |
|
678 @param aContent [in] Partial content to save. |
|
679 @param aAppend [in] ETrue to append to the existing partial buffer. |
|
680 */ |
|
681 void CHtmlParser::AddToPartialContentL ( const TPtrC8& aContent, TBool aAppend /* = EFalse */ ) |
|
682 { |
|
683 if ( aContent.Length () == 0 ) |
|
684 { |
|
685 return; |
|
686 } |
|
687 |
|
688 TInt len = aContent.Length () + iPartialContent->Des().Length(); |
|
689 if ( iPartialContent->Des().MaxSize () < len ) |
|
690 { |
|
691 iPartialContent = iPartialContent->ReAllocL ( len ); |
|
692 } |
|
693 |
|
694 |
|
695 if ( aAppend ) |
|
696 { |
|
697 iPartialContent->Des().Append( aContent ); |
|
698 } |
|
699 else |
|
700 { |
|
701 *iPartialContent = aContent; // Partial text. Save the text. |
|
702 } |
|
703 } |
|
704 |
|
705 /** |
|
706 Check the aTag ( if it is not empty ) in the tag array and will remove it from the array. |
|
707 |
|
708 @param aTag [in] Tag value need to be checked. |
|
709 |
|
710 @return TBool ETrue Found and successfully removed. |
|
711 */ |
|
712 TBool CHtmlParser::CheckAndRemoveTagL ( const TPtrC8& aTag ) |
|
713 { |
|
714 |
|
715 TInt count = iTagQueue->Count (); |
|
716 if ( !count ) |
|
717 { |
|
718 return EFalse; |
|
719 } |
|
720 |
|
721 if ( !aTag.Length () ) |
|
722 { |
|
723 // Remove the last tag added. |
|
724 // For eg: <img src="picture.jpg"/> |
|
725 // "img" tag has been added and need to remove it from the array. |
|
726 TPtrC8 temp ( iTagQueue->MdcaPoint ( count - 1 ) ); |
|
727 iTagQueue->Delete ( count - 1 ); |
|
728 return ETrue; |
|
729 } |
|
730 |
|
731 // Iterate through the tag array from end till it finds the tag. |
|
732 // Remove the elements that is mis-matching from the array. If the tag array |
|
733 // count is zero and if it is unable to find the tag, then there is a tag |
|
734 // mis-match. |
|
735 while ( count ) |
|
736 { |
|
737 TBool found = ETrue; |
|
738 TPtrC8 temp ( iTagQueue->MdcaPoint ( count - 1 ) ); |
|
739 if ( aTag.Compare ( temp ) ) |
|
740 { |
|
741 // Tag mis-match. |
|
742 found = EFalse; |
|
743 CallEndElementL ( temp ); // no end tag. Client should know about this. |
|
744 } |
|
745 // Remove the tag. |
|
746 iTagQueue->Delete ( count - 1 ); |
|
747 |
|
748 if ( found ) |
|
749 { |
|
750 return ETrue; |
|
751 } |
|
752 count = iTagQueue->Count (); |
|
753 } |
|
754 |
|
755 return EFalse; // Tag is not present. |
|
756 } |
|
757 |
|
758 /** |
|
759 Creates RTagInfo & RAttributeArray and tell XML framework |
|
760 |
|
761 @param aTag [in] Tag value. |
|
762 */ |
|
763 void CHtmlParser::CallStartElementL ( const TDesC8& aTag ) |
|
764 { |
|
765 // Create tag info with the uri, localname and prefix values. |
|
766 RTagInfo tagInfo; |
|
767 CreateTagInfoLC ( tagInfo, aTag ); |
|
768 |
|
769 RAttributeArray attributes; |
|
770 CleanupStack::PushL( TCleanupItem( AttributeArrayDelete, &attributes ) ); |
|
771 |
|
772 for ( TInt i = 0; i < iTagAttribute->Count(); ++i ) |
|
773 { |
|
774 RString nameString; |
|
775 RString valueString; |
|
776 RString uriString; |
|
777 RString prefixString; |
|
778 |
|
779 // Create RString's for attribute name/value |
|
780 nameString = iStringPool.OpenStringL ( (*iTagAttribute)[i] ); |
|
781 CleanupClosePushL ( nameString ); |
|
782 valueString = iStringPool.OpenStringL ( (*iTagAttributeValue)[i] ); |
|
783 CleanupClosePushL ( valueString ); |
|
784 |
|
785 // For HTML there is no uri and prefix. So make it null |
|
786 uriString = iStringPool.OpenStringL( KNullDesC8 ); |
|
787 CleanupClosePushL ( uriString ); |
|
788 prefixString = iStringPool.OpenStringL( KNullDesC8 ); |
|
789 CleanupClosePushL ( prefixString ); |
|
790 |
|
791 RAttribute attribute; |
|
792 attribute.Open( uriString, prefixString, nameString, valueString ); |
|
793 |
|
794 CleanupStack::Pop ( 4 ); |
|
795 CleanupClosePushL(attribute); |
|
796 |
|
797 User::LeaveIfError(attributes.Append(attribute)); |
|
798 |
|
799 CleanupStack::Pop(); |
|
800 } |
|
801 |
|
802 |
|
803 // Tell the framework. |
|
804 iContentHandler->OnStartElementL ( tagInfo, attributes, KErrNone ); |
|
805 CleanupStack::PopAndDestroy ( &attributes ); |
|
806 CleanupStack::PopAndDestroy ( &tagInfo ); |
|
807 } |
|
808 |
|
809 /** |
|
810 Creates RTagInfo and tells the framework. |
|
811 |
|
812 @param aTag [in] Tag value. |
|
813 */ |
|
814 void CHtmlParser::CallEndElementL ( const TDesC8& aTag ) |
|
815 { |
|
816 |
|
817 if ( !aTag.Length () ) |
|
818 { |
|
819 return; |
|
820 } |
|
821 |
|
822 // Create tag info with the uri, localname and prefix values. |
|
823 RTagInfo tagInfo; |
|
824 |
|
825 CreateTagInfoLC( tagInfo, aTag ); |
|
826 |
|
827 // Tell the framework |
|
828 iContentHandler->OnEndElementL ( tagInfo, KErrNone ); |
|
829 |
|
830 CleanupStack::PopAndDestroy ( &tagInfo ); |
|
831 } |
|
832 |
|
833 /** |
|
834 Creates RDocumentParams and tells the framework. |
|
835 */ |
|
836 void CHtmlParser::CallStartDocumentL ( const TDesC8& aCharset ) |
|
837 { |
|
838 RString encodingString = iStringPool.OpenStringL ( aCharset ); |
|
839 CleanupClosePushL ( encodingString ); |
|
840 |
|
841 RDocumentParameters docparams; |
|
842 |
|
843 docparams.Open ( encodingString ); |
|
844 CleanupStack::Pop (); // pop encodingString |
|
845 CleanupClosePushL ( docparams ); |
|
846 |
|
847 // Tell the framework. |
|
848 iContentHandler->OnStartDocumentL( docparams, KErrNone ); |
|
849 |
|
850 CleanupStack::PopAndDestroy (); |
|
851 } |
|
852 |
|
853 void CHtmlParser::CheckAndProcessLastOptionalTagL ( const TDesC8& aTag ) |
|
854 { |
|
855 TInt count = iTagQueue->Count(); |
|
856 // Tag queue is empty. Nothing to do. |
|
857 if ( !count ) |
|
858 { |
|
859 return; |
|
860 } |
|
861 |
|
862 TPtrC8 temp( iTagQueue->MdcaPoint ( count - 1 ) ); |
|
863 |
|
864 if ( !temp.Compare ( aTag ) ) |
|
865 { |
|
866 CallEndElementL ( aTag ); // Optional tag without an end tag. Should be closed. |
|
867 iTagQueue->Delete( count - 1 );// delete it from the array. |
|
868 } |
|
869 } |
|
870 |
|
871 TBool CHtmlParser::IsOptionalTagL ( const TDesC8& aTag ) |
|
872 { |
|
873 for ( TInt i = HtmlTagsStringTable::EBody; i < HtmlTagsStringTable::EArea; ++i ) |
|
874 { |
|
875 RString string = iStringPool.String( i, HtmlTagsStringTable::Table ); |
|
876 CleanupClosePushL ( string ); |
|
877 TInt result = aTag.Compare ( string.DesC() ); |
|
878 CleanupStack::PopAndDestroy ( 1 ); |
|
879 if ( !result ) |
|
880 { |
|
881 return ETrue; |
|
882 } |
|
883 } |
|
884 return EFalse; |
|
885 } |
|
886 |
|
887 TBool CHtmlParser::IsForbiddenTagL ( const TDesC8& aTag ) |
|
888 { |
|
889 |
|
890 for ( TInt i = HtmlTagsStringTable::EArea; i <= HtmlTagsStringTable::EParam; ++i ) |
|
891 { |
|
892 RString string = iStringPool.String( i, HtmlTagsStringTable::Table ); |
|
893 CleanupClosePushL ( string ); |
|
894 TInt result = aTag.Compare ( string.DesC() ); |
|
895 CleanupStack::PopAndDestroy ( 1 ); |
|
896 if ( !result ) |
|
897 { |
|
898 return ETrue; |
|
899 } |
|
900 |
|
901 } |
|
902 return EFalse; |
|
903 } |
|
904 |
|
905 void CHtmlParser::CheckAndProcessForbiddenTagL ( const TDesC8& aTag ) |
|
906 { |
|
907 TInt count = iTagQueue->Count(); |
|
908 if ( count ) |
|
909 { |
|
910 CallEndElementL ( aTag ); // Forbidden tag without an end tag. Should be closed. |
|
911 iTagQueue->Delete( count - 1 );// delete it from the array. |
|
912 } |
|
913 } |
|
914 |
|
915 |
|
916 void CHtmlParser::ExtractCharsetL ( const TDesC8& aContent, TPtrC8& aCharset, TBool& aXMLFound ) |
|
917 { |
|
918 aXMLFound = EFalse; |
|
919 TBool meta = ETrue; |
|
920 TInt pos = aContent.FindF ( KMetaName ); |
|
921 if ( pos == KErrNotFound ) |
|
922 { |
|
923 pos = aContent.FindF ( KXMLName ); |
|
924 if ( pos == KErrNotFound ) |
|
925 { |
|
926 User::Leave ( pos ); |
|
927 } |
|
928 meta = EFalse; |
|
929 aXMLFound = ETrue; |
|
930 } |
|
931 |
|
932 TPtrC8 contentPtr ( aContent.Mid ( pos ) ); |
|
933 ExtractCharsetValueL ( contentPtr, meta ? _L8 ( "charset" ) : _L8( "encoding" ), aCharset ); |
|
934 } |
|
935 |
|
936 /** |
|
937 Extract the charset value from the HTML content. This function will be called only |
|
938 if the document is of HTML type. Function will leave if is unable to find "charset" or "encoding". |
|
939 |
|
940 */ |
|
941 void CHtmlParser::ExtractCharsetValueL ( const TDesC8& aContent, const TDesC8& aSearchValue, TPtrC8& aCharset ) |
|
942 { |
|
943 |
|
944 TInt pos = aContent.FindF ( aSearchValue ); |
|
945 if ( pos == KErrNotFound ) |
|
946 { |
|
947 User::Leave ( pos ); |
|
948 } |
|
949 |
|
950 TPtrC8 contentPtr ( aContent.Mid ( pos ) ); |
|
951 CStringParser* parser = CStringParser::NewLC ( contentPtr ); |
|
952 TPtrC8 value; |
|
953 |
|
954 if ( parser->ParseTill ( value, '=' ) ) |
|
955 { |
|
956 parser->SkipLength ( 1 ); // skip '=' |
|
957 TChar ch; |
|
958 parser->GetCurrentCharacter ( ch ); |
|
959 if ( ch == '\"' ) |
|
960 { |
|
961 parser->SkipLength ( 1 ); // skip '\"' |
|
962 } |
|
963 |
|
964 parser->ParseTill ( aCharset, _L8 ("\"\r\n" ) ); |
|
965 } |
|
966 else |
|
967 { |
|
968 CleanupStack::PopAndDestroy ( parser ); |
|
969 User::Leave ( pos ); |
|
970 } |
|
971 |
|
972 CleanupStack::PopAndDestroy ( parser ); |
|
973 } |
|
974 |
|
975 LOCAL_C void AttributeArrayDelete( TAny* aPtr ) |
|
976 { |
|
977 RAttributeArray& attributes = *( RAttributeArray* )aPtr; |
|
978 |
|
979 TInt nAttributes = attributes.Count(); |
|
980 for( TInt i=0; i < nAttributes; ++i ) |
|
981 { |
|
982 attributes[i].Close(); |
|
983 } |
|
984 attributes.Close(); |
|
985 } |
|
986 |
|
987 |
|
988 |