|
1 /* GStreamer SAMI subtitle parser |
|
2 * Copyright (c) 2006 Young-Ho Cha <ganadist at chollian net> |
|
3 * |
|
4 * This library is free software; you can redistribute it and/or |
|
5 * modify it under the terms of the GNU Library General Public |
|
6 * License as published by the Free Software Foundation; either |
|
7 * version 2 of the License, or (at your option) any later version. |
|
8 * |
|
9 * This library is distributed in the hope that it will be useful, |
|
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
12 * Library General Public License for more details. |
|
13 * |
|
14 * You should have received a copy of the GNU Library General Public |
|
15 * License along with this library; if not, write to the |
|
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
|
17 * Boston, MA 02111-1307, USA. |
|
18 */ |
|
19 |
|
20 #include "samiparse.h" |
|
21 |
|
22 /* FIXME: use Makefile stuff */ |
|
23 #ifndef GST_DISABLE_XML |
|
24 #include <libxml/HTMLparser.h> |
|
25 #include <string.h> |
|
26 |
|
27 #define ITALIC_TAG 'i' |
|
28 #define SPAN_TAG 's' |
|
29 #define RUBY_TAG 'r' |
|
30 #define RT_TAG 't' |
|
31 #define CLEAR_TAG '0' |
|
32 |
|
33 typedef struct _GstSamiContext GstSamiContext; |
|
34 |
|
35 struct _GstSamiContext |
|
36 { |
|
37 GString *buf; /* buffer to collect content */ |
|
38 GString *rubybuf; /* buffer to collect ruby content */ |
|
39 GString *resultbuf; /* when opening the next 'sync' tag, move |
|
40 * from 'buf' to avoid to append following |
|
41 * content */ |
|
42 GString *state; /* in many sami files there are tags that |
|
43 * are not closed, so for each open tag the |
|
44 * parser will append a tag flag here so |
|
45 * that tags can be closed properly on |
|
46 * 'sync' tags. See _context_push_state() |
|
47 * and _context_pop_state(). */ |
|
48 htmlParserCtxtPtr htmlctxt; /* html parser context */ |
|
49 gboolean has_result; /* set when ready to push out result */ |
|
50 gboolean in_title; /* flag to avoid appending the title content |
|
51 * to buf */ |
|
52 guint64 time1; /* previous start attribute in sync tag */ |
|
53 guint64 time2; /* current start attribute in sync tag */ |
|
54 }; |
|
55 |
|
56 static gchar * |
|
57 has_tag (GString * str, const gchar tag) |
|
58 { |
|
59 return strrchr (str->str, tag); |
|
60 } |
|
61 |
|
62 static void |
|
63 sami_context_push_state (GstSamiContext * sctx, char state) |
|
64 { |
|
65 g_string_append_c (sctx->state, state); |
|
66 } |
|
67 |
|
68 static void |
|
69 sami_context_pop_state (GstSamiContext * sctx, char state) |
|
70 { |
|
71 GString *str = g_string_new (""); |
|
72 GString *context_state = sctx->state; |
|
73 int i; |
|
74 |
|
75 for (i = context_state->len - 1; i >= 0; i--) { |
|
76 switch (context_state->str[i]) { |
|
77 case ITALIC_TAG: /* <i> */ |
|
78 { |
|
79 g_string_append (str, "</i>"); |
|
80 break; |
|
81 } |
|
82 case SPAN_TAG: /* <span foreground= > */ |
|
83 { |
|
84 g_string_append (str, "</span>"); |
|
85 break; |
|
86 } |
|
87 case RUBY_TAG: /* <span size= > -- ruby */ |
|
88 { |
|
89 break; |
|
90 } |
|
91 case RT_TAG: /* ruby */ |
|
92 { |
|
93 /* FIXME: support for furigana/ruby once implemented in pango */ |
|
94 g_string_append (sctx->rubybuf, "</span>"); |
|
95 if (has_tag (context_state, ITALIC_TAG)) { |
|
96 g_string_append (sctx->rubybuf, "</i>"); |
|
97 } |
|
98 |
|
99 break; |
|
100 } |
|
101 default: |
|
102 break; |
|
103 } |
|
104 if (context_state->str[i] == state) { |
|
105 g_string_append (sctx->buf, str->str); |
|
106 g_string_free (str, TRUE); |
|
107 g_string_truncate (context_state, i); |
|
108 return; |
|
109 } |
|
110 } |
|
111 if (state == CLEAR_TAG) { |
|
112 g_string_append (sctx->buf, str->str); |
|
113 g_string_truncate (context_state, 0); |
|
114 } |
|
115 g_string_free (str, TRUE); |
|
116 } |
|
117 |
|
118 static void |
|
119 handle_start_sync (GstSamiContext * sctx, const xmlChar ** atts) |
|
120 { |
|
121 int i; |
|
122 |
|
123 sami_context_pop_state (sctx, CLEAR_TAG); |
|
124 if (atts != NULL) { |
|
125 for (i = 0; (atts[i] != NULL); i += 2) { |
|
126 const xmlChar *key, *value; |
|
127 |
|
128 key = atts[i]; |
|
129 value = atts[i + 1]; |
|
130 |
|
131 if (!value) |
|
132 continue; |
|
133 if (!xmlStrncmp ((const xmlChar *) "start", key, 5)) { |
|
134 sctx->time1 = sctx->time2; |
|
135 sctx->time2 = atoi ((const char *) value) * GST_MSECOND; |
|
136 sctx->has_result = TRUE; |
|
137 g_string_append (sctx->resultbuf, sctx->buf->str); |
|
138 g_string_truncate (sctx->buf, 0); |
|
139 } |
|
140 } |
|
141 } |
|
142 } |
|
143 |
|
144 static void |
|
145 handle_start_font (GstSamiContext * sctx, const xmlChar ** atts) |
|
146 { |
|
147 int i; |
|
148 |
|
149 sami_context_pop_state (sctx, SPAN_TAG); |
|
150 if (atts != NULL) { |
|
151 g_string_append (sctx->buf, "<span"); |
|
152 for (i = 0; (atts[i] != NULL); i += 2) { |
|
153 const xmlChar *key, *value; |
|
154 |
|
155 key = atts[i]; |
|
156 value = atts[i + 1]; |
|
157 |
|
158 if (!value) |
|
159 continue; |
|
160 if (!xmlStrncmp ((const xmlChar *) "color", key, 5)) { |
|
161 /* |
|
162 * There are invalid color value in many |
|
163 * sami files. |
|
164 * It will fix hex color value that start without '#' |
|
165 */ |
|
166 gchar *sharp = ""; |
|
167 int len = xmlStrlen (value); |
|
168 |
|
169 if (!(*value == '#' && len == 7)) { |
|
170 gchar *r; |
|
171 |
|
172 /* check if it looks like hex */ |
|
173 if (strtol ((const char *) value, &r, 16) >= 0 && |
|
174 ((xmlChar *) r == (value + 6) && len == 6)) { |
|
175 sharp = "#"; |
|
176 } |
|
177 } |
|
178 /* some colours can be found in many sami files, but X RGB database |
|
179 * doesn't contain a colour by this name, so map explicitly */ |
|
180 if (!xmlStrncasecmp (value, (const xmlChar *) "aqua", len)) { |
|
181 value = (const xmlChar *) "#00ffff"; |
|
182 } else if (!xmlStrncasecmp (value, (const xmlChar *) "crimson", len)) { |
|
183 value = (const xmlChar *) "#dc143c"; |
|
184 } else if (!xmlStrncasecmp (value, (const xmlChar *) "fuchsia", len)) { |
|
185 value = (const xmlChar *) "#ff00ff"; |
|
186 } else if (!xmlStrncasecmp (value, (const xmlChar *) "indigo", len)) { |
|
187 value = (const xmlChar *) "#4b0082"; |
|
188 } else if (!xmlStrncasecmp (value, (const xmlChar *) "lime", len)) { |
|
189 value = (const xmlChar *) "#00ff00"; |
|
190 } else if (!xmlStrncasecmp (value, (const xmlChar *) "olive", len)) { |
|
191 value = (const xmlChar *) "#808000"; |
|
192 } else if (!xmlStrncasecmp (value, (const xmlChar *) "silver", len)) { |
|
193 value = (const xmlChar *) "#c0c0c0"; |
|
194 } else if (!xmlStrncasecmp (value, (const xmlChar *) "teal", len)) { |
|
195 value = (const xmlChar *) "#008080"; |
|
196 } |
|
197 g_string_append_printf (sctx->buf, " foreground=\"%s%s\"", sharp, |
|
198 value); |
|
199 } else if (!xmlStrncasecmp ((const xmlChar *) "face", key, 4)) { |
|
200 g_string_append_printf (sctx->buf, " font_family=\"%s\"", value); |
|
201 } |
|
202 } |
|
203 g_string_append_c (sctx->buf, '>'); |
|
204 sami_context_push_state (sctx, SPAN_TAG); |
|
205 } |
|
206 } |
|
207 |
|
208 static void |
|
209 start_sami_element (void *ctx, const xmlChar * name, const xmlChar ** atts) |
|
210 { |
|
211 GstSamiContext *sctx = (GstSamiContext *) ctx; |
|
212 |
|
213 if (!xmlStrncmp ((const xmlChar *) "title", name, 5)) { |
|
214 sctx->in_title = TRUE; |
|
215 } else if (!xmlStrncmp ((const xmlChar *) "sync", name, 4)) { |
|
216 handle_start_sync (sctx, atts); |
|
217 } else if (!xmlStrncmp ((const xmlChar *) "font", name, 4)) { |
|
218 handle_start_font (sctx, atts); |
|
219 } else if (!xmlStrncmp ((const xmlChar *) "ruby", name, 4)) { |
|
220 sami_context_push_state (sctx, RUBY_TAG); |
|
221 } else if (!xmlStrncmp ((const xmlChar *) "br", name, 2)) { |
|
222 g_string_append_c (sctx->buf, '\n'); |
|
223 /* FIXME: support for furigana/ruby once implemented in pango */ |
|
224 } else if (!xmlStrncmp ((const xmlChar *) "rt", name, 2)) { |
|
225 if (has_tag (sctx->state, ITALIC_TAG)) { |
|
226 g_string_append (sctx->rubybuf, "<i>"); |
|
227 } |
|
228 g_string_append (sctx->rubybuf, "<span size='xx-small' rise='-100'>"); |
|
229 sami_context_push_state (sctx, RT_TAG); |
|
230 } else if (!xmlStrncmp ((const xmlChar *) "p", name, 1)) { |
|
231 } else if (!xmlStrncmp ((const xmlChar *) "i", name, 1)) { |
|
232 g_string_append (sctx->buf, "<i>"); |
|
233 sami_context_push_state (sctx, ITALIC_TAG); |
|
234 } |
|
235 } |
|
236 |
|
237 static void |
|
238 end_sami_element (void *ctx, const xmlChar * name) |
|
239 { |
|
240 GstSamiContext *sctx = (GstSamiContext *) ctx; |
|
241 |
|
242 if (!xmlStrncmp ((const xmlChar *) "title", name, 5)) { |
|
243 sctx->in_title = FALSE; |
|
244 } else if (!xmlStrncmp ((const xmlChar *) "font", name, 4)) { |
|
245 sami_context_pop_state (sctx, SPAN_TAG); |
|
246 } else if (!xmlStrncmp ((const xmlChar *) "ruby", name, 4)) { |
|
247 sami_context_pop_state (sctx, RUBY_TAG); |
|
248 } else if (!xmlStrncmp ((const xmlChar *) "i", name, 1)) { |
|
249 sami_context_pop_state (sctx, ITALIC_TAG); |
|
250 } |
|
251 } |
|
252 |
|
253 static void |
|
254 characters_sami (void *ctx, const xmlChar * ch, int len) |
|
255 { |
|
256 GstSamiContext *sctx = (GstSamiContext *) ctx; |
|
257 gchar *escaped; |
|
258 |
|
259 /* skip title */ |
|
260 if (sctx->in_title) |
|
261 return; |
|
262 |
|
263 escaped = g_markup_escape_text ((const gchar *) ch, len); |
|
264 if (has_tag (sctx->state, RT_TAG)) { |
|
265 g_string_append_c (sctx->rubybuf, ' '); |
|
266 g_string_append (sctx->rubybuf, escaped); |
|
267 g_string_append_c (sctx->rubybuf, ' '); |
|
268 } else { |
|
269 g_string_append (sctx->buf, escaped); |
|
270 } |
|
271 g_free (escaped); |
|
272 } |
|
273 |
|
274 static xmlSAXHandler samiSAXHandlerStruct = { |
|
275 NULL, /* internalSubset */ |
|
276 NULL, /* isStandalone */ |
|
277 NULL, /* hasInternalSubset */ |
|
278 NULL, /* hasExternalSubset */ |
|
279 NULL, /* resolveEntity */ |
|
280 NULL, /* getEntity */ |
|
281 NULL, /* entityDecl */ |
|
282 NULL, /* notationDecl */ |
|
283 NULL, /* attributeDecl */ |
|
284 NULL, /* elementDecl */ |
|
285 NULL, /* unparsedEntityDecl */ |
|
286 NULL, /* setDocumentLocator */ |
|
287 NULL, /* startDocument */ |
|
288 NULL, /* endDocument */ |
|
289 start_sami_element, /* startElement */ |
|
290 end_sami_element, /* endElement */ |
|
291 NULL, /* reference */ |
|
292 characters_sami, /* characters */ |
|
293 NULL, /* ignorableWhitespace */ |
|
294 NULL, /* processingInstruction */ |
|
295 NULL, /* comment */ |
|
296 NULL, /* xmlParserWarning */ |
|
297 NULL, /* xmlParserError */ |
|
298 NULL, /* xmlParserError */ |
|
299 NULL, /* getParameterEntity */ |
|
300 NULL, /* cdataBlock */ |
|
301 NULL, /* externalSubset */ |
|
302 1, /* initialized */ |
|
303 NULL, /* private */ |
|
304 NULL, /* startElementNsSAX2Func */ |
|
305 NULL, /* endElementNsSAX2Func */ |
|
306 NULL /* xmlStructuredErrorFunc */ |
|
307 }; |
|
308 static xmlSAXHandlerPtr samiSAXHandler = &samiSAXHandlerStruct; |
|
309 #ifdef __SYMBIAN32__ |
|
310 EXPORT_C |
|
311 #endif |
|
312 |
|
313 |
|
314 void |
|
315 sami_context_init (ParserState * state) |
|
316 { |
|
317 GstSamiContext *context; |
|
318 |
|
319 g_assert (state->user_data == NULL); |
|
320 state->user_data = (gpointer) g_new0 (GstSamiContext, 1); |
|
321 context = (GstSamiContext *) state->user_data; |
|
322 |
|
323 context->htmlctxt = htmlCreatePushParserCtxt (samiSAXHandler, context, |
|
324 "", 0, NULL, XML_CHAR_ENCODING_UTF8); |
|
325 context->buf = g_string_new (""); |
|
326 context->rubybuf = g_string_new (""); |
|
327 context->resultbuf = g_string_new (""); |
|
328 context->state = g_string_new (""); |
|
329 } |
|
330 #ifdef __SYMBIAN32__ |
|
331 EXPORT_C |
|
332 #endif |
|
333 |
|
334 |
|
335 void |
|
336 sami_context_deinit (ParserState * state) |
|
337 { |
|
338 GstSamiContext *context = (GstSamiContext *) state->user_data; |
|
339 |
|
340 if (context) { |
|
341 htmlParserCtxtPtr htmlctxt = context->htmlctxt; |
|
342 |
|
343 /* destroy sax context */ |
|
344 htmlDocPtr doc; |
|
345 |
|
346 htmlParseChunk (htmlctxt, "", 0, 1); |
|
347 doc = htmlctxt->myDoc; |
|
348 htmlFreeParserCtxt (htmlctxt); |
|
349 context->htmlctxt = NULL; |
|
350 if (doc) |
|
351 xmlFreeDoc (doc); |
|
352 g_string_free (context->buf, TRUE); |
|
353 g_string_free (context->rubybuf, TRUE); |
|
354 g_string_free (context->resultbuf, TRUE); |
|
355 g_string_free (context->state, TRUE); |
|
356 g_free (context); |
|
357 state->user_data = NULL; |
|
358 } |
|
359 } |
|
360 #ifdef __SYMBIAN32__ |
|
361 EXPORT_C |
|
362 #endif |
|
363 |
|
364 |
|
365 void |
|
366 sami_context_reset (ParserState * state) |
|
367 { |
|
368 GstSamiContext *context = (GstSamiContext *) state->user_data; |
|
369 |
|
370 if (context) { |
|
371 g_string_truncate (context->buf, 0); |
|
372 g_string_truncate (context->rubybuf, 0); |
|
373 g_string_truncate (context->resultbuf, 0); |
|
374 g_string_truncate (context->state, 0); |
|
375 context->has_result = FALSE; |
|
376 context->in_title = FALSE; |
|
377 context->time1 = 0; |
|
378 context->time2 = 0; |
|
379 } |
|
380 } |
|
381 |
|
382 static gchar * |
|
383 fix_invalid_entities (const gchar * line) |
|
384 { |
|
385 const gchar *cp, *pp; /* current pointer, previous pointer */ |
|
386 gssize size; |
|
387 GString *ret = g_string_new (NULL); |
|
388 |
|
389 pp = line; |
|
390 cp = strchr (line, '&'); |
|
391 while (cp) { |
|
392 size = cp - pp; |
|
393 ret = g_string_append_len (ret, pp, size); |
|
394 cp++; |
|
395 if (g_ascii_strncasecmp (cp, "nbsp;", 5) |
|
396 && (!g_ascii_strncasecmp (cp, "nbsp", 4))) { |
|
397 /* translate " " to " " */ |
|
398 ret = g_string_append_len (ret, " ", 6); |
|
399 cp += 4; |
|
400 } else if (g_ascii_strncasecmp (cp, "quot;", 5) |
|
401 && g_ascii_strncasecmp (cp, "amp;", 4) |
|
402 && g_ascii_strncasecmp (cp, "apos;", 5) |
|
403 && g_ascii_strncasecmp (cp, "lt;", 3) |
|
404 && g_ascii_strncasecmp (cp, "gt;", 3) |
|
405 && g_ascii_strncasecmp (cp, "nbsp;", 5) |
|
406 && cp[0] != '#') { |
|
407 /* translate "&" to "&" */ |
|
408 ret = g_string_append_len (ret, "&", 5); |
|
409 } else { |
|
410 /* do not translate */ |
|
411 ret = g_string_append_c (ret, '&'); |
|
412 } |
|
413 |
|
414 pp = cp; |
|
415 cp = strchr (pp, '&'); |
|
416 } |
|
417 ret = g_string_append (ret, pp); |
|
418 return g_string_free (ret, FALSE); |
|
419 } |
|
420 #ifdef __SYMBIAN32__ |
|
421 EXPORT_C |
|
422 #endif |
|
423 |
|
424 |
|
425 gchar * |
|
426 parse_sami (ParserState * state, const gchar * line) |
|
427 { |
|
428 gchar *fixed_line; |
|
429 GstSamiContext *context = (GstSamiContext *) state->user_data; |
|
430 |
|
431 fixed_line = fix_invalid_entities (line); |
|
432 htmlParseChunk (context->htmlctxt, fixed_line, strlen (fixed_line), 0); |
|
433 g_free (fixed_line); |
|
434 |
|
435 if (context->has_result) { |
|
436 gchar *r; |
|
437 |
|
438 if (context->rubybuf->len) { |
|
439 context->rubybuf = g_string_append_c (context->rubybuf, '\n'); |
|
440 g_string_prepend (context->resultbuf, context->rubybuf->str); |
|
441 context->rubybuf = g_string_truncate (context->rubybuf, 0); |
|
442 } |
|
443 |
|
444 r = g_string_free (context->resultbuf, FALSE); |
|
445 context->resultbuf = g_string_new (""); |
|
446 state->start_time = context->time1; |
|
447 state->duration = context->time2 - context->time1; |
|
448 context->has_result = FALSE; |
|
449 return r; |
|
450 } |
|
451 return NULL; |
|
452 } |
|
453 |
|
454 #else /* GST_DISABLE_XML */ |
|
455 #ifdef __SYMBIAN32__ |
|
456 EXPORT_C |
|
457 #endif |
|
458 gchar * |
|
459 parse_sami (ParserState * state, const gchar * line) |
|
460 { |
|
461 /* our template caps should not include sami in this case */ |
|
462 g_assert_not_reached (); |
|
463 } |
|
464 |
|
465 #ifdef __SYMBIAN32__ |
|
466 EXPORT_C |
|
467 #endif |
|
468 void |
|
469 sami_context_init (ParserState * state) |
|
470 { |
|
471 return; |
|
472 } |
|
473 |
|
474 #ifdef __SYMBIAN32__ |
|
475 EXPORT_C |
|
476 #endif |
|
477 void |
|
478 sami_context_deinit (ParserState * state) |
|
479 { |
|
480 return; |
|
481 } |
|
482 |
|
483 #ifdef __SYMBIAN32__ |
|
484 EXPORT_C |
|
485 #endif |
|
486 void |
|
487 sami_context_reset (ParserState * state) |
|
488 { |
|
489 return; |
|
490 } |
|
491 |
|
492 #endif /* GST_DISABLE_XML */ |