gst_plugins_base/gst/subparse/samiparse.c
changeset 2 5505e8908944
parent 0 0e761a78d257
child 7 567bb019e3e3
equal deleted inserted replaced
1:4c282e7dd6d3 2:5505e8908944
       
     1 /* GStreamer SAMI subtitle parser
       
     2  * Copyright (c) 2006 Young-Ho Cha <ganadist at chollian net>
       
     3  *
       
     4  * This library is free software; you can redistribute it and/or
       
     5  * modify it under the terms of the GNU Library General Public
       
     6  * License as published by the Free Software Foundation; either
       
     7  * version 2 of the License, or (at your option) any later version.
       
     8  *
       
     9  * This library is distributed in the hope that it will be useful,
       
    10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
       
    11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
       
    12  * Library General Public License for more details.
       
    13  *
       
    14  * You should have received a copy of the GNU Library General Public
       
    15  * License along with this library; if not, write to the
       
    16  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
       
    17  * Boston, MA 02111-1307, USA.
       
    18  */
       
    19 
       
    20 #include "samiparse.h"
       
    21 
       
    22 /* FIXME: use Makefile stuff */
       
    23 #ifndef GST_DISABLE_XML
       
    24 #include <libxml/HTMLparser.h>
       
    25 #include <string.h>
       
    26 
       
    27 #define ITALIC_TAG 'i'
       
    28 #define SPAN_TAG   's'
       
    29 #define RUBY_TAG   'r'
       
    30 #define RT_TAG     't'
       
    31 #define CLEAR_TAG  '0'
       
    32 
       
    33 typedef struct _GstSamiContext GstSamiContext;
       
    34 
       
    35 struct _GstSamiContext
       
    36 {
       
    37   GString *buf;                 /* buffer to collect content */
       
    38   GString *rubybuf;             /* buffer to collect ruby content */
       
    39   GString *resultbuf;           /* when opening the next 'sync' tag, move
       
    40                                  * from 'buf' to avoid to append following
       
    41                                  * content */
       
    42   GString *state;               /* in many sami files there are tags that
       
    43                                  * are not closed, so for each open tag the
       
    44                                  * parser will append a tag flag here so
       
    45                                  * that tags can be closed properly on
       
    46                                  * 'sync' tags. See _context_push_state()
       
    47                                  * and _context_pop_state(). */
       
    48   htmlParserCtxtPtr htmlctxt;   /* html parser context */
       
    49   gboolean has_result;          /* set when ready to push out result */
       
    50   gboolean in_title;            /* flag to avoid appending the title content
       
    51                                  * to buf */
       
    52   guint64 time1;                /* previous start attribute in sync tag */
       
    53   guint64 time2;                /* current start attribute in sync tag  */
       
    54 };
       
    55 
       
    56 static gchar *
       
    57 has_tag (GString * str, const gchar tag)
       
    58 {
       
    59   return strrchr (str->str, tag);
       
    60 }
       
    61 
       
    62 static void
       
    63 sami_context_push_state (GstSamiContext * sctx, char state)
       
    64 {
       
    65   g_string_append_c (sctx->state, state);
       
    66 }
       
    67 
       
    68 static void
       
    69 sami_context_pop_state (GstSamiContext * sctx, char state)
       
    70 {
       
    71   GString *str = g_string_new ("");
       
    72   GString *context_state = sctx->state;
       
    73   int i;
       
    74 
       
    75   for (i = context_state->len - 1; i >= 0; i--) {
       
    76     switch (context_state->str[i]) {
       
    77       case ITALIC_TAG:         /* <i> */
       
    78       {
       
    79         g_string_append (str, "</i>");
       
    80         break;
       
    81       }
       
    82       case SPAN_TAG:           /* <span foreground= > */
       
    83       {
       
    84         g_string_append (str, "</span>");
       
    85         break;
       
    86       }
       
    87       case RUBY_TAG:           /* <span size= >  -- ruby */
       
    88       {
       
    89         break;
       
    90       }
       
    91       case RT_TAG:             /*  ruby */
       
    92       {
       
    93         /* FIXME: support for furigana/ruby once implemented in pango */
       
    94         g_string_append (sctx->rubybuf, "</span>");
       
    95         if (has_tag (context_state, ITALIC_TAG)) {
       
    96           g_string_append (sctx->rubybuf, "</i>");
       
    97         }
       
    98 
       
    99         break;
       
   100       }
       
   101       default:
       
   102         break;
       
   103     }
       
   104     if (context_state->str[i] == state) {
       
   105       g_string_append (sctx->buf, str->str);
       
   106       g_string_free (str, TRUE);
       
   107       g_string_truncate (context_state, i);
       
   108       return;
       
   109     }
       
   110   }
       
   111   if (state == CLEAR_TAG) {
       
   112     g_string_append (sctx->buf, str->str);
       
   113     g_string_truncate (context_state, 0);
       
   114   }
       
   115   g_string_free (str, TRUE);
       
   116 }
       
   117 
       
   118 static void
       
   119 handle_start_sync (GstSamiContext * sctx, const xmlChar ** atts)
       
   120 {
       
   121   int i;
       
   122 
       
   123   sami_context_pop_state (sctx, CLEAR_TAG);
       
   124   if (atts != NULL) {
       
   125     for (i = 0; (atts[i] != NULL); i += 2) {
       
   126       const xmlChar *key, *value;
       
   127 
       
   128       key = atts[i];
       
   129       value = atts[i + 1];
       
   130 
       
   131       if (!value)
       
   132         continue;
       
   133       if (!xmlStrncmp ((const xmlChar *) "start", key, 5)) {
       
   134         sctx->time1 = sctx->time2;
       
   135         sctx->time2 = atoi ((const char *) value) * GST_MSECOND;
       
   136         sctx->has_result = TRUE;
       
   137         g_string_append (sctx->resultbuf, sctx->buf->str);
       
   138         g_string_truncate (sctx->buf, 0);
       
   139       }
       
   140     }
       
   141   }
       
   142 }
       
   143 
       
   144 static void
       
   145 handle_start_font (GstSamiContext * sctx, const xmlChar ** atts)
       
   146 {
       
   147   int i;
       
   148 
       
   149   sami_context_pop_state (sctx, SPAN_TAG);
       
   150   if (atts != NULL) {
       
   151     g_string_append (sctx->buf, "<span");
       
   152     for (i = 0; (atts[i] != NULL); i += 2) {
       
   153       const xmlChar *key, *value;
       
   154 
       
   155       key = atts[i];
       
   156       value = atts[i + 1];
       
   157 
       
   158       if (!value)
       
   159         continue;
       
   160       if (!xmlStrncmp ((const xmlChar *) "color", key, 5)) {
       
   161         /*
       
   162          * There are invalid color value in many
       
   163          * sami files.
       
   164          * It will fix hex color value that start without '#'
       
   165          */
       
   166         gchar *sharp = "";
       
   167         int len = xmlStrlen (value);
       
   168 
       
   169         if (!(*value == '#' && len == 7)) {
       
   170           gchar *r;
       
   171 
       
   172           /* check if it looks like hex */
       
   173           if (strtol ((const char *) value, &r, 16) >= 0 &&
       
   174               ((xmlChar *) r == (value + 6) && len == 6)) {
       
   175             sharp = "#";
       
   176           }
       
   177         }
       
   178         /* some colours can be found in many sami files, but X RGB database
       
   179          * doesn't contain a colour by this name, so map explicitly */
       
   180         if (!xmlStrncasecmp (value, (const xmlChar *) "aqua", len)) {
       
   181           value = (const xmlChar *) "#00ffff";
       
   182         } else if (!xmlStrncasecmp (value, (const xmlChar *) "crimson", len)) {
       
   183           value = (const xmlChar *) "#dc143c";
       
   184         } else if (!xmlStrncasecmp (value, (const xmlChar *) "fuchsia", len)) {
       
   185           value = (const xmlChar *) "#ff00ff";
       
   186         } else if (!xmlStrncasecmp (value, (const xmlChar *) "indigo", len)) {
       
   187           value = (const xmlChar *) "#4b0082";
       
   188         } else if (!xmlStrncasecmp (value, (const xmlChar *) "lime", len)) {
       
   189           value = (const xmlChar *) "#00ff00";
       
   190         } else if (!xmlStrncasecmp (value, (const xmlChar *) "olive", len)) {
       
   191           value = (const xmlChar *) "#808000";
       
   192         } else if (!xmlStrncasecmp (value, (const xmlChar *) "silver", len)) {
       
   193           value = (const xmlChar *) "#c0c0c0";
       
   194         } else if (!xmlStrncasecmp (value, (const xmlChar *) "teal", len)) {
       
   195           value = (const xmlChar *) "#008080";
       
   196         }
       
   197         g_string_append_printf (sctx->buf, " foreground=\"%s%s\"", sharp,
       
   198             value);
       
   199       } else if (!xmlStrncasecmp ((const xmlChar *) "face", key, 4)) {
       
   200         g_string_append_printf (sctx->buf, " font_family=\"%s\"", value);
       
   201       }
       
   202     }
       
   203     g_string_append_c (sctx->buf, '>');
       
   204     sami_context_push_state (sctx, SPAN_TAG);
       
   205   }
       
   206 }
       
   207 
       
   208 static void
       
   209 start_sami_element (void *ctx, const xmlChar * name, const xmlChar ** atts)
       
   210 {
       
   211   GstSamiContext *sctx = (GstSamiContext *) ctx;
       
   212 
       
   213   if (!xmlStrncmp ((const xmlChar *) "title", name, 5)) {
       
   214     sctx->in_title = TRUE;
       
   215   } else if (!xmlStrncmp ((const xmlChar *) "sync", name, 4)) {
       
   216     handle_start_sync (sctx, atts);
       
   217   } else if (!xmlStrncmp ((const xmlChar *) "font", name, 4)) {
       
   218     handle_start_font (sctx, atts);
       
   219   } else if (!xmlStrncmp ((const xmlChar *) "ruby", name, 4)) {
       
   220     sami_context_push_state (sctx, RUBY_TAG);
       
   221   } else if (!xmlStrncmp ((const xmlChar *) "br", name, 2)) {
       
   222     g_string_append_c (sctx->buf, '\n');
       
   223     /* FIXME: support for furigana/ruby once implemented in pango */
       
   224   } else if (!xmlStrncmp ((const xmlChar *) "rt", name, 2)) {
       
   225     if (has_tag (sctx->state, ITALIC_TAG)) {
       
   226       g_string_append (sctx->rubybuf, "<i>");
       
   227     }
       
   228     g_string_append (sctx->rubybuf, "<span size='xx-small' rise='-100'>");
       
   229     sami_context_push_state (sctx, RT_TAG);
       
   230   } else if (!xmlStrncmp ((const xmlChar *) "p", name, 1)) {
       
   231   } else if (!xmlStrncmp ((const xmlChar *) "i", name, 1)) {
       
   232     g_string_append (sctx->buf, "<i>");
       
   233     sami_context_push_state (sctx, ITALIC_TAG);
       
   234   }
       
   235 }
       
   236 
       
   237 static void
       
   238 end_sami_element (void *ctx, const xmlChar * name)
       
   239 {
       
   240   GstSamiContext *sctx = (GstSamiContext *) ctx;
       
   241 
       
   242   if (!xmlStrncmp ((const xmlChar *) "title", name, 5)) {
       
   243     sctx->in_title = FALSE;
       
   244   } else if (!xmlStrncmp ((const xmlChar *) "font", name, 4)) {
       
   245     sami_context_pop_state (sctx, SPAN_TAG);
       
   246   } else if (!xmlStrncmp ((const xmlChar *) "ruby", name, 4)) {
       
   247     sami_context_pop_state (sctx, RUBY_TAG);
       
   248   } else if (!xmlStrncmp ((const xmlChar *) "i", name, 1)) {
       
   249     sami_context_pop_state (sctx, ITALIC_TAG);
       
   250   }
       
   251 }
       
   252 
       
   253 static void
       
   254 characters_sami (void *ctx, const xmlChar * ch, int len)
       
   255 {
       
   256   GstSamiContext *sctx = (GstSamiContext *) ctx;
       
   257   gchar *escaped;
       
   258 
       
   259   /* skip title */
       
   260   if (sctx->in_title)
       
   261     return;
       
   262 
       
   263   escaped = g_markup_escape_text ((const gchar *) ch, len);
       
   264   if (has_tag (sctx->state, RT_TAG)) {
       
   265     g_string_append_c (sctx->rubybuf, ' ');
       
   266     g_string_append (sctx->rubybuf, escaped);
       
   267     g_string_append_c (sctx->rubybuf, ' ');
       
   268   } else {
       
   269     g_string_append (sctx->buf, escaped);
       
   270   }
       
   271   g_free (escaped);
       
   272 }
       
   273 
       
   274 static xmlSAXHandler samiSAXHandlerStruct = {
       
   275   NULL,                         /* internalSubset */
       
   276   NULL,                         /* isStandalone */
       
   277   NULL,                         /* hasInternalSubset */
       
   278   NULL,                         /* hasExternalSubset */
       
   279   NULL,                         /* resolveEntity */
       
   280   NULL,                         /* getEntity */
       
   281   NULL,                         /* entityDecl */
       
   282   NULL,                         /* notationDecl */
       
   283   NULL,                         /* attributeDecl */
       
   284   NULL,                         /* elementDecl */
       
   285   NULL,                         /* unparsedEntityDecl */
       
   286   NULL,                         /* setDocumentLocator */
       
   287   NULL,                         /* startDocument */
       
   288   NULL,                         /* endDocument */
       
   289   start_sami_element,           /* startElement */
       
   290   end_sami_element,             /* endElement */
       
   291   NULL,                         /* reference */
       
   292   characters_sami,              /* characters */
       
   293   NULL,                         /* ignorableWhitespace */
       
   294   NULL,                         /* processingInstruction */
       
   295   NULL,                         /* comment */
       
   296   NULL,                         /* xmlParserWarning */
       
   297   NULL,                         /* xmlParserError */
       
   298   NULL,                         /* xmlParserError */
       
   299   NULL,                         /* getParameterEntity */
       
   300   NULL,                         /* cdataBlock */
       
   301   NULL,                         /* externalSubset */
       
   302   1,                            /* initialized */
       
   303   NULL,                         /* private */
       
   304   NULL,                         /* startElementNsSAX2Func */
       
   305   NULL,                         /* endElementNsSAX2Func */
       
   306   NULL                          /* xmlStructuredErrorFunc */
       
   307 };
       
   308 static xmlSAXHandlerPtr samiSAXHandler = &samiSAXHandlerStruct;
       
   309 #ifdef __SYMBIAN32__
       
   310 EXPORT_C
       
   311 #endif
       
   312 
       
   313 
       
   314 void
       
   315 sami_context_init (ParserState * state)
       
   316 {
       
   317   GstSamiContext *context;
       
   318 
       
   319   g_assert (state->user_data == NULL);
       
   320   state->user_data = (gpointer) g_new0 (GstSamiContext, 1);
       
   321   context = (GstSamiContext *) state->user_data;
       
   322 
       
   323   context->htmlctxt = htmlCreatePushParserCtxt (samiSAXHandler, context,
       
   324       "", 0, NULL, XML_CHAR_ENCODING_UTF8);
       
   325   context->buf = g_string_new ("");
       
   326   context->rubybuf = g_string_new ("");
       
   327   context->resultbuf = g_string_new ("");
       
   328   context->state = g_string_new ("");
       
   329 }
       
   330 #ifdef __SYMBIAN32__
       
   331 EXPORT_C
       
   332 #endif
       
   333 
       
   334 
       
   335 void
       
   336 sami_context_deinit (ParserState * state)
       
   337 {
       
   338   GstSamiContext *context = (GstSamiContext *) state->user_data;
       
   339 
       
   340   if (context) {
       
   341     htmlParserCtxtPtr htmlctxt = context->htmlctxt;
       
   342 
       
   343     /* destroy sax context */
       
   344     htmlDocPtr doc;
       
   345 
       
   346     htmlParseChunk (htmlctxt, "", 0, 1);
       
   347     doc = htmlctxt->myDoc;
       
   348     htmlFreeParserCtxt (htmlctxt);
       
   349     context->htmlctxt = NULL;
       
   350     if (doc)
       
   351       xmlFreeDoc (doc);
       
   352     g_string_free (context->buf, TRUE);
       
   353     g_string_free (context->rubybuf, TRUE);
       
   354     g_string_free (context->resultbuf, TRUE);
       
   355     g_string_free (context->state, TRUE);
       
   356     g_free (context);
       
   357     state->user_data = NULL;
       
   358   }
       
   359 }
       
   360 #ifdef __SYMBIAN32__
       
   361 EXPORT_C
       
   362 #endif
       
   363 
       
   364 
       
   365 void
       
   366 sami_context_reset (ParserState * state)
       
   367 {
       
   368   GstSamiContext *context = (GstSamiContext *) state->user_data;
       
   369 
       
   370   if (context) {
       
   371     g_string_truncate (context->buf, 0);
       
   372     g_string_truncate (context->rubybuf, 0);
       
   373     g_string_truncate (context->resultbuf, 0);
       
   374     g_string_truncate (context->state, 0);
       
   375     context->has_result = FALSE;
       
   376     context->in_title = FALSE;
       
   377     context->time1 = 0;
       
   378     context->time2 = 0;
       
   379   }
       
   380 }
       
   381 
       
   382 static gchar *
       
   383 fix_invalid_entities (const gchar * line)
       
   384 {
       
   385   const gchar *cp, *pp;         /* current pointer, previous pointer */
       
   386   gssize size;
       
   387   GString *ret = g_string_new (NULL);
       
   388 
       
   389   pp = line;
       
   390   cp = strchr (line, '&');
       
   391   while (cp) {
       
   392     size = cp - pp;
       
   393     ret = g_string_append_len (ret, pp, size);
       
   394     cp++;
       
   395     if (g_ascii_strncasecmp (cp, "nbsp;", 5)
       
   396         && (!g_ascii_strncasecmp (cp, "nbsp", 4))) {
       
   397       /* translate "&nbsp" to "&nbsp;" */
       
   398       ret = g_string_append_len (ret, "&nbsp;", 6);
       
   399       cp += 4;
       
   400     } else if (g_ascii_strncasecmp (cp, "quot;", 5)
       
   401         && g_ascii_strncasecmp (cp, "amp;", 4)
       
   402         && g_ascii_strncasecmp (cp, "apos;", 5)
       
   403         && g_ascii_strncasecmp (cp, "lt;", 3)
       
   404         && g_ascii_strncasecmp (cp, "gt;", 3)
       
   405         && g_ascii_strncasecmp (cp, "nbsp;", 5)
       
   406         && cp[0] != '#') {
       
   407       /* translate "&" to "&amp;" */
       
   408       ret = g_string_append_len (ret, "&amp;", 5);
       
   409     } else {
       
   410       /* do not translate */
       
   411       ret = g_string_append_c (ret, '&');
       
   412     }
       
   413 
       
   414     pp = cp;
       
   415     cp = strchr (pp, '&');
       
   416   }
       
   417   ret = g_string_append (ret, pp);
       
   418   return g_string_free (ret, FALSE);
       
   419 }
       
   420 #ifdef __SYMBIAN32__
       
   421 EXPORT_C
       
   422 #endif
       
   423 
       
   424 
       
   425 gchar *
       
   426 parse_sami (ParserState * state, const gchar * line)
       
   427 {
       
   428   gchar *fixed_line;
       
   429   GstSamiContext *context = (GstSamiContext *) state->user_data;
       
   430 
       
   431   fixed_line = fix_invalid_entities (line);
       
   432   htmlParseChunk (context->htmlctxt, fixed_line, strlen (fixed_line), 0);
       
   433   g_free (fixed_line);
       
   434 
       
   435   if (context->has_result) {
       
   436     gchar *r;
       
   437 
       
   438     if (context->rubybuf->len) {
       
   439       context->rubybuf = g_string_append_c (context->rubybuf, '\n');
       
   440       g_string_prepend (context->resultbuf, context->rubybuf->str);
       
   441       context->rubybuf = g_string_truncate (context->rubybuf, 0);
       
   442     }
       
   443 
       
   444     r = g_string_free (context->resultbuf, FALSE);
       
   445     context->resultbuf = g_string_new ("");
       
   446     state->start_time = context->time1;
       
   447     state->duration = context->time2 - context->time1;
       
   448     context->has_result = FALSE;
       
   449     return r;
       
   450   }
       
   451   return NULL;
       
   452 }
       
   453 
       
   454 #else /* GST_DISABLE_XML */
       
   455 #ifdef __SYMBIAN32__
       
   456 EXPORT_C
       
   457 #endif
       
   458 gchar *
       
   459 parse_sami (ParserState * state, const gchar * line)
       
   460 {
       
   461   /* our template caps should not include sami in this case */
       
   462   g_assert_not_reached ();
       
   463 }
       
   464 
       
   465 #ifdef __SYMBIAN32__
       
   466 EXPORT_C
       
   467 #endif
       
   468 void
       
   469 sami_context_init (ParserState * state)
       
   470 {
       
   471   return;
       
   472 }
       
   473 
       
   474 #ifdef __SYMBIAN32__
       
   475 EXPORT_C
       
   476 #endif
       
   477 void
       
   478 sami_context_deinit (ParserState * state)
       
   479 {
       
   480   return;
       
   481 }
       
   482 
       
   483 #ifdef __SYMBIAN32__
       
   484 EXPORT_C
       
   485 #endif
       
   486 void
       
   487 sami_context_reset (ParserState * state)
       
   488 {
       
   489   return;
       
   490 }
       
   491 
       
   492 #endif /* GST_DISABLE_XML */