147 |
145 |
148 if (subparse->encoding) { |
146 if (subparse->encoding) { |
149 g_free (subparse->encoding); |
147 g_free (subparse->encoding); |
150 subparse->encoding = NULL; |
148 subparse->encoding = NULL; |
151 } |
149 } |
|
150 |
|
151 if (subparse->detected_encoding) { |
|
152 g_free (subparse->detected_encoding); |
|
153 subparse->detected_encoding = NULL; |
|
154 } |
|
155 |
|
156 if (subparse->adapter) { |
|
157 gst_object_unref (subparse->adapter); |
|
158 subparse->adapter = NULL; |
|
159 } |
|
160 |
152 if (subparse->textbuf) { |
161 if (subparse->textbuf) { |
153 g_string_free (subparse->textbuf, TRUE); |
162 g_string_free (subparse->textbuf, TRUE); |
154 subparse->textbuf = NULL; |
163 subparse->textbuf = NULL; |
155 } |
164 } |
|
165 #ifndef GST_DISABLE_XML |
156 sami_context_deinit (&subparse->state); |
166 sami_context_deinit (&subparse->state); |
|
167 #endif |
157 |
168 |
158 GST_CALL_PARENT (G_OBJECT_CLASS, dispose, (object)); |
169 GST_CALL_PARENT (G_OBJECT_CLASS, dispose, (object)); |
159 } |
170 } |
160 |
171 |
161 static void |
172 static void |
172 |
183 |
173 element_class->change_state = gst_sub_parse_change_state; |
184 element_class->change_state = gst_sub_parse_change_state; |
174 |
185 |
175 g_object_class_install_property (object_class, PROP_ENCODING, |
186 g_object_class_install_property (object_class, PROP_ENCODING, |
176 g_param_spec_string ("subtitle-encoding", "subtitle charset encoding", |
187 g_param_spec_string ("subtitle-encoding", "subtitle charset encoding", |
177 "Encoding to assume if input subtitles are not in UTF-8 encoding. " |
188 "Encoding to assume if input subtitles are not in UTF-8 or any other " |
178 "If not set, the GST_SUBTITLE_ENCODING environment variable will " |
189 "Unicode encoding. If not set, the GST_SUBTITLE_ENCODING environment " |
179 "be checked for an encoding to use. If that is not set either, " |
190 "variable will be checked for an encoding to use. If that is not set " |
180 "ISO-8859-15 will be assumed.", DEFAULT_ENCODING, G_PARAM_READWRITE)); |
191 "either, ISO-8859-15 will be assumed.", DEFAULT_ENCODING, |
|
192 G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)); |
181 } |
193 } |
182 |
194 |
183 static void |
195 static void |
184 gst_sub_parse_init (GstSubParse * subparse) |
196 gst_sub_parse_init (GstSubParse * subparse) |
185 { |
197 { |
306 } |
320 } |
307 GST_OBJECT_UNLOCK (subparse); |
321 GST_OBJECT_UNLOCK (subparse); |
308 } |
322 } |
309 |
323 |
310 static gchar * |
324 static gchar * |
311 convert_encoding (GstSubParse * self, const gchar * str, gsize len) |
325 gst_sub_parse_get_format_description (GstSubParseFormat format) |
|
326 { |
|
327 switch (format) { |
|
328 case GST_SUB_PARSE_FORMAT_MDVDSUB: |
|
329 return "MicroDVD"; |
|
330 case GST_SUB_PARSE_FORMAT_SUBRIP: |
|
331 return "SubRip"; |
|
332 case GST_SUB_PARSE_FORMAT_MPSUB: |
|
333 return "MPSub"; |
|
334 case GST_SUB_PARSE_FORMAT_SAMI: |
|
335 return "SAMI"; |
|
336 case GST_SUB_PARSE_FORMAT_TMPLAYER: |
|
337 return "TMPlayer"; |
|
338 case GST_SUB_PARSE_FORMAT_MPL2: |
|
339 return "MPL2"; |
|
340 case GST_SUB_PARSE_FORMAT_SUBVIEWER: |
|
341 return "SubViewer"; |
|
342 default: |
|
343 case GST_SUB_PARSE_FORMAT_UNKNOWN: |
|
344 break; |
|
345 } |
|
346 return NULL; |
|
347 } |
|
348 |
|
349 static gchar * |
|
350 gst_convert_to_utf8 (const gchar * str, gsize len, const gchar * encoding, |
|
351 gsize * consumed, GError ** err) |
|
352 { |
|
353 gchar *ret = NULL; |
|
354 |
|
355 *consumed = 0; |
|
356 ret = |
|
357 g_convert_with_fallback (str, len, "UTF-8", encoding, "*", consumed, NULL, |
|
358 err); |
|
359 if (ret == NULL) |
|
360 return ret; |
|
361 |
|
362 /* + 3 to skip UTF-8 BOM if it was added */ |
|
363 len = strlen (ret); |
|
364 if (len >= 3 && (guint8) ret[0] == 0xEF && (guint8) ret[1] == 0xBB |
|
365 && (guint8) ret[2] == 0xBF) |
|
366 g_memmove (ret, ret + 3, len + 1 - 3); |
|
367 |
|
368 return ret; |
|
369 } |
|
370 |
|
371 static gchar * |
|
372 detect_encoding (const gchar * str, gsize len) |
|
373 { |
|
374 if (len >= 3 && (guint8) str[0] == 0xEF && (guint8) str[1] == 0xBB |
|
375 && (guint8) str[2] == 0xBF) |
|
376 return g_strdup ("UTF-8"); |
|
377 |
|
378 if (len >= 2 && (guint8) str[0] == 0xFE && (guint8) str[1] == 0xFF) |
|
379 return g_strdup ("UTF-16BE"); |
|
380 |
|
381 if (len >= 2 && (guint8) str[0] == 0xFF && (guint8) str[1] == 0xFE) |
|
382 return g_strdup ("UTF-16LE"); |
|
383 |
|
384 if (len >= 4 && (guint8) str[0] == 0x00 && (guint8) str[1] == 0x00 |
|
385 && (guint8) str[2] == 0xFE && (guint8) str[3] == 0xFF) |
|
386 return g_strdup ("UTF-32BE"); |
|
387 |
|
388 if (len >= 4 && (guint8) str[0] == 0xFF && (guint8) str[1] == 0xFE |
|
389 && (guint8) str[2] == 0x00 && (guint8) str[3] == 0x00) |
|
390 return g_strdup ("UTF-32LE"); |
|
391 |
|
392 return NULL; |
|
393 } |
|
394 |
|
395 static gchar * |
|
396 convert_encoding (GstSubParse * self, const gchar * str, gsize len, |
|
397 gsize * consumed) |
312 { |
398 { |
313 const gchar *encoding; |
399 const gchar *encoding; |
314 GError *err = NULL; |
400 GError *err = NULL; |
315 gchar *ret; |
401 gchar *ret = NULL; |
316 |
402 |
|
403 *consumed = 0; |
|
404 |
|
405 /* First try any detected encoding */ |
|
406 if (self->detected_encoding) { |
|
407 ret = |
|
408 gst_convert_to_utf8 (str, len, self->detected_encoding, consumed, &err); |
|
409 |
|
410 if (!err) |
|
411 return ret; |
|
412 |
|
413 GST_WARNING_OBJECT (self, "could not convert string from '%s' to UTF-8: %s", |
|
414 self->detected_encoding, err->message); |
|
415 g_free (self->detected_encoding); |
|
416 self->detected_encoding = NULL; |
|
417 g_error_free (err); |
|
418 } |
|
419 |
|
420 /* Otherwise check if it's UTF8 */ |
317 if (self->valid_utf8) { |
421 if (self->valid_utf8) { |
318 if (g_utf8_validate (str, len, NULL)) { |
422 if (g_utf8_validate (str, len, NULL)) { |
319 GST_LOG_OBJECT (self, "valid UTF-8, no conversion needed"); |
423 GST_LOG_OBJECT (self, "valid UTF-8, no conversion needed"); |
|
424 *consumed = len; |
320 return g_strndup (str, len); |
425 return g_strndup (str, len); |
321 } |
426 } |
322 GST_INFO_OBJECT (self, "invalid UTF-8!"); |
427 GST_INFO_OBJECT (self, "invalid UTF-8!"); |
323 self->valid_utf8 = FALSE; |
428 self->valid_utf8 = FALSE; |
324 } |
429 } |
325 |
430 |
|
431 /* Else try fallback */ |
326 encoding = self->encoding; |
432 encoding = self->encoding; |
327 if (encoding == NULL || *encoding == '\0') { |
433 if (encoding == NULL || *encoding == '\0') { |
328 encoding = g_getenv ("GST_SUBTITLE_ENCODING"); |
434 encoding = g_getenv ("GST_SUBTITLE_ENCODING"); |
329 } |
435 } |
330 if (encoding == NULL || *encoding == '\0') { |
436 if (encoding == NULL || *encoding == '\0') { |
333 if (g_get_charset (&encoding)) { |
439 if (g_get_charset (&encoding)) { |
334 encoding = "ISO-8859-15"; |
440 encoding = "ISO-8859-15"; |
335 } |
441 } |
336 } |
442 } |
337 |
443 |
338 ret = g_convert_with_fallback (str, len, "UTF-8", encoding, "*", NULL, |
444 ret = gst_convert_to_utf8 (str, len, encoding, consumed, &err); |
339 NULL, &err); |
|
340 |
445 |
341 if (err) { |
446 if (err) { |
342 GST_WARNING_OBJECT (self, "could not convert string from '%s' to UTF-8: %s", |
447 GST_WARNING_OBJECT (self, "could not convert string from '%s' to UTF-8: %s", |
343 encoding, err->message); |
448 encoding, err->message); |
344 g_error_free (err); |
449 g_error_free (err); |
345 |
450 |
346 /* invalid input encoding, fall back to ISO-8859-15 (always succeeds) */ |
451 /* invalid input encoding, fall back to ISO-8859-15 (always succeeds) */ |
347 ret = g_convert_with_fallback (str, len, "UTF-8", "ISO-8859-15", "*", |
452 ret = gst_convert_to_utf8 (str, len, "ISO-8859-15", consumed, NULL); |
348 NULL, NULL, NULL); |
|
349 } |
453 } |
350 |
454 |
351 GST_LOG_OBJECT (self, |
455 GST_LOG_OBJECT (self, |
352 "successfully converted %" G_GSIZE_FORMAT " characters from %s to UTF-8" |
456 "successfully converted %" G_GSIZE_FORMAT " characters from %s to UTF-8" |
353 "%s", len, encoding, (err) ? " , using ISO-8859-15 as fallback" : ""); |
457 "%s", len, encoding, (err) ? " , using ISO-8859-15 as fallback" : ""); |
653 g_free (*p_txt); |
757 g_free (*p_txt); |
654 *p_txt = g_string_free (s, FALSE); |
758 *p_txt = g_string_free (s, FALSE); |
655 } |
759 } |
656 } |
760 } |
657 |
761 |
|
762 static gboolean |
|
763 parse_subrip_time (const gchar * ts_string, GstClockTime * t) |
|
764 { |
|
765 gchar s[128] = { '\0', }; |
|
766 gchar *end, *p; |
|
767 guint hour, min, sec, msec, len; |
|
768 |
|
769 while (*ts_string == ' ') |
|
770 ++ts_string; |
|
771 |
|
772 g_strlcpy (s, ts_string, sizeof (s)); |
|
773 if ((end = strstr (s, "-->"))) |
|
774 *end = '\0'; |
|
775 g_strchomp (s); |
|
776 |
|
777 /* ms may be in these formats: |
|
778 * hh:mm:ss,500 = 500ms |
|
779 * hh:mm:ss, 5 = 5ms |
|
780 * hh:mm:ss, 5 = 50ms |
|
781 * hh:mm:ss, 50 = 50ms |
|
782 * hh:mm:ss,5 = 500ms |
|
783 * and sscanf() doesn't differentiate between ' 5' and '5' so munge |
|
784 * the white spaces within the timestamp to '0' (I'm sure there's a |
|
785 * way to make sscanf() do this for us, but how?) |
|
786 */ |
|
787 g_strdelimit (s, " ", '0'); |
|
788 |
|
789 /* make sure we have exactly three digits after he comma */ |
|
790 p = strchr (s, ','); |
|
791 g_assert (p != NULL); |
|
792 ++p; |
|
793 len = strlen (p); |
|
794 if (len > 3) { |
|
795 p[3] = '\0'; |
|
796 } else |
|
797 while (len < 3) { |
|
798 g_strlcat (&p[len], "0", 2); |
|
799 ++len; |
|
800 } |
|
801 |
|
802 GST_LOG ("parsing timestamp '%s'", s); |
|
803 if (sscanf (s, "%u:%u:%u,%u", &hour, &min, &sec, &msec) != 4) { |
|
804 GST_WARNING ("failed to parse subrip timestamp string '%s'", s); |
|
805 return FALSE; |
|
806 } |
|
807 |
|
808 *t = ((hour * 3600) + (min * 60) + sec) * GST_SECOND + msec * GST_MSECOND; |
|
809 return TRUE; |
|
810 } |
|
811 |
658 static gchar * |
812 static gchar * |
659 parse_subrip (ParserState * state, const gchar * line) |
813 parse_subrip (ParserState * state, const gchar * line) |
660 { |
814 { |
661 guint h1, m1, s1, ms1; |
|
662 guint h2, m2, s2, ms2; |
|
663 int subnum; |
815 int subnum; |
664 gchar *ret; |
816 gchar *ret; |
665 |
817 |
666 switch (state->state) { |
818 switch (state->state) { |
667 case 0: |
819 case 0: |
668 /* looking for a single integer */ |
820 /* looking for a single integer */ |
669 if (sscanf (line, "%u", &subnum) == 1) |
821 if (sscanf (line, "%u", &subnum) == 1) |
670 state->state = 1; |
822 state->state = 1; |
671 return NULL; |
823 return NULL; |
672 case 1: |
824 case 1: |
|
825 { |
|
826 GstClockTime ts_start, ts_end; |
|
827 gchar *end_time; |
|
828 |
673 /* looking for start_time --> end_time */ |
829 /* looking for start_time --> end_time */ |
674 if (sscanf (line, "%u:%u:%u,%u --> %u:%u:%u,%u", |
830 if ((end_time = strstr (line, " --> ")) && |
675 &h1, &m1, &s1, &ms1, &h2, &m2, &s2, &ms2) == 8) { |
831 parse_subrip_time (line, &ts_start) && |
|
832 parse_subrip_time (end_time + strlen (" --> "), &ts_end) && |
|
833 state->start_time <= ts_end) { |
676 state->state = 2; |
834 state->state = 2; |
677 state->start_time = |
835 state->start_time = ts_start; |
678 (((guint64) h1) * 3600 + m1 * 60 + s1) * GST_SECOND + |
836 state->duration = ts_end - ts_start; |
679 ms1 * GST_MSECOND; |
|
680 state->duration = |
|
681 (((guint64) h2) * 3600 + m2 * 60 + s2) * GST_SECOND + |
|
682 ms2 * GST_MSECOND - state->start_time; |
|
683 } else { |
837 } else { |
684 GST_DEBUG ("error parsing subrip time line"); |
838 GST_DEBUG ("error parsing subrip time line '%s'", line); |
685 state->state = 0; |
839 state->state = 0; |
686 } |
840 } |
687 return NULL; |
841 return NULL; |
|
842 } |
688 case 2: |
843 case 2: |
689 { |
844 { |
690 /* No need to parse that text if it's out of segment */ |
845 /* No need to parse that text if it's out of segment */ |
691 gint64 clip_start = 0, clip_stop = 0; |
846 gint64 clip_start = 0, clip_stop = 0; |
692 gboolean in_seg = FALSE; |
847 gboolean in_seg = FALSE; |
885 { |
1041 { |
886 if (state->buf) { |
1042 if (state->buf) { |
887 g_string_free (state->buf, TRUE); |
1043 g_string_free (state->buf, TRUE); |
888 state->buf = NULL; |
1044 state->buf = NULL; |
889 } |
1045 } |
|
1046 #ifndef GST_DISABLE_XML |
890 if (state->user_data) { |
1047 if (state->user_data) { |
891 sami_context_reset (state); |
1048 sami_context_reset (state); |
892 } |
1049 } |
|
1050 #endif |
|
1051 } |
|
1052 |
|
1053 /* regex type enum */ |
|
1054 typedef enum |
|
1055 { |
|
1056 GST_SUB_PARSE_REGEX_UNKNOWN = 0, |
|
1057 GST_SUB_PARSE_REGEX_MDVDSUB = 1, |
|
1058 GST_SUB_PARSE_REGEX_SUBRIP = 2, |
|
1059 } GstSubParseRegex; |
|
1060 |
|
1061 static gpointer |
|
1062 gst_sub_parse_data_format_autodetect_regex_once (GstSubParseRegex regtype) |
|
1063 { |
|
1064 gpointer result = NULL; |
|
1065 GError *gerr = NULL; |
|
1066 switch (regtype) { |
|
1067 case GST_SUB_PARSE_REGEX_MDVDSUB: |
|
1068 result = |
|
1069 (gpointer) g_regex_new ("^\\{[0-9]+\\}\\{[0-9]+\\}", 0, 0, &gerr); |
|
1070 if (result == NULL) { |
|
1071 g_warning ("Compilation of mdvd regex failed: %s", gerr->message); |
|
1072 g_error_free (gerr); |
|
1073 } |
|
1074 break; |
|
1075 case GST_SUB_PARSE_REGEX_SUBRIP: |
|
1076 result = (gpointer) g_regex_new ("^([ 0-9]){0,3}[0-9]\\s*(\x0d)?\x0a" |
|
1077 "[ 0-9][0-9]:[ 0-9][0-9]:[ 0-9][0-9],[ 0-9]{0,2}[0-9]" |
|
1078 " +--> +([ 0-9])?[0-9]:[ 0-9][0-9]:[ 0-9][0-9],[ 0-9]{0,2}[0-9]", |
|
1079 0, 0, &gerr); |
|
1080 if (result == NULL) { |
|
1081 g_warning ("Compilation of subrip regex failed: %s", gerr->message); |
|
1082 g_error_free (gerr); |
|
1083 } |
|
1084 break; |
|
1085 default: |
|
1086 GST_WARNING ("Trying to allocate regex of unknown type %u", regtype); |
|
1087 } |
|
1088 return result; |
893 } |
1089 } |
894 |
1090 |
895 /* |
1091 /* |
896 * FIXME: maybe we should pass along a second argument, the preceding |
1092 * FIXME: maybe we should pass along a second argument, the preceding |
897 * text buffer, because that is how this originally worked, even though |
1093 * text buffer, because that is how this originally worked, even though |
899 */ |
1095 */ |
900 |
1096 |
901 static GstSubParseFormat |
1097 static GstSubParseFormat |
902 gst_sub_parse_data_format_autodetect (gchar * match_str) |
1098 gst_sub_parse_data_format_autodetect (gchar * match_str) |
903 { |
1099 { |
904 static gboolean need_init_regexps = TRUE; |
|
905 static regex_t mdvd_rx; |
|
906 static regex_t subrip_rx; |
|
907 guint n1, n2, n3; |
1100 guint n1, n2, n3; |
908 |
1101 |
909 /* initialize the regexps used the first time around */ |
1102 static GOnce mdvd_rx_once = G_ONCE_INIT; |
910 if (need_init_regexps) { |
1103 static GOnce subrip_rx_once = G_ONCE_INIT; |
911 int err; |
1104 |
912 char errstr[128]; |
1105 GRegex *mdvd_grx; |
913 |
1106 GRegex *subrip_grx; |
914 need_init_regexps = FALSE; |
1107 |
915 if ((err = regcomp (&mdvd_rx, "^\\{[0-9]+\\}\\{[0-9]+\\}", |
1108 g_once (&mdvd_rx_once, |
916 REG_EXTENDED | REG_NEWLINE | REG_NOSUB) != 0) || |
1109 (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once, |
917 (err = regcomp (&subrip_rx, "^[0-9]([0-9]){0,3}(\x0d)?\x0a" |
1110 (gpointer) GST_SUB_PARSE_REGEX_MDVDSUB); |
918 "[0-9][0-9]:[0-9][0-9]:[0-9][0-9],[0-9]{3}" |
1111 g_once (&subrip_rx_once, |
919 " --> [0-9][0-9]:[0-9][0-9]:[0-9][0-9],[0-9]{3}", |
1112 (GThreadFunc) gst_sub_parse_data_format_autodetect_regex_once, |
920 REG_EXTENDED | REG_NEWLINE | REG_NOSUB)) != 0) { |
1113 (gpointer) GST_SUB_PARSE_REGEX_SUBRIP); |
921 regerror (err, &subrip_rx, errstr, 127); |
1114 |
922 GST_WARNING ("Compilation of subrip regex failed: %s", errstr); |
1115 mdvd_grx = (GRegex *) mdvd_rx_once.retval; |
923 } |
1116 subrip_grx = (GRegex *) subrip_rx_once.retval; |
924 } |
1117 |
925 |
1118 if (g_regex_match (mdvd_grx, match_str, 0, NULL) == TRUE) { |
926 if (regexec (&mdvd_rx, match_str, 0, NULL, 0) == 0) { |
|
927 GST_LOG ("MicroDVD (frame based) format detected"); |
1119 GST_LOG ("MicroDVD (frame based) format detected"); |
928 return GST_SUB_PARSE_FORMAT_MDVDSUB; |
1120 return GST_SUB_PARSE_FORMAT_MDVDSUB; |
929 } |
1121 } |
930 if (regexec (&subrip_rx, match_str, 0, NULL, 0) == 0) { |
1122 if (g_regex_match (subrip_grx, match_str, 0, NULL) == TRUE) { |
931 GST_LOG ("SubRip (time based) format detected"); |
1123 GST_LOG ("SubRip (time based) format detected"); |
932 return GST_SUB_PARSE_FORMAT_SUBRIP; |
1124 return GST_SUB_PARSE_FORMAT_SUBRIP; |
933 } |
1125 } |
934 |
1126 |
935 if (!strncmp (match_str, "FORMAT=TIME", 11)) { |
1127 if (!strncmp (match_str, "FORMAT=TIME", 11)) { |
936 GST_LOG ("MPSub (time based) format detected"); |
1128 GST_LOG ("MPSub (time based) format detected"); |
937 return GST_SUB_PARSE_FORMAT_MPSUB; |
1129 return GST_SUB_PARSE_FORMAT_MPSUB; |
938 } |
1130 } |
|
1131 #ifndef GST_DISABLE_XML |
939 if (strstr (match_str, "<SAMI>") != NULL || |
1132 if (strstr (match_str, "<SAMI>") != NULL || |
940 strstr (match_str, "<sami>") != NULL) { |
1133 strstr (match_str, "<sami>") != NULL) { |
941 GST_LOG ("SAMI (time based) format detected"); |
1134 GST_LOG ("SAMI (time based) format detected"); |
942 return GST_SUB_PARSE_FORMAT_SAMI; |
1135 return GST_SUB_PARSE_FORMAT_SAMI; |
943 } |
1136 } |
|
1137 #endif |
944 /* we're boldly assuming the first subtitle appears within the first hour */ |
1138 /* we're boldly assuming the first subtitle appears within the first hour */ |
945 if (sscanf (match_str, "0:%02u:%02u:", &n1, &n2) == 2 || |
1139 if (sscanf (match_str, "0:%02u:%02u:", &n1, &n2) == 2 || |
946 sscanf (match_str, "0:%02u:%02u=", &n1, &n2) == 2 || |
1140 sscanf (match_str, "0:%02u:%02u=", &n1, &n2) == 2 || |
947 sscanf (match_str, "00:%02u:%02u:", &n1, &n2) == 2 || |
1141 sscanf (match_str, "00:%02u:%02u:", &n1, &n2) == 2 || |
948 sscanf (match_str, "00:%02u:%02u=", &n1, &n2) == 2 || |
1142 sscanf (match_str, "00:%02u:%02u=", &n1, &n2) == 2 || |
967 gst_sub_parse_format_autodetect (GstSubParse * self) |
1161 gst_sub_parse_format_autodetect (GstSubParse * self) |
968 { |
1162 { |
969 gchar *data; |
1163 gchar *data; |
970 GstSubParseFormat format; |
1164 GstSubParseFormat format; |
971 |
1165 |
972 if (strlen (self->textbuf->str) < 35) { |
1166 if (strlen (self->textbuf->str) < 30) { |
973 GST_DEBUG ("File too small to be a subtitles file"); |
1167 GST_DEBUG ("File too small to be a subtitles file"); |
974 return NULL; |
1168 return NULL; |
975 } |
1169 } |
976 |
1170 |
977 data = g_strndup (self->textbuf->str, 35); |
1171 data = g_strndup (self->textbuf->str, 35); |
978 format = gst_sub_parse_data_format_autodetect (data); |
1172 format = gst_sub_parse_data_format_autodetect (data); |
979 g_free (data); |
1173 g_free (data); |
980 |
1174 |
981 self->parser_type = format; |
1175 self->parser_type = format; |
|
1176 self->subtitle_codec = gst_sub_parse_get_format_description (format); |
982 parser_state_init (&self->state); |
1177 parser_state_init (&self->state); |
983 |
1178 |
984 switch (format) { |
1179 switch (format) { |
985 case GST_SUB_PARSE_FORMAT_MDVDSUB: |
1180 case GST_SUB_PARSE_FORMAT_MDVDSUB: |
986 self->parse_line = parse_mdvdsub; |
1181 self->parse_line = parse_mdvdsub; |
989 self->parse_line = parse_subrip; |
1184 self->parse_line = parse_subrip; |
990 return gst_caps_new_simple ("text/x-pango-markup", NULL); |
1185 return gst_caps_new_simple ("text/x-pango-markup", NULL); |
991 case GST_SUB_PARSE_FORMAT_MPSUB: |
1186 case GST_SUB_PARSE_FORMAT_MPSUB: |
992 self->parse_line = parse_mpsub; |
1187 self->parse_line = parse_mpsub; |
993 return gst_caps_new_simple ("text/plain", NULL); |
1188 return gst_caps_new_simple ("text/plain", NULL); |
|
1189 #ifndef GST_DISABLE_XML |
994 case GST_SUB_PARSE_FORMAT_SAMI: |
1190 case GST_SUB_PARSE_FORMAT_SAMI: |
995 self->parse_line = parse_sami; |
1191 self->parse_line = parse_sami; |
996 sami_context_init (&self->state); |
1192 sami_context_init (&self->state); |
997 return gst_caps_new_simple ("text/x-pango-markup", NULL); |
1193 return gst_caps_new_simple ("text/x-pango-markup", NULL); |
|
1194 #endif |
998 case GST_SUB_PARSE_FORMAT_TMPLAYER: |
1195 case GST_SUB_PARSE_FORMAT_TMPLAYER: |
999 self->parse_line = parse_tmplayer; |
1196 self->parse_line = parse_tmplayer; |
|
1197 self->state.max_duration = 5 * GST_SECOND; |
1000 return gst_caps_new_simple ("text/plain", NULL); |
1198 return gst_caps_new_simple ("text/plain", NULL); |
1001 case GST_SUB_PARSE_FORMAT_MPL2: |
1199 case GST_SUB_PARSE_FORMAT_MPL2: |
1002 self->parse_line = parse_mpl2; |
1200 self->parse_line = parse_mpl2; |
1003 return gst_caps_new_simple ("text/x-pango-markup", NULL); |
1201 return gst_caps_new_simple ("text/x-pango-markup", NULL); |
1004 case GST_SUB_PARSE_FORMAT_SUBVIEWER: |
1202 case GST_SUB_PARSE_FORMAT_SUBVIEWER: |
1014 } |
1212 } |
1015 |
1213 |
1016 static void |
1214 static void |
1017 feed_textbuf (GstSubParse * self, GstBuffer * buf) |
1215 feed_textbuf (GstSubParse * self, GstBuffer * buf) |
1018 { |
1216 { |
1019 if (GST_BUFFER_OFFSET (buf) != self->offset) { |
1217 gboolean discont; |
|
1218 gsize consumed; |
|
1219 gchar *input = NULL; |
|
1220 |
|
1221 discont = GST_BUFFER_IS_DISCONT (buf); |
|
1222 |
|
1223 if (GST_BUFFER_OFFSET_IS_VALID (buf) && |
|
1224 GST_BUFFER_OFFSET (buf) != self->offset) { |
|
1225 self->offset = GST_BUFFER_OFFSET (buf); |
|
1226 discont = TRUE; |
|
1227 } |
|
1228 |
|
1229 if (discont) { |
|
1230 GST_INFO ("discontinuity"); |
1020 /* flush the parser state */ |
1231 /* flush the parser state */ |
1021 parser_state_init (&self->state); |
1232 parser_state_init (&self->state); |
1022 g_string_truncate (self->textbuf, 0); |
1233 g_string_truncate (self->textbuf, 0); |
|
1234 gst_adapter_clear (self->adapter); |
|
1235 #ifndef GST_DISABLE_XML |
1023 sami_context_reset (&self->state); |
1236 sami_context_reset (&self->state); |
1024 } |
1237 #endif |
1025 |
1238 /* we could set a flag to make sure that the next buffer we push out also |
1026 self->textbuf = g_string_append_len (self->textbuf, |
1239 * has the DISCONT flag set, but there's no point really given that it's |
1027 (gchar *) GST_BUFFER_DATA (buf), GST_BUFFER_SIZE (buf)); |
1240 * subtitles which are discontinuous by nature. */ |
|
1241 } |
|
1242 |
1028 self->offset = GST_BUFFER_OFFSET (buf) + GST_BUFFER_SIZE (buf); |
1243 self->offset = GST_BUFFER_OFFSET (buf) + GST_BUFFER_SIZE (buf); |
1029 self->next_offset = self->offset; |
1244 self->next_offset = self->offset; |
1030 |
1245 |
1031 gst_buffer_unref (buf); |
1246 gst_adapter_push (self->adapter, buf); |
|
1247 |
|
1248 input = |
|
1249 convert_encoding (self, (const gchar *) gst_adapter_peek (self->adapter, |
|
1250 gst_adapter_available (self->adapter)), |
|
1251 (gsize) gst_adapter_available (self->adapter), &consumed); |
|
1252 |
|
1253 if (input && consumed > 0) { |
|
1254 self->textbuf = g_string_append (self->textbuf, input); |
|
1255 gst_adapter_flush (self->adapter, consumed); |
|
1256 } |
|
1257 |
|
1258 g_free (input); |
1032 } |
1259 } |
1033 |
1260 |
1034 static GstFlowReturn |
1261 static GstFlowReturn |
1035 handle_buffer (GstSubParse * self, GstBuffer * buf) |
1262 handle_buffer (GstSubParse * self, GstBuffer * buf) |
1036 { |
1263 { |
1037 GstFlowReturn ret = GST_FLOW_OK; |
1264 GstFlowReturn ret = GST_FLOW_OK; |
1038 GstCaps *caps = NULL; |
1265 GstCaps *caps = NULL; |
1039 gchar *line, *subtitle; |
1266 gchar *line, *subtitle; |
|
1267 |
|
1268 if (self->first_buffer) { |
|
1269 self->detected_encoding = |
|
1270 detect_encoding ((gchar *) GST_BUFFER_DATA (buf), |
|
1271 GST_BUFFER_SIZE (buf)); |
|
1272 self->first_buffer = FALSE; |
|
1273 } |
1040 |
1274 |
1041 feed_textbuf (self, buf); |
1275 feed_textbuf (self, buf); |
1042 |
1276 |
1043 /* make sure we know the format */ |
1277 /* make sure we know the format */ |
1044 if (G_UNLIKELY (self->parser_type == GST_SUB_PARSE_FORMAT_UNKNOWN)) { |
1278 if (G_UNLIKELY (self->parser_type == GST_SUB_PARSE_FORMAT_UNKNOWN)) { |
1048 if (!gst_pad_set_caps (self->srcpad, caps)) { |
1282 if (!gst_pad_set_caps (self->srcpad, caps)) { |
1049 gst_caps_unref (caps); |
1283 gst_caps_unref (caps); |
1050 return GST_FLOW_UNEXPECTED; |
1284 return GST_FLOW_UNEXPECTED; |
1051 } |
1285 } |
1052 gst_caps_unref (caps); |
1286 gst_caps_unref (caps); |
1053 } |
1287 |
1054 |
1288 /* push tags */ |
1055 while ((line = get_next_line (self)) && !self->flushing) { |
1289 if (self->subtitle_codec != NULL) { |
|
1290 GstTagList *tags; |
|
1291 |
|
1292 tags = gst_tag_list_new (); |
|
1293 gst_tag_list_add (tags, GST_TAG_MERGE_APPEND, GST_TAG_SUBTITLE_CODEC, |
|
1294 self->subtitle_codec, NULL); |
|
1295 gst_element_found_tags_for_pad (GST_ELEMENT (self), self->srcpad, tags); |
|
1296 } |
|
1297 } |
|
1298 |
|
1299 while (!self->flushing && (line = get_next_line (self))) { |
|
1300 guint offset = 0; |
|
1301 |
1056 /* Set segment on our parser state machine */ |
1302 /* Set segment on our parser state machine */ |
1057 self->state.segment = &self->segment; |
1303 self->state.segment = &self->segment; |
1058 /* Now parse the line, out of segment lines will just return NULL */ |
1304 /* Now parse the line, out of segment lines will just return NULL */ |
1059 GST_LOG_OBJECT (self, "Parsing line '%s'", line); |
1305 GST_LOG_OBJECT (self, "Parsing line '%s'", line + offset); |
1060 subtitle = self->parse_line (&self->state, line); |
1306 subtitle = self->parse_line (&self->state, line + offset); |
1061 g_free (line); |
1307 g_free (line); |
1062 |
1308 |
1063 if (subtitle) { |
1309 if (subtitle) { |
1064 guint subtitle_len = strlen (subtitle); |
1310 guint subtitle_len = strlen (subtitle); |
1065 |
1311 |
1073 memcpy (GST_BUFFER_DATA (buf), subtitle, subtitle_len + 1); |
1319 memcpy (GST_BUFFER_DATA (buf), subtitle, subtitle_len + 1); |
1074 GST_BUFFER_SIZE (buf) = subtitle_len; |
1320 GST_BUFFER_SIZE (buf) = subtitle_len; |
1075 GST_BUFFER_TIMESTAMP (buf) = self->state.start_time; |
1321 GST_BUFFER_TIMESTAMP (buf) = self->state.start_time; |
1076 GST_BUFFER_DURATION (buf) = self->state.duration; |
1322 GST_BUFFER_DURATION (buf) = self->state.duration; |
1077 |
1323 |
|
1324 /* in some cases (e.g. tmplayer) we can only determine the duration |
|
1325 * of a text chunk from the timestamp of the next text chunk; in those |
|
1326 * cases, we probably want to limit the duration to something |
|
1327 * reasonable, so we don't end up showing some text for e.g. 40 seconds |
|
1328 * just because nothing else is being said during that time */ |
|
1329 if (self->state.max_duration > 0 && GST_BUFFER_DURATION_IS_VALID (buf)) { |
|
1330 if (GST_BUFFER_DURATION (buf) > self->state.max_duration) |
|
1331 GST_BUFFER_DURATION (buf) = self->state.max_duration; |
|
1332 } |
|
1333 |
1078 gst_segment_set_last_stop (&self->segment, GST_FORMAT_TIME, |
1334 gst_segment_set_last_stop (&self->segment, GST_FORMAT_TIME, |
1079 self->state.start_time); |
1335 self->state.start_time); |
1080 |
1336 |
1081 GST_DEBUG_OBJECT (self, "Sending text '%s', %" GST_TIME_FORMAT " + %" |
1337 GST_DEBUG_OBJECT (self, "Sending text '%s', %" GST_TIME_FORMAT " + %" |
1082 GST_TIME_FORMAT, subtitle, GST_TIME_ARGS (self->state.start_time), |
1338 GST_TIME_FORMAT, subtitle, GST_TIME_ARGS (self->state.start_time), |
1083 GST_TIME_ARGS (self->state.duration)); |
1339 GST_TIME_ARGS (self->state.duration)); |
1084 |
1340 |
1085 ret = gst_pad_push (self->srcpad, buf); |
1341 ret = gst_pad_push (self->srcpad, buf); |
1086 } |
1342 } |
1087 |
1343 |
|
1344 /* move this forward (the tmplayer parser needs this) */ |
|
1345 if (self->state.duration != GST_CLOCK_TIME_NONE) |
|
1346 self->state.start_time += self->state.duration; |
|
1347 |
1088 g_free (subtitle); |
1348 g_free (subtitle); |
1089 subtitle = NULL; |
1349 subtitle = NULL; |
1090 |
1350 |
1091 if (ret != GST_FLOW_OK) { |
1351 if (ret != GST_FLOW_OK) { |
1092 GST_DEBUG_OBJECT (self, "flow: %s", gst_flow_get_name (ret)); |
1352 GST_DEBUG_OBJECT (self, "flow: %s", gst_flow_get_name (ret)); |
1106 |
1366 |
1107 self = GST_SUBPARSE (GST_PAD_PARENT (sinkpad)); |
1367 self = GST_SUBPARSE (GST_PAD_PARENT (sinkpad)); |
1108 |
1368 |
1109 /* Push newsegment if needed */ |
1369 /* Push newsegment if needed */ |
1110 if (self->need_segment) { |
1370 if (self->need_segment) { |
1111 #ifndef __SYMBIAN32__ |
|
1112 GST_LOG_OBJECT (self, "pushing newsegment event with %" GST_SEGMENT_FORMAT, |
1371 GST_LOG_OBJECT (self, "pushing newsegment event with %" GST_SEGMENT_FORMAT, |
1113 &self->segment); |
1372 &self->segment); |
1114 #endif |
1373 |
1115 gst_pad_push_event (self->srcpad, gst_event_new_new_segment (FALSE, |
1374 gst_pad_push_event (self->srcpad, gst_event_new_new_segment (FALSE, |
1116 self->segment.rate, self->segment.format, |
1375 self->segment.rate, self->segment.format, |
1117 self->segment.last_stop, self->segment.stop, self->segment.time)); |
1376 self->segment.last_stop, self->segment.stop, self->segment.time)); |
1118 self->need_segment = FALSE; |
1377 self->need_segment = FALSE; |
1119 } |
1378 } |
1134 switch (GST_EVENT_TYPE (event)) { |
1393 switch (GST_EVENT_TYPE (event)) { |
1135 case GST_EVENT_EOS:{ |
1394 case GST_EVENT_EOS:{ |
1136 /* Make sure the last subrip chunk is pushed out even |
1395 /* Make sure the last subrip chunk is pushed out even |
1137 * if the file does not have an empty line at the end */ |
1396 * if the file does not have an empty line at the end */ |
1138 if (self->parser_type == GST_SUB_PARSE_FORMAT_SUBRIP || |
1397 if (self->parser_type == GST_SUB_PARSE_FORMAT_SUBRIP || |
|
1398 self->parser_type == GST_SUB_PARSE_FORMAT_TMPLAYER || |
1139 self->parser_type == GST_SUB_PARSE_FORMAT_MPL2) { |
1399 self->parser_type == GST_SUB_PARSE_FORMAT_MPL2) { |
1140 GstBuffer *buf = gst_buffer_new_and_alloc (1 + 1); |
1400 GstBuffer *buf = gst_buffer_new_and_alloc (2 + 1); |
1141 |
1401 |
1142 GST_DEBUG ("EOS. Pushing remaining text (if any)"); |
1402 GST_DEBUG ("EOS. Pushing remaining text (if any)"); |
1143 GST_BUFFER_DATA (buf)[0] = '\n'; |
1403 GST_BUFFER_DATA (buf)[0] = '\n'; |
1144 GST_BUFFER_DATA (buf)[1] = '\0'; /* play it safe */ |
1404 GST_BUFFER_DATA (buf)[1] = '\n'; |
1145 GST_BUFFER_SIZE (buf) = 1; |
1405 GST_BUFFER_DATA (buf)[2] = '\0'; /* play it safe */ |
|
1406 GST_BUFFER_SIZE (buf) = 2; |
1146 GST_BUFFER_OFFSET (buf) = self->offset; |
1407 GST_BUFFER_OFFSET (buf) = self->offset; |
1147 gst_sub_parse_chain (pad, buf); |
1408 gst_sub_parse_chain (pad, buf); |
1148 } |
1409 } |
1149 ret = gst_pad_event_default (pad, event); |
1410 ret = gst_pad_event_default (pad, event); |
1150 break; |
1411 break; |
1245 |
1510 |
1246 /* FIXME 0.11: these caps are ugly, use app/x-subtitle + type field or so; |
1511 /* FIXME 0.11: these caps are ugly, use app/x-subtitle + type field or so; |
1247 * also, give different subtitle formats really different types */ |
1512 * also, give different subtitle formats really different types */ |
1248 static GstStaticCaps mpl2_caps = |
1513 static GstStaticCaps mpl2_caps = |
1249 GST_STATIC_CAPS ("application/x-subtitle-mpl2"); |
1514 GST_STATIC_CAPS ("application/x-subtitle-mpl2"); |
|
1515 #define SUB_CAPS (gst_static_caps_get (&sub_caps)) |
|
1516 |
1250 static GstStaticCaps tmp_caps = |
1517 static GstStaticCaps tmp_caps = |
1251 GST_STATIC_CAPS ("application/x-subtitle-tmplayer"); |
1518 GST_STATIC_CAPS ("application/x-subtitle-tmplayer"); |
|
1519 #define TMP_CAPS (gst_static_caps_get (&tmp_caps)) |
|
1520 |
|
1521 static GstStaticCaps sub_caps = GST_STATIC_CAPS ("application/x-subtitle"); |
|
1522 #define MPL2_CAPS (gst_static_caps_get (&mpl2_caps)) |
|
1523 |
|
1524 #ifndef GST_DISABLE_XML |
1252 static GstStaticCaps smi_caps = GST_STATIC_CAPS ("application/x-subtitle-sami"); |
1525 static GstStaticCaps smi_caps = GST_STATIC_CAPS ("application/x-subtitle-sami"); |
1253 static GstStaticCaps sub_caps = GST_STATIC_CAPS ("application/x-subtitle"); |
|
1254 |
|
1255 #define SUB_CAPS (gst_static_caps_get (&sub_caps)) |
|
1256 #define SAMI_CAPS (gst_static_caps_get (&smi_caps)) |
1526 #define SAMI_CAPS (gst_static_caps_get (&smi_caps)) |
1257 #define TMP_CAPS (gst_static_caps_get (&tmp_caps)) |
1527 #endif |
1258 #define MPL2_CAPS (gst_static_caps_get (&mpl2_caps)) |
|
1259 |
1528 |
1260 static void |
1529 static void |
1261 gst_subparse_type_find (GstTypeFind * tf, gpointer private) |
1530 gst_subparse_type_find (GstTypeFind * tf, gpointer private) |
1262 { |
1531 { |
1263 GstSubParseFormat format; |
1532 GstSubParseFormat format; |
1264 const guint8 *data; |
1533 const guint8 *data; |
1265 GstCaps *caps; |
1534 GstCaps *caps; |
1266 gchar *str; |
1535 gchar *str; |
1267 |
1536 gchar *encoding = NULL; |
1268 if (!(data = gst_type_find_peek (tf, 0, 36))) |
1537 const gchar *end; |
|
1538 |
|
1539 if (!(data = gst_type_find_peek (tf, 0, 129))) |
1269 return; |
1540 return; |
1270 |
1541 |
1271 /* make sure string passed to _autodetect() is NUL-terminated */ |
1542 /* make sure string passed to _autodetect() is NUL-terminated */ |
1272 str = g_strndup ((gchar *) data, 35); |
1543 str = g_malloc0 (129); |
|
1544 memcpy (str, data, 128); |
|
1545 |
|
1546 if ((encoding = detect_encoding (str, 128)) != NULL) { |
|
1547 gchar *converted_str; |
|
1548 GError *err = NULL; |
|
1549 gsize tmp; |
|
1550 |
|
1551 converted_str = gst_convert_to_utf8 (str, 128, encoding, &tmp, &err); |
|
1552 if (converted_str == NULL) { |
|
1553 GST_DEBUG ("Encoding '%s' detected but conversion failed: %s", encoding, |
|
1554 err->message); |
|
1555 g_error_free (err); |
|
1556 g_free (encoding); |
|
1557 } else { |
|
1558 g_free (str); |
|
1559 str = converted_str; |
|
1560 g_free (encoding); |
|
1561 } |
|
1562 } |
|
1563 |
|
1564 /* Check if at least the first 120 chars are valid UTF8, |
|
1565 * otherwise convert as always */ |
|
1566 if (!g_utf8_validate (str, 128, &end) && (end - str) < 120) { |
|
1567 gchar *converted_str; |
|
1568 GError *err = NULL; |
|
1569 gsize tmp; |
|
1570 const gchar *enc; |
|
1571 |
|
1572 enc = g_getenv ("GST_SUBTITLE_ENCODING"); |
|
1573 if (enc == NULL || *enc == '\0') { |
|
1574 /* if local encoding is UTF-8 and no encoding specified |
|
1575 * via the environment variable, assume ISO-8859-15 */ |
|
1576 if (g_get_charset (&enc)) { |
|
1577 enc = "ISO-8859-15"; |
|
1578 } |
|
1579 } |
|
1580 converted_str = gst_convert_to_utf8 (str, 128, enc, &tmp, &err); |
|
1581 if (converted_str == NULL) { |
|
1582 GST_DEBUG ("Charset conversion failed: %s", err->message); |
|
1583 g_error_free (err); |
|
1584 g_free (str); |
|
1585 return; |
|
1586 } else { |
|
1587 g_free (str); |
|
1588 str = converted_str; |
|
1589 } |
|
1590 } |
|
1591 |
1273 format = gst_sub_parse_data_format_autodetect (str); |
1592 format = gst_sub_parse_data_format_autodetect (str); |
1274 g_free (str); |
1593 g_free (str); |
1275 |
1594 |
1276 switch (format) { |
1595 switch (format) { |
1277 case GST_SUB_PARSE_FORMAT_MDVDSUB: |
1596 case GST_SUB_PARSE_FORMAT_MDVDSUB: |