net/aws/transcriber: translate: optional experimental translation tokenization

This commit adds an optional experimental translation tokenization feature. It can be activated using the `translation_src_%u` pads property `tokenization-method`. For the moment, the feature is deactivated by default. The Translate ws accepts '<span></span>' tags in the input and adds matching tags in the output. When an 'id' is also provided as an attribute of the 'span', the matching output tag also uses this 'id'. In the context of close captions, the 'id's are of little use. However, we can take advantage of the spans in the output to identify translation chunks, which more or less reflect the rythm of the input transcript. This commit adds simples spans (no 'id') to the input Transcript Items and parses the resulting spans in the translated output, assigning the timestamps and durations sequentially from the input Transcript Items. Edge cases such as absence of spans, nested spans were observed and are handled here. Similarly, mismatches between the number of input and output items are taken care of by some sort of reconcialiation. Note that this is still experimental and requires further testings. Part-of: <https://gitlab.freedesktop.org/gstreamer/gst-plugins-rs/-/merge_requests/1109>
author: François Laignel <francois@centricular.com> 2023-03-11 18:27:51 +0300
committer: GStreamer Marge Bot <gitlab-merge-bot@gstreamer-foundation.org> 2023-03-14 16:48:32 +0300
commit: 299e25ab3c94a3b7bcdc4efa30d34e23dcb349f5 (patch)
tree: 39bbd1b484282a1f524037226abf5ceb425706aa /docs/plugins
parent: 743e97738fe44bec8843f467828a1ec2aa710d91 (diff)
1 files changed, 34 insertions, 1 deletions
diff --git a/docs/plugins/gst_plugins_cache.json b/docs/plugins/gst_plugins_cache.json
index c6e6e16bb..982e0d02b 100644
--- a/docs/plugins/gst_plugins_cache.json
+++ b/docs/plugins/gst_plugins_cache.json
@@ -650,6 +650,12 @@
                         "direction": "src",
                         "presence": "request",
                         "type": "GstTranslationSrcPad"
+                    },
+                    "translation_src_%%u": {
+                        "caps": "text/x-raw:\n         format: utf8\n",
+                        "direction": "src",
+                        "presence": "request",
+                        "type": "GstTranslationSrcPad"
                     }
                 },
                 "properties": {
@@ -773,7 +779,7 @@
                         "construct": false,
                         "construct-only": false,
                         "controllable": false,
-                        "default": "3000",
+                        "default": "5000",
                         "max": "-1",
                         "min": "0",
                         "mutable": "ready",
@@ -858,6 +864,21 @@
                     }
                 ]
             },
+            "GstAwsTranscriberTranslationTokenizationMethod": {
+                "kind": "enum",
+                "values": [
+                    {
+                        "desc": "None: don't tokenize translations",
+                        "name": "none",
+                        "value": "0"
+                    },
+                    {
+                        "desc": "Span based: insert spans in the transript text and use the resulting spans in the translations to reproduce speech pacing.",
+                        "name": "span-based",
+                        "value": "1"
+                    }
+                ]
+            },
             "GstAwsTranscriberVocabularyFilterMethod": {
                 "kind": "enum",
                 "values": [
@@ -919,6 +940,18 @@
                         "readable": true,
                         "type": "gchararray",
                         "writable": true
+                    },
+                    "tokenization-method": {
+                        "blurb": "The tokenization method to apply to translations",
+                        "conditionally-available": false,
+                        "construct": false,
+                        "construct-only": false,
+                        "controllable": false,
+                        "default": "none (0)",
+                        "mutable": "ready",
+                        "readable": true,
+                        "type": "GstAwsTranscriberTranslationTokenizationMethod",
+                        "writable": true
                     }
                 }
             }
author	François Laignel <francois@centricular.com>	2023-03-11 18:27:51 +0300
committer	GStreamer Marge Bot <gitlab-merge-bot@gstreamer-foundation.org>	2023-03-14 16:48:32 +0300
commit	299e25ab3c94a3b7bcdc4efa30d34e23dcb349f5 (patch)
tree	39bbd1b484282a1f524037226abf5ceb425706aa /docs/plugins
parent	743e97738fe44bec8843f467828a1ec2aa710d91 (diff)