-
Notifications
You must be signed in to change notification settings - Fork 1
/
auto-caption.py
90 lines (78 loc) · 3.2 KB
/
auto-caption.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#!/usr/bin/env python
#
# Any copyright is dedicated to the Public Domain.
# http://creativecommons.org/publicdomain/zero/1.0/
from __future__ import print_function
import argparse
import codecs
import gi
gi.require_version('Gst', '1.0')
from gi.repository import GObject, Gst
from pycaption import SRTWriter, WebVTTWriter, CaptionSet, Caption, CaptionNode
GObject.threads_init()
Gst.init(None)
def run_pipeline(url=None, hmm=None, lm=None, dict=None,
caption_format='webvtt', out_file=None):
if url is None:
raise Exception('No URL specified!')
pipeline = Gst.parse_launch('uridecodebin name=source ! audioconvert !' +
' audioresample ! pocketsphinx name=asr !' +
' fakesink')
source = pipeline.get_by_name('source')
source.set_property('uri', url)
pocketsphinx = pipeline.get_by_name('asr')
if hmm:
pocketsphinx.set_property('hmm', hmm)
if lm:
pocketsphinx.set_property('lm', lm)
if dict:
pocketsphinx.set_property('dict', dict)
bus = pipeline.get_bus()
# Start playing
pipeline.set_state(Gst.State.PLAYING)
cap_set = CaptionSet()
captions = []
# Wait until error or EOS
while True:
try:
msg = bus.timed_pop(Gst.CLOCK_TIME_NONE)
if msg:
#if msg.get_structure():
# print(msg.get_structure().to_string())
if msg.type == Gst.MessageType.EOS:
break
struct = msg.get_structure()
if struct and struct.get_name() == 'pocketsphinx':
if struct['final']:
c = Caption()
c.start = struct['start_time'] / Gst.USECOND
c.end = struct['end_time'] / Gst.USECOND
c.nodes.append(CaptionNode.create_text(struct['hypothesis']))
captions.append(c)
except KeyboardInterrupt:
pipeline.send_event(Gst.Event.new_eos())
# Free resources
pipeline.set_state(Gst.State.NULL)
cap_set.set_captions('en-US', captions)
writer = SRTWriter() if caption_format == 'srt' else WebVTTWriter()
caption_data = writer.write(cap_set)
if out_file is not None:
codecs.open(out_file, 'w', 'utf-8').write(caption_data)
else:
print(caption_data)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Recognize speech from audio')
parser.add_argument('url', help='URL to a media file')
parser.add_argument('--hmm',
help='Path to a pocketsphinx HMM data directory')
parser.add_argument('--lm',
help='Path to a pocketsphinx language model file')
parser.add_argument('--dict',
help='Path to a pocketsphinx CMU dictionary file')
parser.add_argument('--caption-format', choices=['srt', 'webvtt'],
default='webvtt',
help='Format of output captions')
parser.add_argument('--out-file', metavar='FILE',
help='Write captions to FILE (default is stdout)')
args = parser.parse_args()
run_pipeline(**vars(args))