diff options
Diffstat (limited to '')
| -rw-r--r-- | roles/caption/tasks/main.yml | 25 | ||||
| -rwxr-xr-x | roles/caption/templates/caption.sh | 18 | ||||
| -rwxr-xr-x | roles/caption/templates/process-captions.py | 231 | 
3 files changed, 253 insertions, 21 deletions
diff --git a/roles/caption/tasks/main.yml b/roles/caption/tasks/main.yml index 7bf45d0..6396339 100644 --- a/roles/caption/tasks/main.yml +++ b/roles/caption/tasks/main.yml @@ -3,12 +3,31 @@      name:        - python3-pip        - ffmpeg +      - cmake +      - jq  - name: Install whisper    ansible.builtin.pip:      name: git+https://github.com/openai/whisper.git +- name: Install lhotse +  ansible.builtin.pip: +    name: +      - lhotse +      - webvtt-py +      - tqdm +      - torchaudio +      - num2words  - name: Copy the shell script -  tags: caption-sh +  tags: process-captions    template: -    src: caption.sh -    dest: current +    src: process-captions.py +    dest: /data/emacsconf/{{ emacsconf_year }}/process-captions.py      mode: 0755 +    owner: sachac +    group: org +- name: Copy talks.json +  tags: talks-json +  template: +    src: talks.json +    dest: /data/emacsconf/{{ emacsconf_year }}/talks.json +    owner: sachac +    group: org diff --git a/roles/caption/templates/caption.sh b/roles/caption/templates/caption.sh deleted file mode 100755 index 9600a3c..0000000 --- a/roles/caption/templates/caption.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -# {{ ansible_managed }} -FILE="$1" -MODEL="${2:small}" -AUDIO=$(basename "$FILE" | sed s/\\.[a-z][a-z][a-z][a-z]?$//).ogg -if [[ ! -f $AUDIO ]]; then -  if  [[ "$FILE" == *webm ]]; then -      ffmpeg -y -i "$FILE" -acodec copy -vn $AUDIO -  else -      ffmpeg -y -i "$FILE" -acodec libvorbis -vn $AUDIO -  fi -fi -date > $AUDIO-$MODEL.log -time whisper $AUDIO --model $MODEL --threads 12 >> $AUDIO-$MODEL.log -for EXT in vtt txt srt; do -    mv $AUDIO.$EXT $(basename -s .webm.$EXT $AUDIO.$EXT) -done -date >> $AUDIO-$MODEL.log diff --git a/roles/caption/templates/process-captions.py b/roles/caption/templates/process-captions.py new file mode 100755 index 0000000..6ad890a --- /dev/null +++ b/roles/caption/templates/process-captions.py @@ -0,0 +1,231 @@ +#!/usr/bin/python3 +"""Use OpenAI Whisper to automatically generate captions for the video files in the specified directory.""" + +# {{ ansible_managed }} + +# The MIT License (MIT) +# Copyright © 2022 Sacha Chua <sacha@sachachua.com> + +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation files +# (the “Software”), to deal in the Software without restriction, +# including without limitation the rights to use, copy, modify, merge, +# publish, distribute, sublicense, and/or sell copies of the Software, +# and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: + +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +from collections import defaultdict +import subprocess +import datetime +import sys +import webvtt +import xml.etree.ElementTree as ET +from lhotse import RecordingSet, Recording, AudioSource, SupervisionSegment, SupervisionSet, create_cut_set_eager, align_with_torchaudio, CutSet, annotate_with_whisper +from tqdm import tqdm +import whisper +import re +import os +import json +import torch + +THREADS = 12 +VIDEO_REGEXP = '\.(webm|mov)$' +AUDIO_REGEXP = '\.(ogg|opus)$' +ALWAYS = False +TRIM_AUDIO = False +MODEL = os.environ.get('MODEL', 'large')  # Set to tiny for testing +JSON_FILE = '/data/emacsconf/2022/talks.json' + +def get_slug_from_filename(filename): +    m = re.search('emacsconf-[0-9]+-([a-z]+)--', filename) +    if m: +        return m.group(1) +    else: +        return os.path.basename(os.path.dirname(filename)) +         +def get_files_to_work_on(directory): +    """Return the list of audio files to work on. +    The specified directory is checked recursively. +    Skip any videos that already have caption files. +     +    Convert any videos that don't already have audio files, and return the audio files instead. +    When there are multiple videos and audio files for a talk, pick one. +    """ +    info = defaultdict(lambda: {}, {}) +    directory = os.path.expanduser(directory) +    for folder, subs, files in os.walk(directory): +        for filename in files: +            f = os.path.join(folder, filename) +            slug = get_slug_from_filename(f) +            info[slug]['slug'] = slug +            if re.search(AUDIO_REGEXP, filename): +                info[slug]['audio'] = f +            elif re.search(VIDEO_REGEXP, filename):  +                info[slug]['video'] = f +            elif re.search('vtt$', filename):  +                info[slug]['vtt'] = f +            elif re.search('srv2$', filename):  +                info[slug]['srv2'] = f +    needs_work = [] +    if JSON_FILE: +        with open(JSON_FILE) as f: +            talks = json.load(f)['talks'] +    for key, val in info.items(): +        if not 'video' in val and not 'audio' in val: continue +        if talks: +            talk = next(filter(lambda talk: talk['slug'] == val['slug'], talks), None) +        if talk: +            val['base'] = os.path.join(os.path.dirname(val['video'] or val['audio']), +                                       base_name(talk['video-slug'])) +        else: +            val['base'] = os.path.join(os.path.dirname(val['video'] or val['audio']), +                                       base_name(val['video'] or val['audio'])) +        if ALWAYS or (not 'vtt' in val or not 'srv2' in val): +            if not 'audio' in val and 'video' in val: +                # No audio, need to convert it +                val = extract_audio(val) +            needs_work.append(val) +    return needs_work + +def extract_audio(work): +    output = subprocess.check_output(['ffprobe', video_file], stderr=subprocess.STDOUT) +    extension = 'opus' +    if 'Audio: vorbis' in output.decode(): +        extension = 'ogg' +    new_file = os.path.join(os.path.dirname(video_file), base_name(video_file) + '.' + extension) +    acodec = 'copy' if re.search('webm$', video_file) else 'libopus' +    log("Extracting audio from %s acodec %s" % (video_file, acodec)) +    output = subprocess.check_output(['ffmpeg', '-y', '-i', video_file, '-acodec', acodec, '-vn', new_file], stderr=subprocess.STDOUT) +    work['audio'] = new_file +    return work + +def to_sec(time_str): +    "Convert a WebVTT time into seconds." +    h, m, s, ms = re.split('[\\.:]', time_str) +    return int(h) * 3600 + int(m) * 60 + int(s) + (int(ms) / 1000) + +def log(s): +    print(datetime.datetime.now(), s) + +def clean_up_timestamps(result): +    segs = list(result['segments']) +    seg_len = len(segs) +    for i, seg in enumerate(segs[:-1]): +        seg['end'] = min(segs[i + 1]['start'] - 0.001, seg['end']) +    result['segments'] = segs +    return result +     +def generate_captions(work): +    """Generate a VTT file based on the audio file.""" +    log("Generating captions") +    new_file = work['base'] + '.vtt' +    model = whisper.load_model(MODEL, device="cuda" if torch.cuda.is_available() else "cpu") +    audio = whisper.load_audio(work['audio']) +    if TRIM_AUDIO: +        audio = whisper.pad_or_trim(audio) +    result = model.transcribe(audio, verbose=True) +    result = clean_up_timestamps(result) +    with open(new_file, 'w') as vtt: +        whisper.utils.write_vtt(result['segments'], file=vtt) +    work['vtt'] = new_file +    if 'srv2' in work: del work['srv2'] +    return work +     +def generate_srv2(work): +    """Generate a SRV2 file.""" +    log("Generating SRV2") +    recs = RecordingSet.from_recordings([Recording.from_file(work['audio'])]) +    rec_id = recs[0].id +    captions = [] +    for i, caption in enumerate(webvtt.read(work['vtt'])): +        if TRIM_AUDIO and i > 2: break +        captions.append(SupervisionSegment(id=rec_id + '-sup' + '%05d' % i, channel=recs[0].channel_ids[0], recording_id=rec_id, start=to_sec(caption.start), duration=to_sec(caption.end) - to_sec(caption.start), text=caption.text, language='English')) +    sups = SupervisionSet.from_segments(captions) +    main = CutSet.from_manifests(recordings=recs, supervisions=sups) +    work['cuts'] = main.trim_to_supervisions(keep_overlapping=False,keep_all_channels=True) +    cuts_aligned = align_with_torchaudio(work['cuts']) +    root = ET.Element("timedtext") +    doc = ET.SubElement(root, "window") +    for line, aligned in enumerate(cuts_aligned): +        # Numbers are weird  +        words = re.split(' ', captions[line].text) +        tokenized_words = [re.sub('[^\'A-Z0-9]', '', w.upper()) for w in words] +        if len(aligned.supervisions) == 0: +            print(captions[line], aligned) +            continue +        aligned_words = list(aligned.supervisions[0].alignment['word']) +        aligned_index = 0 +        aligned_len = len(aligned_words) +        word_index = 0 +        word_len = len(words) +        while word_index < word_len and aligned_index < aligned_len: +            # log("Choosing %s %s" % (words[word_index], aligned_words[aligned_index].symbol)) +            ET.SubElement(doc, 'text', +                              t=str(float(aligned_words[aligned_index].start)*1000), +                              d=str(float(aligned_words[aligned_index].duration)*1000), +                              w="1", +                              append="1").text = words[word_index] +            if tokenized_words[word_index] != aligned_words[aligned_index].symbol and word_index < word_len - 1: +                # Scan ahead for a word that maches the next word, but don't go too far +                cur_aligned = aligned_index +                while aligned_index < aligned_len and aligned_index < cur_aligned + 5 and aligned_words[aligned_index].symbol != tokenized_words[word_index + 1]: +                    log("Sliding to match %s %d %s" % (tokenized_words[word_index + 1], aligned_index, aligned_words[aligned_index].symbol)) +                    aligned_index = aligned_index + 1 +                if not aligned_words[aligned_index].symbol == tokenized_words[word_index + 1]: +                    log("Resetting, couldn't find") +                    aligned_index = cur_aligned + 1 +            else: +                aligned_index = aligned_index + 1 +            word_index = word_index + 1 +    tree = ET.ElementTree(root) +    work['srv2'] = work['base'] + '.srv2' +    with open(work['srv2'], "w") as f: +        tree.write(f.buffer) +    return work + +def base_name(s): +    """ +    Return the base name of file so that we can add extensions to it. +    Remove tokens like --normalized, --recoded, etc. +    Make sure the filename has either --main or --questions. +    """ +    s = os.path.basename(s) +    type = 'questions' if '--questions.' in s else 'main' +    if TRIM_AUDIO: +        type = 'test' +    match = re.match('^(emacsconf-[0-9]+-[a-z]+--.*?--.*?)(--|\.)', s) +    if (match): +        return match.group(1) + '--' + type +    else: +        return os.path.splitext(s)[0] + '--' + type +# assert(base_name('/home/sachac/current/sqlite/emacsconf-2022-sqlite--using-sqlite-as-a-data-source-a-framework-and-an-example--andrew-hyatt--normalized.webm.vtt') == 'emacsconf-2022-sqlite--using-sqlite-as-a-data-source-a-framework-and-an-example--andrew-hyatt--main') + +log(f"MODEL {MODEL} ALWAYS {ALWAYS} TRIM_AUDIO {TRIM_AUDIO}") +directory = sys.argv[1] if len(sys.argv) > 1 else "~/current" +needs_work = get_files_to_work_on(directory) +if THREADS > 0: +    torch.set_num_threads(THREADS) +for work in needs_work: +    log("Started processing %s" % work['base']) +    if work['audio']: +        if ALWAYS or not 'vtt' in work: +            work = generate_captions(work) +        if ALWAYS or not 'srv2' in work: +            work = generate_srv2(work) +#     print("Aligning words", audio_file, datetime.datetime.now()) +#     word_cuts = align_words(cuts) +#     convert_cuts_to_word_timing(audio_file, word_cuts) +    log("Done %s" % str(work['base'])) +      | 
