diff options
| author | Sacha Chua <sacha@sachachua.com> | 2022-10-29 07:33:39 -0400 | 
|---|---|---|
| committer | Sacha Chua <sacha@sachachua.com> | 2022-10-29 07:33:39 -0400 | 
| commit | a6a373410bde6918f27992ff967660c0541f810a (patch) | |
| tree | ab8fbd6a5e12f065a7af82027b98df818dfb9603 /roles/caption | |
| parent | bff9ceae7f42723f939f620f9ef9faebca966742 (diff) | |
| download | emacsconf-ansible-a6a373410bde6918f27992ff967660c0541f810a.tar.xz emacsconf-ansible-a6a373410bde6918f27992ff967660c0541f810a.zip | |
media, wiki-publish, caption updates
* common-playbook.yml: Start moving publishing setup to res
* group_vars/all.yml: New common variables.
* inventory.yml (prod): Number of threads, new hosts
* roles/caption/templates/process-captions.py: Add mp4, try to figure
out why script was failing, simplify
* roles/media/tasks/main.yml: New role for setting up
media.emacsconf.org for this year
* roles/prerec/templates/reencode.sh: Keep a copy of zaeph's script
* roles/stream/defaults/main.yml: Add more variables
* roles/stream: Restreaming lowres
* roles/wiki-publish/tasks/emacs.yml: Build Emacs from source
Diffstat (limited to 'roles/caption')
| -rw-r--r-- | roles/caption/tasks/main.yml | 4 | ||||
| -rwxr-xr-x | roles/caption/templates/process-captions.py | 84 | 
2 files changed, 34 insertions, 54 deletions
| diff --git a/roles/caption/tasks/main.yml b/roles/caption/tasks/main.yml index 7fe1570..353b83b 100644 --- a/roles/caption/tasks/main.yml +++ b/roles/caption/tasks/main.yml @@ -31,7 +31,7 @@      path: "{{ emacsconf_caption_dir }}"      state: directory  - name: Copy the script for processing the files -  tags: process-captions, wip +  tags: process-captions    template:      src: process-captions.py      dest: "{{ emacsconf_caption_dir }}/process-captions.py" @@ -58,7 +58,6 @@      group: root      mode: 0755  - name: Change the group for all the files -  tags: wip    file:      dest: "{{ emacsconf_caption_dir }}"      group: "{{ emacsconf_group }}" @@ -66,7 +65,6 @@      recurse: true  - name: Restart caption monitoring service    become: true -  tags: wip    service:      name: captions      enabled: true diff --git a/roles/caption/templates/process-captions.py b/roles/caption/templates/process-captions.py index 72e9ad2..66f39dd 100755 --- a/roles/caption/templates/process-captions.py +++ b/roles/caption/templates/process-captions.py @@ -32,16 +32,12 @@ import datetime  import sys  import webvtt  import xml.etree.ElementTree as ET -from lhotse import RecordingSet, Recording, AudioSource, SupervisionSegment, SupervisionSet, create_cut_set_eager, align_with_torchaudio, CutSet, annotate_with_whisper -from tqdm import tqdm -import whisper -import re -import os +from lhotse import RecordinRecording, AudioSource, SupervisionSegment, SupervisionSet, create_cut_set_e  import json  import torch -THREADS = 12 -VIDEO_REGEXP = '\.(webm|mov)$' +THREADS = {{ cpus }} +VIDEO_REGEXP = '\.(webm|mov|mp4)$'  AUDIO_REGEXP = '\.(ogg|opus)$'  ALWAYS = False  TRIM_AUDIO = False @@ -49,6 +45,8 @@ MODEL = os.environ.get('MODEL', 'large')  # Set to tiny for testing  WORK_DIR = "{{ emacsconf_caption_dir }}"  JSON_FILE = os.path.join(WORK_DIR, 'talks.json') +# ---------------------------------------------------------------- +  def get_slug_from_filename(filename):      m = re.search('emacsconf-[0-9]+-([a-z]+)--', filename)      if m: @@ -155,41 +153,23 @@ def generate_srv2(work):          captions.append(SupervisionSegment(id=rec_id + '-sup' + '%05d' % i, channel=recs[0].channel_ids[0], recording_id=rec_id, start=to_sec(caption.start), duration=to_sec(caption.end) - to_sec(caption.start), text=caption.text, language='English'))      sups = SupervisionSet.from_segments(captions)      main = CutSet.from_manifests(recordings=recs, supervisions=sups) -    work['cuts'] = main.trim_to_supervisions(keep_overlapping=False,keep_all_channels=True) +    work['cuts'] = main.trim_to_supervisions(keep_all_channels=True)      cuts_aligned = align_with_torchaudio(work['cuts'])      root = ET.Element("timedtext")      doc = ET.SubElement(root, "window")      for line, aligned in enumerate(cuts_aligned): -        # Numbers are weird  -        words = re.split(' ', captions[line].text) -        tokenized_words = [re.sub('[^\'A-Z0-9]', '', w.upper()) for w in words] -        if len(aligned.supervisions) == 0: -            print(captions[line], aligned) -            continue -        aligned_words = list(aligned.supervisions[0].alignment['word']) -        aligned_index = 0 -        aligned_len = len(aligned_words) -        word_index = 0 -        word_len = len(words) -        while word_index < word_len and aligned_index < aligned_len: -            # log("Choosing %s %s" % (words[word_index], aligned_words[aligned_index].symbol)) -            ET.SubElement(doc, 'text', -                              t=str(float(aligned_words[aligned_index].start)*1000), -                              d=str(float(aligned_words[aligned_index].duration)*1000), -                              w="1", -                              append="1").text = words[word_index] -            if tokenized_words[word_index] != aligned_words[aligned_index].symbol and word_index < word_len - 1: -                # Scan ahead for a word that maches the next word, but don't go too far -                cur_aligned = aligned_index -                while aligned_index < aligned_len and aligned_index < cur_aligned + 5 and aligned_words[aligned_index].symbol != tokenized_words[word_index + 1]: -                    log("Sliding to match %s %d %s" % (tokenized_words[word_index + 1], aligned_index, aligned_words[aligned_index].symbol)) -                    aligned_index = aligned_index + 1 -                if not aligned_words[aligned_index].symbol == tokenized_words[word_index + 1]: -                    log("Resetting, couldn't find") -                    aligned_index = cur_aligned + 1 -            else: -                aligned_index = aligned_index + 1 -            word_index = word_index + 1 +        if len(aligned.supervisions) > 0: +            aligned_words = aligned.supervisions[0].alignment['word'] +            for w, word in enumerate(aligned_words): +                el = ET.SubElement(doc, 'text', +                                  t=str(float(word.start)*1000), +                                  d=str(float(word.duration)*1000), +                                  w="1", +                                  append="1") +                el.text = word.symbol +                el.tail = "\n" +        else: +            print("No supervisions", aligned)      tree = ET.ElementTree(root)      work['srv2'] = work['base'] + '.srv2'      with open(work['srv2'], "w") as f: @@ -218,18 +198,20 @@ directory = sys.argv[1] if len(sys.argv) > 1 else WORK_DIR  needs_work = get_files_to_work_on(directory)  if len(needs_work) > 0: -    if THREADS > 0: -        torch.set_num_threads(THREADS) -    for work in needs_work: -        log("Started processing %s" % work['base']) -        if work['audio']: -            if ALWAYS or not 'vtt' in work: -                work = generate_captions(work) -            if ALWAYS or not 'srv2' in work: -                work = generate_srv2(work) -    #     print("Aligning words", audio_file, datetime.datetime.now()) -    #     word_cuts = align_words(cuts) -    #     convert_cuts_to_word_timing(audio_file, word_cuts) -        log("Done %s" % str(work['base'])) +    while len(needs_work) > 0: +        if THREADS > 0: +            torch.set_num_threads(THREADS) +            for work in needs_work: +                log("Started processing %s" % work['base']) +                if work['audio']: +                    if ALWAYS or not 'vtt' in work: +                        work = generate_captions(work) +                    if ALWAYS or not 'srv2' in work: +                        work = generate_srv2(work) +                        #     print("Aligning words", audio_file, datetime.datetime.now()) +                        #     word_cuts = align_words(cuts) +                        #     convert_cuts_to_word_timing(audio_file, word_cuts) +                    log("Done %s" % str(work['base'])) +            needs_work = get_files_to_work_on(directory)  else:      log("No work needed.") | 
