From 0b07963c81155b621dd45b878b869a78b8c9de49 Mon Sep 17 00:00:00 2001 From: Sacha Chua Date: Sat, 5 Nov 2022 07:55:05 -0400 Subject: Caption updates --- roles/caption/tasks/main.yml | 39 ++++++++++++++++++------ roles/caption/templates/process-captions.py | 18 +++++++---- roles/caption/templates/process-prerec.sh | 18 +++++++++++ roles/caption/templates/reencode.sh | 43 +++++++++++++++++++++++++++ roles/caption/templates/run-aeneas.sh | 14 +++++++++ roles/caption/templates/update-task-status.sh | 10 +++++++ roles/caption/templates/upload.sh | 6 ++++ 7 files changed, 133 insertions(+), 15 deletions(-) create mode 100755 roles/caption/templates/process-prerec.sh create mode 100755 roles/caption/templates/reencode.sh create mode 100755 roles/caption/templates/run-aeneas.sh create mode 100755 roles/caption/templates/update-task-status.sh create mode 100755 roles/caption/templates/upload.sh diff --git a/roles/caption/tasks/main.yml b/roles/caption/tasks/main.yml index c1511cf..a69d848 100644 --- a/roles/caption/tasks/main.yml +++ b/roles/caption/tasks/main.yml @@ -30,20 +30,41 @@ state: present - name: Ensure the directory exists file: - path: "{{ emacsconf_caption_dir }}" + path: "{{ emacsconf_caption_dir }}/scripts" state: directory -- name: Copy the script for processing the files - tags: process-captions + owner: "{{ emacsconf_user }}" + group: "{{ emacsconf_group }}" +- name: Recreate encoding script + tags: process-prerec template: - src: process-captions.py - dest: "{{ emacsconf_caption_dir }}/process-captions.py" + src: "{{ item }}" + dest: "{{ emacsconf_caption_dir }}/scripts/{{ item }}" + owner: "{{ emacsconf_user }}" + group: "{{ emacsconf_group }}" + force: no mode: 0775 -- name: Copy the inotify script - tags: process-captions + loop: + - reencode.sh +- name: Copy scripts for processing + tags: process-prerec template: - src: inotify-process-captions.sh - dest: "{{ emacsconf_caption_dir }}/inotify-process-captions.sh" + src: "{{ item }}" + dest: "{{ emacsconf_caption_dir }}/scripts/{{ item }}" + owner: "{{ emacsconf_user }}" + group: "{{ emacsconf_group }}" mode: 0775 + loop: + - process-captions.py + - process-prerec.sh + - update-task-status.sh + - upload.sh + - run-aeneas.sh +# - name: Copy the inotify script +# tags: process-captions +# template: +# src: inotify-process-captions.sh +# dest: "{{ emacsconf_caption_dir }}/inotify-process-captions.sh" +# mode: 0775 - name: Copy talks.json tags: talks-json template: diff --git a/roles/caption/templates/process-captions.py b/roles/caption/templates/process-captions.py index 223531b..1b6515c 100755 --- a/roles/caption/templates/process-captions.py +++ b/roles/caption/templates/process-captions.py @@ -81,6 +81,8 @@ def get_files_to_work_on(directory): info[slug]['vtt'] = f elif re.search('srv2$', filename): info[slug]['srv2'] = f + elif re.search('txt$', filename): + info[slug]['txt'] = f needs_work = [] if JSON_FILE: with open(JSON_FILE) as f: @@ -108,10 +110,11 @@ def extract_audio(work): if 'Audio: vorbis' in output.decode(): extension = 'ogg' new_file = work['base'] + '.' + extension - acodec = 'copy' if re.search('\\.(webm|mp4|mkv)$', work['video']) else 'libopus' + acodec = 'copy' if re.search('\\.webm$', work['video']) else 'libopus' log("Extracting audio from %s acodec %s" % (work['video'], acodec)) output = subprocess.check_output(['ffmpeg', '-y', '-i', work['video'], '-acodec', acodec, '-vn', new_file], stderr=subprocess.STDOUT) work['audio'] = new_file + subprocess.call(["/data/emacsconf/2022/scripts/upload.sh", work['audio']]) return work def to_sec(time_str): @@ -142,18 +145,21 @@ def generate_captions(work): result = clean_up_timestamps(result) with open(new_file, 'w') as vtt: whisper.utils.write_vtt(result['segments'], file=vtt) - with open(work['base'] + '.txt') as txt: + with open(work['base'] + '.txt', 'w') as txt: whisper.utils.write_txt(result['segments'], file=txt) work['vtt'] = new_file + work['txt'] = work['base'] + '.txt' + subprocess.call(["/data/emacsconf/2022/scripts/upload.sh", work['vtt'], work['txt']]) if 'srv2' in work: del work['srv2'] return work def generate_text(work): - with open(work['base'] + '.txt') as txt: + with open(work['base'] + '.txt', 'w') as txt: for i, caption in enumerate(webvtt.read(work['vtt'])): - txt.write(caption.text) - work['text'] = work['base'] + '.txt' - + txt.write(caption.text + "\n") + work['txt'] = work['base'] + '.txt' + return work + def generate_srv2(work): """Generate a SRV2 file.""" log("Generating SRV2") diff --git a/roles/caption/templates/process-prerec.sh b/roles/caption/templates/process-prerec.sh new file mode 100755 index 0000000..e49aa72 --- /dev/null +++ b/roles/caption/templates/process-prerec.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# {{ ansible_managed }} + +ORIGINAL=$1 +REENCODED=$(echo "$ORIGINAL" | perl -pe 's/^(emacsconf-[0-9]*-.*?--.*?--.*?--).*/$1reencoded.webm/') +SLUG=$(echo "$ORIGINAL" | perl -ne '/^emacsconf-[0-9]*-(.*?)--/ && print $1') +MAIN=$(echo "$ORIGINAL" | perl -pe 's/^(emacsconf-[0-9]*-.*?--.*?--.*?--).*/$1main.webm/') +SCREEN=reencode-$SLUG +if ! ( screen -ls | grep -q $SLUG ); then + screen -dmS $SCREEN +fi +( cd /data/emacsconf/cache; ./update-cache ) +/data/emacsconf/2022/scripts/update-task-status.sh $SLUG "WAITING_FOR_PREREC" "PROCESSING" +#if [[ ! -f "$REENCODED" ]]; then +screen -S $SCREEN -X screen -t reencode-$SLUG /bin/bash -c "/data/emacsconf/2022/scripts/reencode.sh \"$ORIGINAL\" \"$REENCODED\" && /data/emacsconf/2022/scripts/upload.sh $REENCODED && exec /bin/bash" & +#fi +screen -S $SCREEN -X screen -t captions-$SLUG /bin/bash -c "/data/emacsconf/2022/scripts/process-captions.py $(dirname $ORIGINAL); /data/emacsconf/2022/scripts/update-task-status.sh $SLUG PROCESSING TO_ASSIGN; exec /bin/bash" +screen -x $SCREEN diff --git a/roles/caption/templates/reencode.sh b/roles/caption/templates/reencode.sh new file mode 100755 index 0000000..e3a82eb --- /dev/null +++ b/roles/caption/templates/reencode.sh @@ -0,0 +1,43 @@ +#!/usr/bin/env bash + +set -euo pipefail + +# Defaults +q=32 +cpu=4 +time_limit="" +print_only=false + +while getopts :q:c:t:s OPT; do + case $OPT in + q|+q) + q="$OPTARG" + ;; + c|+c) + cpu="$OPTARG" + ;; + t|+t) + time_limit="-to $OPTARG" + ;; + s) + print_only=true + ;; + *) + echo "usage: `basename $0` [+-q ARG] [+-c ARG} [--] ARGS..." + exit 2 + esac +done +shift `expr $OPTIND - 1` +OPTIND=1 + +command="$(cat<