1
Bash script to download and search youtube subtitles and output clickable timestamped urls
(lemm.ee)
cross-posted from: https://lemm.ee/post/23155648
Here is the script.
#!/usr/bin/env bash # Download and search youtube subs # deps yt-dlp ,awk, perl, any one or more of either ugrep, ripgrep, grep # usage "script youtube_url" main() { url="$@" check_if_url get_video_id search_for_downloaded_matching_files set_download_boolean_flag download_subs read_and_format_transcript_file echo_description_file user_search } # Iterate over the array and add items to the new array if they match the regex check_if_url() { local regex='^https://[^[:space:]]+$' if ! [[ $url =~ $regex ]]; then echo "Invalid input. Valid input is a url matching regex ${regex}" exit 1 fi } get_video_id() { video_id=$(echo "$url" | sed -n 's/.*v=\([^&]*\).*/\1/p') } search_for_downloaded_matching_files() { # Find newest created files matching the video_id transcript_file="$( /usr/bin/ls -t --time=creation "$PWD"/*${video_id}*\.vtt 2>/dev/null | head -n 1 )" description_file="$( /usr/bin/ls -t --time=creation "$PWD"/*${video_id}*\.description 2>/dev/null | head -n 1 )" } set_download_boolean_flag() { if [ -n "$transcript_file" ] && [ -n "$description_file" ]; then download=0 # FALSE else download=1 # TRUE fi } download_subs() { if [ "$download" -eq 1 ]; then yt-dlp --restrict-filenames --write-auto-sub --skip-download "${url}" yt-dlp --restrict-filenames --sub-langs=eng --write-subs --skip-download "${url}" yt-dlp --restrict-filenames --write-description --skip-download "${url}" # Search files again since they were just downloaded search_for_downloaded_matching_files fi } read_and_format_transcript_file() { perl_removed_dupes="$(perl -0777 -pe 's/^\d\d.*\n.*\n.*<\/c>//gm' <"${transcript_file}")" local prefix="https://www.youtube.com/watch?v=${video_id}&t=" local suffix="s" formated_transcript_file="$(awk -v pre="$prefix" -v suf="$suffix" ' /^([0-9]{2}:){2}[0-9]{2}\.[0-9]{3}/ { split($1, a, /[:.]/); $1 = pre (int(a[1]*3600 + a[2]*60 + a[3]) - 3) suf; sub(/ --> [0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{3}/, ""); sub(/ align:start position:0%$/, ""); print; next; } { sub(/ align:start position:0%$/, ""); print; } ' <<<"${perl_removed_dupes}")" #CRLF for ugrep to avoid ?bug? where before lines are not all outputted formated_transcript_file_CRLF=$(printf '%b' "$formated_transcript_file" | sed 's/$/\r/') } echo_description_file() { cat "${description_file}" } user_search() { echo -e "\n\n" read -rp "Enter regex (read as raw input): " search_term : ${app_count:=0} if command -v ug >/dev/null 2>&1; then echo -e "\n\n\n\n" echo "Ugrep output" ug --pretty=never -B2 -A1 -i -Z+-~1 -e "${search_term}" --andnot "^https?:\/\/" <<<"$formated_transcript_file_CRLF" ((app_count++)) fi if command -v rg >/dev/null 2>&1; then echo -e "\n\n\n\n" echo "Ripgrep output" rg -iP -B2 -A7 "^(?!https?:\/\/).*\K${search_term}" <<<"$formated_transcript_file" ((app_count++)) fi if [ "$app_count" -eq 0 ]; then echo -e "\n\n\n\n" echo "Grep output" grep -iP -B2 -A1 "${search_term}" <<<"$formated_transcript_file" echo -e "\n\n" echo "Consider installing ripgrep and ugrep for better search" ((app_count++)) fi } main "$@"
I needed, I would pay $5 per month in perpetuity for access to Firefox. Fuck google