diff --git a/files/models/media.py b/files/models/media.py index 01303875..0e101a21 100644 --- a/files/models/media.py +++ b/files/models/media.py @@ -357,6 +357,10 @@ class Media(models.Model): a_tags, b_tags, ] + + for subtitle in self.subtitles.all(): + items.append(subtitle.subtitle_text) + items = [item for item in items if item] text = " ".join(items) text = " ".join([token for token in text.lower().split(" ") if token not in STOP_WORDS]) diff --git a/files/models/subtitle.py b/files/models/subtitle.py index b671b286..998c7dbd 100644 --- a/files/models/subtitle.py +++ b/files/models/subtitle.py @@ -1,6 +1,7 @@ import os import tempfile +import pysubs2 from django.conf import settings from django.db import models from django.urls import reverse @@ -73,6 +74,17 @@ class Subtitle(models.Model): raise Exception("Could not convert to srt") return True + @property + def subtitle_text(self): + sub = pysubs2.load(self.subtitle_file.path, encoding="utf-8") + text = ' '.join([line.text for line in sub]) + text = text.replace("\\N", " ") + text = text.replace("-", " ") + text = text.replace(".", " ") + text = text.replace(" ", " ") + + return text + class TranscriptionRequest(models.Model): # Whisper transcription request