recap_gen/subtitles/parser.py

19 lines
457 B
Python

import pysrt
import pandas as pd
def parse_srt_to_df(filepath: str, encoding: str = "cp1251") -> pd.DataFrame:
subs = pysrt.open(filepath, encoding=encoding)
rows = []
for sub in subs:
text = sub.text.replace('\n', ' ').strip()
if text:
rows.append({
"start": sub.start.to_time(),
"end": sub.end.to_time(),
"text": text
})
return pd.DataFrame(rows)