recap_gen/subtitles/parser.py

24 lines
595 B
Python

import pysrt
import pandas as pd
import chardet
from pathlib import Path
def parse_srt_to_df(filepath: str) -> pd.DataFrame:
print(f"Загружаю {filepath}")
raw = Path(filepath).read_bytes()[:4096]
enc = chardet.detect(raw)["encoding"]
subs = pysrt.open(filepath, encoding=enc)
rows = []
for sub in subs:
text = sub.text.replace('\n', ' ').strip()
if text:
rows.append({
"start": sub.start.to_time(),
"end": sub.end.to_time(),
"text": text
})
return pd.DataFrame(rows)