File size: 594 Bytes
a53dc0a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
# detector/utils.py

from PyPDF2 import PdfReader
import docx

async def extract_text_from_file(file):
    filename = file.filename.lower()

    if filename.endswith(".pdf"):
        reader = PdfReader(file.file)
        return "\n".join([page.extract_text() or "" for page in reader.pages])

    elif filename.endswith(".docx"):
        document = docx.Document(file.file)
        return "\n".join([para.text for para in document.paragraphs])

    elif filename.endswith(".txt"):
        return (await file.read()).decode("utf-8")

    else:
        raise ValueError("Unsupported file type.")