Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
|
@@ -3,9 +3,9 @@ from fastapi.responses import HTMLResponse
|
|
| 3 |
from fastapi.staticfiles import StaticFiles
|
| 4 |
from transformers import pipeline
|
| 5 |
import textwrap
|
| 6 |
-
import fitz # PyMuPDF for
|
| 7 |
from docx import Document
|
| 8 |
-
import openpyxl # For Excel
|
| 9 |
from pptx import Presentation
|
| 10 |
from fastapi.middleware.cors import CORSMiddleware
|
| 11 |
from functools import lru_cache
|
|
@@ -15,23 +15,23 @@ from io import BytesIO
|
|
| 15 |
# Initialize FastAPI app
|
| 16 |
app = FastAPI()
|
| 17 |
|
| 18 |
-
# Enable CORS
|
| 19 |
app.add_middleware(
|
| 20 |
CORSMiddleware,
|
| 21 |
-
allow_origins=["*"],
|
| 22 |
allow_credentials=True,
|
| 23 |
-
allow_methods=["*"],
|
| 24 |
-
allow_headers=["*"],
|
| 25 |
)
|
| 26 |
|
| 27 |
-
#
|
| 28 |
STATIC_DIR = "static"
|
| 29 |
|
| 30 |
-
# Ensure the
|
| 31 |
if not os.path.exists(STATIC_DIR):
|
| 32 |
os.makedirs(STATIC_DIR)
|
| 33 |
|
| 34 |
-
#
|
| 35 |
app.mount("/static", StaticFiles(directory=STATIC_DIR, html=True), name="static")
|
| 36 |
|
| 37 |
@app.get("/", response_class=HTMLResponse)
|
|
@@ -43,7 +43,7 @@ async def read_root():
|
|
| 43 |
except FileNotFoundError:
|
| 44 |
raise HTTPException(status_code=404, detail="index.html not found in static folder.")
|
| 45 |
|
| 46 |
-
# Supported
|
| 47 |
LANGUAGE_CODES = {
|
| 48 |
"Anglais": "en",
|
| 49 |
"Francais": "fr",
|
|
@@ -61,7 +61,7 @@ AVAILABLE_MODELS = {
|
|
| 61 |
"en-es": "Helsinki-NLP/opus-mt-en-es",
|
| 62 |
}
|
| 63 |
|
| 64 |
-
# Cache models
|
| 65 |
@lru_cache(maxsize=10)
|
| 66 |
def load_translator(src_code: str, tgt_code: str):
|
| 67 |
model_key = f"{src_code}-{tgt_code}"
|
|
@@ -78,31 +78,29 @@ def load_translator(src_code: str, tgt_code: str):
|
|
| 78 |
else:
|
| 79 |
raise ValueError(f"No model available for {src_code} -> {tgt_code}")
|
| 80 |
|
| 81 |
-
#
|
| 82 |
def chunk_text(text, max_length=400):
|
| 83 |
return textwrap.wrap(text, max_length)
|
| 84 |
|
| 85 |
-
#
|
| 86 |
def extract_text(file: UploadFile):
|
| 87 |
try:
|
| 88 |
-
file_bytes = file.file.read()
|
| 89 |
-
file_stream = BytesIO(file_bytes)
|
| 90 |
|
| 91 |
if file.filename.endswith(".txt"):
|
| 92 |
return file_bytes.decode("utf-8")
|
| 93 |
|
| 94 |
elif file.filename.endswith(".pdf"):
|
| 95 |
-
doc = fitz.open(stream=
|
| 96 |
return "\n".join([page.get_text() for page in doc])
|
| 97 |
|
| 98 |
elif file.filename.endswith(".docx"):
|
| 99 |
-
file_stream.seek(0) # Reset cursor position
|
| 100 |
doc = Document(file_stream)
|
| 101 |
return "\n".join([para.text for para in doc.paragraphs])
|
| 102 |
|
| 103 |
elif file.filename.endswith(".xlsx"):
|
| 104 |
-
|
| 105 |
-
wb = openpyxl.load_workbook(file_stream, data_only=True)
|
| 106 |
text = ""
|
| 107 |
for sheet in wb.sheetnames:
|
| 108 |
ws = wb[sheet]
|
|
@@ -111,7 +109,6 @@ def extract_text(file: UploadFile):
|
|
| 111 |
return text
|
| 112 |
|
| 113 |
elif file.filename.endswith(".pptx"):
|
| 114 |
-
file_stream.seek(0)
|
| 115 |
prs = Presentation(file_stream)
|
| 116 |
text = ""
|
| 117 |
for slide in prs.slides:
|
|
@@ -126,7 +123,7 @@ def extract_text(file: UploadFile):
|
|
| 126 |
except Exception as e:
|
| 127 |
raise HTTPException(status_code=500, detail=f"Error extracting text: {str(e)}")
|
| 128 |
|
| 129 |
-
#
|
| 130 |
@app.post("/upload/")
|
| 131 |
async def upload_file(
|
| 132 |
file: UploadFile = File(...),
|
|
@@ -136,7 +133,7 @@ async def upload_file(
|
|
| 136 |
text = extract_text(file)
|
| 137 |
|
| 138 |
if not text.strip():
|
| 139 |
-
raise HTTPException(status_code=400, detail="No text extracted from file.")
|
| 140 |
|
| 141 |
src_code = LANGUAGE_CODES.get(src_lang)
|
| 142 |
tgt_code = LANGUAGE_CODES.get(tgt_lang)
|
|
@@ -145,15 +142,12 @@ async def upload_file(
|
|
| 145 |
raise HTTPException(status_code=400, detail=f"Unsupported language: {src_lang} -> {tgt_lang}")
|
| 146 |
|
| 147 |
try:
|
| 148 |
-
# Load translation model
|
| 149 |
translator = load_translator(src_code, tgt_code)
|
| 150 |
|
| 151 |
-
# If translation goes through English as an intermediate step
|
| 152 |
if isinstance(translator, tuple):
|
| 153 |
translator1, translator2 = translator
|
| 154 |
intermediate_text = "\n".join([translator1(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
|
| 155 |
translated_text = "\n".join([translator2(chunk)[0]['translation_text'] for chunk in chunk_text(intermediate_text)])
|
| 156 |
-
|
| 157 |
else:
|
| 158 |
translated_text = "\n".join([translator(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
|
| 159 |
|
|
|
|
| 3 |
from fastapi.staticfiles import StaticFiles
|
| 4 |
from transformers import pipeline
|
| 5 |
import textwrap
|
| 6 |
+
import fitz # PyMuPDF for PDF handling
|
| 7 |
from docx import Document
|
| 8 |
+
import openpyxl # For Excel
|
| 9 |
from pptx import Presentation
|
| 10 |
from fastapi.middleware.cors import CORSMiddleware
|
| 11 |
from functools import lru_cache
|
|
|
|
| 15 |
# Initialize FastAPI app
|
| 16 |
app = FastAPI()
|
| 17 |
|
| 18 |
+
# Enable CORS to allow frontend communication
|
| 19 |
app.add_middleware(
|
| 20 |
CORSMiddleware,
|
| 21 |
+
allow_origins=["*"],
|
| 22 |
allow_credentials=True,
|
| 23 |
+
allow_methods=["*"],
|
| 24 |
+
allow_headers=["*"],
|
| 25 |
)
|
| 26 |
|
| 27 |
+
# Directory for static files
|
| 28 |
STATIC_DIR = "static"
|
| 29 |
|
| 30 |
+
# Ensure the directory exists
|
| 31 |
if not os.path.exists(STATIC_DIR):
|
| 32 |
os.makedirs(STATIC_DIR)
|
| 33 |
|
| 34 |
+
# Serve static files correctly
|
| 35 |
app.mount("/static", StaticFiles(directory=STATIC_DIR, html=True), name="static")
|
| 36 |
|
| 37 |
@app.get("/", response_class=HTMLResponse)
|
|
|
|
| 43 |
except FileNotFoundError:
|
| 44 |
raise HTTPException(status_code=404, detail="index.html not found in static folder.")
|
| 45 |
|
| 46 |
+
# Supported languages
|
| 47 |
LANGUAGE_CODES = {
|
| 48 |
"Anglais": "en",
|
| 49 |
"Francais": "fr",
|
|
|
|
| 61 |
"en-es": "Helsinki-NLP/opus-mt-en-es",
|
| 62 |
}
|
| 63 |
|
| 64 |
+
# Cache models for better performance
|
| 65 |
@lru_cache(maxsize=10)
|
| 66 |
def load_translator(src_code: str, tgt_code: str):
|
| 67 |
model_key = f"{src_code}-{tgt_code}"
|
|
|
|
| 78 |
else:
|
| 79 |
raise ValueError(f"No model available for {src_code} -> {tgt_code}")
|
| 80 |
|
| 81 |
+
# Function to split text into chunks
|
| 82 |
def chunk_text(text, max_length=400):
|
| 83 |
return textwrap.wrap(text, max_length)
|
| 84 |
|
| 85 |
+
# Function to extract text from files
|
| 86 |
def extract_text(file: UploadFile):
|
| 87 |
try:
|
| 88 |
+
file_bytes = file.file.read()
|
| 89 |
+
file_stream = BytesIO(file_bytes)
|
| 90 |
|
| 91 |
if file.filename.endswith(".txt"):
|
| 92 |
return file_bytes.decode("utf-8")
|
| 93 |
|
| 94 |
elif file.filename.endswith(".pdf"):
|
| 95 |
+
doc = fitz.open(stream=file_bytes, filetype="pdf")
|
| 96 |
return "\n".join([page.get_text() for page in doc])
|
| 97 |
|
| 98 |
elif file.filename.endswith(".docx"):
|
|
|
|
| 99 |
doc = Document(file_stream)
|
| 100 |
return "\n".join([para.text for para in doc.paragraphs])
|
| 101 |
|
| 102 |
elif file.filename.endswith(".xlsx"):
|
| 103 |
+
wb = openpyxl.load_workbook(file_stream)
|
|
|
|
| 104 |
text = ""
|
| 105 |
for sheet in wb.sheetnames:
|
| 106 |
ws = wb[sheet]
|
|
|
|
| 109 |
return text
|
| 110 |
|
| 111 |
elif file.filename.endswith(".pptx"):
|
|
|
|
| 112 |
prs = Presentation(file_stream)
|
| 113 |
text = ""
|
| 114 |
for slide in prs.slides:
|
|
|
|
| 123 |
except Exception as e:
|
| 124 |
raise HTTPException(status_code=500, detail=f"Error extracting text: {str(e)}")
|
| 125 |
|
| 126 |
+
# Correctly defined POST route for file upload
|
| 127 |
@app.post("/upload/")
|
| 128 |
async def upload_file(
|
| 129 |
file: UploadFile = File(...),
|
|
|
|
| 133 |
text = extract_text(file)
|
| 134 |
|
| 135 |
if not text.strip():
|
| 136 |
+
raise HTTPException(status_code=400, detail="No text extracted from the file.")
|
| 137 |
|
| 138 |
src_code = LANGUAGE_CODES.get(src_lang)
|
| 139 |
tgt_code = LANGUAGE_CODES.get(tgt_lang)
|
|
|
|
| 142 |
raise HTTPException(status_code=400, detail=f"Unsupported language: {src_lang} -> {tgt_lang}")
|
| 143 |
|
| 144 |
try:
|
|
|
|
| 145 |
translator = load_translator(src_code, tgt_code)
|
| 146 |
|
|
|
|
| 147 |
if isinstance(translator, tuple):
|
| 148 |
translator1, translator2 = translator
|
| 149 |
intermediate_text = "\n".join([translator1(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
|
| 150 |
translated_text = "\n".join([translator2(chunk)[0]['translation_text'] for chunk in chunk_text(intermediate_text)])
|
|
|
|
| 151 |
else:
|
| 152 |
translated_text = "\n".join([translator(chunk)[0]['translation_text'] for chunk in chunk_text(text)])
|
| 153 |
|