from fastapi import FastAPI
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image

app = FastAPI()

@app.get("/")
def greet_json():
    # Load model and processor from Hugging Face
    print("Loading model and processor...")
    processor = TrOCRProcessor.from_pretrained('tjoab/latex_finetuned')
    model = VisionEncoderDecoderModel.from_pretrained('tjoab/latex_finetuned')

    # Load all images as a batch
    sample_image = open_PIL_image("sample.png")

    # Preprocess the images 
    preproc_image = processor.image_processor(images=[sample_image], return_tensors="pt").pixel_values

    # Generate and decode the tokens
    # NOTE: max_length default value is very small, which often results in truncated inference if not set 
    pred_ids = model.generate(preproc_image, max_length=128)
    latex_preds = processor.batch_decode(pred_ids, skip_special_tokens=True)
    return {"message": "Success", "latex_preds": latex_preds}


# Helper funtion (path to either JPEG or PNG)
def open_PIL_image(image_path: str) -> Image.Image:
  image = Image.open(image_path)
  if image_path.split('.')[-1].lower() == 'png':
      image = Image.composite(image, PIL.Image.new('RGB', image.size, 'white'), image)
  return image