# nougat-small-onnx example




In [9]:
!free -h
!nproc
!date

               total        used        free      shared  buff/cache   available
Mem:            12Gi       5.2Gi       4.2Gi       8.0Mi       3.3Gi       7.2Gi
Swap:             0B          0B          0B
2
Wed Nov 22 11:57:53 PM UTC 2023


In [10]:
!pip install transformers accelerate optimum[onnxruntime] -U -q
!pip install onnx onnxruntime==1.15.1 nougat-ocr l python-Levenshtein -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.1/247.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for l (setup.py) ... [?25l[?25hdone
  Building wheel for betterpath (setup.py) ... [?25l[?25hdone


In [2]:
!pip install -U -q git+https://github.com/huggingface/optimum.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for optimum (pyproject.toml) ... [?25l[?25hdone


In [3]:
from optimum.onnxruntime import ORTModelForVision2Seq
from transformers import NougatProcessor

model_name = 'pszemraj/nougat-small-onnx'
processor = NougatProcessor.from_pretrained(model_name)
model = ORTModelForVision2Seq.from_pretrained(
    model_name,
    provider="CPUExecutionProvider", # 'CUDAExecutionProvider' for gpu
    use_merged=False,
    use_io_binding=True
)

In [4]:
!pip list | grep cu

cufflinks                        0.17.3
cupy-cuda11x                     11.0.0
docutils                         0.18.1
jaxlib                           0.4.20+cuda11.cudnn86
torch                            2.1.0+cu118
torchaudio                       2.1.0+cu118
torchvision                      0.16.0+cu118


In [6]:
!pip uninstall cupy-cuda11x -y

Found existing installation: cupy-cuda11x 11.0.0
Uninstalling cupy-cuda11x-11.0.0:
  Successfully uninstalled cupy-cuda11x-11.0.0


# New Section

In [4]:
from huggingface_hub import hf_hub_download
import re
from PIL import Image
import requests
from nougat.dataset.rasterize import rasterize_paper
from transformers import NougatProcessor, VisionEncoderDecoderModel
import torch
import uuid
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [5]:
# %%
from transformers import StoppingCriteria, StoppingCriteriaList
from collections import defaultdict

class RunningVarTorch:
    def __init__(self, L=15, norm=False):
        self.values = None
        self.L = L
        self.norm = norm

    def push(self, x: torch.Tensor):
        assert x.dim() == 1
        if self.values is None:
            self.values = x[:, None]
        elif self.values.shape[1] < self.L:
            self.values = torch.cat((self.values, x[:, None]), 1)
        else:
            self.values = torch.cat((self.values[:, 1:], x[:, None]), 1)

    def variance(self):
        if self.values is None:
            return
        if self.norm:
            return torch.var(self.values, 1) / self.values.shape[1]
        else:
            return torch.var(self.values, 1)

#@title nougat things
class StoppingCriteriaScores(StoppingCriteria):
    def __init__(self, threshold: float = 0.015, window_size: int = 200):
        super().__init__()
        self.threshold = threshold
        self.vars = RunningVarTorch(norm=True)
        self.varvars = RunningVarTorch(L=window_size)
        self.stop_inds = defaultdict(int)
        self.stopped = defaultdict(bool)
        self.size = 0
        self.window_size = window_size

    @torch.no_grad()
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        last_scores = scores[-1]
        self.vars.push(last_scores.max(1)[0].float().cpu())
        self.varvars.push(self.vars.variance())
        self.size += 1
        if self.size < self.window_size:
            return False

        varvar = self.varvars.variance()
        for b in range(len(last_scores)):
            if varvar[b] < self.threshold:
                if self.stop_inds[b] > 0 and not self.stopped[b]:
                    self.stopped[b] = self.stop_inds[b] >= self.size
                else:
                    self.stop_inds[b] = int(
                        min(max(self.size, 1) * 1.15 + 150 + self.window_size, 4095)
                    )
            else:
                self.stop_inds[b] = 0
                self.stopped[b] = False
        return all(self.stopped.values()) and len(self.stopped) > 0

In [6]:

from pathlib import Path

from tqdm.auto import tqdm


import os
import requests
import uuid
from urllib.parse import urlparse, unquote

#@title inference fns
def get_pdf(pdf_link):
    # Parse the URL to extract the filename
    parsed_url = urlparse(pdf_link)
    filename = os.path.basename(unquote(parsed_url.path))

    # Use a unique filename if the URL doesn't contain a filename or if it's not a PDF
    if not filename or not filename.lower().endswith('.pdf'):
        filename = f"downloaded_paper_{uuid.uuid4().hex}.pdf"

    full_path = os.path.join(os.getcwd(), filename)

    try:
        response = requests.get(pdf_link)
        response.raise_for_status()  # Raise an HTTPError if the HTTP request returned an unsuccessful status code

        with open(full_path, "wb") as pdf_file:
            pdf_file.write(response.content)
        print("PDF downloaded successfully.")
    except requests.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except Exception as err:
        print(f"An error occurred: {err}")

    return full_path

def predict(image, fix_markdown=False):
    # prepare PDF image for the model
    image = Image.open(image)
    pixel_values = processor(image, return_tensors="pt").pixel_values

    with torch.backends.cuda.sdp_kernel(
        enable_flash=True, enable_math=False, enable_mem_efficient=False
    ):
        outputs = model.generate(
            pixel_values.to(model.device),
            min_length=1,
            max_new_tokens=3584,
            bad_words_ids=[[processor.tokenizer.unk_token_id]],
            return_dict_in_generate=True,
            output_scores=True,
            stopping_criteria=StoppingCriteriaList([StoppingCriteriaScores()]),
        )
    generated = processor.batch_decode(outputs[0], skip_special_tokens=True)[0]
    generated = processor.post_process_generation(generated, fix_markdown=fix_markdown)

    return generated


def inference(pdf_file, pdf_link=None, fix_markdown=False):
    if pdf_file is None:
        if pdf_link == "" or pdf_link is None:
            print("No file is uploaded and No link is provided")
            return "No data provided. Upload a pdf file or provide a pdf link and try again!"
        else:
            file_name = get_pdf(pdf_link)
    else:
        pdf_file = Path(pdf_file)
        file_name = pdf_file.name
        pdf_name = pdf_file.name.split("/")[-1].split(".")[0]

    images = rasterize_paper(file_name, return_pil=True)
    sequence = ""
    # infer for every page and concat
    for image in tqdm(images, desc="inference"):
        sequence += predict(image, fix_markdown=fix_markdown) + "\n\n"

    content = (
        sequence.replace(r"\(", "$")
        .replace(r"\)", "$")
        .replace(r"\[", "$$")
        .replace(r"\]", "$$")
    )
    out_path = Path.cwd() / f"output-{Path(file_name).stem}.md"
    with open(out_path, "w+") as f:
        f.write(content)
        f.close()

    return content, str(out_path.resolve())

In [8]:
inference(None, 'https://arxiv.org/pdf/1706.03762.pdf', fix_markdown=True)

PDF downloaded successfully.


inference:   0%|          | 0/15 [00:00<?, ?it/s]

('\n\n# Attention Is All You Need\n\n Ashish Vaswani\n\nGoogle Brain\n\navaswani@google.com\n\n&Noam Shazeer\n\nGoogle Brain\n\nnoam@google.com\n\n&Niki Parmar\n\nGoogle Research\n\nnikip@google.com\n\n&Jakob Uszkoreit\n\nGoogle Research\n\nusz@google.com\n\n&Lilion Jones\n\nGoogle Research\n\nlilion@google.com\n\n&Aidan N. Gomez\n\nUniversity of Toronto\n\naidan@cs.toronto.edu\n\n&Lukasz Kaiser\n\nGoogle Brain\n\nlukaszkaiser@google.com\n\n&Illia Polosukhin\n\nillia.polosukhin@gmail.com\n\nWork performed while at Google Brain.Work performed while at Google Research.\n\n###### Abstract\n\nThe dominant sequence transduction models are based on complex recurrent or convolutional neural networks that include an encoder and a decoder. The best performing models also connect the encoder and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. Experime