Spaces:

lol040604lol
/

tamilResrorationUsingDeepseek

Runtime error

File size: 3,609 Bytes

3af2df3
 
 
21d3d34
a5fec08
1d215a0
83f3978
1d215a0
a5fec08
313817b
a5fec08
 
 
313817b
a5fec08
 
 
313817b
a5fec08
 
1d215a0
 
83f3978
 
1d215a0
83f3978
1d215a0
 
a5fec08
 
 
 
5868b60
50a99ec
a5fec08
5868b60
 
 
 
 
 
 
 
 
 
 
 
50a99ec
a5fec08
 
 
 
 
 
 
5868b60
 
313817b
a5fec08
5868b60
a5fec08
 
 
5868b60
 
 
a5fec08
 
 
 
 
 
 
 
 
 
 
 
 
 
21d3d34
 
5868b60


import nltk
nltk.download("punkt")
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.llms import HuggingFaceLLM
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import os

# Load DeepSeek-R1
deepseek_tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1", trust_remote_code=True)
deepseek_model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1", trust_remote_code=True)

# Load IndicBART
indicbart_tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicBART")
indicbart_model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/IndicBART")

# Initialize LlamaIndex components
embed_model = HuggingFaceEmbedding(model_name="sentence-transformers/all-MiniLM-L6-v2")

llm = HuggingFaceLLM(
    model_name="deepseek-ai/DeepSeek-R1",
    tokenizer_name="deepseek-ai/DeepSeek-R1",
    max_new_tokens=512,
    context_window=4096
)

service_context = ServiceContext.from_defaults(llm=llm, embed_model=embed_model)

# Build index from documents in 'data' directory
documents = SimpleDirectoryReader("data").load_data()
index = VectorStoreIndex.from_documents(documents, service_context=service_context)

# Define functions for each task
def restore_text(input_text, task_type):
    prefix_map = {
        "Restore & Correct Tamil Text": "restore: ",
        "Summarize in Tamil": "summarize: ",
        "Translate to English": "translate Tamil to English: "
    }
    prefix = prefix_map.get(task_type, "restore: ")
    input_text = prefix + input_text
    inputs = indicbart_tokenizer([input_text], return_tensors="pt", padding=True)
    outputs = indicbart_model.generate(**inputs, max_length=256, num_beams=4, early_stopping=True)
    decoded_output = indicbart_tokenizer.batch_decode(outputs, skip_special_tokens=True)
    return decoded_output[0]

def deepseek_chat(message):
    inputs = deepseek_tokenizer.encode(message + deepseek_tokenizer.eos_token, return_tensors="pt")
    outputs = deepseek_model.generate(inputs, max_length=1024, pad_token_id=deepseek_tokenizer.eos_token_id)
    return deepseek_tokenizer.decode(outputs[0], skip_special_tokens=True)

def query_documents(query):
    query_engine = index.as_query_engine()
    response = query_engine.query(query)
    return str(response)

# Gradio Interface
with gr.Blocks() as demo:
    gr.Markdown("## 🕉️ Ancient Tamil Literature Expert AI")

    with gr.Tab("IndicBART Tasks"):
        input_text = gr.Textbox(label="Input Tamil Text", lines=8, placeholder="Enter ancient Tamil text here...")
        task_type = gr.Radio(choices=["Restore & Correct Tamil Text", "Summarize in Tamil", "Translate to English"], label="Select Task")
        output_text = gr.Textbox(label="Output")
        submit_button = gr.Button("Submit")
        submit_button.click(fn=restore_text, inputs=[input_text, task_type], outputs=output_text)

    with gr.Tab("DeepSeek-R1 Chat"):
        chat_input = gr.Textbox(label="Enter your message")
        chat_output = gr.Textbox(label="DeepSeek-R1 Response")
        chat_button = gr.Button("Send")
        chat_button.click(fn=deepseek_chat, inputs=chat_input, outputs=chat_output)

    with gr.Tab("Document Query"):
        query_input = gr.Textbox(label="Enter your query")
        query_output = gr.Textbox(label="Query Response")
        query_button = gr.Button("Search")
        query_button.click(fn=query_documents, inputs=query_input, outputs=query_output)

if __name__ == "__main__":
    demo.launch()