Spaces:
Runtime error
Runtime error
File size: 8,685 Bytes
f083ad2 b67f906 f083ad2 b67f906 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
pip install "gradio[mcp]"
import gradio as gr
import anthropic
import os
import base64
import fitz # PyMuPDF
import json
# It's recommended to load the API key from secrets when deploying
# For Hugging Face Spaces, you would set this as a secret in your Space settings
try:
ANTHROPIC_API_KEY = userdata.get('ANTHROPIC_API_KEY')
except:
ANTHROPIC_API_KEY = os.environ.get('ANTHROPIC_API_KEY')
client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
# Helper Functions from the notebook
def visualize_raw_response(response):
raw_response = {"content": []}
for content in response.content:
if content.type == "text":
block = {"type": "text", "text": content.text}
if hasattr(content, 'citations') and content.citations:
block["citations"] = [vars(c) for c in content.citations]
raw_response["content"].append(block)
return json.dumps(raw_response, indent=2)
def format_citations(response):
if not response:
return ""
citations_dict = {}
citation_counter = 1
formatted_text = ""
citations_list = []
for content in response.content:
if content.type == "text":
text = content.text
if hasattr(content, 'citations') and content.citations:
sorted_citations = sorted(content.citations, key=lambda c: getattr(c, 'start_char_index', 0) or getattr(c, 'start_page_number', 0) or getattr(c, 'start_block_index', 0))
for citation in sorted_citations:
doc_title = citation.document_title
cited_text = ' '.join(citation.cited_text.replace('\n', ' ').replace('\r', ' ').split())
citation_key = f"{doc_title}:{cited_text}"
if citation_key not in citations_dict:
citations_dict[citation_key] = citation_counter
citations_list.append(f"[{citation_counter}] \"{cited_text}\" found in \"{doc_title}\"")
citation_counter += 1
citation_num = citations_dict[citation_key]
text += f" [{citation_num}]"
formatted_text += text
return formatted_text + "\n\n" + "\n".join(citations_list)
def process_documents(doc_type, file_paths):
documents = []
if not file_paths:
return documents
for file_path in file_paths:
with open(file_path, 'rb') as f:
content = f.read()
if doc_type == 'Plain Text':
documents.append({"type": "document", "source": {"type": "text", "media_type": "text/plain", "data": content.decode('utf-8')}, "title": os.path.basename(file_path), "citations": {"enabled": True}})
elif doc_type == 'PDF':
documents.append({"type": "document", "source": {"type": "base64", "media_type": "application/pdf", "data": base64.b64encode(content).decode('utf-8')}, "title": os.path.basename(file_path), "citations": {"enabled": True}})
elif doc_type == 'Custom Content':
documents.append({"type": "document", "source": {"type": "content", "content": [{"type": "text", "text": content.decode('utf-8')}]}, "title": os.path.basename(file_path), "citations": {"enabled": True}})
return documents
def get_anthropic_response(documents, question):
if not documents or not question:
return None
try:
messages = [{"role": "user", "content": documents + [{"type": "text", "text": question}]}]
response = client.messages.create(model="claude-3-5-sonnet-latest", temperature=0.0, max_tokens=1024, messages=messages)
return response
except Exception as e:
print(f"An error occurred: {e}")
return None
def highlight_pdf(response, pdf_path):
if not response:
return None
pdf_citations = [c for content in response.content if hasattr(content, 'citations') and content.citations for c in content.citations if c.type == "page_location"]
if not pdf_citations:
return None
doc = fitz.open(pdf_path)
output_pdf_path = "highlighted_output.pdf"
for citation in pdf_citations:
text_to_find = citation.cited_text.replace('\u0002', '')
start_page = citation.start_page_number - 1
end_page = citation.end_page_number - 1
for page_num in range(start_page, end_page + 1):
if 0 <= page_num < len(doc):
page = doc[page_num]
text_instances = page.search_for(text_to_find.strip())
for inst in text_instances:
highlight = page.add_highlight_annot(inst)
highlight.set_colors({"stroke": (1, 1, 0)})
highlight.update()
doc.save(output_pdf_path)
doc.close()
return output_pdf_path
def annotate_pdf(pdf_path, annotation_text, page_number):
if not pdf_path or not os.path.exists(pdf_path): return None
doc = fitz.open(pdf_path)
page_index = page_number - 1
if not 0 <= page_index < len(doc): doc.close(); return None
page = doc[page_index]
rect = fitz.Rect(50, 50, 400, 100)
page.insert_textbox(rect, annotation_text, fontsize=12, color=(1, 0, 0))
output_pdf_path = pdf_path.replace(".pdf", "_annotated.pdf")
doc.save(output_pdf_path)
doc.close()
return output_pdf_path
def process_and_display(doc_type, question, files, load_samples, annotation_text, annotation_page):
original_pdf_path = None
file_names = []
if load_samples:
# This part needs to be adapted for a deployed environment
# as it relies on a local 'data' directory structure.
# For deployment, you'd package these files with your app.
question = "Sample question"
file_names = [] # Add paths to sample files here
elif files:
file_names = [f.name for f in files]
if not file_names:
return "Please upload documents or load sample data.", {}, None, None, None, None, None, None
if doc_type == 'PDF' and file_names:
original_pdf_path = file_names[0]
documents = process_documents(doc_type, file_names)
response = get_anthropic_response(documents, question)
if not response:
return "Failed to get response from API.", {}, None, None, None, None, None, None
formatted_response = format_citations(response)
raw_response_json_str = visualize_raw_response(response)
raw_response_json = json.loads(raw_response_json_str)
highlighted_pdf_path = None
annotated_pdf_path = None
if doc_type == 'PDF':
highlighted_pdf_path = highlight_pdf(response, original_pdf_path)
if annotation_text and annotation_page:
pdf_to_annotate = highlighted_pdf_path if highlighted_pdf_path else original_pdf_path
if pdf_to_annotate:
annotated_pdf_path = annotate_pdf(pdf_to_annotate, annotation_text, int(annotation_page))
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding='utf-8') as f:
f.write(formatted_response)
formatted_response_path = f.name
with tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w", encoding='utf-8') as f:
f.write(raw_response_json_str)
raw_response_path = f.name
final_pdf_path = annotated_pdf_path if annotated_pdf_path else highlighted_pdf_path
return formatted_response, raw_response_json, highlighted_pdf_path, original_pdf_path, formatted_response_path, raw_response_path, final_pdf_path, final_pdf_path
# Gradio Interface
iface = gr.Interface(
fn=process_and_display,
inputs=[
gr.Radio(['Plain Text', 'PDF', 'Custom Content'], label="Document Type"),
gr.Textbox(lines=2, placeholder="Enter your question here...", label="Question"),
gr.File(file_count="multiple", label="Upload Documents"),
gr.Checkbox(label="Load Sample Data (requires data folder)"),
gr.Textbox(lines=2, placeholder="Enter annotation text...", label="Annotation Text"),
gr.Number(label="Annotation Page Number", precision=0)
],
outputs=[
gr.Textbox(label="Formatted Response"),
gr.JSON(label="Raw API Response"),
gr.File(label="Highlighted PDF"),
gr.File(label="Original PDF"),
gr.File(label="Download Formatted Response"),
gr.File(label="Download Raw Response"),
gr.File(label="Download Highlighted PDF"),
gr.File(label="Final Annotated PDF")
],
title="Anthropic Citations API Explorer",
description="Explore Anthropic's citation capabilities. Upload documents, ask questions, see cited responses, and add your own annotations."
)
if __name__ == "__main__":
iface.launch()
|