File size: 8,685 Bytes
f083ad2
b67f906
 
 
 
 
 
f083ad2
b67f906
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
pip install "gradio[mcp]"
import gradio as gr
import anthropic
import os
import base64
import fitz  # PyMuPDF
import json


# It's recommended to load the API key from secrets when deploying
# For Hugging Face Spaces, you would set this as a secret in your Space settings
try:
    ANTHROPIC_API_KEY = userdata.get('ANTHROPIC_API_KEY')
except:
    ANTHROPIC_API_KEY = os.environ.get('ANTHROPIC_API_KEY')

client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)

# Helper Functions from the notebook
def visualize_raw_response(response):
    raw_response = {"content": []}
    for content in response.content:
        if content.type == "text":
            block = {"type": "text", "text": content.text}
            if hasattr(content, 'citations') and content.citations:
                block["citations"] = [vars(c) for c in content.citations]
            raw_response["content"].append(block)
    return json.dumps(raw_response, indent=2)

def format_citations(response):
    if not response:
        return ""
    citations_dict = {}
    citation_counter = 1
    formatted_text = ""
    citations_list = []
    for content in response.content:
        if content.type == "text":
            text = content.text
            if hasattr(content, 'citations') and content.citations:
                sorted_citations = sorted(content.citations, key=lambda c: getattr(c, 'start_char_index', 0) or getattr(c, 'start_page_number', 0) or getattr(c, 'start_block_index', 0))
                for citation in sorted_citations:
                    doc_title = citation.document_title
                    cited_text = ' '.join(citation.cited_text.replace('\n', ' ').replace('\r', ' ').split())
                    citation_key = f"{doc_title}:{cited_text}"
                    if citation_key not in citations_dict:
                        citations_dict[citation_key] = citation_counter
                        citations_list.append(f"[{citation_counter}] \"{cited_text}\" found in \"{doc_title}\"")
                        citation_counter += 1
                    citation_num = citations_dict[citation_key]
                    text += f" [{citation_num}]"
            formatted_text += text
    return formatted_text + "\n\n" + "\n".join(citations_list)

def process_documents(doc_type, file_paths):
    documents = []
    if not file_paths:
        return documents
    for file_path in file_paths:
        with open(file_path, 'rb') as f:
            content = f.read()
        if doc_type == 'Plain Text':
            documents.append({"type": "document", "source": {"type": "text", "media_type": "text/plain", "data": content.decode('utf-8')}, "title": os.path.basename(file_path), "citations": {"enabled": True}})
        elif doc_type == 'PDF':
            documents.append({"type": "document", "source": {"type": "base64", "media_type": "application/pdf", "data": base64.b64encode(content).decode('utf-8')}, "title": os.path.basename(file_path), "citations": {"enabled": True}})
        elif doc_type == 'Custom Content':
            documents.append({"type": "document", "source": {"type": "content", "content": [{"type": "text", "text": content.decode('utf-8')}]}, "title": os.path.basename(file_path), "citations": {"enabled": True}})
    return documents

def get_anthropic_response(documents, question):
    if not documents or not question:
        return None
    try:
        messages = [{"role": "user", "content": documents + [{"type": "text", "text": question}]}]
        response = client.messages.create(model="claude-3-5-sonnet-latest", temperature=0.0, max_tokens=1024, messages=messages)
        return response
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

def highlight_pdf(response, pdf_path):
    if not response:
        return None
    pdf_citations = [c for content in response.content if hasattr(content, 'citations') and content.citations for c in content.citations if c.type == "page_location"]
    if not pdf_citations:
        return None
    doc = fitz.open(pdf_path)
    output_pdf_path = "highlighted_output.pdf"
    for citation in pdf_citations:
        text_to_find = citation.cited_text.replace('\u0002', '')
        start_page = citation.start_page_number - 1
        end_page = citation.end_page_number - 1
        for page_num in range(start_page, end_page + 1):
            if 0 <= page_num < len(doc):
                page = doc[page_num]
                text_instances = page.search_for(text_to_find.strip())
                for inst in text_instances:
                    highlight = page.add_highlight_annot(inst)
                    highlight.set_colors({"stroke": (1, 1, 0)})
                    highlight.update()
    doc.save(output_pdf_path)
    doc.close()
    return output_pdf_path

def annotate_pdf(pdf_path, annotation_text, page_number):
    if not pdf_path or not os.path.exists(pdf_path): return None
    doc = fitz.open(pdf_path)
    page_index = page_number - 1
    if not 0 <= page_index < len(doc): doc.close(); return None
    page = doc[page_index]
    rect = fitz.Rect(50, 50, 400, 100)
    page.insert_textbox(rect, annotation_text, fontsize=12, color=(1, 0, 0))
    output_pdf_path = pdf_path.replace(".pdf", "_annotated.pdf")
    doc.save(output_pdf_path)
    doc.close()
    return output_pdf_path

def process_and_display(doc_type, question, files, load_samples, annotation_text, annotation_page):
    original_pdf_path = None
    file_names = []
    if load_samples:
        # This part needs to be adapted for a deployed environment
        # as it relies on a local 'data' directory structure.
        # For deployment, you'd package these files with your app.
        question = "Sample question"
        file_names = [] # Add paths to sample files here
    elif files:
        file_names = [f.name for f in files]

    if not file_names:
        return "Please upload documents or load sample data.", {}, None, None, None, None, None, None

    if doc_type == 'PDF' and file_names:
        original_pdf_path = file_names[0]

    documents = process_documents(doc_type, file_names)
    response = get_anthropic_response(documents, question)

    if not response:
        return "Failed to get response from API.", {}, None, None, None, None, None, None

    formatted_response = format_citations(response)
    raw_response_json_str = visualize_raw_response(response)
    raw_response_json = json.loads(raw_response_json_str)

    highlighted_pdf_path = None
    annotated_pdf_path = None

    if doc_type == 'PDF':
        highlighted_pdf_path = highlight_pdf(response, original_pdf_path)
        if annotation_text and annotation_page:
             pdf_to_annotate = highlighted_pdf_path if highlighted_pdf_path else original_pdf_path
             if pdf_to_annotate:
                annotated_pdf_path = annotate_pdf(pdf_to_annotate, annotation_text, int(annotation_page))

    with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding='utf-8') as f:
        f.write(formatted_response)
        formatted_response_path = f.name
    with tempfile.NamedTemporaryFile(delete=False, suffix=".json", mode="w", encoding='utf-8') as f:
        f.write(raw_response_json_str)
        raw_response_path = f.name

    final_pdf_path = annotated_pdf_path if annotated_pdf_path else highlighted_pdf_path

    return formatted_response, raw_response_json, highlighted_pdf_path, original_pdf_path, formatted_response_path, raw_response_path, final_pdf_path, final_pdf_path


# Gradio Interface
iface = gr.Interface(
    fn=process_and_display,
    inputs=[
        gr.Radio(['Plain Text', 'PDF', 'Custom Content'], label="Document Type"),
        gr.Textbox(lines=2, placeholder="Enter your question here...", label="Question"),
        gr.File(file_count="multiple", label="Upload Documents"),
        gr.Checkbox(label="Load Sample Data (requires data folder)"),
        gr.Textbox(lines=2, placeholder="Enter annotation text...", label="Annotation Text"),
        gr.Number(label="Annotation Page Number", precision=0)
    ],
    outputs=[
        gr.Textbox(label="Formatted Response"),
        gr.JSON(label="Raw API Response"),
        gr.File(label="Highlighted PDF"),
        gr.File(label="Original PDF"),
        gr.File(label="Download Formatted Response"),
        gr.File(label="Download Raw Response"),
        gr.File(label="Download Highlighted PDF"),
        gr.File(label="Final Annotated PDF")
    ],
    title="Anthropic Citations API Explorer",
    description="Explore Anthropic's citation capabilities. Upload documents, ask questions, see cited responses, and add your own annotations."
)

if __name__ == "__main__":
    iface.launch()