Spaces:

Kamyar-zeinalipour
/

CS-neda

Sleeping

File size: 13,622 Bytes

ef57bca
0480b49
af270fc
5b60af7
7dddc8e
 
 
 
af270fc
 
 
 
 
 
 
 
0480b49
d42b36c
5f043aa
 
0480b49
 
5b60af7
af270fc
 
 
 
 
 
 
5f043aa
7dddc8e
5f043aa
 
 
 
 
 
 
5b60af7
af270fc
a7001d6
7dddc8e
a7001d6
 
 
 
 
 
d42b36c
 
 
a7001d6
d42b36c
a7001d6
d42b36c
a7001d6
5b60af7
d42b36c
 
af270fc
a7001d6
7dddc8e
a7001d6
 
 
af270fc
a7001d6
 
 
 
af270fc
 
 
 
 
a7001d6
 
 
 
7dddc8e
af270fc
 
0480b49
 
 
a7001d6
af270fc
 
a7001d6
0480b49
d958b89
a7001d6
af270fc
0480b49
 
 
a7001d6
 
0480b49
 
 
 
af270fc
0480b49
5b60af7
0480b49
5b60af7
 
 
af270fc
0480b49
5f043aa
 
 
 
b23009b
 
 
5b60af7
7dddc8e
5f043aa
 
af270fc
5f043aa
 
0480b49
d42b36c
5b60af7
a7001d6
5b60af7
af270fc
7dddc8e
 
a7001d6
 
af270fc
 
 
5f043aa
d958b89
0480b49
af270fc
 
 
 
d958b89
 
af270fc
 
cc6f8c8
 
af270fc
5b60af7
0480b49
 
 
af270fc
5b60af7
7dddc8e
d42b36c
 
 
5f043aa
d42b36c
 
 
 
 
 
7dddc8e
d42b36c
0480b49
7dddc8e
af270fc
 
d42b36c
 
 
 
 
 
 
 
 
 
 
 
 
 
5f043aa
 
d42b36c
7dddc8e
af270fc
d42b36c
 
af270fc
 
d42b36c
5f043aa
af270fc
d42b36c
5f043aa
af270fc
 
 
d42b36c
 
af270fc
d42b36c
5f043aa
 
d42b36c
 
af270fc
d42b36c
af270fc
5b60af7
af270fc
0480b49
5f043aa
 
 
0480b49
af270fc
0480b49
d42b36c
af270fc
2bf53dd
 
 
 
 
af270fc
 
 
5f043aa
0480b49
 
 
a7001d6
 
af270fc
5f043aa
af270fc
5f043aa
a7001d6
 
 
 
af270fc
5f043aa
af270fc
5f043aa
a7001d6
 
5b60af7
d42b36c
 
 
5f043aa
 
d42b36c
 
 
 
5b60af7
a7001d6
 
 
 
 
5f043aa
a7001d6
 
 
 
af270fc
5b60af7
 
7dddc8e
5b60af7
 
af270fc
 
 
7dddc8e
5b60af7
af270fc
5b60af7
7dddc8e
5b60af7
 
 
7dddc8e
5b60af7
 
af270fc
 
 
 
d42b36c
 
7dddc8e
 
 
 
 
d42b36c
5b60af7
af270fc
7dddc8e
5f043aa
5b60af7
7dddc8e
 
d42b36c
a7001d6
 
 
 
d42b36c
5b60af7
 
af270fc
d42b36c
 
5f043aa
d42b36c
 
a7001d6
 
5b60af7
0480b49
d958b89
7dddc8e

# paragraph_annotation_tool.py NEW ui
"""
Paragraph-level annotation tool with per-model comments
────────────────────────────────────────────────────────────────────────────
• Upload a CSV containing at least:
      Content_Paragraph,
      <model>_prompt1, <model>_prompt2             … for each model
• Enter your annotator name and click “Start / Resume”.
• Rate each prompt A / B / C, optionally leave a comment for every model,
  navigate Back / Next, download the annotated CSV.

New in this version
───────────────────
• One **comment textbox per model** (shown next to the model’s two ratings).
• Comments are stored in the CSV under columns named  ``comment_<model>``.
• Blank comments are saved as the literal string ``"no comment"``.
"""
from __future__ import annotations
import gradio as gr, pandas as pd
import random, time, os, shutil, uuid
from typing import List

# ─── CONFIG ───────────────────────────────────────────────────────────────
MAX_MODELS      = 10                      # pre-allocate up to this many models
CONTENT_COL     = "Content_Paragraph"
PROMPT1_SUFFIX  = "_prompt1"
PROMPT2_SUFFIX  = "_prompt2"
COMMENT_PREFIX  = "comment_"            # <COMMENT_PREFIX><model>
PERM_COL        = "perm_models"
RATING_OPTS     = ["A", "B", "C"]

# ─── GLOBALS (filled after CSV load) ───────────────────────────────────────
df: pd.DataFrame | None = None
csv_path: str | None    = None
models: List[str]       = []
TOTAL = 0
annotator = ""
current_start: float | None = None

# ─── CSV HELPERS ───────────────────────────────────────────────────────────

def load_csv(path: str):
    """Read CSV, discover model columns, add helper columns if needed."""
    global df, models, TOTAL, csv_path
    csv_path = path
    df = pd.read_csv(csv_path, keep_default_na=False)
    TOTAL = len(df)

    models.clear()
    for col in df.columns:
        if col.endswith(PROMPT1_SUFFIX) and not col.startswith("rating_"):
            m = col[: -len(PROMPT1_SUFFIX)]
            if f"{m}{PROMPT2_SUFFIX}" not in df.columns:
                raise ValueError(f"Found '{col}' but no '{m}{PROMPT2_SUFFIX}'")
            models.append(m)

    if not models:
        raise ValueError(f"No '*{PROMPT1_SUFFIX}' columns found")
    if len(models) > MAX_MODELS:
        raise ValueError(
            f"CSV has {len(models)} models but MAX_MODELS is {MAX_MODELS}")

    # helper columns
    if PERM_COL not in df.columns:
        df[PERM_COL] = ""
    for m in models:
        # rating columns per prompt
        for p in ("prompt1", "prompt2"):
            rc = f"rating_{m}__{p}"
            if rc not in df.columns:
                df[rc] = ""
        # NEW → comment column per model
        cc = f"{COMMENT_PREFIX}{m}"
        if cc not in df.columns:
            df[cc] = "no comment"  # default value

    for col in ("annotator", "annotation_time"):
        if col not in df.columns:
            df[col] = "" if col == "annotator" else 0.0


# ─── BOOK-KEEPING ──────────────────────────────────────────────────────────

def first_incomplete() -> int:
    for i, row in df.iterrows():
        for m in models:
            if (
                row[f"rating_{m}__prompt1"] == "" or
                row[f"rating_{m}__prompt2"] == ""
            ):
                return i
    return 0


def get_perm(idx: int) -> List[str]:
    cell = str(df.at[idx, PERM_COL])
    if not cell:
        seq = models.copy()
        random.shuffle(seq)
        df.at[idx, PERM_COL] = "|".join(seq)
        df.to_csv(csv_path, index=False)
    return df.at[idx, PERM_COL].split("|")


def build_row(idx: int):
    """Return fixed-length tuple of widget values for example *idx*."""
    global current_start
    row   = df.loc[idx]
    order = get_perm(idx)

    outs, rates, comms = [], [], []
    for m in order:
        outs.append(row[f"{m}{PROMPT1_SUFFIX}"])
        outs.append(row[f"{m}{PROMPT2_SUFFIX}"])
        rates.append(row[f"rating_{m}__prompt1"] or None)
        rates.append(row[f"rating_{m}__prompt2"] or None)
        val = row[f"{COMMENT_PREFIX}{m}"]
        comms.append("" if val == "no comment" else val)
        

    # pad up to MAX_MODELS
    outs  += [""]  * (MAX_MODELS*2 - len(outs))
    rates += [None]* (MAX_MODELS*2 - len(rates))
    comms += ["" ] * (MAX_MODELS   - len(comms))

    ready = all(r in RATING_OPTS for r in rates[: 2*len(models)])
    current_start = time.time()

    header = f"Example {idx+1}/{TOTAL}"
    return (
        idx, header, row[CONTENT_COL],
        *outs, *rates, *comms,
        gr.update(visible=True),                   # back_btn update
        gr.update(visible=True, interactive=ready) # next_btn update
    )


def save_row(idx: int, ratings: List[str], comments: List[str]):
    """Persist ratings & comments for example *idx* → CSV."""
    if not all(r in RATING_OPTS for r in ratings[: 2*len(models)]):
        return
    elapsed = time.time() - current_start if current_start else 0.0

    order = get_perm(idx)
    p = 0  # rating pointer
    for m in order:
        df.at[idx, f"rating_{m}__prompt1"] = ratings[p]; p += 1
        df.at[idx, f"rating_{m}__prompt2"] = ratings[p]; p += 1
    # comments
    for m, c in zip(order, comments):
        clean = (c or "").strip()
        df.at[idx, f"{COMMENT_PREFIX}{m}"] = clean or "no comment"

    df.at[idx, "annotator"]       = annotator
    df.at[idx, "annotation_time"] = float(elapsed)
    df.to_csv(csv_path, index=False)


def _writable_dir() -> str:
    """Return /data on Spaces, /tmp elsewhere – whichever is writeable."""
    for d in ("/data", "/tmp"):
        try:
            os.makedirs(d, exist_ok=True)
            with open(os.path.join(d, ".touch"), "w"):
                pass
            return d
        except PermissionError:
            continue
    raise PermissionError("No writable directory found.")


# ─── GRADIO UI ────────────────────────────────────────────────────────────
with gr.Blocks(title="Paragraph Annotation Tool") as demo:
    # shared state
    idx_state      = gr.State(0)  # current example index
    nmodels_state  = gr.State(0)  # how many model slots are active

    gr.Markdown("## Paragraph Annotation Tool")

    with gr.Row():
        upload_box = gr.File(label="Upload / Resume CSV", file_types=[".csv"])
        annot_box  = gr.Textbox(label="Annotator name")
        start_btn  = gr.Button("Start / Resume")

    annotator_label = gr.Markdown(visible=False)

    annotation_area = gr.Column(visible=False)
    with annotation_area:
        idx_box  = gr.Number(label="Index", interactive=False)
        hdr_box  = gr.Markdown()
        para_box = gr.Textbox(lines=6, interactive=False,
                              label="Content Paragraph")

        # Pre-allocate up to MAX_MODELS slots
        out_boxes, radio_widgets, comment_boxes = [], [], []
        for _ in range(MAX_MODELS):
            with gr.Row():
                # prompts + ratings
                with gr.Column(scale=2):
                    out1 = gr.Textbox(lines=6, interactive=False)
                    rad1 = gr.Radio(RATING_OPTS, label="Rating (P1)", value=None)
                with gr.Column(scale=2):
                    out2 = gr.Textbox(lines=6, interactive=False)
                    rad2 = gr.Radio(RATING_OPTS, label="Rating (P2)", value=None)
                # NEW → comment textbox
                with gr.Column(scale=1):
                    com = gr.Textbox(lines=2, label="Comment", placeholder="Optional…")
                out_boxes.extend((out1, out2))
                radio_widgets.extend((rad1, rad2))
                comment_boxes.append(com)

        back_btn     = gr.Button("⟵ Back", visible=False)
        next_btn     = gr.Button("Save & Next ⟶", visible=False)
        download_btn = gr.Button("💾 Download CSV", visible=False)

    # Enable NEXT when visible radios are filled (comments are optional)
    def toggle_next(model_cnt: int, *vals):
        needed = vals[: model_cnt*2]  # only rating radios
        return gr.update(interactive=all(v in RATING_OPTS for v in needed))

    for r in radio_widgets:
        r.change(toggle_next,
                 inputs=[nmodels_state]+radio_widgets,
                 outputs=next_btn)

    # ── navigation callbacks ──────────────────────────────────────────────
    def goto(step: int):
        def _fn(idx: int, model_cnt: int, *vals):
            """Handle Back / Next logic."""
            # structure of *vals*: radios (model_cnt*2) + comments (model_cnt) + next_btn 
            RADIO_COUNT = MAX_MODELS * 2
            ratings  = list(vals[: model_cnt * 2])
            comments = list(vals[RADIO_COUNT : RADIO_COUNT + model_cnt])

            # save current row unless we attempted to go back without finishing ratings
            if step != -1 or all(r in RATING_OPTS for r in ratings):
                save_row(idx, ratings, comments)
            new_idx = max(0, min(idx+step, TOTAL-1))
            return build_row(new_idx)
        return _fn

    back_btn.click(
        goto(-1),
        inputs=[idx_state, nmodels_state]+radio_widgets+comment_boxes+[next_btn],
        outputs=[idx_state, hdr_box, para_box,
                 *out_boxes, *radio_widgets, *comment_boxes,
                 back_btn, next_btn],
    )

    next_btn.click(
        goto(1),
        inputs=[idx_state, nmodels_state]+radio_widgets+comment_boxes+[next_btn],
        outputs=[idx_state, hdr_box, para_box,
                 *out_boxes, *radio_widgets, *comment_boxes,
                 back_btn, next_btn],
    )

    # CSV download
    def make_download():
        if df is None:
            raise gr.Error("No CSV loaded yet.")
        tmp = os.path.join(_writable_dir(),
                           f"annotations_{uuid.uuid4().hex}.csv")
        df.to_csv(tmp, index=False)
        return tmp
    download_btn.click(make_download, outputs=gr.File())

    # ── Start / Resume ────────────────────────────────────────────────────
    def start_app(csv_file, name):
        global annotator
        if csv_file is None or not name.strip():
            raise gr.Error("Please upload a CSV and enter your name.")

        new_path = os.path.join(_writable_dir(), f"{uuid.uuid4().hex}.csv")
        shutil.copy(csv_file.name, new_path)
        load_csv(new_path)
        annotator = name.strip()

        # visibility flags – one boolean per model slot
        vis_flags = [i < len(models) for i in range(MAX_MODELS)]

        # build first row values
        row_vals = build_row(first_incomplete())
        idx_val, hdr_val, para_val = row_vals[:3]
        outs   = row_vals[3              : 3 + MAX_MODELS*2]
        rates  = row_vals[3 + MAX_MODELS*2             : 3 + MAX_MODELS*4]
        comms  = row_vals[3 + MAX_MODELS*4             : 3 + MAX_MODELS*5]
        back_update, next_update = row_vals[-2:]

        # updates for textboxes, radios, comments
        out_updates = [
            gr.update(value=outs[i],  visible=vis_flags[i//2])
            for i in range(MAX_MODELS*2)
        ]
        radio_updates = [
            gr.update(value=rates[i], visible=vis_flags[i//2])
            for i in range(MAX_MODELS*2)
        ]
        comment_updates = [
            gr.update(value=comms[i], visible=vis_flags[i])
            for i in range(MAX_MODELS)
        ]

        return (
            first_incomplete(),           # idx_state
            len(models),                  # nmodels_state
            gr.update(value=idx_val),     # idx_box
            gr.update(value=hdr_val),     # hdr_box
            gr.update(value=para_val),    # para_box
            *out_updates,
            *radio_updates,
            *comment_updates,
            back_update, next_update,     # nav buttons
            gr.update(visible=True,
                      value=f"**Annotator:** {annotator}"),
            gr.update(visible=True),      # download_btn
            gr.update(visible=True)       # annotation_area
        )

    start_btn.click(
        start_app,
        inputs=[upload_box, annot_box],
        outputs=[
            idx_state, nmodels_state,
            idx_box, hdr_box, para_box,
            *out_boxes, *radio_widgets, *comment_boxes,
            back_btn, next_btn,
            annotator_label,
            download_btn,
            annotation_area
        ],
    )

# ─── RUN ───────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    demo.queue()
    demo.launch()          # keep share=False on HF Spaces