#!/usr/bin/env python
"""
reassemble_bbox_dataset_resume.py
---------------------------------
Incrementally rebuilds `bbox_filled / annotated / bbox_json` columns from
QA artefacts and pushes the final dataset **privately** to HF Hub.

• Safe to ^C / rerun (uses on-disk Arrow cache)
• When NOTHING is left to process it *just* loads the cache and pushes.
• Uses path-only image columns (HFImage(decode=False)) to keep RAM tiny.
"""

import os, json
from pathlib import Path
from tqdm.auto import tqdm
from datasets import (
    load_dataset, load_from_disk, Dataset, disable_progress_bar, Features,
    Value, Image as HFImage
)
from PIL import Image
from huggingface_hub.utils import HfHubHTTPError

disable_progress_bar()

# ══════ CONFIG ══════════════════════════════════════════════════════
DATASET_NAME = "fotographerai/furniture_captioned_segment_prompt"
SPLIT        = "train"

QA_DIR       = Path("bbox_review_recaptioned")      # artefacts
CACHE_DIR    = Path("rebuild_cache")                # incremental Arrow cache
CACHE_DIR.mkdir(exist_ok=True)

TARGET_SIDE  = 1500
GREEN_RGB    = (0, 255, 0)

BATCH_SAVE   = 500
HUB_REPO     = "fotographerai/furniture_bboxfilled_rebuild"
HF_TOKEN     = os.environ.get("HF_TOKEN", "").strip()  # needs write+private

# ══════ HELPERS ═════════════════════════════════════════════════════
def img_ref(p: Path) -> dict:                        # path-only image dict
    return {"path": str(p), "bytes": None}

def make_green_png(p: Path):
    if not p.exists():
        Image.new("RGB", (TARGET_SIDE, TARGET_SIDE), GREEN_RGB).save(p)

def ensure_full_bbox(p: Path):
    if not p.exists():
        p.write_text(json.dumps({"xyxy": [[0, 0, TARGET_SIDE, TARGET_SIDE]]}))

# ══════ LOAD SOURCE DATASET ═════════════════════════════════════════
base_ds = load_dataset(DATASET_NAME, split=SPLIT, streaming=False)
N_TOTAL = len(base_ds)
print("Original rows:", N_TOTAL)

# ══════ LOAD OR INIT CACHE ══════════════════════════════════════════
if (CACHE_DIR / "dataset_info.json").exists():
    cache_ds = load_from_disk(CACHE_DIR)
    done     = set(cache_ds["__row_idx__"])
    print(f"Cache found → {len(done)} rows already processed.")
    records  = {k: list(v) for k, v in cache_ds.to_dict().items()}
else:
    done, records = set(), {"__row_idx__": [], "bbox_filled": [],
                            "annotated": [], "bbox_json": []}

missing = [i for i in range(N_TOTAL) if i not in done]
print("Rows still to process:", len(missing))

# ══════ NO WORK LEFT?  push & exit ══════════════════════════════════
if not missing:
    print("💤 nothing new to process – pushing cached dataset…")
    try:
        url = cache_ds.push_to_hub(
            HUB_REPO, private=True, token=HF_TOKEN, max_shard_size="500MB"
        )
        print("🚀 dataset pushed to:", url)
    except HfHubHTTPError as e:
        print("❌ push failed:", e)
    exit(0)

# ══════ PROCESS MISSING ROWS ═══════════════════════════════════════
for n, i in enumerate(tqdm(missing, desc="Re-assembling")):
    g_png  = QA_DIR / f"{i:06d}_green.png"
    a_png  = QA_DIR / f"{i:06d}_anno.png"
    bbox_j = QA_DIR / f"{i:06d}_bbox.json"

    if not (g_png.exists() and a_png.exists() and bbox_j.exists()):
        mask_png = QA_DIR / f"{i:06d}_mask.png"
        make_green_png(mask_png)
        g_png = a_png = mask_png
        ensure_full_bbox(bbox_j)

    row = base_ds[i]                       # copy original cols once
    records["__row_idx__"].append(i)
    for k, v in row.items():
        records.setdefault(k, []).append(v)

    records["bbox_filled"].append(img_ref(g_png))
    records["annotated"].append(img_ref(a_png))
    records["bbox_json"].append(bbox_j.read_text())

    if (n + 1) % BATCH_SAVE == 0:
        Dataset.from_dict(records).save_to_disk(CACHE_DIR)
        print(f"⏫ cached at {n+1}/{len(missing)}")

# ══════ FINAL DATASET FEATURES & SAVE ═══════════════════════════════
features = Features({
    "__row_idx__" : Value("int32"),
    "bbox_filled" : HFImage(decode=False),
    "annotated"   : HFImage(decode=False),
    "bbox_json"   : Value("string"),
    # original columns inferred below
})
for k in base_ds.features:
    if k not in features:
        features[k] = base_ds.features[k]

final_ds = Dataset.from_dict(records, features=features)
final_ds.save_to_disk(CACHE_DIR)
print("✅ cached dataset saved to", CACHE_DIR.resolve())

# ══════ PUSH PRIVATE ═══════════════════════════════════════════════
if not HF_TOKEN:
    print("⚠️  HF_TOKEN env-var not set – skipping push.")
else:
    try:
        url = final_ds.push_to_hub(
            HUB_REPO, private=True, token=HF_TOKEN, max_shard_size="500MB"
        )
        print("🚀 dataset pushed to:", url)
    except HfHubHTTPError as e:
        print("❌ push failed:", e)