Spaces:

sohamnk
/

lnf_v2_ai_pipeline

Sleeping

App Files Files Community

sohamnk commited on Aug 28

Commit

87cf5bb

verified ·

1 Parent(s): b096426

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -132

app.py CHANGED Viewed

@@ -1,11 +1,11 @@
 # --------------------------------------------------------------------------
-# UNIFIED AI SERVICE FOR LOST & FOUND V2 (with Hierarchical Filtering)
 # --------------------------------------------------------------------------
-# This service now performs intelligent filtering before scoring.
-# 1. Filters by object name (canonicalLabel).
-# 2. Filters by date (within 1 week).
-# 3. Filters by location hierarchy.
-# 4. Scores only the most relevant items.
 # --------------------------------------------------------------------------
 import sys
@@ -16,15 +16,16 @@ import requests
 import cv2
 import traceback
 from io import BytesIO
-from skimage import feature
 from flask import Flask, request, jsonify
 from PIL import Image
 from datetime import datetime, timedelta
 # --- Import Deep Learning Libraries ---
 import torch
-from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection, AutoTokenizer, AutoModel
 from segment_anything import SamPredictor, sam_model_registry
 # ==========================================================================
 # --- CONFIGURATION & INITIALIZATION ---
@@ -33,39 +34,60 @@ from segment_anything import SamPredictor, sam_model_registry
 app = Flask(__name__)
 # --- Scoring and Weighting Configuration ---
-TEXT_FIELD_WEIGHTS = { "brand": 1.0, "material": 1.0, "markings": 1.0, "colors": 1.0, "size": 1.0 }
-TEXT_FIELDS_TO_EMBED = ["brand", "material", "markings"]
-SCORE_WEIGHTS = { "text_score": 0.5, "image_score": 0.5 }
-FINAL_SCORE_THRESHOLD = 0.55
 # --- Model Loading ---
 print("="*50)
-print("🚀 Initializing Unified AI Service...")
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 print(f"🧠 Using device: {device}")
 bge_model_id = "BAAI/bge-small-en-v1.5"
 tokenizer_text = AutoTokenizer.from_pretrained(bge_model_id)
 model_text = AutoModel.from_pretrained(bge_model_id).to(device)
 gnd_model_id = "IDEA-Research/grounding-dino-base"
-processor_gnd = AutoProcessor.from_pretrained(gnd_model_id)
 model_gnd = AutoModelForZeroShotObjectDetection.from_pretrained(gnd_model_id).to(device)
 sam_checkpoint = "sam_vit_b_01ec64.pth"
 sam_model = sam_model_registry["vit_b"](checkpoint=sam_checkpoint).to(device)
 sam_predictor = SamPredictor(sam_model)
-print("✅ All models loaded successfully.")
 print("="*50)
 # ==========================================================================
 # --- HELPER FUNCTIONS ---
 # ==========================================================================
 def get_text_embedding(text: str) -> list:
-    # --- ⬇️ THIS IS THE FIX ⬇️ ---
-    # If the input is a list (like the 'colors' field), join it into a string.
     if isinstance(text, list):
         text = ", ".join(text)
-    if not text or not text.strip(): return None
     instruction = "Represent this sentence for searching relevant passages: "
     inputs = tokenizer_text(instruction + text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
@@ -75,81 +97,56 @@ def get_text_embedding(text: str) -> list:
     embedding = torch.nn.functional.normalize(embedding, p=2, dim=1)
     return embedding.cpu().numpy()[0].tolist()
 def cosine_similarity(vec1, vec2):
     if vec1 is None or vec2 is None: return 0.0
     vec1, vec2 = np.array(vec1), np.array(vec2)
     return float(np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)))
-def calculate_color_similarity(colors1: list, colors2: list) -> float:
-    if not colors1 and not colors2: return 1.0
-    if not colors1 or not colors2: return 0.0
-    set1, set2 = set(c.lower() for c in colors1), set(c.lower() for c in colors2)
-    intersection = len(set1.intersection(set2))
-    union = len(set1.union(set2))
-    return intersection / union if union > 0 else 0.0
-def segment_guided_object(image: Image.Image, object_label: str) -> Image.Image:
-    prompt = f"a {object_label}."
     image_rgb = image.convert("RGB")
     image_np = np.array(image_rgb)
     h, w = image_np.shape[:2]
     inputs = processor_gnd(images=image_rgb, text=prompt, return_tensors="pt").to(device)
     with torch.no_grad():
         outputs = model_gnd(**inputs)
     results = processor_gnd.post_process_grounded_object_detection(
         outputs, inputs.input_ids, threshold=0.4, text_threshold=0.4, target_sizes=[(h, w)]
     )
     if not results or len(results[0]['boxes']) == 0:
-        return image
     sam_predictor.set_image(image_np)
     box = results[0]['boxes'][0].cpu().numpy().astype(int)
     masks, _, _ = sam_predictor.predict(box=box, multimask_output=False)
     mask = masks[0]
-    object_rgba = np.zeros((h, w, 4), dtype=np.uint8)
-    object_rgba[:, :, :3] = image_np
-    object_rgba[:, :, 3] = mask * 255
-    return Image.fromarray(object_rgba, 'RGBA')
-def extract_visual_features(segmented_image_rgba: Image.Image) -> dict:
-    image_np = np.array(segmented_image_rgba)
-    bgr_image = cv2.cvtColor(image_np[:, :, :3], cv2.COLOR_RGB2BGR)
-    mask = image_np[:, :, 3]
-    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-    shape_features = np.zeros(7)
-    if contours:
-        largest_contour = max(contours, key=cv2.contourArea)
-        moments = cv2.moments(largest_contour)
-        if moments['m00'] != 0:
-            hu_moments = cv2.HuMoments(moments).flatten()
-            shape_features = -np.sign(hu_moments) * np.log10(np.abs(hu_moments) + 1e-7)
-    color_hist = cv2.calcHist([bgr_image], [0, 1, 2], mask, [8, 8, 8], [0, 256, 0, 256, 0, 256])
-    cv2.normalize(color_hist, color_hist)
-    gray_image = cv2.cvtColor(bgr_image, cv2.COLOR_BGR2GRAY)
-    lbp = feature.local_binary_pattern(gray_image, P=24, R=3, method="uniform")
-    (texture_hist, _) = np.histogram(lbp[mask > 0], bins=np.arange(0, 27), range=(0, 26))
-    texture_hist = texture_hist.astype("float")
-    texture_hist /= (texture_hist.sum() + 1e-6)
-    return {
-        "shape_features": shape_features.tolist(),
-        "color_features": color_hist.flatten().tolist(),
-        "texture_features": texture_hist.tolist()
-    }
-def calculate_dynamic_weights(all_shape_scores, all_color_scores, stability_factor=0.4):
-    shape_scores, color_scores = np.array(all_shape_scores), np.array(all_color_scores)
-    def get_iqr(scores):
-        if len(scores) < 2: return 0
-        q3, q1 = np.percentile(scores, [75, 25])
-        return q3 - q1
-    shape_dispersion = get_iqr(shape_scores)
-    color_dispersion = get_iqr(color_scores)
-    inv_shape_disp = 1 / (shape_dispersion + stability_factor)
-    inv_color_disp = 1 / (color_dispersion + stability_factor)
-    total_inv_disp = inv_shape_disp + inv_color_disp
-    remaining_weight = 0.8
-    shape_weight = remaining_weight * (inv_shape_disp / total_inv_disp) if total_inv_disp > 0 else remaining_weight / 2
-    color_weight = remaining_weight * (inv_color_disp / total_inv_disp) if total_inv_disp > 0 else remaining_weight / 2
-    return {"shape": shape_weight, "color": color_weight, "texture": 0.2}
 # ==========================================================================
 # --- FLASK ENDPOINTS ---
@@ -157,34 +154,45 @@ def calculate_dynamic_weights(all_shape_scores, all_color_scores, stability_fact
 @app.route('/', methods=['GET'])
 def health_check():
-    return jsonify({"status": "Unified AI Service is running"}), 200
 @app.route('/process', methods=['POST'])
 def process_item():
     try:
         data = request.json
         response = {
             "canonicalLabel": data.get('objectName', '').lower().strip(),
             "brand_embedding": get_text_embedding(data.get('brand')),
             "material_embedding": get_text_embedding(data.get('material')),
-            "markings_embedding": get_text_embedding(data.get('markings'))
         }
-        visual_features_list = []
         if data.get('images'):
             for image_url in data['images']:
                 try:
                     img_response = requests.get(image_url, timeout=20)
                     img_response.raise_for_status()
                     image = Image.open(BytesIO(img_response.content))
-                    segmented_image = segment_guided_object(image, data['objectName'])
-                    features = extract_visual_features(segmented_image)
-                    visual_features_list.append(features)
                 except Exception as e:
                     print(f"    - ⚠️ Could not process image {image_url}: {e}")
                     continue
-        response["visual_features"] = visual_features_list
         return jsonify(response), 200
     except Exception as e:
         return jsonify({"error": str(e)}), 500
 @app.route('/compare', methods=['POST'])
@@ -195,75 +203,48 @@ def compare_items():
         search_list = payload['searchList']
         print(f"\n[COMPARE] Received {len(search_list)} candidates for '{query_item.get('objectName')}'.")
-        # --- HIERARCHICAL FILTERING LOGIC ---
-        # 1. Object Name Filtering
         query_label = query_item.get('canonicalLabel')
         if query_label:
             search_list = [item for item in search_list if item.get('canonicalLabel') == query_label]
-        print(f"  [FILTER] After object name filter: {len(search_list)} candidates remain.")
-        # 2. Date Filtering (within 1 week)
         query_date_str = query_item.get('dateLost') or query_item.get('dateFound')
         query_date = datetime.fromisoformat(query_date_str.replace('Z', '+00:00'))
         one_week = timedelta(days=7)
-        def is_within_week(item):
-            item_date_str = item.get('dateFound') or item.get('dateLost')
-            if not item_date_str: return False
-            item_date = datetime.fromisoformat(item_date_str.replace('Z', '+00:00'))
-            return abs(query_date - item_date) <= one_week
-        search_list = [item for item in search_list if is_within_week(item)]
-        print(f"  [FILTER] After date filter (1 week): {len(search_list)} candidates remain.")
-        # 3. Location Filtering
         query_location = query_item.get('locationLost') or query_item.get('locationFound')
         if query_location and query_location != "Campus":
-            filtered_by_location = []
-            for item in search_list:
-                item_location = item.get('locationFound') or item.get('locationLost')
-                if item_location == query_location or item_location == "Campus":
-                    filtered_by_location.append(item)
-            search_list = filtered_by_location
-        print(f"  [FILTER] After location hierarchy: {len(search_list)} candidates remain for scoring.")
-        # --- SCORING LOGIC (runs only on the filtered list) ---
         results = []
         for item in search_list:
             item_id = item.get('_id')
             try:
-                total_text_score, total_text_weight = 0, 0
                 for field in TEXT_FIELDS_TO_EMBED:
-                    q_emb, i_emb = query_item.get(f"{field}_embedding"), item.get(f"{field}_embedding")
                     if q_emb and i_emb:
-                        score, weight = cosine_similarity(q_emb, i_emb), TEXT_FIELD_WEIGHTS.get(field, 0)
-                        total_text_score += score * weight; total_text_weight += weight
-                if query_item.get('colors'):
-                    score, weight = calculate_color_similarity(query_item['colors'], item.get('colors', [])), TEXT_FIELD_WEIGHTS.get('colors', 0)
-                    total_text_score += score * weight; total_text_weight += weight
-                if query_item.get('size'):
-                    score, weight = (1.0 if query_item['size'] == item.get('size') else 0.0), TEXT_FIELD_WEIGHTS.get('size', 0)
-                    total_text_score += score * weight; total_text_weight += weight
-                text_score = (total_text_score / total_text_weight) if total_text_weight > 0 else 0.0
                 image_score = 0.0
-                query_visuals, item_visuals = query_item.get('visual_features', []), item.get('visual_features', [])
-                if query_visuals and item_visuals:
-                    all_shape_scores, all_color_scores, all_texture_scores = [], [], []
-                    for q_vis in query_visuals:
-                        for i_vis in item_visuals:
-                            shape_dist = cv2.matchShapes(np.array(q_vis["shape_features"], dtype="float32"), np.array(i_vis["shape_features"], dtype="float32"), cv2.CONTOURS_MATCH_I1, 0.0)
-                            all_shape_scores.append(1.0 / (1.0 + shape_dist))
-                            all_color_scores.append(cv2.compareHist(np.array(q_vis["color_features"], dtype="float32"), np.array(i_vis["color_features"], dtype="float32"), cv2.HISTCMP_CORREL))
-                            all_texture_scores.append(cv2.compareHist(np.array(q_vis["texture_features"], dtype="float32"), np.array(i_vis["texture_features"], dtype="float32"), cv2.HISTCMP_CORREL))
-                    if all_shape_scores:
-                        weights = calculate_dynamic_weights(all_shape_scores, all_color_scores)
-                        image_score = (weights["shape"] * max(all_shape_scores) + weights["color"] * max(all_color_scores) + weights["texture"] * max(all_texture_scores))
-                final_score = (SCORE_WEIGHTS['text_score'] * text_score + SCORE_WEIGHTS['image_score'] * image_score) if (query_visuals and item_visuals) else text_score
                 if final_score >= FINAL_SCORE_THRESHOLD:
                     results.append({ "_id": str(item_id), "score": round(final_score, 4) })
@@ -272,7 +253,7 @@ def compare_items():
                 continue
         results.sort(key=lambda x: x["score"], reverse=True)
-        print(f"\n[COMPARE] ✅ Search complete. Found {len(results)} potential matches from the filtered list.")
         return jsonify({"matches": results}), 200
     except Exception as e:

 # --------------------------------------------------------------------------
+# UNIFIED AI SERVICE V3 (DINOv2 Integration)
 # --------------------------------------------------------------------------
+# This service uses DINOv2 for image embeddings and BGE for text embeddings.
+# It performs intelligent filtering before scoring.
+# 1. Filters by object name, date, and location hierarchy.
+# 2. Extracts features using BGE (text) and DINOv2 (image).
+# 3. Scores items based on a hybrid of text and image similarity.
 # --------------------------------------------------------------------------
 import sys
 import cv2
 import traceback
 from io import BytesIO
 from flask import Flask, request, jsonify
 from PIL import Image
 from datetime import datetime, timedelta
 # --- Import Deep Learning Libraries ---
 import torch
+from transformers import AutoImageProcessor, AutoModel, AutoTokenizer
 from segment_anything import SamPredictor, sam_model_registry
+# Grounding DINO is still needed for segmentation
+from transformers import AutoProcessor as AutoGndProcessor, AutoModelForZeroShotObjectDetection
 # ==========================================================================
 # --- CONFIGURATION & INITIALIZATION ---
 app = Flask(__name__)
 # --- Scoring and Weighting Configuration ---
+TEXT_FIELDS_TO_EMBED = ["brand", "material", "size", "colors"]
+SCORE_WEIGHTS = { "text_score": 0.4, "image_score": 0.6 } # Give image score more weight
+FINAL_SCORE_THRESHOLD = 0.5
 # --- Model Loading ---
 print("="*50)
+print("🚀 Initializing AI Service with DINOv2...")
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 print(f"🧠 Using device: {device}")
+# 1. Load BGE Text Model
+print("...Loading BGE text model (BAAI/bge-small-en-v1.5)...")
 bge_model_id = "BAAI/bge-small-en-v1.5"
 tokenizer_text = AutoTokenizer.from_pretrained(bge_model_id)
 model_text = AutoModel.from_pretrained(bge_model_id).to(device)
+print("✅ BGE model loaded.")
+# 2. Load DINOv2 Image Model
+print("...Loading DINOv2 model (facebook/dinov2-base)...")
+dinov2_model_id = "facebook/dinov2-base"
+processor_dinov2 = AutoImageProcessor.from_pretrained(dinov2_model_id)
+model_dinov2 = AutoModel.from_pretrained(dinov2_model_id).to(device)
+print("✅ DINOv2 model loaded.")
+# 3. Load Grounding DINO Model (for segmentation)
+print("...Loading Grounding DINO model for segmentation...")
 gnd_model_id = "IDEA-Research/grounding-dino-base"
+processor_gnd = AutoGndProcessor.from_pretrained(gnd_model_id)
 model_gnd = AutoModelForZeroShotObjectDetection.from_pretrained(gnd_model_id).to(device)
+print("✅ Grounding DINO model loaded.")
+# 4. Load Segment Anything (SAM) Model
+print("...Loading SAM model...")
 sam_checkpoint = "sam_vit_b_01ec64.pth"
 sam_model = sam_model_registry["vit_b"](checkpoint=sam_checkpoint).to(device)
 sam_predictor = SamPredictor(sam_model)
+print("✅ SAM model loaded.")
 print("="*50)
 # ==========================================================================
 # --- HELPER FUNCTIONS ---
 # ==========================================================================
 def get_text_embedding(text: str) -> list:
+    # --- THIS IS THE FIX ---
+    # First, handle the case where text is a list (like the 'colors' field).
     if isinstance(text, list):
+        if not text: # Handle empty list case
+            return None
         text = ", ".join(text)
+    # Now, perform the check on the (potentially converted) string.
+    if not text or not text.strip():
+        return None
     instruction = "Represent this sentence for searching relevant passages: "
     inputs = tokenizer_text(instruction + text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
     embedding = torch.nn.functional.normalize(embedding, p=2, dim=1)
     return embedding.cpu().numpy()[0].tolist()
+def get_image_embedding(image: Image.Image) -> list:
+    """Generates a DINOv2 embedding for a given image."""
+    inputs = processor_dinov2(images=image, return_tensors="pt").to(device)
+    with torch.no_grad():
+        outputs = model_dinov2(**inputs)
+    # Use the CLS token embedding
+    embedding = outputs.last_hidden_state[:, 0, :]
+    embedding = torch.nn.functional.normalize(embedding, p=2, dim=1)
+    return embedding.cpu().numpy()[0].tolist()
 def cosine_similarity(vec1, vec2):
     if vec1 is None or vec2 is None: return 0.0
     vec1, vec2 = np.array(vec1), np.array(vec2)
     return float(np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2)))
+def segment_guided_object(image: Image.Image, object_label: str, text_data: dict) -> Image.Image:
+    """Segments an object using a more descriptive prompt."""
+    desc_parts = [object_label]
+    if text_data.get('brand'): desc_parts.append(f"brand {text_data['brand']}")
+    if text_data.get('colors'): desc_parts.append(", ".join(text_data['colors']))
+    prompt = " ".join(desc_parts)
+    print(f"  [Segment] Using prompt: '{prompt}'")
     image_rgb = image.convert("RGB")
     image_np = np.array(image_rgb)
     h, w = image_np.shape[:2]
     inputs = processor_gnd(images=image_rgb, text=prompt, return_tensors="pt").to(device)
     with torch.no_grad():
         outputs = model_gnd(**inputs)
     results = processor_gnd.post_process_grounded_object_detection(
         outputs, inputs.input_ids, threshold=0.4, text_threshold=0.4, target_sizes=[(h, w)]
     )
     if not results or len(results[0]['boxes']) == 0:
+        print(f"  [Segment] ⚠️ Warning: Could not detect object. Using full image.")
+        return image_rgb
     sam_predictor.set_image(image_np)
     box = results[0]['boxes'][0].cpu().numpy().astype(int)
     masks, _, _ = sam_predictor.predict(box=box, multimask_output=False)
     mask = masks[0]
+    background = np.ones_like(image_np, dtype=np.uint8) * 255
+    foreground = cv2.bitwise_and(image_np, image_np, mask=mask.astype(np.uint8))
+    background = cv2.bitwise_and(background, background, mask=~mask.astype(np.uint8))
+    segmented_np = cv2.add(foreground, background)
+    return Image.fromarray(segmented_np, 'RGB')
 # ==========================================================================
 # --- FLASK ENDPOINTS ---
 @app.route('/', methods=['GET'])
 def health_check():
+    return jsonify({"status": "Unified AI Service (DINOv2) is running"}), 200
 @app.route('/process', methods=['POST'])
 def process_item():
     try:
         data = request.json
+        print(f"\n[PROCESS] Received request for: {data.get('objectName')}")
         response = {
             "canonicalLabel": data.get('objectName', '').lower().strip(),
             "brand_embedding": get_text_embedding(data.get('brand')),
             "material_embedding": get_text_embedding(data.get('material')),
+            "size_embedding": get_text_embedding(data.get('size')),
+            "colors_embedding": get_text_embedding(data.get('colors')),
         }
+        image_embeddings = []
         if data.get('images'):
+            print(f"  [PROCESS] Processing {len(data['images'])} image(s)...")
             for image_url in data['images']:
                 try:
                     img_response = requests.get(image_url, timeout=20)
                     img_response.raise_for_status()
                     image = Image.open(BytesIO(img_response.content))
+                    segmented_image = segment_guided_object(image, data['objectName'], data)
+                    embedding = get_image_embedding(segmented_image)
+                    image_embeddings.append(embedding)
                 except Exception as e:
                     print(f"    - ⚠️ Could not process image {image_url}: {e}")
                     continue
+        response["image_embeddings"] = image_embeddings
+        print(f"  [PROCESS] ✅ Successfully processed all features.")
         return jsonify(response), 200
     except Exception as e:
+        print(f"❌ Error in /process: {e}")
+        traceback.print_exc()
         return jsonify({"error": str(e)}), 500
 @app.route('/compare', methods=['POST'])
         search_list = payload['searchList']
         print(f"\n[COMPARE] Received {len(search_list)} candidates for '{query_item.get('objectName')}'.")
+        # --- HIERARCHICAL FILTERING ---
         query_label = query_item.get('canonicalLabel')
         if query_label:
             search_list = [item for item in search_list if item.get('canonicalLabel') == query_label]
+        print(f"  [FILTER] After object name: {len(search_list)} candidates remain.")
         query_date_str = query_item.get('dateLost') or query_item.get('dateFound')
         query_date = datetime.fromisoformat(query_date_str.replace('Z', '+00:00'))
         one_week = timedelta(days=7)
+        search_list = [item for item in search_list if abs(query_date - datetime.fromisoformat((item.get('dateFound') or item.get('dateLost')).replace('Z', '+00:00'))) <= one_week]
+        print(f"  [FILTER] After date: {len(search_list)} candidates remain.")
         query_location = query_item.get('locationLost') or query_item.get('locationFound')
         if query_location and query_location != "Campus":
+            search_list = [item for item in search_list if (item.get('locationFound') or item.get('locationLost')) in [query_location, "Campus"]]
+        print(f"  [FILTER] After location: {len(search_list)} candidates for scoring.")
+        # --- SCORING ---
         results = []
         for item in search_list:
             item_id = item.get('_id')
             try:
+                total_text_score = 0
                 for field in TEXT_FIELDS_TO_EMBED:
+                    q_emb = query_item.get(f"{field}_embedding")
+                    i_emb = item.get(f"{field}_embedding")
                     if q_emb and i_emb:
+                        total_text_score += cosine_similarity(q_emb, i_emb)
+                text_score = total_text_score / len(TEXT_FIELDS_TO_EMBED) if TEXT_FIELDS_TO_EMBED else 0
                 image_score = 0.0
+                query_img_embs = query_item.get('image_embeddings', [])
+                item_img_embs = item.get('image_embeddings', [])
+                if query_img_embs and item_img_embs:
+                    all_img_scores = []
+                    for q_emb in query_img_embs:
+                        for i_emb in item_img_embs:
+                            all_img_scores.append(cosine_similarity(q_emb, i_emb))
+                    if all_img_scores:
+                        image_score = max(all_img_scores)
+                final_score = (SCORE_WEIGHTS['text_score'] * text_score + SCORE_WEIGHTS['image_score'] * image_score)
                 if final_score >= FINAL_SCORE_THRESHOLD:
                     results.append({ "_id": str(item_id), "score": round(final_score, 4) })
                 continue
         results.sort(key=lambda x: x["score"], reverse=True)
+        print(f"\n[COMPARE] ✅ Search complete. Found {len(results)} potential matches.")
         return jsonify({"matches": results}), 200
     except Exception as e: