Spaces:

Sengil
/

aspect_term_extraction

Sleeping

Mert Şengil commited on Jun 13

Commit

ae19be7

1 Parent(s): 19995b3

Add filtering to show only aspect terms present in original text

Files changed (1) hide show

app.py CHANGED Viewed

@@ -25,6 +25,16 @@ def is_valid_aspect(word):
         word.isalpha()
     )
 def extract_and_rank_aspects(text, max_tokens=64, beams=5):
     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(DEVICE)
@@ -46,7 +56,10 @@ def extract_and_rank_aspects(text, max_tokens=64, beams=5):
     all_terms = []
     for pred in all_predictions:
         candidates = re.split(r"[;,–—\-]|(?:\s*,\s*)", pred)
-        all_terms.extend([w.strip().lower() for w in candidates if is_valid_aspect(w)])
     ranked = Counter(all_terms).most_common()
     return ranked
@@ -93,7 +106,7 @@ with gr.Blocks(title="🇹🇷 Türkçe Aspect Term Extraction", theme=gr.themes
         with gr.Column():
             output = gr.Markdown(
                 label="📊 Sonuçlar",
-                value="Sonuçlar burada görünecek..."
             )
     # Example texts

         word.isalpha()
     )
+def is_aspect_in_text(aspect_term, original_text):
+    """Aspect term'in orijinal metinde geçip geçmediğini kontrol eder"""
+    # Case-insensitive karşılaştırma
+    text_lower = original_text.lower()
+    aspect_lower = aspect_term.lower()
+    # Word boundary ile tam kelime araması
+    pattern = r'\b' + re.escape(aspect_lower) + r'\b'
+    return bool(re.search(pattern, text_lower, re.IGNORECASE))
 def extract_and_rank_aspects(text, max_tokens=64, beams=5):
     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(DEVICE)
     all_terms = []
     for pred in all_predictions:
         candidates = re.split(r"[;,–—\-]|(?:\s*,\s*)", pred)
+        # Sadece orijinal metinde geçen aspect term'leri ekle
+        for candidate in candidates:
+            if is_valid_aspect(candidate) and is_aspect_in_text(candidate.strip(), text):
+                all_terms.append(candidate.strip().lower())
     ranked = Counter(all_terms).most_common()
     return ranked
         with gr.Column():
             output = gr.Markdown(
                 label="📊 Sonuçlar",
+                value="📊 Sonuçlar"
             )
     # Example texts