Mert Şengil commited on
Commit
ae19be7
·
1 Parent(s): 19995b3

Add filtering to show only aspect terms present in original text

Browse files
Files changed (1) hide show
  1. app.py +15 -2
app.py CHANGED
@@ -25,6 +25,16 @@ def is_valid_aspect(word):
25
  word.isalpha()
26
  )
27
 
 
 
 
 
 
 
 
 
 
 
28
  def extract_and_rank_aspects(text, max_tokens=64, beams=5):
29
  inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(DEVICE)
30
 
@@ -46,7 +56,10 @@ def extract_and_rank_aspects(text, max_tokens=64, beams=5):
46
  all_terms = []
47
  for pred in all_predictions:
48
  candidates = re.split(r"[;,–—\-]|(?:\s*,\s*)", pred)
49
- all_terms.extend([w.strip().lower() for w in candidates if is_valid_aspect(w)])
 
 
 
50
 
51
  ranked = Counter(all_terms).most_common()
52
  return ranked
@@ -93,7 +106,7 @@ with gr.Blocks(title="🇹🇷 Türkçe Aspect Term Extraction", theme=gr.themes
93
  with gr.Column():
94
  output = gr.Markdown(
95
  label="📊 Sonuçlar",
96
- value="Sonuçlar burada görünecek..."
97
  )
98
 
99
  # Example texts
 
25
  word.isalpha()
26
  )
27
 
28
+ def is_aspect_in_text(aspect_term, original_text):
29
+ """Aspect term'in orijinal metinde geçip geçmediğini kontrol eder"""
30
+ # Case-insensitive karşılaştırma
31
+ text_lower = original_text.lower()
32
+ aspect_lower = aspect_term.lower()
33
+
34
+ # Word boundary ile tam kelime araması
35
+ pattern = r'\b' + re.escape(aspect_lower) + r'\b'
36
+ return bool(re.search(pattern, text_lower, re.IGNORECASE))
37
+
38
  def extract_and_rank_aspects(text, max_tokens=64, beams=5):
39
  inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(DEVICE)
40
 
 
56
  all_terms = []
57
  for pred in all_predictions:
58
  candidates = re.split(r"[;,–—\-]|(?:\s*,\s*)", pred)
59
+ # Sadece orijinal metinde geçen aspect term'leri ekle
60
+ for candidate in candidates:
61
+ if is_valid_aspect(candidate) and is_aspect_in_text(candidate.strip(), text):
62
+ all_terms.append(candidate.strip().lower())
63
 
64
  ranked = Counter(all_terms).most_common()
65
  return ranked
 
106
  with gr.Column():
107
  output = gr.Markdown(
108
  label="📊 Sonuçlar",
109
+ value="📊 Sonuçlar"
110
  )
111
 
112
  # Example texts