| |
|
| |
|
| |
|
| |
|
| | import spacy
|
| | from spacy.tokens import Doc, Span, Token
|
| | from datetime import datetime
|
| | from typing import List, Optional, Tuple
|
| |
|
| |
|
| | try:
|
| | from aee_core_classes_era import Proposition, EpistemicData
|
| | except ImportError:
|
| | print("Extractor Error: Could not import Proposition/EpistemicData class from aee_core_classes_era.py.")
|
| | Proposition = None; EpistemicData = None
|
| |
|
| |
|
| | NLP_MODEL = None; MODEL_NAME = "en_core_web_sm"
|
| | try:
|
| | NLP_MODEL = spacy.load(MODEL_NAME)
|
| |
|
| | except OSError:
|
| | print(f"Extractor Error: spaCy English model '{MODEL_NAME}' not found. Please run: python -m spacy download {MODEL_NAME}")
|
| |
|
| |
|
| | def process_with_spacy(text: str) -> Optional[Doc]:
|
| | """
|
| | Verilen metni spaCy ile işler ve Doc nesnesini döndürür.
|
| | NLP_MODEL yüklenemezse None döndürür.
|
| | """
|
| | if NLP_MODEL is None:
|
| | print(f"Error: spaCy model not loaded. Cannot process text.")
|
| | return None
|
| |
|
| | try:
|
| | return NLP_MODEL(text)
|
| | except Exception as e:
|
| | print(f"Error processing text with spaCy: {e}")
|
| | return None
|
| |
|
| |
|
| | def get_token_lemma(token: Optional[Token]) -> Optional[str]:
|
| | """ Verilen Token nesnesinin lemma'sını (kökünü) küçük harfle güvenli bir şekilde alır. """
|
| | return token.lemma_.lower() if token else None
|
| |
|
| | def find_negation(token: Optional[Token], sentence: Span) -> bool:
|
| | """ Verilen Token'a (genellikle fiil) veya ilişkili olduğu ana yapıya bağlı bir negasyon olup olmadığını kontrol eder. """
|
| | if not token: return False
|
| |
|
| | for child in token.children:
|
| | if child.dep_ == "neg": return True
|
| |
|
| | if token.head != token and token.head.pos_ == "AUX":
|
| | for child in token.head.children:
|
| | if child.dep_ == "neg": return True
|
| |
|
| | if token.lemma_ == "be":
|
| | for child in token.children:
|
| | if child.dep_ == "neg": return True
|
| |
|
| | for child in token.children:
|
| | if child.dep_ == "advmod" and child.lemma_ == "not": return True
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | return False
|
| |
|
| |
|
| |
|
| | def get_source_based_confidence(source_id: str) -> float:
|
| | """ Kaynak ID'sine göre temel bir başlangıç güven skoru döndürür. """
|
| | source_id_lower = source_id.lower()
|
| |
|
| | if "user" in source_id_lower or "comment" in source_id_lower or "diary" in source_id_lower: return 0.45
|
| | elif "blog" in source_id_lower or "opinion" in source_id_lower or "forum" in source_id_lower: return 0.50
|
| | elif "news" in source_id_lower or any(domain in source_id_lower for domain in [".com", ".org", ".net"]): return 0.65
|
| | elif "wiki" in source_id_lower: return 0.70
|
| | elif "report" in source_id_lower or "fact_sheet" in source_id_lower: return 0.75
|
| | elif "textbook" in source_id_lower or ".edu" in source_id_lower : return 0.80
|
| | elif "science" in source_id_lower or "research" in source_id_lower or "expert" in source_id_lower or "paper" in source_id_lower: return 0.85
|
| | elif "common_knowledge" in source_id_lower: return 0.90
|
| | else: return 0.55
|
| |
|
| | def calculate_linguistic_confidence_modifier(sent: Span) -> float:
|
| | """ Cümledeki kesinlik/belirsizlik ifadelerine bakarak güven ayarlama çarpanı döndürür. """
|
| | modifier = 1.0
|
| | uncertainty_markers = {"may", "might", "could", "perhaps", "possibly", "suggest", "appear", "seem", "likely", "probably", "believe", "think", "assume", "sometimes"}
|
| | certainty_markers = {"will", "must", "definitely", "certainly", "undoubtedly", "always", "never", "prove", "confirm", "show", "demonstrate", "fact"}
|
| | negation_markers = {"not", "n't", "never", "no", "none"}
|
| |
|
| | has_uncertainty = False
|
| | has_certainty = False
|
| | has_negation_cue = False
|
| |
|
| | for token in sent:
|
| | lemma = token.lemma_.lower()
|
| | pos = token.pos_
|
| | dep = token.dep_
|
| |
|
| |
|
| | if (pos in ["AUX", "VERB"] and lemma in uncertainty_markers) or \
|
| | (pos == "ADV" and lemma in uncertainty_markers):
|
| | has_uncertainty = True
|
| |
|
| | break
|
| |
|
| |
|
| | if not has_uncertainty:
|
| | for token in sent:
|
| | lemma = token.lemma_.lower()
|
| | pos = token.pos_
|
| | if (pos == "ADV" and lemma in certainty_markers) or \
|
| | (pos == "VERB" and lemma in certainty_markers):
|
| | has_certainty = True
|
| |
|
| | break
|
| |
|
| |
|
| | if has_uncertainty:
|
| | modifier = 0.80
|
| | elif has_certainty:
|
| | modifier = 1.15
|
| |
|
| |
|
| | return modifier
|
| |
|
| |
|
| | MIN_CONFIDENCE = 0.01
|
| | MAX_CONFIDENCE = 0.99
|
| |
|
| |
|
| | def extract_propositions_era(doc: Doc, source_id: str) -> List[Proposition]:
|
| | """
|
| | Era Sürümü: Önermeleri çıkarır ve başlangıç güvenini hem kaynağa
|
| | hem de dilbilimsel ifadelere göre ayarlar.
|
| | """
|
| | propositions: List[Proposition] = []
|
| | if not doc or not Proposition or not EpistemicData: return propositions
|
| |
|
| | for sent in doc.sents:
|
| |
|
| | root: Token = sent.root
|
| | subject: Optional[Token] = None; prop_object: Optional[Token] = None; attribute: Optional[Token] = None
|
| |
|
| |
|
| |
|
| | for token in sent:
|
| | if token.dep_ in ["nsubj", "nsubjpass"] and token.head == root:
|
| | subject = token
|
| | break
|
| | if not subject:
|
| |
|
| | continue
|
| |
|
| |
|
| | for token in sent:
|
| | if token.head == root:
|
| | if token.dep_ in ["dobj", "pobj"]:
|
| | prop_object = token
|
| | elif token.dep_ in ["attr", "acomp", "xcomp"]:
|
| | attribute = token
|
| |
|
| |
|
| |
|
| | relation_lemma = get_token_lemma(root)
|
| | subject_lemma = get_token_lemma(subject)
|
| | value_lemma = None
|
| | value_token = attribute if attribute else prop_object
|
| |
|
| | if value_token:
|
| | value_lemma = get_token_lemma(value_token)
|
| |
|
| | is_negated = find_negation(value_token, sent)
|
| | else:
|
| |
|
| |
|
| | continue
|
| |
|
| |
|
| | if not is_negated:
|
| | is_negated = find_negation(root, sent)
|
| |
|
| |
|
| |
|
| | if subject_lemma and relation_lemma and value_lemma:
|
| |
|
| | source_based_conf = get_source_based_confidence(source_id)
|
| | linguistic_modifier = calculate_linguistic_confidence_modifier(sent)
|
| | initial_confidence = max(MIN_CONFIDENCE, min(MAX_CONFIDENCE, source_based_conf * linguistic_modifier))
|
| |
|
| |
|
| | source_type = None; sid_lower = source_id.lower()
|
| |
|
| | if "user" in sid_lower: source_type = "user"
|
| | elif "news" in sid_lower: source_type = "news"
|
| |
|
| | elif "common" in sid_lower: source_type = "common"
|
| | elif "textbook" in sid_lower: source_type = "textbook"
|
| |
|
| |
|
| |
|
| | ep_data = EpistemicData(
|
| | source_id=source_id,
|
| | initial_confidence=initial_confidence,
|
| | source_type=source_type
|
| | )
|
| |
|
| |
|
| | new_prop = Proposition(
|
| | text_span=sent.text,
|
| | sentence_text=sent.text,
|
| | epistemic_data=ep_data,
|
| | subject_lemma=subject_lemma,
|
| | relation_lemma=relation_lemma,
|
| | value_lemma=value_lemma,
|
| | is_negated=is_negated
|
| | )
|
| |
|
| | propositions.append(new_prop)
|
| |
|
| |
|
| |
|
| |
|
| | return propositions
|
| |
|
| |
|
| | if __name__ == "__main__":
|
| | print("\nTesting AEE Extractor Module (Era Version - Linguistic Confidence)...")
|
| |
|
| | if not NLP_MODEL:
|
| | print("Cannot run tests because spaCy model is not loaded.")
|
| | else:
|
| | print("Creating test sentences...")
|
| | test_sentences = [
|
| | "The sky is blue.",
|
| | "The sky is not blue.",
|
| | "The sky might be blue.",
|
| | "The sky is definitely blue.",
|
| | "System A is bigger than System B.",
|
| | ]
|
| |
|
| | for text in test_sentences:
|
| | print(f"\nProcessing: '{text}'")
|
| | doc = process_with_spacy(text)
|
| | if doc:
|
| | props = extract_propositions_era(doc, "test_source")
|
| | for prop in props:
|
| | print(f" Extracted: {prop}")
|
| | print(f" Subject: {prop.subject_lemma}, Relation: {prop.relation_lemma}, Value: {prop.value_lemma}")
|
| | print(f" Negated: {prop.is_negated}, Confidence: {prop.epistemic_data.initial_confidence:.2f}")
|
| | else:
|
| | print(" Failed to process with spaCy.") |