import os import spacy from nltk.corpus import wordnet as wn from nltk.stem import WordNetLemmatizer import nltk from flask import Flask, request, jsonify os.environ["NLTK_DATA"] = "/usr/local/nltk_data" # Ensure NLTK corpora are available try: nltk.data.find('corpora/wordnet') except LookupError: raise RuntimeError("NLTK wordnet data not found. Make sure it's downloaded in Docker build.") # Initialize Flask app and NLP tools app = Flask(__name__) nlp = spacy.load("en_core_web_sm") lemmatizer = WordNetLemmatizer() # --- NLP Helper Functions --- def extract_main_noun(text): """Extracts the last noun from the text, assuming it's the primary object.""" doc = nlp(text) nouns = [token.text for token in doc if token.pos_ == "NOUN"] return nouns[-1] if nouns else text def check_singular_form(word1, word2): """Checks if two words share the same singular form (e.g., 'glasses' vs 'glass').""" lemma1 = lemmatizer.lemmatize(word1.lower(), pos='n') lemma2 = lemmatizer.lemmatize(word2.lower(), pos='n') return lemma1 == lemma2 def check_synonyms(word1, word2): """Checks if two words are synonyms (e.g., 'wallet' vs 'billfold').""" syns1 = set(l.name() for s in wn.synsets(word1, pos=wn.NOUN) for l in s.lemmas()) syns2 = set(l.name() for s in wn.synsets(word2, pos=wn.NOUN) for l in s.lemmas()) return len(syns1.intersection(syns2)) > 0 def check_hyponym(word1, word2): """Checks if one word is a type of another (e.g., 'bottle' vs 'container').""" syns1 = wn.synsets(word1, pos=wn.NOUN) syns2 = wn.synsets(word2, pos=wn.NOUN) if not syns1 or not syns2: return False for s1 in syns1: for hyper in s1.hypernyms(): if hyper in syns2: return True for s2 in syns2: for hyper in s2.hypernyms(): if hyper in syns1: return True return False # --- Core Logic for Label Assignment --- def find_canonical_label(object_name, existing_labels): """ Assigns a canonical label to a new object by checking against existing labels. If no match is found, it returns the lemmatized (singular) noun of the new object. """ noun = extract_main_noun(object_name) for label in existing_labels: if check_singular_form(noun, label) or check_synonyms(noun, label) or check_hyponym(noun, label): print(f"Match found: '{noun}' -> Existing label '{label}'") return label new_label = lemmatizer.lemmatize(noun.lower(), pos='n') print(f"No match found for '{noun}'. Creating new canonical label: '{new_label}'") return new_label # --- API Endpoint --- @app.route('/get-canonical-label', methods=['POST']) def assign_label_endpoint(): """ API endpoint to receive an object name and existing labels, and return a canonical label. """ data = request.get_json() if not data or 'object_name' not in data or 'existing_labels' not in data: return jsonify({"error": "Request must include 'object_name' and 'existing_labels'"}), 400 canonical_label = find_canonical_label(data['object_name'], data['existing_labels']) return jsonify({"canonical_label": canonical_label}) if __name__ == '__main__': # Run the service on port 7860, the default for Hugging Face Spaces app.run(host='0.0.0.0', port=7860, debug=True)