File size: 6,414 Bytes
dbca390
 
c4f7a31
dbca390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4f7a31
dbca390
c4f7a31
 
 
 
 
 
dbca390
c4f7a31
dbca390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb9dd9f
 
 
 
dbca390
 
 
fb9dd9f
 
 
 
 
 
 
 
 
 
 
dbca390
 
 
 
 
 
 
 
 
 
 
 
 
 
fb9dd9f
dbca390
 
 
fb9dd9f
dbca390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fb9dd9f
dbca390
 
fb9dd9f
 
dbca390
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
import gensim
import gensim.downloader
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd
import os
from supabase import acreate_client, AsyncClient
from dotenv import load_dotenv

class Vectorizer:
    """
    A class to:
        - Generate embeddings of words
        - Query for words from Supabase database based on vector similarity
        - Return matching ASL videos for words
    """

    def load_kv(self, model_name='word2vec-google-news-300'):
        """
        Returns a KeyedVector object loaded from gensim
        """
        model_path = os.path.join(os.getcwd(), 'gensim-data', 'GoogleNews-vectors-negative300.bin.gz')
        try:
            print(f"Loading model from {model_path}")
            kv = KeyedVectors.load_word2vec_format(model_path, binary=True)
            print("Word2Vec model loaded successfully as KeyedVectors object.")
            return kv
        except FileNotFoundError:
            print(f"Error: Model file not found at {model_path}. Trying to download...")
            kv = gensim.downloader.load(model_name)  # returns a keyedvector
            print("Word2Vec model loaded successfully as KeyedVectors object.")
            return kv
        except Exception as e:
            print(f"Unable to load embedding model from gensim: {e}")
            return None

    async def initialize_supabase(self):
        url: str = os.environ.get("SUPABASE_URL")
        key: str = os.environ.get("SUPABASE_KEY")
        supabase: AsyncClient = await acreate_client(url, key)
        return supabase

    def __init__(self):
        load_dotenv()
        self.kv = self.load_kv()
        self.supabase = None  # Will be initialized when needed

    async def ensure_supabase_initialized(self):
        """Ensure Supabase client is initialized"""
        if self.supabase is None:
            self.supabase = await self.initialize_supabase()

    def encode(self, word):
        print(f"encoding {word}")
        if self.kv is None:
            print("KeyedVectors not loaded")
            return None
        if word in self.kv.key_to_index:
            return self.kv[word]
        else:
            print(f"Error: {word} is not in the KeyedVector's vocabulary")
            # Try to find closest match
            try:
                closest_matches = self.kv.most_similar(word, topn=3)
                if closest_matches:
                    closest_word = closest_matches[0][0]
                    print(f"Using closest match '{closest_word}' for '{word}'")
                    return self.kv[closest_word]
                else:
                    print(f"No similar words found for '{word}'")
            except Exception as e:
                print(f"Error finding similar words: {e}")
            return None
    
    def encode_and_format(self, word):
        """
        Apply encoding function to each word.
        Prettify the encoding to match expected format for Supabase vectors
        """
        enc = self.encode(word)
        return "[" + ",".join(map(str, enc.tolist())) + "]" if enc is not None else None

    async def vector_query_from_supabase(self, query):
        try:
            await self.ensure_supabase_initialized()
            query_embedding = self.encode(query)
            
            if query_embedding is None:
                return {
                    "match": False,
                    "error": f"'{query}' not in vocabulary and no similar words found"
                }

            query_embedding = query_embedding.tolist()

            if self.supabase is not None:
                result = await self.supabase.rpc(
                    "match_vector",
                    {
                        "query_embedding": query_embedding,
                        "match_threshold": 0.0,
                        "match_count": 1
                    }
                ).execute()

                data = result.data
                if data:
                    match = data[0]
                    return {
                        "match": True,
                        "query": query,
                        "matching_word": match["word"],
                        "video_url": match["video_url"],
                        "similarity": match["similarity"]
                    }
                else:
                    return {"match": False}
            else:
                return {"match": False, "error": "Supabase not initialized"}

        except Exception as e:
            print(f"RPC call failed: {e}")
            return {"match": False, "error": str(e)}


def load_filtered_kv(model_name='word2vec-google-news-300', vocab=None):
    """
    Returns a KeyedVector object whose vocabulary
    consists of the words in vocab
    """
    if vocab is None:
        vocab = []
    try:
        # gensim.downloader.load returns a KeyedVector
        original_kv = gensim.downloader.load(model_name)
        if vocab:
            filtered_key2vec_map = {}
            for key in vocab:
                if key in original_kv.key_to_index:
                    filtered_key2vec_map[key] = original_kv[key]

            new_kv = gensim.models.KeyedVectors(
                vector_size=original_kv.vector_size)
            new_kv.add_vectors(list(filtered_key2vec_map.keys()),
                               np.array(list(filtered_key2vec_map.values())))
            return original_kv
        else:
            return original_kv
    except Exception as e:
        print(f"Unable to load embedding model from gensim: {e}")
        return None


async def main():
    vectorizer = Vectorizer()

    # Test exact word match
    vector = vectorizer.encode("test")
    print(vector)
    
    # Test words not in vocabulary with closest match fallback
    result = await vectorizer.vector_query_from_supabase("dog")
    print(result)
    result = await vectorizer.vector_query_from_supabase("cat")
    print(result)

    # read word list
    # df = pd.read_csv('videos_rows.csv')
    
    # # Add embeddings column - apply encode to each word
    # df['embedding'] = df['word'].apply(vectorizer.encode_and_format)

    # # Drop any rows that don't have an embedding
    # df = df.dropna(subset=['embedding'])
    # print(df.head())

    # df.to_csv("vectors.csv", index=False, columns=["word", "video_url", "embedding"], header=True)


if __name__ == "__main__":
    import asyncio
    asyncio.run(main())