Spaces:

ludekcizinsky
/

homepage2vec

Runtime error

App Files Files Community

ludekcizinsky commited on Dec 21, 2023

Commit

8e53f74

1 Parent(s): 0a71fa6

feat(init): init commit

Browse files

Files changed (16) hide show

.DS_Store +0 -0
.python-version +1 -0
app.py +57 -0
homepage2vec/__init__.py +3 -0
homepage2vec/__pycache__/__init__.cpython-310.pyc +0 -0
homepage2vec/__pycache__/data_collection.cpython-310.pyc +0 -0
homepage2vec/__pycache__/model.cpython-310.pyc +0 -0
homepage2vec/__pycache__/textual_extractor.cpython-310.pyc +0 -0
homepage2vec/data_collection.py +42 -0
homepage2vec/model.py +192 -0
homepage2vec/textual_extractor.py +341 -0
models/.DS_Store +0 -0
models/gpt3.5/features.txt +8 -0
models/gpt3.5/model.pt +3 -0
pyproject.toml +20 -0
requirements.txt +6 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.10

app.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import gradio as gr
+from typing import Dict
+import os
+from homepage2vec.model import WebsiteClassifier as Homepage2Vec
+EXAMPLES = [
+    ["gpt3.5", "tanjasenghaasdesigns.de"],
+    ["gpt3.5", "epfl.ch"],
+    ["gpt3.5", "cc.cz"],
+    ["gpt3.5", "promaminky.cz"]
+]
+def predict(model_choice : str, url : str) -> Dict[str, float]:
+    """
+    Predict the categories of a website using the Homepage2Vec model.
+    Args:
+        model_choice (str): The model to use for prediction.
+        url (str): The url of the website to predict.
+    Returns:
+        Dict[str, float]: The categories and their corresponding scores.
+    """
+    # Define the model directory path
+    model_dir = os.path.join("models", model_choice)
+    # Initialise model
+    model = Homepage2Vec(model_dir=model_dir)
+    # Website to predict
+    website = model.fetch_website(url)
+    # Obtain scores and embeddings
+    scores, _ = model.predict(website)
+    # Filter only scores that have a value greater than 0.5
+    scores = {k: v for k, v in scores.items() if v > 0.5}
+    return scores
+iface = gr.Interface(
+    fn=predict,
+    inputs=[gr.Dropdown(choices=["gpt3.5", "gpt4"], label="Select Model"),
+            gr.Textbox(label="Enter Website URL", placeholder="www.mikasenghaas.de")],
+    outputs=gr.Label(num_top_classes=14, label="Predicted Labels", show_label=True),
+    title="Homepage2Vec",
+    description="Use Homepage2Vec to predict the categories of any website you wish.",
+    examples=EXAMPLES,
+    live=False,
+    allow_flagging="never",
+)
+iface.launch()

homepage2vec/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Adapted version of the code from Homepage2Vec (https://github.com/epfl-dlab/homepage2vec).
+"""

homepage2vec/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (268 Bytes). View file

homepage2vec/__pycache__/data_collection.cpython-310.pyc ADDED Viewed

Binary file (1.33 kB). View file

homepage2vec/__pycache__/model.cpython-310.pyc ADDED Viewed

Binary file (5.42 kB). View file

homepage2vec/__pycache__/textual_extractor.cpython-310.pyc ADDED Viewed

Binary file (8.94 kB). View file

homepage2vec/data_collection.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""
+Module to access and load a webpage to be used by the homepage2vec model.
+Includes:
+    - TimeoutException: Exception to be raised when a timeout occurs.
+    - time_limit: Context manager to set a time limit on the execution of a block.
+    - access_website: Function to access a website and return its response.
+"""
+import requests
+def access_website(url, timeout=10):
+    """
+    Return the response corresponding to a url, or None if there was a request error
+    """
+    try:
+        # change user-agent so that we don't look like a bot
+        headers = requests.utils.default_headers()
+        headers.update(
+            {
+                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.16; rv:84.0) Gecko/20100101 Firefox/84.0",
+            }
+        )
+        # r_head = requests.head("http://" + url, timeout=timeout, headers=headers)
+        if not url.startswith("http://") and not url.startswith("https:"):
+            url = "http://" + url
+        r_get = requests.get(url, timeout=timeout, headers=headers)
+        # head_code = r_head.status_code
+        get_code = r_get.status_code
+        if r_get.encoding.lower() != "utf-8":
+            r_get.encoding = r_get.apparent_encoding
+        text = r_get.text
+        content_type = r_get.headers.get("content-type", "?").strip()
+        return text, get_code, content_type
+    except Exception as e:
+        return None

homepage2vec/model.py ADDED Viewed

	@@ -0,0 +1,192 @@

+"""
+Module that defines the Homepage2vec model (consisting of a textual extractor and a classifier).
+Includes:
+    - WebsiteClassifier: Class to load and use the Homepage2vec model.
+    - SimpleClassifier: Class to define the architecture of the Homepage2vec model.
+    - Webpage: Class to define a webpage query.
+"""
+import json
+import os
+import tempfile
+import uuid
+from typing import OrderedDict
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+from homepage2vec.data_collection import access_website
+from homepage2vec.textual_extractor import TextualExtractor
+class WebsiteClassifier:
+    """
+    Pretrained Homepage2vec model
+    """
+    def __init__(
+        self,
+        model_dir: str,
+        device=None,
+        cpu_threads_count=1,
+        dataloader_workers=1,
+        state_dict: OrderedDict | None = None,
+    ):
+        self.input_dim = 4665
+        self.output_dim = 14
+        self.classes = [
+            "Arts",
+            "Business",
+            "Computers",
+            "Games",
+            "Health",
+            "Home",
+            "Kids_and_Teens",
+            "News",
+            "Recreation",
+            "Reference",
+            "Science",
+            "Shopping",
+            "Society",
+            "Sports",
+        ]
+        self.temporary_dir = tempfile.gettempdir() + "/homepage2vec/"
+        self.device = device
+        self.dataloader_workers = dataloader_workers
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        if not device:
+            if torch.cuda.is_available():
+                self.device = "cuda:0"
+            else:
+                self.device = "cpu"
+                torch.set_num_threads(cpu_threads_count)
+        # Load state dict if not specified
+        if not state_dict:
+            weight_path = os.path.join(model_dir, "model.pt")
+            state_dict = torch.load(weight_path, map_location=torch.device(self.device))
+        # Load pretrained model
+        self.model = SimpleClassifier(self.input_dim, self.output_dim)
+        self.model.load_state_dict(state_dict)
+        # features used in training
+        self.features_order = []
+        self.features_dim = {}
+        feature_path = os.path.join(model_dir, "features.txt")
+        with open(feature_path, "r") as file:
+            for f in file:
+                name = f.split(" ")[0]
+                dim = int(f.split(" ")[1][:-1])
+                self.features_order.append(name)
+                self.features_dim[name] = dim
+    def get_scores(self, x):
+        with torch.no_grad():
+            self.model.eval()
+            return self.model.forward(x)
+    def fetch_website(self, url):
+        response = access_website(url)
+        w = Webpage(url)
+        if response is not None:
+            html, get_code, content_type = response
+            w.http_code = get_code
+            if self.is_valid(get_code, content_type):
+                w.is_valid = True
+                w.html = html
+        return w
+    def get_features(self, url, html, screenshot_path):
+        te = TextualExtractor(self.device)
+        features = te.get_features(url, html)
+        return features
+    def predict(self, website):
+        website.features = self.get_features(
+            website.url, website.html, website.screenshot_path
+        )
+        all_features = self.concatenate_features(website)
+        input_features = torch.FloatTensor(all_features)
+        scores, embeddings = self.get_scores(input_features)
+        return (
+            dict(zip(self.classes, torch.sigmoid(scores).tolist())),
+            embeddings.tolist(),
+        )
+    def concatenate_features(self, w):
+        """
+        Concatenate the features attributes of webpage instance, with respect to the features order in h2v
+        """
+        v = np.zeros(self.input_dim)
+        ix = 0
+        for f_name in self.features_order:
+            f_dim = self.features_dim[f_name]
+            f_value = w.features[f_name]
+            if f_value is None:
+                f_value = f_dim * [0]  # if no feature, replace with zeros
+            v[ix : ix + f_dim] = f_value
+            ix += f_dim
+        return v
+    def is_valid(self, get_code, content_type):
+        valid_get_code = get_code == 200
+        valid_content_type = content_type.startswith("text/html")
+        return valid_get_code and valid_content_type
+class SimpleClassifier(nn.Module):
+    """
+    Model architecture of Homepage2vec
+    """
+    def __init__(self, input_dim, output_dim, dropout=0.5):
+        super(SimpleClassifier, self).__init__()
+        self.layer1 = torch.nn.Linear(input_dim, 1000)
+        self.layer2 = torch.nn.Linear(1000, 100)
+        self.fc = torch.nn.Linear(100, output_dim)
+        self.drop = torch.nn.Dropout(dropout)  # dropout of 0.5 before each layer
+    def forward(self, x):
+        x = self.layer1(x)
+        x = F.relu(self.drop(x))
+        emb = self.layer2(x)
+        x = F.relu(self.drop(emb))
+        x = self.fc(x)
+        return x, emb
+class Webpage:
+    """
+    Shell for a webpage query
+    """
+    def __init__(self, url):
+        self.url = url
+        self.uid = uuid.uuid4().hex
+        self.is_valid = False
+        self.http_code = False
+        self.html = None
+        self.screenshot_path = None
+        self.features = None
+        self.embedding = None
+        self.scores = None
+    def __repr__(self):
+        return json.dumps(self.__dict__)

homepage2vec/textual_extractor.py ADDED Viewed

	@@ -0,0 +1,341 @@

+"""
+Module to extract textual features from the html content of a webpage.
+Includes:
+    - TextualExtractor: Class to extract textual features from the html content of a webpage.
+    - embed_text: Function to embed the text of a webpage.
+    - embed_description: Function to embed the description of a webpage.
+    - embed_keywords: Function to embed the keywords of a webpage.
+    - embed_title: Function to embed the title of a webpage.
+    - embed_links: Function to embed the links of a webpage.
+    - embed_url: Function to embed the url of a webpage.
+    - embed_tld: Function to embed the top-level domain of a webpage.
+    - embed_metatags: Function to embed the metatags of a webpage.
+    - split_in_sentences: Function to split the text of a webpage in sentences.
+    - clean_url: Function to clean the url of a webpage.
+    - clean_field: Function to clean a field of a webpage.
+    - clean_link: Function to clean a link of a webpage.
+    - trunc: Function to truncate the output of a tokenizer to a given length.
+"""
+import logging
+import re
+from collections import Counter
+from bs4 import BeautifulSoup
+from sentence_transformers import SentenceTransformer
+logging.getLogger("sentence_transformers").setLevel(logging.WARNING)
+class TextualExtractor:
+    """
+    Extract textual features from the html content of a webpage
+    """
+    xlmr = None
+    def __init__(self, device="cpu"):
+        if not TextualExtractor.xlmr:
+            # Turn off logging and progress bar
+            TextualExtractor.xlmr = SentenceTransformer(
+                "paraphrase-xlm-r-multilingual-v1",
+                device=device,
+            )
+        # self.xlmr = SentenceTransformer('xlm-r-distilroberta-base-paraphrase-v1', device=device)
+        # TLD used for one-hot encoding
+        self.rep_tld = [
+            "com",
+            "org",
+            "net",
+            "info",
+            "xyz",
+            "club",
+            "biz",
+            "top",
+            "edu",
+            "online",
+            "pro",
+            "site",
+            "vip",
+            "icu",
+            "buzz",
+            "app",
+            "asia",
+            "su",
+            "gov",
+            "space",
+        ]
+        # Metatags used for one-hot encoding
+        self.rep_metatags = [
+            "viewport",
+            "description",
+            "generator",
+            "keywords",
+            "robots",
+            "twitter:card",
+            "msapplication-tileimage",
+            "google-site-verification",
+            "author",
+            "twitter:title",
+            "twitter:description",
+            "theme-color",
+            "twitter:image",
+            "twitter:site",
+            "format-detection",
+            "msapplication-tilecolor",
+            "copyright",
+            "twitter:data1",
+            "twitter:label1",
+            "revisit-after",
+            "apple-mobile-web-app-capable",
+            "handheldfriendly",
+            "language",
+            "msvalidate.01",
+            "twitter:url",
+            "title",
+            "mobileoptimized",
+            "twitter:creator",
+            "skype_toolbar",
+            "rating",
+        ]
+        # number of sentences and links over which we compute the features
+        self.k_sentences = 100
+        self.k_links = 50
+    def get_features(self, url, html):
+        features = {}
+        # url
+        url_feature = embed_url(url, TextualExtractor.xlmr)
+        features["f_url"] = url_feature
+        # tld
+        tld_feature = embed_tld(url, self.rep_tld)
+        features["f_tld"] = tld_feature
+        # print(html)
+        soup = BeautifulSoup(html, "lxml")
+        # metatags
+        metatags_feature = embed_metatags(soup, self.rep_metatags)
+        features["f_metatags"] = metatags_feature
+        # title
+        title_feature = embed_title(soup, TextualExtractor.xlmr)
+        features["f_title"] = title_feature
+        # description
+        description_feature = embed_description(soup, TextualExtractor.xlmr)
+        features["f_description"] = description_feature
+        # keywords
+        keywords_feature = embed_keywords(soup, TextualExtractor.xlmr)
+        features["f_keywords"] = keywords_feature
+        # links
+        links_feature = embed_links(soup, TextualExtractor.xlmr, self.k_links)
+        features["f_links_" + str(self.k_links)] = links_feature
+        # text
+        text_feature = embed_text(soup, TextualExtractor.xlmr, self.k_sentences)
+        features["f_text_" + str(self.k_sentences)] = text_feature
+        return features
+def embed_text(soup, transformer, k_sentences):
+    """Embed the text of a webpage""" ""
+    sentences = split_in_sentences(soup)[:k_sentences]
+    if len(sentences) == 0:
+        return None
+    # this is needed to avoid some warnings, truncate the sentences
+    sentences_trunc = [
+        trunc(s, transformer.tokenizer, transformer.max_seq_length) for s in sentences
+    ]
+    sentences_emb = transformer.encode(sentences_trunc)
+    if sentences_emb.size == 0:
+        return None
+    text_emb = sentences_emb.mean(axis=0).tolist()  # mean of the sentences
+    return text_emb
+def embed_description(soup, transformer):
+    """Embed the description of a webpage""" ""
+    desc = soup.find("meta", attrs={"name": ["description", "Description"]})
+    if not desc:
+        return None
+    content = desc.get("content", "")
+    if len(content.strip()) == 0:
+        return None
+    content = clean_field(content)
+    # this is needed to avoid some warnings
+    desc_trunc = trunc(content, transformer.tokenizer, transformer.max_seq_length)
+    desc_emb = transformer.encode(desc_trunc)
+    if desc_emb.size == 0:
+        return None
+    return desc_emb.tolist()
+def embed_keywords(soup, transformer):
+    """Embed the keywords of a webpage""" ""
+    kw = soup.find("meta", attrs={"name": "keywords"})
+    if not kw:
+        return None
+    content = kw.get("content", "")
+    if len(content.strip()) == 0:
+        return None
+    # this is needed to avoid some warnings
+    kw_trunc = trunc(content, transformer.tokenizer, transformer.max_seq_length)
+    kw_emb = transformer.encode(kw_trunc)
+    if kw_emb.size == 0:
+        return None
+    return kw_emb.tolist()
+def embed_title(soup, transformer):
+    """Embed the title of a webpage""" ""
+    title = soup.find("title")
+    if title is None:
+        return None
+    title = str(title.string)
+    title = clean_field(title)
+    if len(title) == 0:
+        return None
+    # this is needed to avoid some warnings
+    title_trunc = trunc(title, transformer.tokenizer, transformer.max_seq_length)
+    title_emb = transformer.encode(title_trunc)
+    if title_emb.size == 0:
+        return None
+    return title_emb.tolist()
+def embed_links(soup, transformer, k_links):
+    """Embed the links of a webpage""" ""
+    a_tags = soup.find_all("a", href=True)
+    links = [a.get("href", "") for a in a_tags]
+    links = [clean_link(link) for link in links]
+    links = [link for link in links if len(link) != 0]
+    words = [w.lower() for w in " ".join(links).split(" ") if len(w) != 0]
+    if len(words) == 0:
+        return None
+    most_frequent_words = [w[0] for w in Counter(words).most_common(k_links)]
+    # most_frequent_words = pd.Series(words).value_counts()[:k_links].index.values
+    # this is needed to avoid some warnings
+    words_trunc = [
+        trunc(w, transformer.tokenizer, transformer.max_seq_length)
+        for w in most_frequent_words
+    ]
+    words_emb = transformer.encode(words_trunc)
+    if words_emb.size == 0:
+        return None
+    links_emb = words_emb.mean(axis=0).tolist()
+    return links_emb
+def embed_url(url, transformer):
+    """Embed the url of a webpage"""
+    cleaned_url = clean_url(url)
+    url_emb = transformer.encode(cleaned_url)
+    if url_emb.size == 0:
+        return None
+    return url_emb.mean(axis=0).tolist()
+def embed_tld(url, rep_tld):
+    """Embed the top-level domain of a webpage"""
+    tld = url.split(".")[-1]
+    rep_onehot = [int(tld.startswith(d)) for d in rep_tld]
+    continent_onehot = 7 * [0]  # TODO
+    return rep_onehot + continent_onehot
+def embed_metatags(soup, rep_metatags):
+    """Embed the metatags of a webpage"""
+    metatags = soup.findAll("meta")
+    attr = [m.get("name", None) for m in metatags]
+    attr = [a.lower() for a in attr if a is not None]
+    attr_emb = [int(a in attr) for a in rep_metatags]
+    return attr_emb
+def split_in_sentences(soup):
+    """From the raw html content of a website, extract the text visible to the user and splits it in sentences"""
+    sep = soup.get_text("[SEP]").split(
+        "[SEP]"
+    )  # separate text elements with special separators [SEP]
+    strip = [s.strip() for s in sep if s != "\n"]
+    clean = [s for s in strip if len(s) != 0]
+    return clean
+def clean_url(url):
+    """Clean the url of a webpage"""
+    url = re.sub(r"www.|http://|https://|-|_", "", url)
+    return url.split(".")[:-1]
+def clean_field(field):
+    """Clean a field of a webpage"""
+    field = re.sub(r"\*|\n|\r|\t|\||:|-|–", "", field)
+    return field.strip()
+def clean_link(link):
+    """Clean a link of a webpage"""
+    link = re.sub(r"www.|http://|https://|[0-9]+", "", link)
+    link = re.sub(r"-|_|=|\?|:", " ", link)
+    link = link.split("/")[1:]
+    return " ".join(link).strip()
+def trunc(seq, tok, max_length):
+    """Truncate the output of a tokenizer to a given length, doesn't affect the performances"""
+    e = tok.encode(seq, truncation=True)
+    d = tok.decode(e[1:-1][: max_length - 2])
+    return d

models/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

models/gpt3.5/features.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+f_tld 27
+f_url 768
+f_metatags 30
+f_title 768
+f_description 768
+f_keywords 768
+f_links_50 768
+f_text_100 768

models/gpt3.5/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d40bb85c577a8c0951b585714c35fa10509267f0d52ec1c6952f650e9622887
+size 19072308

pyproject.toml ADDED Viewed

	@@ -0,0 +1,20 @@

+[tool.poetry]
+name = "homepage2vec"
+version = "0.1.0"
+description = "Website Classifier"
+authors = ["Your Name <[email protected]>"]
+readme = "README.md"
+[tool.poetry.dependencies]
+python = "3.10.13"
+requests = "*"
+torch = "*"
+beautifulsoup4 = "*"
+lxml = "*"
+sentence-transformers = "*"
+numpy = "*"
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+requests
+torch
+beautifulsoup4
+lxml
+sentence-transformers
+numpy