Spaces:

PatoFlamejanteTV
/

Safe-o-Bot

Running

App Files Files Community

PatoFlamejanteTV commited on 11 days ago

Commit

ce99828

verified ·

1 Parent(s): 1173745

Create classifier.py

Browse files

Files changed (1) hide show

classifier.py +217 -0

classifier.py ADDED Viewed

	@@ -0,0 +1,217 @@

+"""
+classifier.py
+Core pipeline: normalization, heuristics, multi-model inference, aggregation & explanations.
+Designed to be defensive: flags suspicious content and explains why.
+"""
+from typing import List, Dict, Any, Optional, Tuple
+import re
+import math
+import logging
+# Model imports
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+###########################
+# Configuration / models
+###########################
+# Candidate model names (change to the exact models you prefer)
+HARM_MODELS = [
+    "unitary/toxic-bert",                     # English toxic classifier
+    "unitary/multilingual-toxic-xlm-roberta"  # multilingual toxic classifier
+]
+URL_MODEL = "r3ddkahili/final-complete-malicious-url-model"  # malicious URL detector
+# thresholds (tunable)
+THRESHOLDS = {
+    "harm": 0.5,    # generic threshold for harmful label(s) (individual mapping below)
+    "url": 0.7,     # suspicious/malicious probability threshold
+    "ascii_entropy": 3.0  # lower entropy -> suspicious
+}
+# model handles (populated by load_models())
+MODEL_HANDLES = {
+    "harm": [],  # list of tuples (name, tokenizer, model, label_map)
+    "url": None  # tuple (name, tokenizer, model, label_map)
+}
+###########################
+# Utilities: normalization
+###########################
+# Minimal homoglyph map (extend this in production)
+HOMOGLYPH_MAP = {
+    '\u0430': 'a',  # cyrillic a -> a
+    '\u0435': 'e',  # cyrillic e -> e
+    '\u03BF': 'o',  # greek omicron -> o
+    '0': 'o',
+    '1': 'l',
+    '@': 'a',
+}
+ZERO_WIDTH_PATTERN = re.compile('[\u200B-\u200F\uFEFF]')
+def normalize_obfuscation(text: str) -> str:
+    """Normalize text: collapse whitespace, remove zero-width, apply basic homoglyph map."""
+    t = ZERO_WIDTH_PATTERN.sub('', text)
+    t = re.sub(r'\s+', ' ', t)
+    out_chars = []
+    for ch in t:
+        out_chars.append(HOMOGLYPH_MAP.get(ch, ch))
+    return ''.join(out_chars).strip()
+def shannon_entropy(s: str) -> float:
+    """Return Shannon character entropy of string s."""
+    if not s:
+        return 0.0
+    s = s.replace(" ", "")
+    freq = {}
+    for c in s:
+        freq[c] = freq.get(c, 0) + 1
+    ent = 0.0
+    L = len(s)
+    for v in freq.values():
+        p = v / L
+        ent -= p * math.log2(p)
+    return ent
+###########################
+# Heuristic detectors
+###########################
+# suspicious URL-like tokens (shortlist of TLDs frequently used for obfuscation)
+URL_OBFUSCATION_RE = re.compile(
+    r'([a-z0-9\-]{1,20}\s*[\.\[\(]? ?(?:link|site|xyz|to|ly|pw|click)\b)|'  # e.g. site.link or site . link
+    r'(https?://)?[^\s]{1,64}\.(?:link|site|xyz|to|ly|pw|click)\b',
+    re.I
+)
+JAILBREAK_PATTERNS = [
+    re.compile(r"ignore (?:previous|all) instructions", re.I),
+    re.compile(r"(?:bypass|disable) (?:filters|moderation|safety)", re.I),
+    re.compile(r"rewire the (?:system|assistant) prompt", re.I),
+    re.compile(r"output (?:the|the full) system prompt", re.I),
+]
+ASCII_ART_RE = re.compile(r'[\u2500-\u259F]|[_\-\|]{6,}|(?:\bASCII\b)', re.I)
+# catches long runs of punctuation / separators (often used to hide tokens)
+OBFUSCATION_SEP_RE = re.compile(r'([^\w\s]{2,}\s*){2,}')
+def heuristic_scan(raw: str, normalized: str) -> List[Dict[str, Any]]:
+    flags = []
+    # URL heuristics
+    if URL_OBFUSCATION_RE.search(raw) or URL_OBFUSCATION_RE.search(normalized):
+        flags.append({"type": "hidden_link_heuristic", "explain": "Suspicious or obfuscated URL-like token detected by regex."})
+    # ascii-art / block text / low entropy
+    ent = shannon_entropy(re.sub(r'\s+', '', normalized))
+    if ASCII_ART_RE.search(raw) or ent < THRESHOLDS["ascii_entropy"]:
+        flags.append({"type": "ascii_art_heuristic", "explain": f"ASCII-art-like characters or low entropy text (entropy={ent:.2f})."})
+    # jailbreak heuristics
+    jail_matches = [p.pattern for p in JAILBREAK_PATTERNS if p.search(normalized)]
+    if jail_matches:
+        flags.append({"type": "ai_jailbreak_heuristic", "explain": "Patterns commonly used to override model safety detected.", "matches": jail_matches})
+    # obfuscation separators
+    if OBFUSCATION_SEP_RE.search(normalized):
+        flags.append({"type": "filter_obfuscation_heuristic", "explain": "Many non-alphanumeric separators or repeated punctuation — possible obfuscation."})
+    return flags
+###########################
+# Model loading & helpers
+###########################
+def safe_load_tokenizer_and_model(name: str) -> Optional[Tuple]:
+    """Try to load tokenizer and model; return None on failure gracefully."""
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(name, use_fast=True)
+        model = AutoModelForSequenceClassification.from_pretrained(name)
+        model.eval()
+        if torch.cuda.is_available():
+            try:
+                model.to("cuda")
+            except Exception:
+                logger.warning("Could not move model to cuda.")
+        logger.info(f"Loaded model {name}")
+        return tokenizer, model
+    except Exception as e:
+        logger.warning(f"Failed to load {name}: {e}")
+        return None
+def load_models():
+    """Populate MODEL_HANDLES with tokenizer+model pairs. Called once at import or app init."""
+    # load harm models list
+    for mname in HARM_MODELS:
+        res = safe_load_tokenizer_and_model(mname)
+        if res:
+            tokenizer, model = res
+            # attempt to extract label mapping (if model has config.id2label)
+            label_map = getattr(model.config, "id2label", None) or {}
+            MODEL_HANDLES["harm"].append((mname, tokenizer, model, label_map))
+    # load URL model
+    res = safe_load_tokenizer_and_model(URL_MODEL)
+    if res:
+        tokenizer, model = res
+        label_map = getattr(model.config, "id2label", None) or {}
+        MODEL_HANDLES["url"] = (URL_MODEL, tokenizer, model, label_map)
+# Call at import
+try:
+    load_models()
+except Exception as e:
+    # keep running: models may be loaded later or in Space with more resources
+    logger.warning(f"Model loading raised: {e}")
+###########################
+# Model runners
+###########################
+def run_sequence_model(tokenizer, model, text: str, max_length=512) -> Dict[str, float]:
+    """Run a sequence classification model and return label->prob mapping (softmax)."""
+    inputs = tokenizer(text, truncation=True, max_length=max_length, return_tensors="pt")
+    if torch.cuda.is_available() and next(model.parameters(), None) is not None:
+        inputs = {k: v.to("cuda") for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model(**inputs)
+        logits = outputs.logits
+        probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
+    # build mapping
+    id2label = getattr(model.config, "id2label", {})
+    if id2label:
+        return {id2label.get(i, str(i)): float(probs[i]) for i in range(len(probs))}
+    else:
+        # fallback: numeric labels
+        return {str(i): float(probs[i]) for i in range(len(probs))}
+def aggregate_harm_predictions(preds: List[Dict[str, float]]) -> Dict[str, Any]:
+    """
+    Combine multiple harm model outputs.
+    We compute per-label averages and maxes, and decide whether to flag.
+    """
+    if not preds:
+        return {"combined": {}, "note": "no harm models loaded"}
+    label_set = set()
+    for p in preds:
+        label_set.update(p.keys())
+    combined = {}
+    for lbl in label_set:
+        vals = [p.get(lbl, 0.0)