File size: 6,750 Bytes
4eafb23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
"""
Module: models.ml_feedback
Description: ML Feedback Models - Learning from Investigation Results
Author: Anderson Henrique da Silva
Date: 2025-10-07 18:11:37
License: Proprietary - All rights reserved

These models store feedback data that can be used to train
and improve machine learning models for anomaly detection.
"""

from typing import Optional, Dict, Any
from datetime import datetime
from enum import Enum

from sqlalchemy import Column, String, Float, Integer, DateTime, JSON, Enum as SQLEnum, ForeignKey
from sqlalchemy.dialects.postgresql import UUID
from sqlalchemy.orm import relationship
import uuid

from src.db.base import Base


class FeedbackType(str, Enum):
    """Type of feedback."""
    USER_CONFIRMED = "user_confirmed"  # User confirmed the anomaly
    USER_REJECTED = "user_rejected"    # User rejected as false positive
    AUTO_VALIDATED = "auto_validated"  # System validated through external data
    EXPERT_REVIEW = "expert_review"    # Expert reviewed and confirmed


class AnomalyLabel(str, Enum):
    """Ground truth labels for ML training."""
    TRUE_POSITIVE = "true_positive"    # Correctly identified anomaly
    FALSE_POSITIVE = "false_positive"  # Incorrectly flagged as anomaly
    FALSE_NEGATIVE = "false_negative"  # Missed anomaly
    UNCERTAIN = "uncertain"            # Unclear/needs more review


class InvestigationFeedback(Base):
    """
    Feedback on investigation results for ML training.

    This table stores ground truth data that can be used to:
    - Train supervised ML models
    - Evaluate model performance
    - Identify model weaknesses
    - Improve anomaly detection thresholds
    """

    __tablename__ = "investigation_feedback"

    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    investigation_id = Column(UUID(as_uuid=True), nullable=False, index=True)
    anomaly_id = Column(String(255), nullable=True, index=True)

    # Feedback details
    feedback_type = Column(SQLEnum(FeedbackType), nullable=False)
    anomaly_label = Column(SQLEnum(AnomalyLabel), nullable=False)

    # Contract and detection details
    contract_id = Column(String(255), nullable=True, index=True)
    anomaly_type = Column(String(100), nullable=False, index=True)
    detected_severity = Column(Float, nullable=False)
    detected_confidence = Column(Float, nullable=False)

    # Ground truth
    actual_severity = Column(Float, nullable=True)  # Corrected severity
    corrected_type = Column(String(100), nullable=True)  # Corrected anomaly type

    # Features used for detection (for retraining)
    features = Column(JSON, nullable=False)  # Feature vector used

    # Additional context
    feedback_notes = Column(String(1000), nullable=True)
    evidence_urls = Column(JSON, nullable=True)  # Supporting evidence

    # Attribution
    feedback_by = Column(String(255), nullable=True)  # User ID or system
    reviewed_by = Column(String(255), nullable=True)  # Expert reviewer

    # Timestamps
    created_at = Column(DateTime, nullable=False, default=datetime.utcnow, index=True)
    updated_at = Column(DateTime, nullable=True, onupdate=datetime.utcnow)

    # Model version that made the prediction
    model_version = Column(String(50), nullable=True)
    detection_threshold = Column(Float, nullable=True)

    def __repr__(self):
        return f"<InvestigationFeedback {self.id} - {self.anomaly_label}>"


class MLTrainingDataset(Base):
    """
    Curated datasets for ML model training.

    Aggregates feedback data into training-ready datasets with
    proper train/val/test splits and balanced classes.
    """

    __tablename__ = "ml_training_datasets"

    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    name = Column(String(255), nullable=False)
    description = Column(String(1000), nullable=True)

    # Dataset composition
    anomaly_types = Column(JSON, nullable=False)  # Types included
    total_samples = Column(Integer, nullable=False)
    positive_samples = Column(Integer, nullable=False)
    negative_samples = Column(Integer, nullable=False)

    # Data splits
    train_size = Column(Integer, nullable=False)
    val_size = Column(Integer, nullable=False)
    test_size = Column(Integer, nullable=False)

    # Quality metrics
    label_confidence_avg = Column(Float, nullable=True)
    data_quality_score = Column(Float, nullable=True)

    # Metadata
    created_at = Column(DateTime, nullable=False, default=datetime.utcnow)
    created_by = Column(String(255), nullable=True)

    # Storage
    storage_path = Column(String(500), nullable=True)  # Path to serialized dataset
    format = Column(String(50), nullable=False, default="pytorch")

    def __repr__(self):
        return f"<MLTrainingDataset {self.name} - {self.total_samples} samples>"


class MLModelVersion(Base):
    """
    Trained ML model versions with performance tracking.

    Tracks different versions of trained models with their
    performance metrics and deployment status.
    """

    __tablename__ = "ml_model_versions"

    id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4)
    model_name = Column(String(255), nullable=False, index=True)
    version = Column(String(50), nullable=False, index=True)

    # Model details
    model_type = Column(String(100), nullable=False)
    architecture = Column(String(255), nullable=True)
    hyperparameters = Column(JSON, nullable=True)

    # Training info
    training_dataset_id = Column(UUID(as_uuid=True), ForeignKey("ml_training_datasets.id"))
    trained_at = Column(DateTime, nullable=False, default=datetime.utcnow)
    training_duration_seconds = Column(Float, nullable=True)

    # Performance metrics
    train_accuracy = Column(Float, nullable=True)
    val_accuracy = Column(Float, nullable=True)
    test_accuracy = Column(Float, nullable=True)
    precision = Column(Float, nullable=True)
    recall = Column(Float, nullable=True)
    f1_score = Column(Float, nullable=True)
    auc_roc = Column(Float, nullable=True)

    # Additional metrics
    false_positive_rate = Column(Float, nullable=True)
    false_negative_rate = Column(Float, nullable=True)
    inference_time_ms = Column(Float, nullable=True)

    # Deployment
    is_deployed = Column(Integer, nullable=False, default=0)  # Boolean
    deployed_at = Column(DateTime, nullable=True)
    deployment_environment = Column(String(50), nullable=True)

    # Storage
    model_path = Column(String(500), nullable=True)
    model_size_mb = Column(Float, nullable=True)

    # Metadata
    created_by = Column(String(255), nullable=True)
    notes = Column(String(1000), nullable=True)

    def __repr__(self):
        return f"<MLModelVersion {self.model_name} v{self.version}>"