Initial model upload
Browse files- README.md +51 -0
- accuracey.png +0 -0
- clickbait_model.pkl +3 -0
- requirements.txt +3 -0
- train_clickbait.py +64 -0
README.md
CHANGED
|
@@ -1,3 +1,54 @@
|
|
| 1 |
---
|
| 2 |
license: mit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
license: mit
|
| 3 |
+
language: en
|
| 4 |
+
datasets:
|
| 5 |
+
- amananandrai/clickbait-dataset
|
| 6 |
+
metrics:
|
| 7 |
+
- accuracy
|
| 8 |
+
tags:
|
| 9 |
+
- sklearn
|
| 10 |
+
- text-classification
|
| 11 |
+
- clickbait
|
| 12 |
+
widget:
|
| 13 |
+
- text: "You Won't Believe What Happens Next!"
|
| 14 |
+
example_title: "Clickbait Example"
|
| 15 |
+
- text: "Scientists Discover New Planet in Solar System"
|
| 16 |
+
example_title: "Non-Clickbait Example"
|
| 17 |
---
|
| 18 |
+
|
| 19 |
+
# Clickbait Detection Model (Logistic Regression)
|
| 20 |
+
|
| 21 |
+
هذا نموذج تعلم آلة (Scikit-learn Pipeline) تم تدريبه لتصنيف عناوين الأخبار (Headlines) إلى "Clickbait" (عنوان مثير) أو "Not Clickbait" (عنوان عادي).
|
| 22 |
+
|
| 23 |
+
## 🚀 كيف تستخدم النموذج
|
| 24 |
+
|
| 25 |
+
تم حفظ النموذج كـ `Pipeline` كامل من `sklearn`، وهو يتضمن `TfidfVectorizer` و `LogisticRegression`. هذا يعني أنه يتعامل مع النص مباشرة.
|
| 26 |
+
|
| 27 |
+
```python
|
| 28 |
+
import joblib
|
| 29 |
+
|
| 30 |
+
# قم بتحميل النموذج من Hugging Face Hub
|
| 31 |
+
# (تأكد من تثبيت huggingface_hub: pip install huggingface_hub)
|
| 32 |
+
from huggingface_hub import hf_hub_download
|
| 33 |
+
|
| 34 |
+
model_path = hf_hub_download(repo_id="[Ma120]/[clickbait-detector]", filename="clickbait_model.pkl")
|
| 35 |
+
model = joblib.load(model_path)
|
| 36 |
+
|
| 37 |
+
# اختبر النموذج
|
| 38 |
+
headlines = [
|
| 39 |
+
"You Won't Believe What Happens Next!",
|
| 40 |
+
"Local Library Announces Summer Reading Program",
|
| 41 |
+
"10 Signs You're a Genius (Number 7 Will Shock You)",
|
| 42 |
+
"Government Passes New Budget Bill"
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
predictions = model.predict(headlines)
|
| 46 |
+
|
| 47 |
+
# 1 = Clickbait, 0 = Not Clickbait
|
| 48 |
+
for headline, pred in zip(headlines, predictions):
|
| 49 |
+
label = "Clickbait" if pred == 1 else "Not Clickbait"
|
| 50 |
+
print(f"[{label}] {headline}")
|
| 51 |
+
|
| 52 |
+
# يمكنك أيضاً الحصول على الاحتمالات
|
| 53 |
+
# probabilities = model.predict_proba(headlines)
|
| 54 |
+
# print(probabilities)
|
accuracey.png
ADDED
|
clickbait_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5a2096bff7ac569925320f09b0808ffec384ea47cec1e36f46d1fc366f0183bd
|
| 3 |
+
size 222500
|
requirements.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
scikit-learn
|
| 2 |
+
pandas
|
| 3 |
+
joblib
|
train_clickbait.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from sklearn.model_selection import train_test_split
|
| 3 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 4 |
+
from sklearn.linear_model import LogisticRegression
|
| 5 |
+
from sklearn.pipeline import Pipeline
|
| 6 |
+
import joblib
|
| 7 |
+
import os # تم إضافة هذه المكتبة لفحص الملفات
|
| 8 |
+
|
| 9 |
+
# اسم ملف البيانات الذي سيبحث عنه الكود في نفس المجلد
|
| 10 |
+
DATA_FILE = "clickbait_data.csv"
|
| 11 |
+
|
| 12 |
+
def train_model():
|
| 13 |
+
"""
|
| 14 |
+
الدالة الرئيسية لتدريب النموذج وحفظه
|
| 15 |
+
"""
|
| 16 |
+
print("Starting model training...")
|
| 17 |
+
|
| 18 |
+
# 1. تحميل البيانات
|
| 19 |
+
# التأكد من وجود الملف في نفس المجلد قبل محاولة قراءته
|
| 20 |
+
if not os.path.exists(DATA_FILE):
|
| 21 |
+
print(f"Error: '{DATA_FILE}' not found in the current directory.")
|
| 22 |
+
print("Please make sure the dataset is present before running the training.")
|
| 23 |
+
print("You can download it from Kaggle: https://www.kaggle.com/datasets/amananandrai/clickbait-dataset")
|
| 24 |
+
exit()
|
| 25 |
+
|
| 26 |
+
try:
|
| 27 |
+
df = pd.read_csv(DATA_FILE)
|
| 28 |
+
except Exception as e:
|
| 29 |
+
print(f"Error reading {DATA_FILE}: {e}")
|
| 30 |
+
exit()
|
| 31 |
+
|
| 32 |
+
print(f"Dataset loaded: {len(df)} headlines.")
|
| 33 |
+
|
| 34 |
+
# 2. تحديد المدخلات والمخرجات
|
| 35 |
+
X = df['headline']
|
| 36 |
+
y = df['clickbait']
|
| 37 |
+
|
| 38 |
+
# 3. تقسيم البيانات
|
| 39 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 40 |
+
|
| 41 |
+
# 4. بناء "الخط" (Pipeline)
|
| 42 |
+
# هذا يدمج خطوتين: تحويل النص لأرقام (TfidfVectorizer) والتصنيف (LogisticRegression)
|
| 43 |
+
model_pipeline = Pipeline([
|
| 44 |
+
('vectorizer', TfidfVectorizer(max_features=5000)), # نأخذ أهم 5000 كلمة
|
| 45 |
+
('classifier', LogisticRegression(max_iter=1000))
|
| 46 |
+
])
|
| 47 |
+
|
| 48 |
+
# 5. التدريب
|
| 49 |
+
print("Training the model... (This may take a minute)")
|
| 50 |
+
model_pipeline.fit(X_train, y_train)
|
| 51 |
+
|
| 52 |
+
# 6. التقييم
|
| 53 |
+
accuracy = model_pipeline.score(X_test, y_test)
|
| 54 |
+
print(f"Training complete. Model accuracy: {accuracy * 100:.2f}%")
|
| 55 |
+
|
| 56 |
+
# 7. حفظ النموذج
|
| 57 |
+
joblib.dump(model_pipeline, "clickbait_model.pkl")
|
| 58 |
+
|
| 59 |
+
print("Model saved successfully as 'clickbait_model.pkl'")
|
| 60 |
+
|
| 61 |
+
# --- تشغيل الكود ---
|
| 62 |
+
# هذا السطر يضمن أن الكود سيعمل فقط عند تشغيل الملف مباشرة
|
| 63 |
+
if __name__ == "__main__":
|
| 64 |
+
train_model()
|