Spaces:

gkdud00
/

project-tdm

Running

project-tdm / aggro_model.py

round 4

474acae 13 days ago

7.48 kB

	import torch
	import torch.nn as nn
	from transformers import BertTokenizer, BertModel
	import pickle
	import re
	import os
	import sys
	import numpy as np
	from collections import defaultdict
	# =============================================================================
	# 1. 모델 클래스 정의
	# =============================================================================

	# (1) 규칙 기반 스코어러 클래스
	class RuleBasedScorer:
	def __init__(self):
	# 패턴별 단어 사전
	self.patterns = {
	11: defaultdict(float), 12: defaultdict(float),
	13: defaultdict(float), 14: defaultdict(float)
	}

	self.pattern_names = {
	11: '의문 유발형(부호)', 12: '의문 유발형(은닉)',
	13: '선정표현 사용형', 14: '속어/줄임말 사용형'
	}
	# 부호 패턴 (단순 물음표 제외, 과장된 부호만)
	self.symbol_patterns = {
	'repeated': re.compile(r'([!?…~])\1+'), # 반복 부호 (??, !!)
	'ellipsis': re.compile(r'\.\.\.\|…') # 말줄임표
	}

	def get_score(self, title):

	# 1. 텍스트 토큰화 (단순 띄어쓰기 및 문자 추출)
	words = re.findall(r'[가-힣A-Za-z0-9]+', str(title))
	scores = {}

	# 2. 부호 점수 계산
	rep = len(self.symbol_patterns['repeated'].findall(title))
	ell = len(self.symbol_patterns['ellipsis'].findall(title))
	symbol_score = (rep * 30) + (ell * 10)

	# 3. 패턴별(11~14) 점수 계산
	for p in [11, 12, 13, 14]:
	word_score = 0
	# 단어 매칭 점수 (사전에 있는 단어인지 확인)
	if p in self.patterns: # 안전장치
	for word in words:
	if word in self.patterns[p]:
	# 가중치 적용 (로그 스케일)
	word_score += np.log1p(self.patterns[p][word]) * 2

	total = 0
	# 패턴별 점수 합산 로직
	if p == 11: # 의문부호형
	total = symbol_score # 오직 부호만 봄

	elif p == 12: # 의문은닉형 ("...이유는")
	total = word_score + (symbol_score * 0.5)

	else: # 13(선정), 14(속어)
	total = word_score # 오직 단어만 봄

	scores[p] = total

	# 4. 최종 점수 산출 (가장 높은 점수 선택)
	if not scores:
	return {'score': 0, 'pattern': 0, 'pattern_name': '정상'}

	max_pattern = max(scores, key=scores.get)
	max_score = min(100, scores[max_pattern]) # 100점 만점

	return {
	'score': max_score,
	'pattern': max_pattern,
	'pattern_name': self.pattern_names.get(max_pattern, '알 수 없음')
	}

	# 🚨 Pickle 로딩 에러 방지용
	import __main__
	setattr(__main__, "RuleBasedScorer", RuleBasedScorer)


	# (2) KoBERT 모델 클래스
	class FishingClassifier(nn.Module):
	def __init__(self, bert, num_classes=2):
	super().__init__()
	self.bert = bert
	self.dropout = nn.Dropout(0.3)
	self.fc = nn.Linear(768, num_classes)
	def forward(self, input_ids, mask):
	_, pooled = self.bert(input_ids=input_ids, attention_mask=mask, return_dict=False)
	return self.fc(self.dropout(pooled))

	# =============================================================================
	# 2. 모델 로드
	# =============================================================================
	print("[AggroModel] 시스템 로딩 시작...")

	aggro_model = None
	tokenizer = None
	rule_scorer = None
	device = torch.device("cpu")

	# 절대 경로 설정
	BASE_DIR = os.path.dirname(os.path.abspath(__file__))

	# --- A. 규칙 기반 모델 로드 (.pkl) ---
	try:
	pkl_path = os.path.join(BASE_DIR, "rule_based_scorer.pkl")
	if os.path.exists(pkl_path):
	with open(pkl_path, "rb") as f:
	rule_scorer = pickle.load(f)
	print(f"✅ [Aggro] 규칙 기반 모델 로드 성공!")
	else:
	print(f"⚠️ [Aggro] 규칙 파일 없음: {pkl_path}")
	except Exception as e:
	print(f"🚨 [Aggro] 규칙 모델 로드 실패: {e}")

	# --- B. KoBERT 모델 로드 (.pth) ---
	try:
	MODEL_NAME = 'skt/kobert-base-v1'
	try:
	tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
	bert_base = BertModel.from_pretrained(MODEL_NAME)
	except:
	print("⚠️ skt 모델 로드 실패, monologg로 재시도...")
	tokenizer = BertTokenizer.from_pretrained('monologg/kobert')
	bert_base = BertModel.from_pretrained('monologg/kobert')

	aggro_model = FishingClassifier(bert_base).to(device)

	pth_path = os.path.join(BASE_DIR, "bert_fishing_model_best.pth")

	if os.path.exists(pth_path):
	state_dict = torch.load(pth_path, map_location=device)
	aggro_model.load_state_dict(state_dict)
	aggro_model.eval()
	print(f"✅ [Aggro] KoBERT 모델 로드 성공!")
	else:
	print(f"⚠️ [Aggro] KoBERT 파일 없음: {pth_path}")
	aggro_model = None

	except Exception as e:
	print(f"🚨 [Aggro] KoBERT 로드 실패: {e}")
	aggro_model = None


	# =============================================================================
	# 3. 메인 함수
	# =============================================================================
	def get_aggro_score(title: str) -> dict:

	# 1. 규칙 기반 점수
	rule_score = 0.0
	rule_pattern = "분석 불가"

	if rule_scorer:
	try:
	res = rule_scorer.get_score(title)
	rule_score = res['score']
	rule_pattern = res.get('pattern_name', '알 수 없음')
	except Exception as e:
	print(f"규칙 계산 에러: {e}")
	rule_score = 50.0

	# 2. KoBERT 점수
	bert_score = 0.0
	if aggro_model and tokenizer:
	try:
	inputs = tokenizer(
	title, return_tensors='pt', padding="max_length", truncation=True, max_length=64
	)
	input_ids = inputs['input_ids'].to(device)
	mask = inputs['attention_mask'].to(device)

	with torch.no_grad():
	outputs = aggro_model(input_ids, mask)
	probs = torch.softmax(outputs, dim=1)
	bert_score = probs[0][1].item() * 100
	except:
	bert_score = 50.0

	# 3. 합산 (규칙 40% + BERT 60%)
	if rule_scorer and aggro_model:
	final_score = (rule_score * 0.4) + (bert_score * 0.6)
	elif aggro_model:
	final_score = bert_score
	elif rule_scorer:
	final_score = rule_score
	else:
	final_score = 0.0

	# 4. 결과
	normalized_score = min(final_score / 100.0, 1.0)

	reason = "제목이 평이합니다."
	recommendation = "양호합니다."

	if final_score >= 70:
	reason = f"AI 예측({bert_score:.0f}점) 및 규칙({rule_pattern})에 의해 낚시성으로 판단되었습니다."
	recommendation = "객관적인 사실 위주의 제목으로 수정해주세요."
	elif final_score >= 40:
	reason = f"일부 낚시성 요소({rule_pattern})가 감지되었습니다."
	recommendation = "표현을 조금 더 다듬는 것을 권장합니다."

	return {
	"score": round(normalized_score, 4),
	"reason": reason,
	"recommendation": recommendation
	}