Spaces:
Running
Running
""" | |
OpenAI-based resume data extraction. | |
Uses GPT models to extract structured information from resume text. | |
""" | |
import json | |
import re | |
import logging | |
from typing import Dict, Any, List, Optional | |
import openai | |
from openai import OpenAI | |
# Set up logging | |
logger = logging.getLogger(__name__) | |
class OpenAIResumeExtractor: | |
""" | |
Resume data extractor using OpenAI's GPT models. | |
""" | |
def __init__(self, api_key: Optional[str] = None, model: str = "gpt-4o"): | |
"""Initialize with OpenAI API key and model.""" | |
self.client = OpenAI(api_key=api_key) if api_key else OpenAI() | |
self.model = model | |
logger.info(f"OpenAI extractor initialized with model: {model}") | |
def extract_sections_openai(self, text: str) -> Dict[str, Any]: | |
""" | |
Extract resume sections using OpenAI API. | |
Args: | |
text: Raw resume text | |
Returns: | |
Dict containing extracted sections | |
""" | |
logger.info("Starting OpenAI extraction...") | |
try: | |
# Create extraction prompt | |
prompt = self._create_extraction_prompt(text) | |
# Call OpenAI API | |
response = self.client.chat.completions.create( | |
model=self.model, | |
messages=[ | |
{"role": "system", "content": "You are an expert resume parser. Extract information and return ONLY valid JSON."}, | |
{"role": "user", "content": prompt} | |
], | |
temperature=0.1, | |
max_tokens=2000 | |
) | |
# Parse response | |
content = response.choices[0].message.content.strip() | |
logger.debug(f"OpenAI response: {content[:200]}...") | |
# Clean and parse JSON | |
content = self._clean_json_response(content) | |
result = json.loads(content) | |
# Validate and enhance result | |
result = self._validate_and_clean_result(result) | |
# Add contact info extraction | |
contact_info = self._extract_contact_info(text) | |
result["ContactInfo"] = contact_info | |
logger.info("✅ OpenAI extraction completed successfully") | |
return result | |
except json.JSONDecodeError as e: | |
logger.error(f"JSON parsing error: {e}") | |
logger.debug(f"Response content: {content}") | |
return self._fallback_extraction(text) | |
except Exception as e: | |
logger.error(f"OpenAI extraction failed: {e}") | |
return self._fallback_extraction(text) | |
def _clean_json_response(self, content: str) -> str: | |
"""Clean JSON response from OpenAI.""" | |
# Remove markdown code blocks | |
content = re.sub(r'```json\s*', '', content) | |
content = re.sub(r'```\s*$', '', content) | |
# Remove any text before first { | |
start = content.find('{') | |
if start > 0: | |
content = content[start:] | |
# Remove any text after last } | |
end = content.rfind('}') | |
if end > 0 and end < len(content) - 1: | |
content = content[:end + 1] | |
return content.strip() | |
def _create_extraction_prompt(self, text: str) -> str: | |
"""Create prompt for OpenAI extraction.""" | |
prompt = f""" | |
Extract information from this resume and return ONLY valid JSON in this exact format: | |
{{ | |
"Name": "Full Name with credentials (PhD, MBA, etc.)", | |
"Summary": "Professional summary or objective", | |
"Skills": ["skill1", "skill2", "skill3"], | |
"StructuredExperiences": [ | |
{{ | |
"title": "Job Title", | |
"company": "Company Name", | |
"date_range": "Start Date - End Date", | |
"responsibilities": ["responsibility1", "responsibility2"] | |
}} | |
], | |
"Education": ["degree info", "school info"], | |
"Training": ["certification1", "training1"], | |
"Address": "Full address if available" | |
}} | |
Resume text: | |
{text} | |
CRITICAL INSTRUCTIONS: | |
- For NAME: Include ALL credentials (PhD, MBA, M.S., B.S., etc.) - example: "John Doe, PhD, MBA" | |
- Read the ENTIRE resume text carefully, don't miss content | |
- Extract ALL work experiences with full details | |
- Return ONLY valid JSON, no explanations | |
- If a section is not found, use empty string or empty array | |
- Extract actual technical skills, not company names | |
""" | |
return prompt | |
def _extract_contact_info(self, text: str) -> Dict[str, str]: | |
"""Extract contact information from resume text.""" | |
contact_info = {} | |
# Extract email | |
email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' | |
email_match = re.search(email_pattern, text) | |
if email_match: | |
contact_info['email'] = email_match.group() | |
# Extract phone number | |
phone_patterns = [ | |
r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', | |
r'\+1[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', | |
r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}' | |
] | |
for pattern in phone_patterns: | |
phone_match = re.search(pattern, text) | |
if phone_match: | |
contact_info['phone'] = phone_match.group().strip() | |
break | |
# Extract LinkedIn | |
linkedin_pattern = r'linkedin\.com/in/[A-Za-z0-9-]+' | |
linkedin_match = re.search(linkedin_pattern, text) | |
if linkedin_match: | |
contact_info['linkedin'] = linkedin_match.group() | |
logger.info(f"OPENAI: Extracted ContactInfo as dict: {contact_info}") | |
return contact_info | |
def _validate_and_clean_result(self, result: Dict[str, Any]) -> Dict[str, Any]: | |
"""Validate and clean the extraction result.""" | |
# Ensure all required keys exist | |
required_keys = ["Name", "Summary", "Skills", "StructuredExperiences", "Education", "Training", "Address"] | |
for key in required_keys: | |
if key not in result: | |
result[key] = [] if key in ["Skills", "StructuredExperiences", "Education", "Training"] else "" | |
# Clean skills - remove company names and duplicates | |
if result.get("Skills"): | |
cleaned_skills = [] | |
for skill in result["Skills"]: | |
skill = skill.strip() | |
# Skip if it looks like a company name or is too short | |
if len(skill) > 1 and not self._is_company_name(skill): | |
cleaned_skills.append(skill) | |
result["Skills"] = list(set(cleaned_skills)) # Remove duplicates | |
# Validate experience structure | |
if result.get("StructuredExperiences"): | |
cleaned_experiences = [] | |
for exp in result["StructuredExperiences"]: | |
if isinstance(exp, dict) and exp.get("title") and exp.get("company"): | |
# Ensure responsibilities is a list | |
if not isinstance(exp.get("responsibilities"), list): | |
exp["responsibilities"] = [] | |
cleaned_experiences.append(exp) | |
result["StructuredExperiences"] = cleaned_experiences | |
return result | |
def _is_company_name(self, text: str) -> bool: | |
"""Check if text looks like a company name rather than a skill.""" | |
company_indicators = [ | |
"inc", "llc", "corp", "ltd", "company", "solutions", "services", | |
"systems", "technologies", "financial", "insurance" | |
] | |
text_lower = text.lower() | |
return any(indicator in text_lower for indicator in company_indicators) | |
def _fallback_extraction(self, text: str) -> Dict[str, Any]: | |
"""Fallback to regex-based extraction if OpenAI fails.""" | |
logger.info("Using regex fallback extraction...") | |
return { | |
"Name": self._extract_name_regex(text), | |
"Summary": self._extract_summary_regex(text), | |
"Skills": self._extract_skills_regex(text), | |
"StructuredExperiences": self._extract_experiences_regex(text), | |
"Education": self._extract_education_regex(text), | |
"Training": [], | |
"Address": self._extract_address_regex(text), | |
"ContactInfo": self._extract_contact_info(text) | |
} | |
def _extract_name_regex(self, text: str) -> str: | |
"""Regex fallback for name extraction.""" | |
lines = text.split('\n')[:5] | |
for line in lines: | |
line = line.strip() | |
if re.search(r'@|phone|email|linkedin|github', line.lower()): | |
continue | |
# Match name with potential credentials (PhD, MBA, etc.) | |
name_match = re.match(r'^([A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?(?:,\s*[A-Z][a-z.]+(?:,\s*[A-Z][a-z.]+)?)?)', line) | |
if name_match: | |
return name_match.group(1) | |
return "" | |
def _extract_summary_regex(self, text: str) -> str: | |
"""Regex fallback for summary extraction.""" | |
summary_pattern = r'(?i)(?:professional\s+)?summary[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))' | |
match = re.search(summary_pattern, text, re.DOTALL) | |
if match: | |
summary = match.group(1).strip() | |
summary = re.sub(r'\n+', ' ', summary) | |
summary = re.sub(r'\s+', ' ', summary) | |
return summary | |
return "" | |
def _extract_skills_regex(self, text: str) -> List[str]: | |
"""Regex fallback for skills extraction.""" | |
skills = set() | |
# Look for technical skills section | |
skills_pattern = r'(?i)technical\s+skills?[:\s]*\n(.*?)(?=\n\s*(?:experience|education|projects?))' | |
match = re.search(skills_pattern, text, re.DOTALL) | |
if match: | |
skills_text = match.group(1) | |
# Split by common separators | |
skill_items = re.split(r'[,;]\s*', skills_text.replace('\n', ' ')) | |
for item in skill_items: | |
item = item.strip() | |
if item and len(item) > 1 and len(item) < 30: | |
skills.add(item) | |
return sorted(list(skills)) | |
def _extract_experiences_regex(self, text: str) -> List[Dict[str, Any]]: | |
"""Regex fallback for experience extraction.""" | |
experiences = [] | |
# Look for work experience section | |
exp_pattern = r'(?i)(?:work\s+)?experience[:\s]*\n(.*?)(?=\n\s*(?:education|projects?|certifications?|$))' | |
match = re.search(exp_pattern, text, re.DOTALL) | |
if match: | |
exp_text = match.group(1) | |
# Look for job entries with | separators | |
job_pattern = r'([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)' | |
matches = re.findall(job_pattern, exp_text) | |
for match in matches: | |
title, company, dates = match | |
responsibilities = [] | |
# Look for bullet points after this job | |
job_section = exp_text[exp_text.find(f"{title}|{company}|{dates}"):] | |
bullets = re.findall(r'[-•]\s*([^-•\n]+)', job_section) | |
responsibilities = [bullet.strip() for bullet in bullets if len(bullet.strip()) > 10] | |
experience = { | |
"title": title.strip(), | |
"company": company.strip(), | |
"date_range": dates.strip(), | |
"responsibilities": responsibilities | |
} | |
experiences.append(experience) | |
return experiences | |
def _extract_education_regex(self, text: str) -> List[str]: | |
"""Regex fallback for education extraction.""" | |
education = [] | |
edu_pattern = r'(?i)education[:\s]*\n(.*?)(?=\n\s*(?:certifications?|projects?|$))' | |
match = re.search(edu_pattern, text, re.DOTALL) | |
if match: | |
edu_text = match.group(1) | |
edu_lines = [line.strip() for line in edu_text.split('\n') if line.strip()] | |
for line in edu_lines: | |
if len(line) > 10: # Filter out short lines | |
education.append(line) | |
return education | |
def _extract_address_regex(self, text: str) -> str: | |
"""Regex fallback for address extraction.""" | |
# Look for address patterns like "6001 Tain Dr. Suite 203, Dublin, OH, 43016" | |
address_patterns = [ | |
r'(\d+\s+[A-Za-z\s\.]+(?:Suite|Apt|Unit)\s+\d+,?\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*\d{5})', | |
r'(\d+\s+[A-Za-z\s\.]+,?\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*\d{5})', | |
r'([A-Za-z\s\d\.]+,\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*\d{5})' | |
] | |
for pattern in address_patterns: | |
match = re.search(pattern, text) | |
if match: | |
return match.group(1).strip() | |
return "" | |
# Main extraction function for compatibility | |
def extract_sections_openai(text: str, api_key: Optional[str] = None) -> Dict[str, Any]: | |
"""Extract resume sections using OpenAI API.""" | |
extractor = OpenAIResumeExtractor(api_key=api_key) | |
return extractor.extract_sections_openai(text) |