""" OpenAI-based resume data extraction. Uses GPT models to extract structured information from resume text. """ import json import re import logging from typing import Dict, Any, List, Optional import openai from openai import OpenAI # Set up logging logger = logging.getLogger(__name__) class OpenAIResumeExtractor: """ Resume data extractor using OpenAI's GPT models. """ def __init__(self, api_key: Optional[str] = None, model: str = "gpt-4o"): """Initialize with OpenAI API key and model.""" self.client = OpenAI(api_key=api_key) if api_key else OpenAI() self.model = model logger.info(f"OpenAI extractor initialized with model: {model}") def extract_sections_openai(self, text: str) -> Dict[str, Any]: """ Extract resume sections using OpenAI API. Args: text: Raw resume text Returns: Dict containing extracted sections """ logger.info("Starting OpenAI extraction...") try: # Create extraction prompt prompt = self._create_extraction_prompt(text) # Call OpenAI API response = self.client.chat.completions.create( model=self.model, messages=[ {"role": "system", "content": "You are an expert resume parser. Extract information and return ONLY valid JSON."}, {"role": "user", "content": prompt} ], temperature=0.1, max_tokens=2000 ) # Parse response content = response.choices[0].message.content.strip() logger.debug(f"OpenAI response: {content[:200]}...") # Clean and parse JSON content = self._clean_json_response(content) result = json.loads(content) # Validate and enhance result result = self._validate_and_clean_result(result) # Add contact info extraction contact_info = self._extract_contact_info(text) result["ContactInfo"] = contact_info logger.info("✅ OpenAI extraction completed successfully") return result except json.JSONDecodeError as e: logger.error(f"JSON parsing error: {e}") logger.debug(f"Response content: {content}") return self._fallback_extraction(text) except Exception as e: logger.error(f"OpenAI extraction failed: {e}") return self._fallback_extraction(text) def _clean_json_response(self, content: str) -> str: """Clean JSON response from OpenAI.""" # Remove markdown code blocks content = re.sub(r'```json\s*', '', content) content = re.sub(r'```\s*$', '', content) # Remove any text before first { start = content.find('{') if start > 0: content = content[start:] # Remove any text after last } end = content.rfind('}') if end > 0 and end < len(content) - 1: content = content[:end + 1] return content.strip() def _create_extraction_prompt(self, text: str) -> str: """Create prompt for OpenAI extraction.""" prompt = f""" Extract information from this resume and return ONLY valid JSON in this exact format: {{ "Name": "Full Name with credentials (PhD, MBA, etc.)", "Summary": "Professional summary or objective", "Skills": ["skill1", "skill2", "skill3"], "StructuredExperiences": [ {{ "title": "Job Title", "company": "Company Name", "date_range": "Start Date - End Date", "responsibilities": ["responsibility1", "responsibility2"] }} ], "Education": ["degree info", "school info"], "Training": ["certification1", "training1"], "Address": "Full address if available" }} Resume text: {text} CRITICAL INSTRUCTIONS: - For NAME: Include ALL credentials (PhD, MBA, M.S., B.S., etc.) - example: "John Doe, PhD, MBA" - Read the ENTIRE resume text carefully, don't miss content - Extract ALL work experiences with full details - Return ONLY valid JSON, no explanations - If a section is not found, use empty string or empty array - Extract actual technical skills, not company names """ return prompt def _extract_contact_info(self, text: str) -> Dict[str, str]: """Extract contact information from resume text.""" contact_info = {} # Extract email email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' email_match = re.search(email_pattern, text) if email_match: contact_info['email'] = email_match.group() # Extract phone number phone_patterns = [ r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', r'\+1[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}' ] for pattern in phone_patterns: phone_match = re.search(pattern, text) if phone_match: contact_info['phone'] = phone_match.group().strip() break # Extract LinkedIn linkedin_pattern = r'linkedin\.com/in/[A-Za-z0-9-]+' linkedin_match = re.search(linkedin_pattern, text) if linkedin_match: contact_info['linkedin'] = linkedin_match.group() logger.info(f"OPENAI: Extracted ContactInfo as dict: {contact_info}") return contact_info def _validate_and_clean_result(self, result: Dict[str, Any]) -> Dict[str, Any]: """Validate and clean the extraction result.""" # Ensure all required keys exist required_keys = ["Name", "Summary", "Skills", "StructuredExperiences", "Education", "Training", "Address"] for key in required_keys: if key not in result: result[key] = [] if key in ["Skills", "StructuredExperiences", "Education", "Training"] else "" # Clean skills - remove company names and duplicates if result.get("Skills"): cleaned_skills = [] for skill in result["Skills"]: skill = skill.strip() # Skip if it looks like a company name or is too short if len(skill) > 1 and not self._is_company_name(skill): cleaned_skills.append(skill) result["Skills"] = list(set(cleaned_skills)) # Remove duplicates # Validate experience structure if result.get("StructuredExperiences"): cleaned_experiences = [] for exp in result["StructuredExperiences"]: if isinstance(exp, dict) and exp.get("title") and exp.get("company"): # Ensure responsibilities is a list if not isinstance(exp.get("responsibilities"), list): exp["responsibilities"] = [] cleaned_experiences.append(exp) result["StructuredExperiences"] = cleaned_experiences return result def _is_company_name(self, text: str) -> bool: """Check if text looks like a company name rather than a skill.""" company_indicators = [ "inc", "llc", "corp", "ltd", "company", "solutions", "services", "systems", "technologies", "financial", "insurance" ] text_lower = text.lower() return any(indicator in text_lower for indicator in company_indicators) def _fallback_extraction(self, text: str) -> Dict[str, Any]: """Fallback to regex-based extraction if OpenAI fails.""" logger.info("Using regex fallback extraction...") return { "Name": self._extract_name_regex(text), "Summary": self._extract_summary_regex(text), "Skills": self._extract_skills_regex(text), "StructuredExperiences": self._extract_experiences_regex(text), "Education": self._extract_education_regex(text), "Training": [], "Address": self._extract_address_regex(text), "ContactInfo": self._extract_contact_info(text) } def _extract_name_regex(self, text: str) -> str: """Regex fallback for name extraction.""" lines = text.split('\n')[:5] for line in lines: line = line.strip() if re.search(r'@|phone|email|linkedin|github', line.lower()): continue # Match name with potential credentials (PhD, MBA, etc.) name_match = re.match(r'^([A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?(?:,\s*[A-Z][a-z.]+(?:,\s*[A-Z][a-z.]+)?)?)', line) if name_match: return name_match.group(1) return "" def _extract_summary_regex(self, text: str) -> str: """Regex fallback for summary extraction.""" summary_pattern = r'(?i)(?:professional\s+)?summary[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))' match = re.search(summary_pattern, text, re.DOTALL) if match: summary = match.group(1).strip() summary = re.sub(r'\n+', ' ', summary) summary = re.sub(r'\s+', ' ', summary) return summary return "" def _extract_skills_regex(self, text: str) -> List[str]: """Regex fallback for skills extraction.""" skills = set() # Look for technical skills section skills_pattern = r'(?i)technical\s+skills?[:\s]*\n(.*?)(?=\n\s*(?:experience|education|projects?))' match = re.search(skills_pattern, text, re.DOTALL) if match: skills_text = match.group(1) # Split by common separators skill_items = re.split(r'[,;]\s*', skills_text.replace('\n', ' ')) for item in skill_items: item = item.strip() if item and len(item) > 1 and len(item) < 30: skills.add(item) return sorted(list(skills)) def _extract_experiences_regex(self, text: str) -> List[Dict[str, Any]]: """Regex fallback for experience extraction.""" experiences = [] # Look for work experience section exp_pattern = r'(?i)(?:work\s+)?experience[:\s]*\n(.*?)(?=\n\s*(?:education|projects?|certifications?|$))' match = re.search(exp_pattern, text, re.DOTALL) if match: exp_text = match.group(1) # Look for job entries with | separators job_pattern = r'([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)' matches = re.findall(job_pattern, exp_text) for match in matches: title, company, dates = match responsibilities = [] # Look for bullet points after this job job_section = exp_text[exp_text.find(f"{title}|{company}|{dates}"):] bullets = re.findall(r'[-•]\s*([^-•\n]+)', job_section) responsibilities = [bullet.strip() for bullet in bullets if len(bullet.strip()) > 10] experience = { "title": title.strip(), "company": company.strip(), "date_range": dates.strip(), "responsibilities": responsibilities } experiences.append(experience) return experiences def _extract_education_regex(self, text: str) -> List[str]: """Regex fallback for education extraction.""" education = [] edu_pattern = r'(?i)education[:\s]*\n(.*?)(?=\n\s*(?:certifications?|projects?|$))' match = re.search(edu_pattern, text, re.DOTALL) if match: edu_text = match.group(1) edu_lines = [line.strip() for line in edu_text.split('\n') if line.strip()] for line in edu_lines: if len(line) > 10: # Filter out short lines education.append(line) return education def _extract_address_regex(self, text: str) -> str: """Regex fallback for address extraction.""" # Look for address patterns like "6001 Tain Dr. Suite 203, Dublin, OH, 43016" address_patterns = [ r'(\d+\s+[A-Za-z\s\.]+(?:Suite|Apt|Unit)\s+\d+,?\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*\d{5})', r'(\d+\s+[A-Za-z\s\.]+,?\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*\d{5})', r'([A-Za-z\s\d\.]+,\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*\d{5})' ] for pattern in address_patterns: match = re.search(pattern, text) if match: return match.group(1).strip() return "" # Main extraction function for compatibility def extract_sections_openai(text: str, api_key: Optional[str] = None) -> Dict[str, Any]: """Extract resume sections using OpenAI API.""" extractor = OpenAIResumeExtractor(api_key=api_key) return extractor.extract_sections_openai(text)