Spaces:

gauravbox
/

TalentLensAI

Running

File size: 13,320 Bytes

c2f9ec8
79b5c9c
 
c2f9ec8
 
 
 
 
 
79b5c9c
 
c2f9ec8
 
79b5c9c
c2f9ec8
 
79b5c9c
c2f9ec8
 
79b5c9c
c2f9ec8
 
 
79b5c9c
 
c2f9ec8
79b5c9c
c2f9ec8
 
 
79b5c9c
c2f9ec8
 
 
 
 
79b5c9c
c2f9ec8
79b5c9c
c2f9ec8
 
79b5c9c
c2f9ec8
 
79b5c9c
c2f9ec8
 
 
79b5c9c
 
c2f9ec8
79b5c9c
c2f9ec8
 
 
79b5c9c
 
 
c2f9ec8
79b5c9c
 
 
c2f9ec8
79b5c9c
c2f9ec8
 
79b5c9c
c2f9ec8
 
 
 
 
 
79b5c9c
 
 
 
 
c2f9ec8
 
79b5c9c
 
 
 
 
 
 
 
 
 
 
 
c2f9ec8
79b5c9c
 
 
 
c2f9ec8
79b5c9c
c2f9ec8
 
79b5c9c
c2f9ec8
79b5c9c
c2f9ec8
 
79b5c9c
 
 
 
 
 
 
 
 
 
 
 
 
 
c2f9ec8
 
79b5c9c
 
c2f9ec8
79b5c9c
 
 
 
c2f9ec8
 
79b5c9c
c2f9ec8
 
 
79b5c9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2f9ec8
79b5c9c
c2f9ec8
 
79b5c9c
c2f9ec8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79b5c9c
c2f9ec8
 
79b5c9c
c2f9ec8
 
 
 
 
79b5c9c
c2f9ec8
79b5c9c
 
 
 
 
 
 
 
 
 
 
c2f9ec8
 
79b5c9c
c2f9ec8
 
 
 
 
79b5c9c
 
c2f9ec8
 
 
 
 
79b5c9c
c2f9ec8
 
 
 
 
 
 
 
 
 
79b5c9c
c2f9ec8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79b5c9c
c2f9ec8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79b5c9c
c2f9ec8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79b5c9c
 
 
 
 
 
 
c2f9ec8
 
79b5c9c
 
 
 
c2f9ec8
79b5c9c
 
c2f9ec8
79b5c9c
c2f9ec8
79b5c9c
c2f9ec8
79b5c9c

"""
OpenAI-based resume data extraction.
Uses GPT models to extract structured information from resume text.
"""

import json
import re
import logging
from typing import Dict, Any, List, Optional

import openai
from openai import OpenAI

# Set up logging
logger = logging.getLogger(__name__)


class OpenAIResumeExtractor:
    """
    Resume data extractor using OpenAI's GPT models.
    """
    
    def __init__(self, api_key: Optional[str] = None, model: str = "gpt-4o"):
        """Initialize with OpenAI API key and model."""
        self.client = OpenAI(api_key=api_key) if api_key else OpenAI()
        self.model = model
        logger.info(f"OpenAI extractor initialized with model: {model}")
    
    def extract_sections_openai(self, text: str) -> Dict[str, Any]:
        """
        Extract resume sections using OpenAI API.
        
        Args:
            text: Raw resume text
            
        Returns:
            Dict containing extracted sections
        """
        logger.info("Starting OpenAI extraction...")
        
        try:
            # Create extraction prompt
            prompt = self._create_extraction_prompt(text)
            
            # Call OpenAI API
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "You are an expert resume parser. Extract information and return ONLY valid JSON."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.1,
                max_tokens=2000
            )
            
            # Parse response
            content = response.choices[0].message.content.strip()
            logger.debug(f"OpenAI response: {content[:200]}...")
            
            # Clean and parse JSON
            content = self._clean_json_response(content)
            result = json.loads(content)
            
            # Validate and enhance result
            result = self._validate_and_clean_result(result)
            
            # Add contact info extraction
            contact_info = self._extract_contact_info(text)
            result["ContactInfo"] = contact_info
            
            logger.info("✅ OpenAI extraction completed successfully")
            return result
            
        except json.JSONDecodeError as e:
            logger.error(f"JSON parsing error: {e}")
            logger.debug(f"Response content: {content}")
            return self._fallback_extraction(text)
            
        except Exception as e:
            logger.error(f"OpenAI extraction failed: {e}")
            return self._fallback_extraction(text)
    
    def _clean_json_response(self, content: str) -> str:
        """Clean JSON response from OpenAI."""
        # Remove markdown code blocks
        content = re.sub(r'```json\s*', '', content)
        content = re.sub(r'```\s*$', '', content)
        
        # Remove any text before first {
        start = content.find('{')
        if start > 0:
            content = content[start:]
            
        # Remove any text after last }
        end = content.rfind('}')
        if end > 0 and end < len(content) - 1:
            content = content[:end + 1]
            
        return content.strip()
    
    def _create_extraction_prompt(self, text: str) -> str:
        """Create prompt for OpenAI extraction."""
        prompt = f"""
Extract information from this resume and return ONLY valid JSON in this exact format:

{{
  "Name": "Full Name with credentials (PhD, MBA, etc.)",
  "Summary": "Professional summary or objective",
  "Skills": ["skill1", "skill2", "skill3"],
  "StructuredExperiences": [
    {{
      "title": "Job Title",
      "company": "Company Name",
      "date_range": "Start Date - End Date",
      "responsibilities": ["responsibility1", "responsibility2"]
    }}
  ],
  "Education": ["degree info", "school info"],
  "Training": ["certification1", "training1"],
  "Address": "Full address if available"
}}

Resume text:
{text}

CRITICAL INSTRUCTIONS:
- For NAME: Include ALL credentials (PhD, MBA, M.S., B.S., etc.) - example: "John Doe, PhD, MBA"
- Read the ENTIRE resume text carefully, don't miss content
- Extract ALL work experiences with full details
- Return ONLY valid JSON, no explanations
- If a section is not found, use empty string or empty array
- Extract actual technical skills, not company names
"""
        return prompt
    
    def _extract_contact_info(self, text: str) -> Dict[str, str]:
        """Extract contact information from resume text."""
        contact_info = {}
        
        # Extract email
        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        email_match = re.search(email_pattern, text)
        if email_match:
            contact_info['email'] = email_match.group()
        
        # Extract phone number
        phone_patterns = [
            r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',
            r'\+1[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',
            r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}'
        ]
        
        for pattern in phone_patterns:
            phone_match = re.search(pattern, text)
            if phone_match:
                contact_info['phone'] = phone_match.group().strip()
                break
        
        # Extract LinkedIn
        linkedin_pattern = r'linkedin\.com/in/[A-Za-z0-9-]+'
        linkedin_match = re.search(linkedin_pattern, text)
        if linkedin_match:
            contact_info['linkedin'] = linkedin_match.group()
            
        logger.info(f"OPENAI: Extracted ContactInfo as dict: {contact_info}")
        return contact_info
    
    def _validate_and_clean_result(self, result: Dict[str, Any]) -> Dict[str, Any]:
        """Validate and clean the extraction result."""
        
        # Ensure all required keys exist
        required_keys = ["Name", "Summary", "Skills", "StructuredExperiences", "Education", "Training", "Address"]
        for key in required_keys:
            if key not in result:
                result[key] = [] if key in ["Skills", "StructuredExperiences", "Education", "Training"] else ""
        
        # Clean skills - remove company names and duplicates
        if result.get("Skills"):
            cleaned_skills = []
            for skill in result["Skills"]:
                skill = skill.strip()
                # Skip if it looks like a company name or is too short
                if len(skill) > 1 and not self._is_company_name(skill):
                    cleaned_skills.append(skill)
            result["Skills"] = list(set(cleaned_skills))  # Remove duplicates
        
        # Validate experience structure
        if result.get("StructuredExperiences"):
            cleaned_experiences = []
            for exp in result["StructuredExperiences"]:
                if isinstance(exp, dict) and exp.get("title") and exp.get("company"):
                    # Ensure responsibilities is a list
                    if not isinstance(exp.get("responsibilities"), list):
                        exp["responsibilities"] = []
                    cleaned_experiences.append(exp)
            result["StructuredExperiences"] = cleaned_experiences
        
        return result
    
    def _is_company_name(self, text: str) -> bool:
        """Check if text looks like a company name rather than a skill."""
        company_indicators = [
            "inc", "llc", "corp", "ltd", "company", "solutions", "services", 
            "systems", "technologies", "financial", "insurance"
        ]
        text_lower = text.lower()
        return any(indicator in text_lower for indicator in company_indicators)
    
    def _fallback_extraction(self, text: str) -> Dict[str, Any]:
        """Fallback to regex-based extraction if OpenAI fails."""
        logger.info("Using regex fallback extraction...")
        
        return {
            "Name": self._extract_name_regex(text),
            "Summary": self._extract_summary_regex(text),
            "Skills": self._extract_skills_regex(text),
            "StructuredExperiences": self._extract_experiences_regex(text),
            "Education": self._extract_education_regex(text),
            "Training": [],
            "Address": self._extract_address_regex(text),
            "ContactInfo": self._extract_contact_info(text)
        }
    
    def _extract_name_regex(self, text: str) -> str:
        """Regex fallback for name extraction."""
        lines = text.split('\n')[:5]
        for line in lines:
            line = line.strip()
            if re.search(r'@|phone|email|linkedin|github', line.lower()):
                continue
            # Match name with potential credentials (PhD, MBA, etc.)
            name_match = re.match(r'^([A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?(?:,\s*[A-Z][a-z.]+(?:,\s*[A-Z][a-z.]+)?)?)', line)
            if name_match:
                return name_match.group(1)
        return ""
    
    def _extract_summary_regex(self, text: str) -> str:
        """Regex fallback for summary extraction."""
        summary_pattern = r'(?i)(?:professional\s+)?summary[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))'
        match = re.search(summary_pattern, text, re.DOTALL)
        if match:
            summary = match.group(1).strip()
            summary = re.sub(r'\n+', ' ', summary)
            summary = re.sub(r'\s+', ' ', summary)
            return summary
        return ""
    
    def _extract_skills_regex(self, text: str) -> List[str]:
        """Regex fallback for skills extraction."""
        skills = set()
        
        # Look for technical skills section
        skills_pattern = r'(?i)technical\s+skills?[:\s]*\n(.*?)(?=\n\s*(?:experience|education|projects?))'
        match = re.search(skills_pattern, text, re.DOTALL)
        
        if match:
            skills_text = match.group(1)
            # Split by common separators
            skill_items = re.split(r'[,;]\s*', skills_text.replace('\n', ' '))
            for item in skill_items:
                item = item.strip()
                if item and len(item) > 1 and len(item) < 30:
                    skills.add(item)
        
        return sorted(list(skills))
    
    def _extract_experiences_regex(self, text: str) -> List[Dict[str, Any]]:
        """Regex fallback for experience extraction."""
        experiences = []
        
        # Look for work experience section
        exp_pattern = r'(?i)(?:work\s+)?experience[:\s]*\n(.*?)(?=\n\s*(?:education|projects?|certifications?|$))'
        match = re.search(exp_pattern, text, re.DOTALL)
        
        if match:
            exp_text = match.group(1)
            
            # Look for job entries with | separators
            job_pattern = r'([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)'
            matches = re.findall(job_pattern, exp_text)
            
            for match in matches:
                title, company, dates = match
                responsibilities = []
                
                # Look for bullet points after this job
                job_section = exp_text[exp_text.find(f"{title}|{company}|{dates}"):]
                bullets = re.findall(r'[-•]\s*([^-•\n]+)', job_section)
                responsibilities = [bullet.strip() for bullet in bullets if len(bullet.strip()) > 10]
                
                experience = {
                    "title": title.strip(),
                    "company": company.strip(),
                    "date_range": dates.strip(),
                    "responsibilities": responsibilities
                }
                experiences.append(experience)
        
        return experiences
    
    def _extract_education_regex(self, text: str) -> List[str]:
        """Regex fallback for education extraction."""
        education = []
        
        edu_pattern = r'(?i)education[:\s]*\n(.*?)(?=\n\s*(?:certifications?|projects?|$))'
        match = re.search(edu_pattern, text, re.DOTALL)
        
        if match:
            edu_text = match.group(1)
            edu_lines = [line.strip() for line in edu_text.split('\n') if line.strip()]
            
            for line in edu_lines:
                if len(line) > 10:  # Filter out short lines
                    education.append(line)
        
        return education
    
    def _extract_address_regex(self, text: str) -> str:
        """Regex fallback for address extraction."""
        # Look for address patterns like "6001 Tain Dr. Suite 203, Dublin, OH, 43016"
        address_patterns = [
            r'(\d+\s+[A-Za-z\s\.]+(?:Suite|Apt|Unit)\s+\d+,?\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*\d{5})',
            r'(\d+\s+[A-Za-z\s\.]+,?\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*\d{5})',
            r'([A-Za-z\s\d\.]+,\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*\d{5})'
        ]
        
        for pattern in address_patterns:
            match = re.search(pattern, text)
            if match:
                return match.group(1).strip()
        
        return ""


# Main extraction function for compatibility
def extract_sections_openai(text: str, api_key: Optional[str] = None) -> Dict[str, Any]:
    """Extract resume sections using OpenAI API."""
    extractor = OpenAIResumeExtractor(api_key=api_key)
    return extractor.extract_sections_openai(text)