File size: 13,320 Bytes
c2f9ec8
79b5c9c
 
c2f9ec8
 
 
 
 
 
79b5c9c
 
c2f9ec8
 
79b5c9c
c2f9ec8
 
79b5c9c
c2f9ec8
 
79b5c9c
c2f9ec8
 
 
79b5c9c
 
c2f9ec8
79b5c9c
c2f9ec8
 
 
79b5c9c
c2f9ec8
 
 
 
 
79b5c9c
c2f9ec8
79b5c9c
c2f9ec8
 
79b5c9c
c2f9ec8
 
79b5c9c
c2f9ec8
 
 
79b5c9c
 
c2f9ec8
79b5c9c
c2f9ec8
 
 
79b5c9c
 
 
c2f9ec8
79b5c9c
 
 
c2f9ec8
79b5c9c
c2f9ec8
 
79b5c9c
c2f9ec8
 
 
 
 
 
79b5c9c
 
 
 
 
c2f9ec8
 
79b5c9c
 
 
 
 
 
 
 
 
 
 
 
c2f9ec8
79b5c9c
 
 
 
c2f9ec8
79b5c9c
c2f9ec8
 
79b5c9c
c2f9ec8
79b5c9c
c2f9ec8
 
79b5c9c
 
 
 
 
 
 
 
 
 
 
 
 
 
c2f9ec8
 
79b5c9c
 
c2f9ec8
79b5c9c
 
 
 
c2f9ec8
 
79b5c9c
c2f9ec8
 
 
79b5c9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2f9ec8
79b5c9c
c2f9ec8
 
79b5c9c
c2f9ec8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79b5c9c
c2f9ec8
 
79b5c9c
c2f9ec8
 
 
 
 
79b5c9c
c2f9ec8
79b5c9c
 
 
 
 
 
 
 
 
 
 
c2f9ec8
 
79b5c9c
c2f9ec8
 
 
 
 
79b5c9c
 
c2f9ec8
 
 
 
 
79b5c9c
c2f9ec8
 
 
 
 
 
 
 
 
 
79b5c9c
c2f9ec8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79b5c9c
c2f9ec8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79b5c9c
c2f9ec8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79b5c9c
 
 
 
 
 
 
c2f9ec8
 
79b5c9c
 
 
 
c2f9ec8
79b5c9c
 
c2f9ec8
79b5c9c
c2f9ec8
79b5c9c
c2f9ec8
79b5c9c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
"""
OpenAI-based resume data extraction.
Uses GPT models to extract structured information from resume text.
"""

import json
import re
import logging
from typing import Dict, Any, List, Optional

import openai
from openai import OpenAI

# Set up logging
logger = logging.getLogger(__name__)


class OpenAIResumeExtractor:
    """
    Resume data extractor using OpenAI's GPT models.
    """
    
    def __init__(self, api_key: Optional[str] = None, model: str = "gpt-4o"):
        """Initialize with OpenAI API key and model."""
        self.client = OpenAI(api_key=api_key) if api_key else OpenAI()
        self.model = model
        logger.info(f"OpenAI extractor initialized with model: {model}")
    
    def extract_sections_openai(self, text: str) -> Dict[str, Any]:
        """
        Extract resume sections using OpenAI API.
        
        Args:
            text: Raw resume text
            
        Returns:
            Dict containing extracted sections
        """
        logger.info("Starting OpenAI extraction...")
        
        try:
            # Create extraction prompt
            prompt = self._create_extraction_prompt(text)
            
            # Call OpenAI API
            response = self.client.chat.completions.create(
                model=self.model,
                messages=[
                    {"role": "system", "content": "You are an expert resume parser. Extract information and return ONLY valid JSON."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.1,
                max_tokens=2000
            )
            
            # Parse response
            content = response.choices[0].message.content.strip()
            logger.debug(f"OpenAI response: {content[:200]}...")
            
            # Clean and parse JSON
            content = self._clean_json_response(content)
            result = json.loads(content)
            
            # Validate and enhance result
            result = self._validate_and_clean_result(result)
            
            # Add contact info extraction
            contact_info = self._extract_contact_info(text)
            result["ContactInfo"] = contact_info
            
            logger.info("✅ OpenAI extraction completed successfully")
            return result
            
        except json.JSONDecodeError as e:
            logger.error(f"JSON parsing error: {e}")
            logger.debug(f"Response content: {content}")
            return self._fallback_extraction(text)
            
        except Exception as e:
            logger.error(f"OpenAI extraction failed: {e}")
            return self._fallback_extraction(text)
    
    def _clean_json_response(self, content: str) -> str:
        """Clean JSON response from OpenAI."""
        # Remove markdown code blocks
        content = re.sub(r'```json\s*', '', content)
        content = re.sub(r'```\s*$', '', content)
        
        # Remove any text before first {
        start = content.find('{')
        if start > 0:
            content = content[start:]
            
        # Remove any text after last }
        end = content.rfind('}')
        if end > 0 and end < len(content) - 1:
            content = content[:end + 1]
            
        return content.strip()
    
    def _create_extraction_prompt(self, text: str) -> str:
        """Create prompt for OpenAI extraction."""
        prompt = f"""
Extract information from this resume and return ONLY valid JSON in this exact format:

{{
  "Name": "Full Name with credentials (PhD, MBA, etc.)",
  "Summary": "Professional summary or objective",
  "Skills": ["skill1", "skill2", "skill3"],
  "StructuredExperiences": [
    {{
      "title": "Job Title",
      "company": "Company Name",
      "date_range": "Start Date - End Date",
      "responsibilities": ["responsibility1", "responsibility2"]
    }}
  ],
  "Education": ["degree info", "school info"],
  "Training": ["certification1", "training1"],
  "Address": "Full address if available"
}}

Resume text:
{text}

CRITICAL INSTRUCTIONS:
- For NAME: Include ALL credentials (PhD, MBA, M.S., B.S., etc.) - example: "John Doe, PhD, MBA"
- Read the ENTIRE resume text carefully, don't miss content
- Extract ALL work experiences with full details
- Return ONLY valid JSON, no explanations
- If a section is not found, use empty string or empty array
- Extract actual technical skills, not company names
"""
        return prompt
    
    def _extract_contact_info(self, text: str) -> Dict[str, str]:
        """Extract contact information from resume text."""
        contact_info = {}
        
        # Extract email
        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
        email_match = re.search(email_pattern, text)
        if email_match:
            contact_info['email'] = email_match.group()
        
        # Extract phone number
        phone_patterns = [
            r'\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',
            r'\+1[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}',
            r'\d{3}[-.\s]?\d{3}[-.\s]?\d{4}'
        ]
        
        for pattern in phone_patterns:
            phone_match = re.search(pattern, text)
            if phone_match:
                contact_info['phone'] = phone_match.group().strip()
                break
        
        # Extract LinkedIn
        linkedin_pattern = r'linkedin\.com/in/[A-Za-z0-9-]+'
        linkedin_match = re.search(linkedin_pattern, text)
        if linkedin_match:
            contact_info['linkedin'] = linkedin_match.group()
            
        logger.info(f"OPENAI: Extracted ContactInfo as dict: {contact_info}")
        return contact_info
    
    def _validate_and_clean_result(self, result: Dict[str, Any]) -> Dict[str, Any]:
        """Validate and clean the extraction result."""
        
        # Ensure all required keys exist
        required_keys = ["Name", "Summary", "Skills", "StructuredExperiences", "Education", "Training", "Address"]
        for key in required_keys:
            if key not in result:
                result[key] = [] if key in ["Skills", "StructuredExperiences", "Education", "Training"] else ""
        
        # Clean skills - remove company names and duplicates
        if result.get("Skills"):
            cleaned_skills = []
            for skill in result["Skills"]:
                skill = skill.strip()
                # Skip if it looks like a company name or is too short
                if len(skill) > 1 and not self._is_company_name(skill):
                    cleaned_skills.append(skill)
            result["Skills"] = list(set(cleaned_skills))  # Remove duplicates
        
        # Validate experience structure
        if result.get("StructuredExperiences"):
            cleaned_experiences = []
            for exp in result["StructuredExperiences"]:
                if isinstance(exp, dict) and exp.get("title") and exp.get("company"):
                    # Ensure responsibilities is a list
                    if not isinstance(exp.get("responsibilities"), list):
                        exp["responsibilities"] = []
                    cleaned_experiences.append(exp)
            result["StructuredExperiences"] = cleaned_experiences
        
        return result
    
    def _is_company_name(self, text: str) -> bool:
        """Check if text looks like a company name rather than a skill."""
        company_indicators = [
            "inc", "llc", "corp", "ltd", "company", "solutions", "services", 
            "systems", "technologies", "financial", "insurance"
        ]
        text_lower = text.lower()
        return any(indicator in text_lower for indicator in company_indicators)
    
    def _fallback_extraction(self, text: str) -> Dict[str, Any]:
        """Fallback to regex-based extraction if OpenAI fails."""
        logger.info("Using regex fallback extraction...")
        
        return {
            "Name": self._extract_name_regex(text),
            "Summary": self._extract_summary_regex(text),
            "Skills": self._extract_skills_regex(text),
            "StructuredExperiences": self._extract_experiences_regex(text),
            "Education": self._extract_education_regex(text),
            "Training": [],
            "Address": self._extract_address_regex(text),
            "ContactInfo": self._extract_contact_info(text)
        }
    
    def _extract_name_regex(self, text: str) -> str:
        """Regex fallback for name extraction."""
        lines = text.split('\n')[:5]
        for line in lines:
            line = line.strip()
            if re.search(r'@|phone|email|linkedin|github', line.lower()):
                continue
            # Match name with potential credentials (PhD, MBA, etc.)
            name_match = re.match(r'^([A-Z][a-z]+ [A-Z][a-z]+(?:\s+[A-Z][a-z]+)?(?:,\s*[A-Z][a-z.]+(?:,\s*[A-Z][a-z.]+)?)?)', line)
            if name_match:
                return name_match.group(1)
        return ""
    
    def _extract_summary_regex(self, text: str) -> str:
        """Regex fallback for summary extraction."""
        summary_pattern = r'(?i)(?:professional\s+)?summary[:\s]*\n(.*?)(?=\n\s*(?:technical\s+skills?|skills?|experience|education))'
        match = re.search(summary_pattern, text, re.DOTALL)
        if match:
            summary = match.group(1).strip()
            summary = re.sub(r'\n+', ' ', summary)
            summary = re.sub(r'\s+', ' ', summary)
            return summary
        return ""
    
    def _extract_skills_regex(self, text: str) -> List[str]:
        """Regex fallback for skills extraction."""
        skills = set()
        
        # Look for technical skills section
        skills_pattern = r'(?i)technical\s+skills?[:\s]*\n(.*?)(?=\n\s*(?:experience|education|projects?))'
        match = re.search(skills_pattern, text, re.DOTALL)
        
        if match:
            skills_text = match.group(1)
            # Split by common separators
            skill_items = re.split(r'[,;]\s*', skills_text.replace('\n', ' '))
            for item in skill_items:
                item = item.strip()
                if item and len(item) > 1 and len(item) < 30:
                    skills.add(item)
        
        return sorted(list(skills))
    
    def _extract_experiences_regex(self, text: str) -> List[Dict[str, Any]]:
        """Regex fallback for experience extraction."""
        experiences = []
        
        # Look for work experience section
        exp_pattern = r'(?i)(?:work\s+)?experience[:\s]*\n(.*?)(?=\n\s*(?:education|projects?|certifications?|$))'
        match = re.search(exp_pattern, text, re.DOTALL)
        
        if match:
            exp_text = match.group(1)
            
            # Look for job entries with | separators
            job_pattern = r'([^|\n]+)\s*\|\s*([^|\n]+)\s*\|\s*([^|\n]+)'
            matches = re.findall(job_pattern, exp_text)
            
            for match in matches:
                title, company, dates = match
                responsibilities = []
                
                # Look for bullet points after this job
                job_section = exp_text[exp_text.find(f"{title}|{company}|{dates}"):]
                bullets = re.findall(r'[-•]\s*([^-•\n]+)', job_section)
                responsibilities = [bullet.strip() for bullet in bullets if len(bullet.strip()) > 10]
                
                experience = {
                    "title": title.strip(),
                    "company": company.strip(),
                    "date_range": dates.strip(),
                    "responsibilities": responsibilities
                }
                experiences.append(experience)
        
        return experiences
    
    def _extract_education_regex(self, text: str) -> List[str]:
        """Regex fallback for education extraction."""
        education = []
        
        edu_pattern = r'(?i)education[:\s]*\n(.*?)(?=\n\s*(?:certifications?|projects?|$))'
        match = re.search(edu_pattern, text, re.DOTALL)
        
        if match:
            edu_text = match.group(1)
            edu_lines = [line.strip() for line in edu_text.split('\n') if line.strip()]
            
            for line in edu_lines:
                if len(line) > 10:  # Filter out short lines
                    education.append(line)
        
        return education
    
    def _extract_address_regex(self, text: str) -> str:
        """Regex fallback for address extraction."""
        # Look for address patterns like "6001 Tain Dr. Suite 203, Dublin, OH, 43016"
        address_patterns = [
            r'(\d+\s+[A-Za-z\s\.]+(?:Suite|Apt|Unit)\s+\d+,?\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*\d{5})',
            r'(\d+\s+[A-Za-z\s\.]+,?\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*\d{5})',
            r'([A-Za-z\s\d\.]+,\s*[A-Za-z\s]+,\s*[A-Z]{2}\s*\d{5})'
        ]
        
        for pattern in address_patterns:
            match = re.search(pattern, text)
            if match:
                return match.group(1).strip()
        
        return ""


# Main extraction function for compatibility
def extract_sections_openai(text: str, api_key: Optional[str] = None) -> Dict[str, Any]:
    """Extract resume sections using OpenAI API."""
    extractor = OpenAIResumeExtractor(api_key=api_key)
    return extractor.extract_sections_openai(text)