amjad awad commited on
Commit
b8b3ced
·
1 Parent(s): f0072a8

Add application file

Browse files
.gitignore ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ .idea/
169
+
170
+ # Ruff stuff:
171
+ .ruff_cache/
172
+
173
+ # PyPI configuration file
174
+ .pypirc
app.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from parser import PDFExtractor, TextExtractor, DOCXExtractor
3
+ from processor import Preprocessor
4
+ from skill import SkillDynamicMatcher
5
+ from similarity import SentenceTransformerSimilarity
6
+ from recommendation import AiRecommendation
7
+
8
+ # Initialize components
9
+ pdf_extractor = PDFExtractor()
10
+ docx_extractor = DOCXExtractor()
11
+ text_extractor = TextExtractor()
12
+ preprocessor = Preprocessor()
13
+ skill_matcher = SkillDynamicMatcher()
14
+ sentence_transformer = SentenceTransformerSimilarity("mixedbread-ai/mxbai-embed-large-v1")
15
+ recommendation = AiRecommendation()
16
+
17
+
18
+ def extract(file):
19
+ if file is None:
20
+ return "No file uploaded."
21
+ file_path = file if isinstance(file, str) else file.name
22
+ if file_path.endswith('.pdf'):
23
+ return pdf_extractor.extract(file_path)
24
+ elif file_path.endswith('.docx'):
25
+ return docx_extractor.extract(file_path)
26
+ elif file_path.endswith('.txt'):
27
+ return text_extractor.extract(file_path)
28
+ else:
29
+ return "Unsupported file type."
30
+
31
+
32
+ def analyze_files(resume_file, job_description_file):
33
+ if not resume_file or not job_description_file:
34
+ return "Please upload both files.", "", "", "", "", ""
35
+
36
+ try:
37
+ # Extract and process text
38
+ resume_text = extract(resume_file)
39
+ jd_text = extract(job_description_file)
40
+ preprocess_resume = preprocessor.preprocess(resume_text)
41
+ preprocess_jd = preprocessor.preprocess(jd_text)
42
+
43
+ # Skill matching
44
+ matched_jd_skills = skill_matcher.extract(jd_text)
45
+ matched_resume_skills = skill_matcher.extract(resume_text)
46
+ matched_result = skill_matcher.match(matched_jd_skills, matched_resume_skills)
47
+
48
+ # Create scrollable skill display
49
+ skill_display = """
50
+ <div style='
51
+ max-height: 300px;
52
+ overflow-y: auto;
53
+ padding: 10px;
54
+ border: 1px solid #e0e0e0;
55
+ border-radius: 5px;
56
+ margin-bottom: 15px;
57
+ '>
58
+ """
59
+ for skill in matched_jd_skills:
60
+ if skill in matched_resume_skills:
61
+ skill_display += f"""
62
+ <div style='
63
+ background-color: #d4edda;
64
+ color: #155724;
65
+ padding: 5px 10px;
66
+ border-radius: 4px;
67
+ margin: 5px 0;
68
+ display: inline-block;
69
+ '>✓ {skill}</div>
70
+ """
71
+ else:
72
+ skill_display += f"""
73
+ <div style='
74
+ background-color: #f8d7da;
75
+ color: #721c24;
76
+ padding: 5px 10px;
77
+ border-radius: 4px;
78
+ margin: 5px 0;
79
+ display: inline-block;
80
+ '>✗ {skill}</div>
81
+ """
82
+ skill_display += "</div>"
83
+
84
+ # Prepare other outputs
85
+ ratio_text = f"Match Ratio: {matched_result[0]}" if matched_result else "No matches"
86
+ match_string = f"Match Details: {matched_result[1]}" if matched_result else ""
87
+ score = sentence_transformer.similarity(preprocess_resume, preprocess_jd)
88
+ similarity_text = f"Similarity Score: {score:.2f}"
89
+
90
+ return resume_text, jd_text, gr.HTML(skill_display), ratio_text, match_string, similarity_text
91
+
92
+ except Exception as e:
93
+ return f"Error: {str(e)}", "", "", "", "", ""
94
+
95
+
96
+ def get_ai_recommendation(resume_file, job_description_file):
97
+ if not resume_file or not job_description_file:
98
+ return "Please upload both files first."
99
+ try:
100
+ resume_text = extract(resume_file)
101
+ jd_text = extract(job_description_file)
102
+ return recommendation.recommend(resume_text, jd_text)
103
+ except Exception as e:
104
+ return f"Error: {str(e)}"
105
+
106
+
107
+ # Custom CSS for scrollable containers
108
+ custom_css = """
109
+ .scrollable-textbox {
110
+ max-height: 300px;
111
+ overflow-y: auto !important;
112
+ border: 1px solid #e0e0e0;
113
+ border-radius: 5px;
114
+ padding: 10px;
115
+ }
116
+ .scrollable-textbox textarea {
117
+ min-height: 300px !important;
118
+ }
119
+ """
120
+
121
+ with gr.Blocks(title="Resume Analyzer", css=custom_css) as demo:
122
+ gr.Markdown("# 🧠 Smart Resume Analyzer")
123
+
124
+ # File upload
125
+ with gr.Row():
126
+ resume_file = gr.File(label="Your Resume", file_types=[".pdf", ".docx", ".txt"])
127
+ job_description_file = gr.File(label="Job Description", file_types=[".pdf", ".docx", ".txt"])
128
+
129
+ analyze_btn = gr.Button("Analyze Documents", variant="primary")
130
+
131
+ # Results sections
132
+ with gr.Tab("Extracted Text"):
133
+ with gr.Accordion("Resume Content", open=False):
134
+ resume_output = gr.Textbox(
135
+ label="Resume Text",
136
+ lines=20,
137
+ interactive=False,
138
+ elem_classes=["scrollable-textbox"]
139
+ )
140
+ with gr.Accordion("Job Description", open=False):
141
+ jd_output = gr.Textbox(
142
+ label="Job Description Text",
143
+ lines=20,
144
+ interactive=False,
145
+ elem_classes=["scrollable-textbox"]
146
+ )
147
+
148
+ with gr.Tab("Analysis Results"):
149
+ gr.Markdown("## Skill Matching")
150
+ skills_output = gr.HTML(label="Skill Comparison")
151
+
152
+ with gr.Row():
153
+ ratio_output = gr.Textbox(label="Match Ratio", interactive=False)
154
+ similarity_output = gr.Textbox(label="Similarity Score", interactive=False)
155
+
156
+ match_string_output = gr.Textbox(
157
+ label="Detailed Matching",
158
+ interactive=False,
159
+ elem_classes=["scrollable-textbox"]
160
+ )
161
+
162
+ with gr.Tab("AI Recommendations"):
163
+ ai_btn = gr.Button("Generate Recommendations", variant="primary")
164
+ ai_output = gr.Textbox(
165
+ label="AI Suggestions",
166
+ lines=20,
167
+ interactive=False,
168
+ elem_classes=["scrollable-textbox"]
169
+ )
170
+
171
+ # Event handlers
172
+ analyze_btn.click(
173
+ analyze_files,
174
+ inputs=[resume_file, job_description_file],
175
+ outputs=[resume_output, jd_output, skills_output, ratio_output, match_string_output, similarity_output],
176
+ scroll_to_output=True
177
+ )
178
+
179
+ ai_btn.click(
180
+ get_ai_recommendation,
181
+ inputs=[resume_file, job_description_file],
182
+ outputs=[ai_output],
183
+ scroll_to_output=True
184
+ )
185
+
186
+ demo.launch()
evaluation/evaluation.csv ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ ,BERT,TF-IDF,Transformer
2
+ accuracy,0.5127913587265491,0.4860716316088687,0.6031836270608301
3
+ precision,0.5127913587265491,0.0,0.5791925465838509
4
+ recall,1.0,0.0,0.8270509977827051
5
+ f1,0.6779406238256295,0.0,0.6812785388127854
evaluation/evaluation.ipynb ADDED
@@ -0,0 +1,437 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "metadata": {},
5
+ "cell_type": "raw",
6
+ "source": "",
7
+ "id": "f50fa4bf6edb488e"
8
+ },
9
+ {
10
+ "cell_type": "code",
11
+ "id": "initial_id",
12
+ "metadata": {
13
+ "collapsed": true
14
+ },
15
+ "source": [
16
+ "import pandas as pd\n",
17
+ "from datasets import load_dataset\n",
18
+ "from processor import Preprocessor\n",
19
+ "from similarity import SentenceTransformerSimilarity, BertSimilarity, TFIDFSimilarity\n",
20
+ "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score"
21
+ ],
22
+ "outputs": [],
23
+ "execution_count": null
24
+ },
25
+ {
26
+ "metadata": {
27
+ "ExecuteTime": {
28
+ "end_time": "2025-07-08T08:44:53.568166Z",
29
+ "start_time": "2025-07-08T08:44:53.563295Z"
30
+ }
31
+ },
32
+ "cell_type": "code",
33
+ "source": "data_path = \"cnamuangtoun/resume-job-description-fit\"",
34
+ "id": "c5adb8b5a5057b1b",
35
+ "outputs": [],
36
+ "execution_count": 2
37
+ },
38
+ {
39
+ "metadata": {
40
+ "ExecuteTime": {
41
+ "end_time": "2025-07-08T08:44:59.342259Z",
42
+ "start_time": "2025-07-08T08:44:55.506716Z"
43
+ }
44
+ },
45
+ "cell_type": "code",
46
+ "source": [
47
+ "datasets = load_dataset(data_path)\n",
48
+ "\n",
49
+ "train_dataset = datasets['train']\n",
50
+ "test_dataset = datasets['test']"
51
+ ],
52
+ "id": "ee31d8d2097183ae",
53
+ "outputs": [],
54
+ "execution_count": 3
55
+ },
56
+ {
57
+ "metadata": {
58
+ "ExecuteTime": {
59
+ "end_time": "2025-07-08T08:45:14.160133Z",
60
+ "start_time": "2025-07-08T08:45:09.061062Z"
61
+ }
62
+ },
63
+ "cell_type": "code",
64
+ "source": [
65
+ "preprocessor = Preprocessor()\n",
66
+ "sentence_transformer = SentenceTransformerSimilarity(\"mixedbread-ai/mxbai-embed-large-v1\")\n",
67
+ "bert_similarity = BertSimilarity()\n",
68
+ "tfidf_similarity = TFIDFSimilarity()"
69
+ ],
70
+ "id": "8a5ab5d98b0212c2",
71
+ "outputs": [],
72
+ "execution_count": 4
73
+ },
74
+ {
75
+ "metadata": {
76
+ "ExecuteTime": {
77
+ "end_time": "2025-07-08T08:45:15.854458Z",
78
+ "start_time": "2025-07-08T08:45:15.850088Z"
79
+ }
80
+ },
81
+ "cell_type": "code",
82
+ "source": [
83
+ "def compute_similarity(example):\n",
84
+ " resume = preprocessor.preprocess(example[\"resume_text\"])\n",
85
+ " job_description = preprocessor.preprocess(example[\"job_description_text\"])\n",
86
+ " label = 0 if example[\"label\"] == \"No Fit\" else 1\n",
87
+ "\n",
88
+ " tfidf = tfidf_similarity.similarity(resume, job_description)\n",
89
+ " bert = bert_similarity.similarity(resume, job_description)\n",
90
+ " transformer = sentence_transformer.similarity(resume, job_description)\n",
91
+ "\n",
92
+ " return {\n",
93
+ " \"resume\": resume, \"job_description\": job_description, \"label\": label,\n",
94
+ " \"tfidf_similarity\": tfidf, \"bert_similarity\": bert, \"sentence_transformer_similarity\": transformer\n",
95
+ " }"
96
+ ],
97
+ "id": "1edb7165aa0fa91a",
98
+ "outputs": [],
99
+ "execution_count": 5
100
+ },
101
+ {
102
+ "metadata": {
103
+ "ExecuteTime": {
104
+ "end_time": "2025-07-08T09:52:30.395538Z",
105
+ "start_time": "2025-07-08T08:45:18.599810Z"
106
+ }
107
+ },
108
+ "cell_type": "code",
109
+ "source": "test_dataset = test_dataset.map(compute_similarity)",
110
+ "id": "49885541fea7acf9",
111
+ "outputs": [
112
+ {
113
+ "name": "stderr",
114
+ "output_type": "stream",
115
+ "text": [
116
+ "Map: 100%|██████████| 1759/1759 [1:06:38<00:00, 2.27s/ examples]\n"
117
+ ]
118
+ }
119
+ ],
120
+ "execution_count": 6
121
+ },
122
+ {
123
+ "metadata": {
124
+ "ExecuteTime": {
125
+ "end_time": "2025-07-05T10:41:18.629642Z",
126
+ "start_time": "2025-07-05T10:41:18.622846Z"
127
+ }
128
+ },
129
+ "cell_type": "code",
130
+ "source": "test_dataset",
131
+ "id": "abf86ee1b574eee8",
132
+ "outputs": [
133
+ {
134
+ "data": {
135
+ "text/plain": [
136
+ "Dataset({\n",
137
+ " features: ['resume_text', 'job_description_text', 'label', 'resume', 'job_description', 'tfidf_similarity', 'bert_similarity', 'sentence_transformer_similarity'],\n",
138
+ " num_rows: 1759\n",
139
+ "})"
140
+ ]
141
+ },
142
+ "execution_count": 14,
143
+ "metadata": {},
144
+ "output_type": "execute_result"
145
+ }
146
+ ],
147
+ "execution_count": 14
148
+ },
149
+ {
150
+ "metadata": {
151
+ "ExecuteTime": {
152
+ "end_time": "2025-07-08T10:13:38.193054Z",
153
+ "start_time": "2025-07-08T10:13:38.189061Z"
154
+ }
155
+ },
156
+ "cell_type": "code",
157
+ "source": [
158
+ "def convert_to_label(example):\n",
159
+ " tfidf = 0 if example[\"tfidf_similarity\"] < 0.5 else 1\n",
160
+ " bert = 0 if example[\"bert_similarity\"] < 0.5 else 1\n",
161
+ " transformer = 0 if example[\"sentence_transformer_similarity\"] < 0.65 else 1\n",
162
+ "\n",
163
+ " return {\"tfidf\": tfidf, \"bert\": bert, \"transformer\": transformer}\n"
164
+ ],
165
+ "id": "9f6448edfb548f6c",
166
+ "outputs": [],
167
+ "execution_count": 73
168
+ },
169
+ {
170
+ "metadata": {
171
+ "ExecuteTime": {
172
+ "end_time": "2025-07-08T10:13:39.971426Z",
173
+ "start_time": "2025-07-08T10:13:39.830089Z"
174
+ }
175
+ },
176
+ "cell_type": "code",
177
+ "source": "test_dataset = test_dataset.map(convert_to_label)",
178
+ "id": "2d688ab6cef016f3",
179
+ "outputs": [
180
+ {
181
+ "name": "stderr",
182
+ "output_type": "stream",
183
+ "text": [
184
+ "Map: 100%|██████████| 1759/1759 [00:00<00:00, 12958.71 examples/s]\n"
185
+ ]
186
+ }
187
+ ],
188
+ "execution_count": 74
189
+ },
190
+ {
191
+ "metadata": {
192
+ "ExecuteTime": {
193
+ "end_time": "2025-07-08T10:13:41.559694Z",
194
+ "start_time": "2025-07-08T10:13:41.555362Z"
195
+ }
196
+ },
197
+ "cell_type": "code",
198
+ "source": "test_dataset",
199
+ "id": "e1ff007dcc6a9974",
200
+ "outputs": [
201
+ {
202
+ "data": {
203
+ "text/plain": [
204
+ "Dataset({\n",
205
+ " features: ['resume_text', 'job_description_text', 'label', 'resume', 'job_description', 'tfidf_similarity', 'bert_similarity', 'sentence_transformer_similarity', 'tfidf', 'bert', 'transformer'],\n",
206
+ " num_rows: 1759\n",
207
+ "})"
208
+ ]
209
+ },
210
+ "execution_count": 75,
211
+ "metadata": {},
212
+ "output_type": "execute_result"
213
+ }
214
+ ],
215
+ "execution_count": 75
216
+ },
217
+ {
218
+ "metadata": {
219
+ "ExecuteTime": {
220
+ "end_time": "2025-07-08T10:14:18.161742Z",
221
+ "start_time": "2025-07-08T10:14:18.155650Z"
222
+ }
223
+ },
224
+ "cell_type": "code",
225
+ "source": [
226
+ "actual = test_dataset[\"label\"]\n",
227
+ "bert_predict = test_dataset[\"bert\"]\n",
228
+ "tfidf_predict = test_dataset[\"tfidf\"]\n",
229
+ "transformer_predict = test_dataset[\"transformer\"]"
230
+ ],
231
+ "id": "a86f6993fe289ac6",
232
+ "outputs": [],
233
+ "execution_count": 76
234
+ },
235
+ {
236
+ "metadata": {
237
+ "ExecuteTime": {
238
+ "end_time": "2025-07-08T10:14:31.353217Z",
239
+ "start_time": "2025-07-08T10:14:31.349873Z"
240
+ }
241
+ },
242
+ "cell_type": "code",
243
+ "source": [
244
+ "def evaluate(actual, predict):\n",
245
+ " accuracy = accuracy_score(actual, predict)\n",
246
+ " precision = precision_score(actual, predict)\n",
247
+ " recall = recall_score(actual, predict)\n",
248
+ " f1 = f1_score(actual, predict)\n",
249
+ " \n",
250
+ " return {\"accuracy\": accuracy, \"precision\": precision, \"recall\": recall, \"f1\": f1}"
251
+ ],
252
+ "id": "b838d444f65bdc9b",
253
+ "outputs": [],
254
+ "execution_count": 79
255
+ },
256
+ {
257
+ "metadata": {
258
+ "ExecuteTime": {
259
+ "end_time": "2025-07-08T10:14:32.609153Z",
260
+ "start_time": "2025-07-08T10:14:32.583900Z"
261
+ }
262
+ },
263
+ "cell_type": "code",
264
+ "source": [
265
+ "bert_evaluate = evaluate(actual, bert_predict)\n",
266
+ "tfidf_evaluate = evaluate(actual, tfidf_predict)\n",
267
+ "transformer_evaluate = evaluate(actual, transformer_predict)"
268
+ ],
269
+ "id": "6aefb66598a946a8",
270
+ "outputs": [],
271
+ "execution_count": 80
272
+ },
273
+ {
274
+ "metadata": {
275
+ "ExecuteTime": {
276
+ "end_time": "2025-07-08T10:14:34.137687Z",
277
+ "start_time": "2025-07-08T10:14:34.133737Z"
278
+ }
279
+ },
280
+ "cell_type": "code",
281
+ "source": [
282
+ "print(\"bert evaluate : \", bert_evaluate)\n",
283
+ "print(\"tfidf evaluate : \", tfidf_evaluate)\n",
284
+ "print(\"transformer evaluate : \", transformer_evaluate)"
285
+ ],
286
+ "id": "f529a915f203f1f2",
287
+ "outputs": [
288
+ {
289
+ "name": "stdout",
290
+ "output_type": "stream",
291
+ "text": [
292
+ "bert evaluate : {'accuracy': 0.5127913587265491, 'precision': 0.5127913587265491, 'recall': 1.0, 'f1': 0.6779406238256295}\n",
293
+ "tfidf evaluate : {'accuracy': 0.4860716316088687, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0}\n",
294
+ "transformer evaluate : {'accuracy': 0.6031836270608301, 'precision': 0.5791925465838509, 'recall': 0.8270509977827051, 'f1': 0.6812785388127854}\n"
295
+ ]
296
+ }
297
+ ],
298
+ "execution_count": 81
299
+ },
300
+ {
301
+ "metadata": {
302
+ "ExecuteTime": {
303
+ "end_time": "2025-07-08T10:14:35.938911Z",
304
+ "start_time": "2025-07-08T10:14:35.935413Z"
305
+ }
306
+ },
307
+ "cell_type": "code",
308
+ "source": [
309
+ "data = {\n",
310
+ " 'BERT': bert_evaluate,\n",
311
+ " 'TF-IDF': tfidf_evaluate,\n",
312
+ " 'Transformer': transformer_evaluate\n",
313
+ "}\n",
314
+ "\n",
315
+ "dataframe = pd.DataFrame(data)"
316
+ ],
317
+ "id": "b9260a6b2ba65916",
318
+ "outputs": [],
319
+ "execution_count": 82
320
+ },
321
+ {
322
+ "metadata": {
323
+ "ExecuteTime": {
324
+ "end_time": "2025-07-08T09:56:17.793633Z",
325
+ "start_time": "2025-07-08T09:56:17.786996Z"
326
+ }
327
+ },
328
+ "cell_type": "code",
329
+ "source": "dataframe",
330
+ "id": "e67956d4d96cfb3",
331
+ "outputs": [
332
+ {
333
+ "data": {
334
+ "text/plain": [
335
+ " BERT TF-IDF Transformer\n",
336
+ "accuracy 0.512791 0.486072 0.603184\n",
337
+ "precision 0.512791 0.000000 0.579193\n",
338
+ "recall 1.000000 0.000000 0.827051\n",
339
+ "f1 0.677941 0.000000 0.681279"
340
+ ],
341
+ "text/html": [
342
+ "<div>\n",
343
+ "<style scoped>\n",
344
+ " .dataframe tbody tr th:only-of-type {\n",
345
+ " vertical-align: middle;\n",
346
+ " }\n",
347
+ "\n",
348
+ " .dataframe tbody tr th {\n",
349
+ " vertical-align: top;\n",
350
+ " }\n",
351
+ "\n",
352
+ " .dataframe thead th {\n",
353
+ " text-align: right;\n",
354
+ " }\n",
355
+ "</style>\n",
356
+ "<table border=\"1\" class=\"dataframe\">\n",
357
+ " <thead>\n",
358
+ " <tr style=\"text-align: right;\">\n",
359
+ " <th></th>\n",
360
+ " <th>BERT</th>\n",
361
+ " <th>TF-IDF</th>\n",
362
+ " <th>Transformer</th>\n",
363
+ " </tr>\n",
364
+ " </thead>\n",
365
+ " <tbody>\n",
366
+ " <tr>\n",
367
+ " <th>accuracy</th>\n",
368
+ " <td>0.512791</td>\n",
369
+ " <td>0.486072</td>\n",
370
+ " <td>0.603184</td>\n",
371
+ " </tr>\n",
372
+ " <tr>\n",
373
+ " <th>precision</th>\n",
374
+ " <td>0.512791</td>\n",
375
+ " <td>0.000000</td>\n",
376
+ " <td>0.579193</td>\n",
377
+ " </tr>\n",
378
+ " <tr>\n",
379
+ " <th>recall</th>\n",
380
+ " <td>1.000000</td>\n",
381
+ " <td>0.000000</td>\n",
382
+ " <td>0.827051</td>\n",
383
+ " </tr>\n",
384
+ " <tr>\n",
385
+ " <th>f1</th>\n",
386
+ " <td>0.677941</td>\n",
387
+ " <td>0.000000</td>\n",
388
+ " <td>0.681279</td>\n",
389
+ " </tr>\n",
390
+ " </tbody>\n",
391
+ "</table>\n",
392
+ "</div>"
393
+ ]
394
+ },
395
+ "execution_count": 40,
396
+ "metadata": {},
397
+ "output_type": "execute_result"
398
+ }
399
+ ],
400
+ "execution_count": 40
401
+ },
402
+ {
403
+ "metadata": {
404
+ "ExecuteTime": {
405
+ "end_time": "2025-07-08T09:56:37.438085Z",
406
+ "start_time": "2025-07-08T09:56:37.425383Z"
407
+ }
408
+ },
409
+ "cell_type": "code",
410
+ "source": "dataframe.to_csv(\"evaluation.csv\")",
411
+ "id": "a14601b6c4afef68",
412
+ "outputs": [],
413
+ "execution_count": 41
414
+ }
415
+ ],
416
+ "metadata": {
417
+ "kernelspec": {
418
+ "display_name": "Python 3",
419
+ "language": "python",
420
+ "name": "python3"
421
+ },
422
+ "language_info": {
423
+ "codemirror_mode": {
424
+ "name": "ipython",
425
+ "version": 2
426
+ },
427
+ "file_extension": ".py",
428
+ "mimetype": "text/x-python",
429
+ "name": "python",
430
+ "nbconvert_exporter": "python",
431
+ "pygments_lexer": "ipython2",
432
+ "version": "2.7.6"
433
+ }
434
+ },
435
+ "nbformat": 4,
436
+ "nbformat_minor": 5
437
+ }
parser/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .extract import PDFExtractor, DOCXExtractor, TextExtractor
2
+
3
+ __all__ = ["PDFExtractor", "DOCXExtractor", "TextExtractor"]
parser/extract.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdfplumber
2
+ from docx import Document
3
+ from abc import ABC, abstractmethod
4
+
5
+ class Extractor(ABC):
6
+ """
7
+ Abstract base class for extracting text from files.
8
+ """
9
+ @abstractmethod
10
+ def extract(self, path):
11
+ """
12
+ Abstract method to extract text from files.
13
+ :param path: the file path
14
+ :return: extracted text
15
+ """
16
+ pass
17
+
18
+ class PDFExtractor(Extractor):
19
+ """
20
+ Extract text from PDF files.
21
+ """
22
+ def extract(self, path):
23
+ """
24
+ Extract text from PDF file.
25
+ :param path: the file path
26
+ :return: extracted text
27
+ """
28
+ text = ""
29
+ with pdfplumber.open(path) as pdf:
30
+ for page in pdf.pages:
31
+ text += page.extract_text()
32
+
33
+ return text
34
+
35
+
36
+ class DOCXExtractor(Extractor):
37
+ """
38
+ Extract text from DOCX files.
39
+ """
40
+ def extract(self, path):
41
+ """
42
+ Extract text from DOCX file.
43
+ :param path: the file path
44
+ :return: extracted text
45
+ """
46
+ text = ""
47
+ document = Document(path)
48
+ for paragraph in document.paragraphs:
49
+ text += paragraph.text
50
+
51
+ return text
52
+
53
+
54
+ class TextExtractor(Extractor):
55
+ """
56
+ Extract text from .txt file.
57
+ """
58
+ def extract(self, path):
59
+ """
60
+ Extract text from .txt file.
61
+ :param path: the file path
62
+ :return: extracted text
63
+ """
64
+ with open(path, "r") as file:
65
+ text = file.read()
66
+
67
+ return text
processor/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .preprocessor import Preprocessor
2
+
3
+ __all__ = ["Preprocessor"]
processor/preprocessor.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ import spacy
3
+ import re
4
+ from nltk.tokenize import word_tokenize
5
+ from nltk.corpus import stopwords
6
+
7
+ class Preprocessor:
8
+ """
9
+ This class provides methods to perform text preprocessing including tokenization,
10
+ stopword removal, lemmatization, and basic text cleaning.
11
+ """
12
+ def __init__(self, nltk_resource="all", spacy_model="en_core_web_sm", language="english"):
13
+ """
14
+ The constructor for download the nltk resource and spacy model.
15
+ :param nltk_resource: nltk resource
16
+ :param spacy_model: spacy model
17
+ :param language: the main language
18
+ """
19
+ try:
20
+ if nltk_resource == "all":
21
+ nltk.data.find(f"corpora")
22
+ else:
23
+ nltk.data.find(f"corpora/{nltk_resource}")
24
+
25
+ except LookupError:
26
+ nltk.download(nltk_resource)
27
+
28
+ if not spacy.util.is_package(spacy_model):
29
+ spacy.cli.download(spacy_model)
30
+
31
+ self.nlp = spacy.load(spacy_model)
32
+ self.stop_word = set(stopwords.words(language))
33
+
34
+
35
+ def preprocess(self, text):
36
+ """
37
+ This method performs text cleaning, tokenization, and lemmatization.
38
+ :param text: the text to be preprocessed
39
+ :return: the preprocessed text
40
+ """
41
+ text = re.sub(r'(\w+):', r'\1:\n', text)
42
+ text = re.sub(r'([a-z])([A-Z])', r'\1 \n\2', text)
43
+ text = re.sub(r'\s+', ' ', text)
44
+ text = re.sub(r'[^\w\s]', '', text)
45
+ text = text.lower()
46
+
47
+ tokens = word_tokenize(text)
48
+ tokens = [token for token in tokens if token not in self.stop_word]
49
+
50
+ doc = self.nlp(" ".join(tokens))
51
+ tokens = [token.lemma_ for token in doc]
52
+
53
+ return " ".join(tokens)
54
+
55
+
56
+
57
+
58
+
59
+
recommendation/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .ai_recommendation import AiRecommendation
2
+
3
+ __all__ = ["AiRecommendation"]
recommendation/ai_recommendation.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer
2
+
3
+ class AiRecommendation:
4
+ """
5
+ Generates resume improvement suggestions based on a job description using a causal language model.
6
+ """
7
+ def __init__(self, model_name="HuggingFaceTB/SmolLM2-1.7B-Instruct"):
8
+ """
9
+ Initializes the tokenizer and model with the specified model name.
10
+ :param model_name: The name of the model to use.
11
+ """
12
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
13
+ self.model = AutoModelForCausalLM.from_pretrained(model_name)
14
+
15
+
16
+ def recommend(self, resume, job_description):
17
+ """
18
+ Generates targeted resume improvement advice based on a job description.
19
+ :param resume: The user resume
20
+ :param job_description: The job description
21
+ :return: The recommendation
22
+ """
23
+ prompt = f"""
24
+ Given the following resume and job description, provide specific, actionable recommendations to improve the resume so it better matches the job description.
25
+
26
+ Resume:
27
+ {resume}
28
+
29
+ Job Description:
30
+ {job_description}
31
+ """
32
+
33
+ inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
34
+
35
+ outputs = self.model.generate(
36
+ **inputs,
37
+ max_new_tokens=300,
38
+ do_sample=True,
39
+ temperature=0.7,
40
+ top_p=0.9,
41
+ eos_token_id=self.tokenizer.eos_token_id,
42
+ )
43
+
44
+ generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
45
+ answer_only = generated_text[len(prompt):].strip()
46
+
47
+ return answer_only
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ datasets==3.6.0
2
+ transformers==4.53.0
3
+ nltk==3.9.1
4
+ pdfplumber==0.11.7
5
+ scikit-learn==1.7.0
6
+ sentence-transformers==5.0.0
7
+ spacy==3.8.7
8
+ pandas==2.3.0
9
+ matplotlib==3.10.3
10
+ RapidFuzz==3.13.0
11
+ python-docx==1.2.0
12
+ streamlit==1.46.1
13
+ torchvision==0.22.1
14
+ huggingface-hub==0.33.2
15
+ gradio==5.35.0
similarity/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .similarity import SentenceTransformerSimilarity, BertSimilarity, TFIDFSimilarity
2
+
3
+ __all__ = ["SentenceTransformerSimilarity", "BertSimilarity", "TFIDFSimilarity"]
similarity/similarity.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from sentence_transformers import SentenceTransformer
3
+ from transformers import BertTokenizerFast, BertModel
4
+ import torch.nn.functional as F
5
+ from sklearn.feature_extraction.text import TfidfVectorizer
6
+ from sklearn.metrics.pairwise import cosine_similarity
7
+ from sentence_transformers.util import cos_sim
8
+
9
+ class TFIDFSimilarity:
10
+ """
11
+ A class for computing sentence similarity using a TFIDF model.
12
+ """
13
+ def __init__(self):
14
+ """
15
+ Initializes the TfidfVectorizer.
16
+ """
17
+ self.model = TfidfVectorizer(ngram_range=(1, 1))
18
+
19
+ def encode(self, sentence1, sentence2):
20
+ """
21
+ Encode the sentence1 and sentence2 using the TFIDF model.
22
+ :param sentence1: The first sentence.
23
+ :param sentence2: The second sentence.
24
+ :return: The encode vector of the sentence1 and sentence2.
25
+ """
26
+ encodes = self.model.fit_transform([sentence1, sentence2])
27
+ return encodes[0], encodes[1]
28
+
29
+
30
+ def similarity(self, sentence1, sentence2):
31
+ """
32
+ Calculates cosine similarity between two sentence.
33
+ :param sentence1: First sentence text.
34
+ :param sentence2: Second sentence text.
35
+ :return: Cosine similarity score between the two sentence.
36
+ """
37
+ embedding1, embedding2 = self.encode(sentence1, sentence2)
38
+
39
+ return cosine_similarity(embedding1, embedding2)[0][0]
40
+
41
+
42
+
43
+ class SentenceTransformerSimilarity:
44
+ """
45
+ A class for computing sentence similarity using a SentenceTransformer model.
46
+ """
47
+ def __init__(self, model_name="all-MiniLM-L6-v2"):
48
+ """
49
+ Initializes the SentenceTransformerSimilarity with a given model.
50
+ :param model_name: The name of the pretrained SentenceTransformer model
51
+ """
52
+ self.model = SentenceTransformer(model_name)
53
+
54
+
55
+ def encode(self, sentence):
56
+ """
57
+ Encodes a sentence into a dense vector representation.
58
+ :param sentence: Input sentence to encode.
59
+ :return: Embedding vector of the sentence.
60
+ """
61
+ return self.model.encode(sentence)
62
+
63
+
64
+ def similarity(self, sentence1, sentence2):
65
+ """
66
+ Calculates cosine similarity between two sentence.
67
+ :param sentence1: First sentence text.
68
+ :param sentence2: Second sentence text.
69
+ :return: Cosine similarity score between the two sentence.
70
+ """
71
+ embedding1 = self.encode(sentence1)
72
+ embedding2 = self.encode(sentence2)
73
+
74
+ return cos_sim(embedding1, embedding2).item()
75
+
76
+
77
+
78
+ class BertSimilarity:
79
+ """
80
+ A class for computing sentence similarity using a pretrained BERT model.
81
+ """
82
+ def __init__(self, model_name="google-bert/bert-base-uncased"):
83
+ """
84
+ Initializes the BertSimilarity with a given BERT model.
85
+ :param model_name: The name of the pretrained BERT model.
86
+ """
87
+ self.model = BertModel.from_pretrained(model_name)
88
+ self.tokenizer = BertTokenizerFast.from_pretrained(model_name)
89
+
90
+
91
+ def encode(self, sentence):
92
+ """
93
+ Encodes a sentence into a dense vector using the average of the last 4 hidden layers.
94
+ :param sentence: Input sentence to encode.
95
+ :return: Sentence embedding tensor.
96
+ """
97
+ inputs = self.tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
98
+
99
+ with torch.no_grad():
100
+ outputs = self.model(**inputs, output_hidden_states=True)
101
+
102
+ last_4_layers = outputs.hidden_states[-4:]
103
+
104
+ layer_pooled = [torch.mean(layer, dim=1) for layer in last_4_layers]
105
+
106
+ sentence_embedding = torch.mean(torch.stack(layer_pooled), dim=0)
107
+
108
+ return sentence_embedding
109
+
110
+
111
+ def similarity(self, sentence1, sentence2):
112
+ """
113
+ Calculates cosine similarity between two sentence.
114
+ :param sentence1: First sentence text.
115
+ :param sentence2: Second sentence text.
116
+ :return: Cosine similarity score between the two sentence.
117
+ """
118
+ embedding1 = self.encode(sentence1)
119
+ embedding2 = self.encode(sentence2)
120
+
121
+ return F.cosine_similarity(embedding1, embedding2).item()
skill/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .matcher import SkillListMatcher, SkillDynamicMatcher
2
+
3
+ __all__ = ["SkillListMatcher", "SkillDynamicMatcher"]
skill/matcher.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from rapidfuzz import process
3
+ from nltk import ngrams
4
+ from nltk.tokenize import word_tokenize
5
+ from huggingface_hub import snapshot_download
6
+
7
+
8
+ class SkillListMatcher:
9
+ """
10
+ Provides methods to extract and match skills from text.
11
+ """
12
+ def __init__(self, spacy_model= "en_core_web_sm"):
13
+ """
14
+ Initializes the matcher and loads the spaCy model.
15
+ :param spacy_model: Name of the spaCy model to load.
16
+ """
17
+ if not spacy.util.is_package(spacy_model):
18
+ spacy.cli.download(spacy_model)
19
+
20
+ self.nlp = spacy.load(spacy_model)
21
+
22
+
23
+ def __lemmatization(self, skills):
24
+ """
25
+ Lemmatizes a list of skills.
26
+ :param skills: List of skill strings.
27
+ :return: List of lemmatized skills.
28
+ """
29
+ new_skills = []
30
+ for i in range(len(skills)):
31
+ skill = skills[i]
32
+ doc = self.nlp(skill)
33
+ tokens = [token.lemma_ for token in doc]
34
+ new_skills.append(" ".join(tokens).lower().strip())
35
+
36
+ return new_skills
37
+
38
+
39
+ def extract(self, text, skills, threshold=95):
40
+ """
41
+ Extracts relevant skills from the given text.
42
+ :param text: The input text.
43
+ :param skills: List of reference skill strings.
44
+ :param threshold: Threshold for matching skills.
45
+ :return: List of matched skills found in the text.
46
+ """
47
+ text = text.lower()
48
+ tokens = word_tokenize(text)
49
+
50
+ candidates = set()
51
+ for n in range(1, 5):
52
+ for gram in ngrams(tokens, n):
53
+ phrase = ' '.join(gram)
54
+ candidates.add(phrase)
55
+
56
+ new_skills = self.__lemmatization(skills)
57
+
58
+ found_skills = set()
59
+ for phrase in candidates:
60
+ match, score, _ = process.extractOne(phrase, new_skills)
61
+ if score >= threshold:
62
+ found_skills.add(match)
63
+
64
+ return list(found_skills)
65
+
66
+
67
+ def match(self, main_skills, extract_skills):
68
+ """
69
+ Matches extracted skills with main skills.
70
+ :param main_skills: List of target skill strings.
71
+ :param extract_skills: List of extracted skill strings.
72
+ :return: Tuple of match ratio and formatted match string.
73
+ """
74
+ main_skills = self.__lemmatization(main_skills)
75
+ extract_skills = self.__lemmatization(extract_skills)
76
+
77
+ count = 0
78
+
79
+ for skill in extract_skills:
80
+ if skill in main_skills:
81
+ count += 1
82
+
83
+ return count / len(main_skills), f"{count}/{len(main_skills)}"
84
+
85
+
86
+ class SkillDynamicMatcher:
87
+ """
88
+ Extracts and matches skills using a trained spaCy NER model.
89
+ """
90
+ def __init__(self, model_path="amjad-awad/skill-extractor"):
91
+ """
92
+ Initializes the NER model from the specified path.
93
+ :param model_path: Path to the trained NER model.
94
+ """
95
+ model_path = snapshot_download(model_path, repo_type="model")
96
+ self.ner_model = spacy.load(model_path)
97
+
98
+
99
+ def extract(self, text):
100
+ """
101
+ Extracts skill entities from the input text.
102
+ :param text: The input text.
103
+ :return: List of extracted skill entities.
104
+ """
105
+ skills = []
106
+ doc = self.ner_model(text)
107
+
108
+ for ent in doc.ents:
109
+ if "SKILLS" in ent.label_:
110
+ skills.append(ent.text.lower())
111
+
112
+ return list(set(skills))
113
+
114
+
115
+ def match(self, main_skills, extract_skills):
116
+ """
117
+ Matches extracted skills with main skills.
118
+ :param main_skills: List of target skill strings.
119
+ :param extract_skills: List of extracted skill strings.
120
+ :return: Tuple of match ratio and formatted match string.
121
+ """
122
+ count = 0
123
+
124
+ for skill in extract_skills:
125
+ if skill in main_skills:
126
+ count += 1
127
+
128
+ return count / len(main_skills), f"{count}/{len(main_skills)}"