Spaces:

amjad-awad
/

smart-resume-analyzer

Runtime error

App Files Files Community

amjad awad commited on Jul 9, 2025

Commit

b8b3ced

1 Parent(s): f0072a8

Add application file

Browse files

Files changed (15) hide show

.gitignore +174 -0
app.py +186 -0
evaluation/evaluation.csv +5 -0
evaluation/evaluation.ipynb +437 -0
parser/__init__.py +3 -0
parser/extract.py +67 -0
processor/__init__.py +3 -0
processor/preprocessor.py +59 -0
recommendation/__init__.py +3 -0
recommendation/ai_recommendation.py +47 -0
requirements.txt +15 -0
similarity/__init__.py +3 -0
similarity/similarity.py +121 -0
skill/__init__.py +3 -0
skill/matcher.py +128 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,174 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc

app.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import gradio as gr
+from parser import PDFExtractor, TextExtractor, DOCXExtractor
+from processor import Preprocessor
+from skill import SkillDynamicMatcher
+from similarity import SentenceTransformerSimilarity
+from recommendation import AiRecommendation
+# Initialize components
+pdf_extractor = PDFExtractor()
+docx_extractor = DOCXExtractor()
+text_extractor = TextExtractor()
+preprocessor = Preprocessor()
+skill_matcher = SkillDynamicMatcher()
+sentence_transformer = SentenceTransformerSimilarity("mixedbread-ai/mxbai-embed-large-v1")
+recommendation = AiRecommendation()
+def extract(file):
+    if file is None:
+        return "No file uploaded."
+    file_path = file if isinstance(file, str) else file.name
+    if file_path.endswith('.pdf'):
+        return pdf_extractor.extract(file_path)
+    elif file_path.endswith('.docx'):
+        return docx_extractor.extract(file_path)
+    elif file_path.endswith('.txt'):
+        return text_extractor.extract(file_path)
+    else:
+        return "Unsupported file type."
+def analyze_files(resume_file, job_description_file):
+    if not resume_file or not job_description_file:
+        return "Please upload both files.", "", "", "", "", ""
+    try:
+        # Extract and process text
+        resume_text = extract(resume_file)
+        jd_text = extract(job_description_file)
+        preprocess_resume = preprocessor.preprocess(resume_text)
+        preprocess_jd = preprocessor.preprocess(jd_text)
+        # Skill matching
+        matched_jd_skills = skill_matcher.extract(jd_text)
+        matched_resume_skills = skill_matcher.extract(resume_text)
+        matched_result = skill_matcher.match(matched_jd_skills, matched_resume_skills)
+        # Create scrollable skill display
+        skill_display = """
+        <div style='
+            max-height: 300px;
+            overflow-y: auto;
+            padding: 10px;
+            border: 1px solid #e0e0e0;
+            border-radius: 5px;
+            margin-bottom: 15px;
+        '>
+        """
+        for skill in matched_jd_skills:
+            if skill in matched_resume_skills:
+                skill_display += f"""
+                <div style='
+                    background-color: #d4edda;
+                    color: #155724;
+                    padding: 5px 10px;
+                    border-radius: 4px;
+                    margin: 5px 0;
+                    display: inline-block;
+                '>✓ {skill}</div>
+                """
+            else:
+                skill_display += f"""
+                <div style='
+                    background-color: #f8d7da;
+                    color: #721c24;
+                    padding: 5px 10px;
+                    border-radius: 4px;
+                    margin: 5px 0;
+                    display: inline-block;
+                '>✗ {skill}</div>
+                """
+        skill_display += "</div>"
+        # Prepare other outputs
+        ratio_text = f"Match Ratio: {matched_result[0]}" if matched_result else "No matches"
+        match_string = f"Match Details: {matched_result[1]}" if matched_result else ""
+        score = sentence_transformer.similarity(preprocess_resume, preprocess_jd)
+        similarity_text = f"Similarity Score: {score:.2f}"
+        return resume_text, jd_text, gr.HTML(skill_display), ratio_text, match_string, similarity_text
+    except Exception as e:
+        return f"Error: {str(e)}", "", "", "", "", ""
+def get_ai_recommendation(resume_file, job_description_file):
+    if not resume_file or not job_description_file:
+        return "Please upload both files first."
+    try:
+        resume_text = extract(resume_file)
+        jd_text = extract(job_description_file)
+        return recommendation.recommend(resume_text, jd_text)
+    except Exception as e:
+        return f"Error: {str(e)}"
+# Custom CSS for scrollable containers
+custom_css = """
+.scrollable-textbox {
+    max-height: 300px;
+    overflow-y: auto !important;
+    border: 1px solid #e0e0e0;
+    border-radius: 5px;
+    padding: 10px;
+}
+.scrollable-textbox textarea {
+    min-height: 300px !important;
+}
+"""
+with gr.Blocks(title="Resume Analyzer", css=custom_css) as demo:
+    gr.Markdown("# 🧠 Smart Resume Analyzer")
+    # File upload
+    with gr.Row():
+        resume_file = gr.File(label="Your Resume", file_types=[".pdf", ".docx", ".txt"])
+        job_description_file = gr.File(label="Job Description", file_types=[".pdf", ".docx", ".txt"])
+    analyze_btn = gr.Button("Analyze Documents", variant="primary")
+    # Results sections
+    with gr.Tab("Extracted Text"):
+        with gr.Accordion("Resume Content", open=False):
+            resume_output = gr.Textbox(
+                label="Resume Text",
+                lines=20,
+                interactive=False,
+                elem_classes=["scrollable-textbox"]
+            )
+        with gr.Accordion("Job Description", open=False):
+            jd_output = gr.Textbox(
+                label="Job Description Text",
+                lines=20,
+                interactive=False,
+                elem_classes=["scrollable-textbox"]
+            )
+    with gr.Tab("Analysis Results"):
+        gr.Markdown("## Skill Matching")
+        skills_output = gr.HTML(label="Skill Comparison")
+        with gr.Row():
+            ratio_output = gr.Textbox(label="Match Ratio", interactive=False)
+            similarity_output = gr.Textbox(label="Similarity Score", interactive=False)
+        match_string_output = gr.Textbox(
+            label="Detailed Matching",
+            interactive=False,
+            elem_classes=["scrollable-textbox"]
+        )
+    with gr.Tab("AI Recommendations"):
+        ai_btn = gr.Button("Generate Recommendations", variant="primary")
+        ai_output = gr.Textbox(
+            label="AI Suggestions",
+            lines=20,
+            interactive=False,
+            elem_classes=["scrollable-textbox"]
+        )
+    # Event handlers
+    analyze_btn.click(
+        analyze_files,
+        inputs=[resume_file, job_description_file],
+        outputs=[resume_output, jd_output, skills_output, ratio_output, match_string_output, similarity_output],
+        scroll_to_output=True
+    )
+    ai_btn.click(
+        get_ai_recommendation,
+        inputs=[resume_file, job_description_file],
+        outputs=[ai_output],
+        scroll_to_output=True
+    )
+demo.launch()

evaluation/evaluation.csv ADDED Viewed

	@@ -0,0 +1,5 @@

+,BERT,TF-IDF,Transformer
+accuracy,0.5127913587265491,0.4860716316088687,0.6031836270608301
+precision,0.5127913587265491,0.0,0.5791925465838509
+recall,1.0,0.0,0.8270509977827051
+f1,0.6779406238256295,0.0,0.6812785388127854

evaluation/evaluation.ipynb ADDED Viewed

	@@ -0,0 +1,437 @@

+{
+ "cells": [
+  {
+   "metadata": {},
+   "cell_type": "raw",
+   "source": "",
+   "id": "f50fa4bf6edb488e"
+  },
+  {
+   "cell_type": "code",
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "import pandas as pd\n",
+    "from datasets import load_dataset\n",
+    "from processor import Preprocessor\n",
+    "from similarity import SentenceTransformerSimilarity, BertSimilarity, TFIDFSimilarity\n",
+    "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-07-08T08:44:53.568166Z",
+     "start_time": "2025-07-08T08:44:53.563295Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "data_path = \"cnamuangtoun/resume-job-description-fit\"",
+   "id": "c5adb8b5a5057b1b",
+   "outputs": [],
+   "execution_count": 2
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-07-08T08:44:59.342259Z",
+     "start_time": "2025-07-08T08:44:55.506716Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "datasets = load_dataset(data_path)\n",
+    "\n",
+    "train_dataset = datasets['train']\n",
+    "test_dataset = datasets['test']"
+   ],
+   "id": "ee31d8d2097183ae",
+   "outputs": [],
+   "execution_count": 3
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-07-08T08:45:14.160133Z",
+     "start_time": "2025-07-08T08:45:09.061062Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "preprocessor = Preprocessor()\n",
+    "sentence_transformer = SentenceTransformerSimilarity(\"mixedbread-ai/mxbai-embed-large-v1\")\n",
+    "bert_similarity = BertSimilarity()\n",
+    "tfidf_similarity = TFIDFSimilarity()"
+   ],
+   "id": "8a5ab5d98b0212c2",
+   "outputs": [],
+   "execution_count": 4
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-07-08T08:45:15.854458Z",
+     "start_time": "2025-07-08T08:45:15.850088Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "def compute_similarity(example):\n",
+    "    resume = preprocessor.preprocess(example[\"resume_text\"])\n",
+    "    job_description = preprocessor.preprocess(example[\"job_description_text\"])\n",
+    "    label = 0 if example[\"label\"] == \"No Fit\" else 1\n",
+    "\n",
+    "    tfidf = tfidf_similarity.similarity(resume, job_description)\n",
+    "    bert = bert_similarity.similarity(resume, job_description)\n",
+    "    transformer = sentence_transformer.similarity(resume, job_description)\n",
+    "\n",
+    "    return {\n",
+    "        \"resume\": resume, \"job_description\": job_description, \"label\": label,\n",
+    "        \"tfidf_similarity\": tfidf, \"bert_similarity\": bert, \"sentence_transformer_similarity\": transformer\n",
+    "    }"
+   ],
+   "id": "1edb7165aa0fa91a",
+   "outputs": [],
+   "execution_count": 5
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-07-08T09:52:30.395538Z",
+     "start_time": "2025-07-08T08:45:18.599810Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "test_dataset = test_dataset.map(compute_similarity)",
+   "id": "49885541fea7acf9",
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Map: 100%|██████████| 1759/1759 [1:06:38<00:00,  2.27s/ examples]\n"
+     ]
+    }
+   ],
+   "execution_count": 6
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-07-05T10:41:18.629642Z",
+     "start_time": "2025-07-05T10:41:18.622846Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "test_dataset",
+   "id": "abf86ee1b574eee8",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['resume_text', 'job_description_text', 'label', 'resume', 'job_description', 'tfidf_similarity', 'bert_similarity', 'sentence_transformer_similarity'],\n",
+       "    num_rows: 1759\n",
+       "})"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 14
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-07-08T10:13:38.193054Z",
+     "start_time": "2025-07-08T10:13:38.189061Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "def convert_to_label(example):\n",
+    "    tfidf = 0 if example[\"tfidf_similarity\"] < 0.5 else 1\n",
+    "    bert = 0 if example[\"bert_similarity\"] < 0.5 else 1\n",
+    "    transformer = 0 if example[\"sentence_transformer_similarity\"] < 0.65 else 1\n",
+    "\n",
+    "    return {\"tfidf\": tfidf, \"bert\": bert, \"transformer\": transformer}\n"
+   ],
+   "id": "9f6448edfb548f6c",
+   "outputs": [],
+   "execution_count": 73
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-07-08T10:13:39.971426Z",
+     "start_time": "2025-07-08T10:13:39.830089Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "test_dataset = test_dataset.map(convert_to_label)",
+   "id": "2d688ab6cef016f3",
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Map: 100%|██████████| 1759/1759 [00:00<00:00, 12958.71 examples/s]\n"
+     ]
+    }
+   ],
+   "execution_count": 74
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-07-08T10:13:41.559694Z",
+     "start_time": "2025-07-08T10:13:41.555362Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "test_dataset",
+   "id": "e1ff007dcc6a9974",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['resume_text', 'job_description_text', 'label', 'resume', 'job_description', 'tfidf_similarity', 'bert_similarity', 'sentence_transformer_similarity', 'tfidf', 'bert', 'transformer'],\n",
+       "    num_rows: 1759\n",
+       "})"
+      ]
+     },
+     "execution_count": 75,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 75
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-07-08T10:14:18.161742Z",
+     "start_time": "2025-07-08T10:14:18.155650Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "actual = test_dataset[\"label\"]\n",
+    "bert_predict = test_dataset[\"bert\"]\n",
+    "tfidf_predict = test_dataset[\"tfidf\"]\n",
+    "transformer_predict = test_dataset[\"transformer\"]"
+   ],
+   "id": "a86f6993fe289ac6",
+   "outputs": [],
+   "execution_count": 76
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-07-08T10:14:31.353217Z",
+     "start_time": "2025-07-08T10:14:31.349873Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "def evaluate(actual, predict):\n",
+    "    accuracy = accuracy_score(actual, predict)\n",
+    "    precision = precision_score(actual, predict)\n",
+    "    recall = recall_score(actual, predict)\n",
+    "    f1 = f1_score(actual, predict)\n",
+    "    \n",
+    "    return {\"accuracy\": accuracy, \"precision\": precision, \"recall\": recall, \"f1\": f1}"
+   ],
+   "id": "b838d444f65bdc9b",
+   "outputs": [],
+   "execution_count": 79
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-07-08T10:14:32.609153Z",
+     "start_time": "2025-07-08T10:14:32.583900Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "bert_evaluate = evaluate(actual, bert_predict)\n",
+    "tfidf_evaluate = evaluate(actual, tfidf_predict)\n",
+    "transformer_evaluate = evaluate(actual, transformer_predict)"
+   ],
+   "id": "6aefb66598a946a8",
+   "outputs": [],
+   "execution_count": 80
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-07-08T10:14:34.137687Z",
+     "start_time": "2025-07-08T10:14:34.133737Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "print(\"bert evaluate : \", bert_evaluate)\n",
+    "print(\"tfidf evaluate : \", tfidf_evaluate)\n",
+    "print(\"transformer evaluate : \", transformer_evaluate)"
+   ],
+   "id": "f529a915f203f1f2",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "bert evaluate :  {'accuracy': 0.5127913587265491, 'precision': 0.5127913587265491, 'recall': 1.0, 'f1': 0.6779406238256295}\n",
+      "tfidf evaluate :  {'accuracy': 0.4860716316088687, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0}\n",
+      "transformer evaluate :  {'accuracy': 0.6031836270608301, 'precision': 0.5791925465838509, 'recall': 0.8270509977827051, 'f1': 0.6812785388127854}\n"
+     ]
+    }
+   ],
+   "execution_count": 81
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-07-08T10:14:35.938911Z",
+     "start_time": "2025-07-08T10:14:35.935413Z"
+    }
+   },
+   "cell_type": "code",
+   "source": [
+    "data = {\n",
+    "    'BERT': bert_evaluate,\n",
+    "    'TF-IDF': tfidf_evaluate,\n",
+    "    'Transformer': transformer_evaluate\n",
+    "}\n",
+    "\n",
+    "dataframe = pd.DataFrame(data)"
+   ],
+   "id": "b9260a6b2ba65916",
+   "outputs": [],
+   "execution_count": 82
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-07-08T09:56:17.793633Z",
+     "start_time": "2025-07-08T09:56:17.786996Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "dataframe",
+   "id": "e67956d4d96cfb3",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "               BERT    TF-IDF  Transformer\n",
+       "accuracy   0.512791  0.486072     0.603184\n",
+       "precision  0.512791  0.000000     0.579193\n",
+       "recall     1.000000  0.000000     0.827051\n",
+       "f1         0.677941  0.000000     0.681279"
+      ],
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>BERT</th>\n",
+       "      <th>TF-IDF</th>\n",
+       "      <th>Transformer</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>accuracy</th>\n",
+       "      <td>0.512791</td>\n",
+       "      <td>0.486072</td>\n",
+       "      <td>0.603184</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>precision</th>\n",
+       "      <td>0.512791</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.579193</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>recall</th>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.827051</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>f1</th>\n",
+       "      <td>0.677941</td>\n",
+       "      <td>0.000000</td>\n",
+       "      <td>0.681279</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 40
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-07-08T09:56:37.438085Z",
+     "start_time": "2025-07-08T09:56:37.425383Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "dataframe.to_csv(\"evaluation.csv\")",
+   "id": "a14601b6c4afef68",
+   "outputs": [],
+   "execution_count": 41
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

parser/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .extract import PDFExtractor, DOCXExtractor, TextExtractor
2	+
3	+ __all__ = ["PDFExtractor", "DOCXExtractor", "TextExtractor"]

parser/extract.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import pdfplumber
+from docx import Document
+from abc import ABC, abstractmethod
+class Extractor(ABC):
+    """
+    Abstract base class for extracting text from files.
+    """
+    @abstractmethod
+    def extract(self, path):
+        """
+        Abstract method to extract text from files.
+        :param path: the file path
+        :return: extracted text
+        """
+        pass
+class PDFExtractor(Extractor):
+    """
+    Extract text from PDF files.
+    """
+    def extract(self, path):
+        """
+        Extract text from PDF file.
+        :param path: the file path
+        :return: extracted text
+        """
+        text = ""
+        with pdfplumber.open(path) as pdf:
+            for page in pdf.pages:
+                text += page.extract_text()
+        return text
+class DOCXExtractor(Extractor):
+    """
+    Extract text from DOCX files.
+    """
+    def extract(self, path):
+        """
+        Extract text from DOCX file.
+        :param path: the file path
+        :return: extracted text
+        """
+        text = ""
+        document = Document(path)
+        for paragraph in document.paragraphs:
+            text += paragraph.text
+        return text
+class TextExtractor(Extractor):
+    """
+    Extract text from .txt file.
+    """
+    def extract(self, path):
+        """
+        Extract text from .txt file.
+        :param path: the file path
+        :return: extracted text
+        """
+        with open(path, "r") as file:
+            text = file.read()
+        return text

processor/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .preprocessor import Preprocessor
2	+
3	+ __all__ = ["Preprocessor"]

processor/preprocessor.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import nltk
+import spacy
+import re
+from nltk.tokenize import word_tokenize
+from nltk.corpus import stopwords
+class Preprocessor:
+    """
+    This class provides methods to perform text preprocessing including tokenization,
+     stopword removal, lemmatization, and basic text cleaning.
+    """
+    def __init__(self, nltk_resource="all", spacy_model="en_core_web_sm", language="english"):
+        """
+        The constructor for download the nltk resource and spacy model.
+        :param nltk_resource: nltk resource
+        :param spacy_model: spacy model
+        :param language: the main language
+        """
+        try:
+            if nltk_resource == "all":
+                nltk.data.find(f"corpora")
+            else:
+                nltk.data.find(f"corpora/{nltk_resource}")
+        except LookupError:
+            nltk.download(nltk_resource)
+        if not spacy.util.is_package(spacy_model):
+            spacy.cli.download(spacy_model)
+        self.nlp = spacy.load(spacy_model)
+        self.stop_word = set(stopwords.words(language))
+    def preprocess(self, text):
+        """
+        This method performs text cleaning, tokenization, and lemmatization.
+        :param text: the text to be preprocessed
+        :return: the preprocessed text
+        """
+        text = re.sub(r'(\w+):', r'\1:\n', text)
+        text = re.sub(r'([a-z])([A-Z])', r'\1 \n\2', text)
+        text = re.sub(r'\s+', ' ', text)
+        text = re.sub(r'[^\w\s]', '', text)
+        text = text.lower()
+        tokens = word_tokenize(text)
+        tokens = [token for token in tokens if token not in self.stop_word]
+        doc = self.nlp(" ".join(tokens))
+        tokens = [token.lemma_ for token in doc]
+        return " ".join(tokens)

recommendation/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .ai_recommendation import AiRecommendation
2	+
3	+ __all__ = ["AiRecommendation"]

recommendation/ai_recommendation.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from transformers import AutoModelForCausalLM, AutoTokenizer
+class AiRecommendation:
+    """
+    Generates resume improvement suggestions based on a job description using a causal language model.
+    """
+    def __init__(self, model_name="HuggingFaceTB/SmolLM2-1.7B-Instruct"):
+        """
+        Initializes the tokenizer and model with the specified model name.
+        :param model_name: The name of the model to use.
+        """
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForCausalLM.from_pretrained(model_name)
+    def recommend(self, resume, job_description):
+        """
+        Generates targeted resume improvement advice based on a job description.
+        :param resume: The user resume
+        :param job_description: The job description
+        :return: The recommendation
+        """
+        prompt = f"""
+        Given the following resume and job description, provide specific, actionable recommendations to improve the resume so it better matches the job description.
+        Resume:
+        {resume}
+        Job Description:
+        {job_description}
+        """
+        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
+        outputs = self.model.generate(
+            **inputs,
+            max_new_tokens=300,
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.9,
+            eos_token_id=self.tokenizer.eos_token_id,
+        )
+        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        answer_only = generated_text[len(prompt):].strip()
+        return answer_only

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+datasets==3.6.0
+transformers==4.53.0
+nltk==3.9.1
+pdfplumber==0.11.7
+scikit-learn==1.7.0
+sentence-transformers==5.0.0
+spacy==3.8.7
+pandas==2.3.0
+matplotlib==3.10.3
+RapidFuzz==3.13.0
+python-docx==1.2.0
+streamlit==1.46.1
+torchvision==0.22.1
+huggingface-hub==0.33.2
+gradio==5.35.0

similarity/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .similarity import SentenceTransformerSimilarity, BertSimilarity, TFIDFSimilarity
2	+
3	+ __all__ = ["SentenceTransformerSimilarity", "BertSimilarity", "TFIDFSimilarity"]

similarity/similarity.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import torch
+from sentence_transformers import SentenceTransformer
+from transformers import BertTokenizerFast, BertModel
+import torch.nn.functional as F
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from sentence_transformers.util import cos_sim
+class TFIDFSimilarity:
+    """
+    A class for computing sentence similarity using a TFIDF model.
+    """
+    def __init__(self):
+        """
+        Initializes the TfidfVectorizer.
+        """
+        self.model = TfidfVectorizer(ngram_range=(1, 1))
+    def encode(self, sentence1, sentence2):
+        """
+        Encode the sentence1 and sentence2 using the TFIDF model.
+        :param sentence1: The first sentence.
+        :param sentence2: The second sentence.
+        :return: The encode vector of the sentence1 and sentence2.
+        """
+        encodes = self.model.fit_transform([sentence1, sentence2])
+        return encodes[0], encodes[1]
+    def similarity(self, sentence1, sentence2):
+        """
+        Calculates cosine similarity between two sentence.
+        :param sentence1: First sentence text.
+        :param sentence2: Second sentence text.
+        :return: Cosine similarity score between the two sentence.
+        """
+        embedding1, embedding2 = self.encode(sentence1, sentence2)
+        return cosine_similarity(embedding1, embedding2)[0][0]
+class SentenceTransformerSimilarity:
+    """
+    A class for computing sentence similarity using a SentenceTransformer model.
+    """
+    def __init__(self, model_name="all-MiniLM-L6-v2"):
+        """
+        Initializes the SentenceTransformerSimilarity with a given model.
+        :param model_name: The name of the pretrained SentenceTransformer model
+        """
+        self.model = SentenceTransformer(model_name)
+    def encode(self, sentence):
+        """
+        Encodes a sentence into a dense vector representation.
+        :param sentence: Input sentence to encode.
+        :return: Embedding vector of the sentence.
+        """
+        return self.model.encode(sentence)
+    def similarity(self, sentence1, sentence2):
+        """
+        Calculates cosine similarity between two sentence.
+        :param sentence1: First sentence text.
+        :param sentence2: Second sentence text.
+        :return: Cosine similarity score between the two sentence.
+        """
+        embedding1 = self.encode(sentence1)
+        embedding2 = self.encode(sentence2)
+        return cos_sim(embedding1, embedding2).item()
+class BertSimilarity:
+    """
+    A class for computing sentence similarity using a pretrained BERT model.
+    """
+    def __init__(self, model_name="google-bert/bert-base-uncased"):
+        """
+        Initializes the BertSimilarity with a given BERT model.
+        :param model_name: The name of the pretrained BERT model.
+        """
+        self.model = BertModel.from_pretrained(model_name)
+        self.tokenizer = BertTokenizerFast.from_pretrained(model_name)
+    def encode(self, sentence):
+        """
+        Encodes a sentence into a dense vector using the average of the last 4 hidden layers.
+        :param sentence: Input sentence to encode.
+        :return: Sentence embedding tensor.
+        """
+        inputs = self.tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
+        with torch.no_grad():
+            outputs = self.model(**inputs, output_hidden_states=True)
+        last_4_layers = outputs.hidden_states[-4:]
+        layer_pooled = [torch.mean(layer, dim=1) for layer in last_4_layers]
+        sentence_embedding = torch.mean(torch.stack(layer_pooled), dim=0)
+        return sentence_embedding
+    def similarity(self, sentence1, sentence2):
+        """
+        Calculates cosine similarity between two sentence.
+        :param sentence1: First sentence text.
+        :param sentence2: Second sentence text.
+        :return: Cosine similarity score between the two sentence.
+        """
+        embedding1 = self.encode(sentence1)
+        embedding2 = self.encode(sentence2)
+        return F.cosine_similarity(embedding1, embedding2).item()

skill/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .matcher import SkillListMatcher, SkillDynamicMatcher
2	+
3	+ __all__ = ["SkillListMatcher", "SkillDynamicMatcher"]

skill/matcher.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import spacy
+from rapidfuzz import process
+from nltk import ngrams
+from nltk.tokenize import word_tokenize
+from huggingface_hub import snapshot_download
+class SkillListMatcher:
+    """
+    Provides methods to extract and match skills from text.
+    """
+    def __init__(self, spacy_model= "en_core_web_sm"):
+        """
+        Initializes the matcher and loads the spaCy model.
+        :param spacy_model: Name of the spaCy model to load.
+        """
+        if not spacy.util.is_package(spacy_model):
+            spacy.cli.download(spacy_model)
+        self.nlp = spacy.load(spacy_model)
+    def __lemmatization(self, skills):
+        """
+        Lemmatizes a list of skills.
+        :param skills: List of skill strings.
+        :return: List of lemmatized skills.
+        """
+        new_skills = []
+        for i in range(len(skills)):
+            skill = skills[i]
+            doc = self.nlp(skill)
+            tokens = [token.lemma_ for token in doc]
+            new_skills.append(" ".join(tokens).lower().strip())
+        return new_skills
+    def extract(self, text, skills, threshold=95):
+        """
+        Extracts relevant skills from the given text.
+        :param text: The input text.
+        :param skills: List of reference skill strings.
+        :param threshold: Threshold for matching skills.
+        :return: List of matched skills found in the text.
+        """
+        text = text.lower()
+        tokens = word_tokenize(text)
+        candidates = set()
+        for n in range(1, 5):
+            for gram in ngrams(tokens, n):
+                phrase = ' '.join(gram)
+                candidates.add(phrase)
+        new_skills = self.__lemmatization(skills)
+        found_skills = set()
+        for phrase in candidates:
+            match, score, _ = process.extractOne(phrase, new_skills)
+            if score >= threshold:
+                found_skills.add(match)
+        return list(found_skills)
+    def match(self, main_skills, extract_skills):
+        """
+        Matches extracted skills with main skills.
+        :param main_skills: List of target skill strings.
+        :param extract_skills: List of extracted skill strings.
+        :return: Tuple of match ratio and formatted match string.
+        """
+        main_skills = self.__lemmatization(main_skills)
+        extract_skills = self.__lemmatization(extract_skills)
+        count = 0
+        for skill in extract_skills:
+            if skill in main_skills:
+                count += 1
+        return count / len(main_skills), f"{count}/{len(main_skills)}"
+class SkillDynamicMatcher:
+    """
+    Extracts and matches skills using a trained spaCy NER model.
+    """
+    def __init__(self, model_path="amjad-awad/skill-extractor"):
+        """
+        Initializes the NER model from the specified path.
+        :param model_path: Path to the trained NER model.
+        """
+        model_path = snapshot_download(model_path, repo_type="model")
+        self.ner_model = spacy.load(model_path)
+    def extract(self, text):
+        """
+        Extracts skill entities from the input text.
+        :param text: The input text.
+        :return: List of extracted skill entities.
+        """
+        skills = []
+        doc = self.ner_model(text)
+        for ent in doc.ents:
+            if "SKILLS" in ent.label_:
+                skills.append(ent.text.lower())
+        return list(set(skills))
+    def match(self, main_skills, extract_skills):
+        """
+        Matches extracted skills with main skills.
+        :param main_skills: List of target skill strings.
+        :param extract_skills: List of extracted skill strings.
+        :return: Tuple of match ratio and formatted match string.
+        """
+        count = 0
+        for skill in extract_skills:
+            if skill in main_skills:
+                count += 1
+        return count / len(main_skills), f"{count}/{len(main_skills)}"