| import os |
| import fitz |
| import docx |
| import numpy as np |
| import gradio as gr |
| import re |
| from sentence_transformers import SentenceTransformer, CrossEncoder |
| from sklearn.metrics.pairwise import cosine_similarity |
|
|
| |
| |
| |
|
|
| bi_encoder = SentenceTransformer("BAAI/bge-base-en") |
| cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2") |
|
|
|
|
| |
| |
| |
|
|
| def extract_text(file_path): |
| if file_path.endswith(".pdf"): |
| text = "" |
| with fitz.open(file_path) as doc: |
| for page in doc: |
| text += page.get_text() |
| return text |
|
|
| if file_path.endswith(".docx"): |
| d = docx.Document(file_path) |
| return "\n".join(p.text for p in d.paragraphs) |
|
|
| return "" |
|
|
|
|
| |
| |
| |
|
|
| def clean_text(t): |
| t = t.lower() |
| t = re.sub(r"\s+", " ", t) |
| return t |
|
|
|
|
| |
| |
| |
|
|
| def embed_chunks(text, size=400): |
| chunks = [text[i:i+size] for i in range(0, len(text), size)] |
| embs = bi_encoder.encode(chunks) |
| return np.mean(embs, axis=0) |
|
|
|
|
| |
| |
| |
|
|
| SKILLS = [ |
| "python","java","sql","aws","docker","kubernetes", |
| "machine learning","pytorch","tensorflow", |
| "react","node","linux" |
| ] |
|
|
| def skill_score(job, cv): |
| job_skills = [s for s in SKILLS if s in job] |
| if not job_skills: |
| return 0 |
| matched = sum(s in cv for s in job_skills) |
| return matched / len(job_skills) |
|
|
|
|
| |
| |
| |
|
|
| def extract_years(text): |
| nums = re.findall(r"(\d+)\+?\s+years?", text) |
| return max([int(n) for n in nums], default=0) |
|
|
|
|
| |
| |
| |
|
|
| def rank_cvs(job_description, files): |
|
|
| if not files: |
| return "Upload CVs." |
|
|
| job_description = clean_text(job_description) |
|
|
| |
| job_emb = embed_chunks(job_description) |
|
|
| candidates = [] |
|
|
| |
| |
| |
| for f in files: |
| name = os.path.basename(f) |
|
|
| text = clean_text(extract_text(f)) |
| if not text: |
| continue |
|
|
| emb = embed_chunks(text) |
|
|
| sim = cosine_similarity([job_emb], [emb])[0][0] |
|
|
| candidates.append({ |
| "name": name, |
| "text": text, |
| "sim": sim |
| }) |
|
|
| |
| candidates = sorted(candidates, key=lambda x: x["sim"], reverse=True)[:20] |
|
|
|
|
| |
| |
| |
| pairs = [[job_description, c["text"][:3000]] for c in candidates] |
| ce_scores = cross_encoder.predict(pairs) |
|
|
| for c, ce in zip(candidates, ce_scores): |
| c["ce"] = ce |
|
|
|
|
| |
| |
| |
| for c in candidates: |
|
|
| s_score = skill_score(job_description, c["text"]) |
| years = extract_years(c["text"]) |
|
|
| final = ( |
| 0.5 * c["ce"] + |
| 0.3 * s_score + |
| 0.2 * min(years/10,1) |
| ) |
|
|
| c["final"] = final |
|
|
|
|
| |
| |
| |
| candidates = sorted(candidates, key=lambda x: x["final"], reverse=True) |
|
|
|
|
| |
| |
| |
| output = "" |
| for i, c in enumerate(candidates[:10]): |
| output += ( |
| f"### {i+1}. {c['name']}\n" |
| f"- Final Score: {c['final']:.3f}\n" |
| f"- Semantic: {c['ce']:.3f}\n" |
| f"- Skill Match: {skill_score(job_description,c['text']):.2f}\n" |
| f"- Years: {extract_years(c['text'])}\n\n" |
| ) |
|
|
| return output |
|
|
|
|
| |
| |
| |
|
|
| demo = gr.Interface( |
| fn=rank_cvs, |
| inputs=[ |
| gr.Textbox(label="Job Description", lines=6), |
| gr.File(file_count="multiple", type="filepath") |
| ], |
| outputs=gr.Markdown(), |
| title="Production CV Ranker" |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|