Spaces:

DevNumb
/

chatbot

Sleeping

App Files Files Community

chatbot / app.py

DevNumb

Update app.py

362982f verified 2 months ago

raw

history blame contribute delete

4.33 kB

	import os
	import fitz
	import docx
	import numpy as np
	import gradio as gr
	import re
	from sentence_transformers import SentenceTransformer, CrossEncoder
	from sklearn.metrics.pairwise import cosine_similarity

	# -----------------------
	# MODELS (better choices)
	# -----------------------

	bi_encoder = SentenceTransformer("BAAI/bge-base-en") # better embeddings
	cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")


	# -----------------------
	# TEXT EXTRACTION
	# -----------------------

	def extract_text(file_path):
	if file_path.endswith(".pdf"):
	text = ""
	with fitz.open(file_path) as doc:
	for page in doc:
	text += page.get_text()
	return text

	if file_path.endswith(".docx"):
	d = docx.Document(file_path)
	return "\n".join(p.text for p in d.paragraphs)

	return ""


	# -----------------------
	# CLEANING
	# -----------------------

	def clean_text(t):
	t = t.lower()
	t = re.sub(r"\s+", " ", t)
	return t


	# -----------------------
	# CHUNK EMBEDDINGS (IMPORTANT)
	# -----------------------

	def embed_chunks(text, size=400):
	chunks = [text[i:i+size] for i in range(0, len(text), size)]
	embs = bi_encoder.encode(chunks)
	return np.mean(embs, axis=0)


	# -----------------------
	# SKILL MATCHING
	# -----------------------

	SKILLS = [
	"python","java","sql","aws","docker","kubernetes",
	"machine learning","pytorch","tensorflow",
	"react","node","linux"
	]

	def skill_score(job, cv):
	job_skills = [s for s in SKILLS if s in job]
	if not job_skills:
	return 0
	matched = sum(s in cv for s in job_skills)
	return matched / len(job_skills)


	# -----------------------
	# EXPERIENCE EXTRACTION (simple rule)
	# -----------------------

	def extract_years(text):
	nums = re.findall(r"(\d+)\+?\s+years?", text)
	return max([int(n) for n in nums], default=0)


	# -----------------------
	# MAIN RANKING
	# -----------------------

	def rank_cvs(job_description, files):

	if not files:
	return "Upload CVs."

	job_description = clean_text(job_description)

	# embed job once
	job_emb = embed_chunks(job_description)

	candidates = []

	# ----------------
	# Stage 1: Fast retrieval
	# ----------------
	for f in files:
	name = os.path.basename(f)

	text = clean_text(extract_text(f))
	if not text:
	continue

	emb = embed_chunks(text)

	sim = cosine_similarity([job_emb], [emb])[0][0]

	candidates.append({
	"name": name,
	"text": text,
	"sim": sim
	})

	# shortlist top 20
	candidates = sorted(candidates, key=lambda x: x["sim"], reverse=True)[:20]


	# ----------------
	# Stage 2: Cross-encoder rerank (accuracy boost)
	# ----------------
	pairs = [[job_description, c["text"][:3000]] for c in candidates]
	ce_scores = cross_encoder.predict(pairs)

	for c, ce in zip(candidates, ce_scores):
	c["ce"] = ce


	# ----------------
	# Stage 3: Business logic scoring
	# ----------------
	for c in candidates:

	s_score = skill_score(job_description, c["text"])
	years = extract_years(c["text"])

	final = (
	0.5 * c["ce"] + # semantic accuracy
	0.3 * s_score + # skills
	0.2 * min(years/10,1) # experience
	)

	c["final"] = final


	# ----------------
	# sort final
	# ----------------
	candidates = sorted(candidates, key=lambda x: x["final"], reverse=True)


	# ----------------
	# Explainable output
	# ----------------
	output = ""
	for i, c in enumerate(candidates[:10]):
	output += (
	f"### {i+1}. {c['name']}\n"
	f"- Final Score: {c['final']:.3f}\n"
	f"- Semantic: {c['ce']:.3f}\n"
	f"- Skill Match: {skill_score(job_description,c['text']):.2f}\n"
	f"- Years: {extract_years(c['text'])}\n\n"
	)

	return output


	# -----------------------
	# UI
	# -----------------------

	demo = gr.Interface(
	fn=rank_cvs,
	inputs=[
	gr.Textbox(label="Job Description", lines=6),
	gr.File(file_count="multiple", type="filepath")
	],
	outputs=gr.Markdown(),
	title="Production CV Ranker"
	)

	if __name__ == "__main__":
	demo.launch()