Training-Committee-NIT-Rourkela · kunalkushwahatg · Mar 18, 2025 · Mar 18, 2025 · Mar 20, 2025 · Mar 20, 2025
diff --git a/main.py b/main.py
@@ -1,128 +1,241 @@
-from langchain.agents import initialize_agent, AgentType
-from langchain.llms import HuggingFaceHub
-from langchain.tools import Tool
-import fitz
-from googleapiclient.discovery import build
 import os
+import re
+import json
+import fitz
+import ast
+import logging
 from dotenv import load_dotenv
+from langchain.agents import create_react_agent, AgentExecutor
+from langchain_huggingface import HuggingFaceEndpoint
+from langchain_core.tools import Tool
+from langchain import hub
+from typing import List, Dict
+from googleapiclient.discovery import build
+from googleapiclient.errors import HttpError
+from flask import Flask, request, jsonify
 
 # Load environment variables
 load_dotenv()
-
 HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")
 YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
-PDF_PATH = "resume.pdf"
-JOB_DESCRIPTION_PDF_PATH = "JD.pdf"
+
+#flask app setup
+app = Flask(__name__)
+
+# Logging setup
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+if not HUGGINGFACEHUB_API_TOKEN:
+    raise ValueError("Hugging Face API token is missing.")
+
+if not YOUTUBE_API_KEY:
+    raise ValueError("YouTube API key is missing. Please set YOUTUBE_API_KEY in your .env file.")
+
 
 # Initialize Llama 3.2 model from Hugging Face Hub
-llama_llm = HuggingFaceHub(
-    repo_id="meta-llama/Meta-Llama-3-8B-Instruct",  # Adjust if needed for Llama 3.2
-    model_kwargs={"temperature": 0.7, "max_length": 500},
-    huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN
+llama_llm = HuggingFaceEndpoint(
+    repo_id="meta-llama/Meta-Llama-3-8B-Instruct",
+    temperature=0.5,
+    huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN,
 )
 
-def extract_text_from_pdf(pdf_path):
-    """Extract text from a PDF using PyMuPDF."""
-    doc = fitz.open(pdf_path)
-    text = ""
-    for page in doc:
-        text += page.get_text("text")
-    return text
-
-def analyze_resume_with_llama(combined_text: str):
-    """Use Llama 3.2 to analyze a resume against a job description."""
-    analysis_prompt = f"""
-    Based on the resume and job description below, provide EXACTLY 3 specific areas for improvement.
-    Prioritize technical skills over managerial or soft skills.
-    If there are no more technical skills to improve, then suggest managerial or soft skills.
-
-    Format each area as a new line starting with '- IMPROVE: '
-    Focus only on listing the improvement areas, do not provide any other analysis.
+#initialize youtube api
+youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)
+
+#load react template from langchain hub
+prompt_template = hub.pull("hwchase17/react")
+
+#query for the react agent
+query = (
+                "You are an AI assistant that evaluates resumes based on job descriptions. "
+                "Use the analyze_resume tool to analyze the combined resume and job description text, "
+                "and search YouTube for relevant resources using the search_youtube_videos tool. "
+                "Provide short descriptions of the improvement areas as the final answer."
+            )
+
+def extract_json_from_text(text: str) -> Dict:
+    """Extract and parse JSON from a text string safely."""
+    try:
+        match = re.search(r"\{.*\}", text, re.DOTALL)  # Extract JSON block
+        if match:
+            json_string = match.group(0)  
+            return json.loads(json_string)  # Parse JSON
+    except json.JSONDecodeError as e:
+        logger.error(f"Failed to parse JSON: {e}")
 
-    {combined_text}
+    logger.error("No valid JSON found in the text.")
+    return {}
+
+
+def extract_text_from_pdf(pdf_stream):
+    """Extract text from a PDF file stream using PyMuPDF."""
+    try:
+        doc = fitz.open(stream=pdf_stream, filetype="pdf")
+        text = "\n".join(page.get_text("text") for page in doc)
+        return text
+    except Exception as e:
+        logging.error(f"Error extracting text from PDF stream: {e}")
+        return ""
+
+
+def combine_text(resume_text: str, job_description_text: str) -> str:
+    """Combine resume and job description text into a single formatted string."""
+    return f"Resume: {resume_text}\n\nJob Description: {job_description_text}"
+
+
+def extract_list_from_text(text: str) -> list | None:
+    """Extract a Python list from text using regex and safe parsing."""
+    match = re.search(r"\[.*?\]", text)
+    if match:
+        try:
+            extracted_list = ast.literal_eval(match.group(0))
+            return extracted_list if isinstance(extracted_list, list) else None
+        except (SyntaxError, ValueError):
+            logging.warning("Failed to parse list from text.")
+    return None
+
+
+def analyze_resume(combined_text: str) -> List[Dict[str, str]]:
     """
-    return llama_llm.predict(analysis_prompt)
-
-def extract_improvement_areas(analysis_text):
-    """Extract improvement areas from the analysis text."""
-    improvements = []
-    for line in analysis_text.split('\n'):
-        if line.strip().startswith('- IMPROVE:'):
-            improvement = line.replace('- IMPROVE:', '').strip()
-            if improvement:
-                improvements.append(improvement)
-    return improvements
-
-def search_youtube_videos(query):
-    """Search YouTube for videos related to the improvement areas."""
-    youtube = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)
-    request = youtube.search().list(
-        part="snippet",
-        q=query,
-        maxResults=3
-    )
-    response = request.execute()
-
-    video_urls = []
-    for item in response.get("items", []):
-        if "videoId" in item["id"]:
-            video_urls.append(f"https://www.youtube.com/watch?v={item['id']['videoId']}")
-
-    return video_urls
+    Analyze a resume against a job description and return 3 technical improvement areas.
+    """
+    prompt = f"""
+Analyze the following resume and job description to identify **EXACTLY 3 key technical improvement areas**.  
+Focus strictly on **technical skills**, not managerial or soft skills.  
+
+### Input:  
+{combined_text}  
+
+### Output Format (STRICTLY FOLLOW THIS):  
+Return **only one valid JSON object** without any extra text, explanations, or multiple outputs.  
+Ensure the JSON structure exactly matches the format below:
+
+{{
+    "Improvement Areas": [
+        {{
+            "Title": "Improvement Area 1",
+            "Description": "Brief description of the first improvement area."
+        }},
+        {{
+            "Title": "Improvement Area 2",
+            "Description": "Brief description of the second improvement area."
+        }},
+        {{
+            "Title": "Improvement Area 3",
+            "Description": "Brief description of the third improvement area."
+        }}
+    ]
+}}
+"""
 
-def main():
     try:
-        print("Extracting text from PDFs...")
-        resume_text = extract_text_from_pdf(PDF_PATH)
-        job_description_text = extract_text_from_pdf(JOB_DESCRIPTION_PDF_PATH)
-
-        analysis_prompt = f"""
-        You are an AI that evaluates resumes based on job descriptions.
-
-        Resume:
-        {resume_text}
-
-        Job Description:
-        {job_description_text}
-
-        Provide EXACTLY 3 specific areas for improvement.
-        Prioritize technical skills first.
-        Each area must be on a new line starting with '- IMPROVE: '
-        """
-
-        print("\nAnalyzing resume...")
-        improvements = analyze_resume_with_llama(analysis_prompt)
-        improvement_areas = extract_improvement_areas(improvements)
+        analysis = llama_llm.invoke(prompt) 
+        improvement_areas = extract_json_from_text(analysis).get("Improvement Areas", [])
 
         if not improvement_areas:
-            print("\nNo specific improvement areas identified. Retrying with a stricter prompt...")
-            retry_prompt = f"""
-            Reanalyze this resume and job description.
-            List EXACTLY 3 areas for improvement, prioritizing technical skills first.
-            Each area MUST start with '- IMPROVE: '
-
-            Resume: {resume_text}
-            Job Description: {job_description_text}
-            """
-            improvements = analyze_resume_with_llama(retry_prompt)
-            improvement_areas = extract_improvement_areas(improvements)
-
-        print("\nStored Improvement Areas:")
-        for area in improvement_areas:
-            print(f"- {area}")
-
-        print("\nSearching YouTube for relevant videos...")
-        video_recommendations = {}
-        for area in improvement_areas:
-            video_recommendations[area] = search_youtube_videos(area)
-
-        for area, videos in video_recommendations.items():
-            print(f"\nVideos for improvement area: {area}")
-            for video in videos:
-                print(video)
-
+            logger.warning("No improvement areas found in the analysis.")
+            return []
+
+        return improvement_areas
+
     except Exception as e:
-        print(f"\nAn error occurred: {str(e)}")
-        raise
+        logger.error(f"Error in resume analysis: {e}")
+        return [{"Title": "Error", "Description": "Failed to analyze resume."}]
+
+
+
+
+def search_youtube_videos(queries: List[str]) -> Dict[str, List[str]]:
+    """
+    Search YouTube for videos related to the given queries and return links to relevant videos.
+    """
+    recommended_videos = {}
+
+    for query in queries:
+        try:
+            request = youtube.search().list(
+                part="snippet", q=query, maxResults=10 # get top 7 results may contain video or playlist
+            )
+            response = request.execute()
+
+            video_urls = [
+                f"https://www.youtube.com/watch?v={item['id']['videoId']}"
+                for item in response.get("items", [])
+                if "videoId" in item["id"]
+            ]
+
+            if not video_urls:
+                logging.warning(f"No video results for query: {query}")
+                continue  # Skip retrying the exact same query
+
+            # Get top 3 videos out of the 10 results
+            recommended_videos[query] = video_urls[:3]
+
+        except HttpError as e:
+            logging.error(f"YouTube API error for query '{query}': {e}")
+            recommended_videos[query] = ["YouTube API error occurred"]
+
+        except Exception as e:
+            logging.error(f"Unexpected error for query '{query}': {e}")
+            recommended_videos[query] = ["An error occurred"]
+
+    return recommended_videos
+
+
+def create_tools(combined_text: str) -> list:
+    """Create Langchain tools for resume analysis and YouTube search."""
+    return [
+        Tool(
+            name="analyze_resume",
+            func=lambda _: analyze_resume(combined_text),
+            description="Analyze a resume against a job description and return improvement areas.",
+        ),
+        Tool(
+            name="search_youtube_videos",
+            func=lambda queries: search_youtube_videos(extract_list_from_text(queries)),
+            description="Search YouTube for videos related to the improvement areas.",
+        ),
+    ]
+
+@app.route('/analyze', methods=['POST'])
+def analyze():
+    if request.method == 'POST':
+        if 'resume' not in request.files or 'jd' not in request.files:
+            return jsonify({"error": "Both resume and job description PDFs are required"}), 400
+        resume_file = request.files['resume']
+        jd_file = request.files['jd']
+
+        if resume_file.filename == '' or jd_file.filename == '':
+            return jsonify({"error": "Files must have valid names"}), 400
+
+        try:
+            resume_text = extract_text_from_pdf(resume_file.read())
+            job_description_text = extract_text_from_pdf(jd_file.read())
+
+            combined_text = combine_text(resume_text, job_description_text)
+            tools = create_tools(combined_text)
+
+            react_agent = create_react_agent(llm=llama_llm, tools=tools, prompt=prompt_template)
+            agent_executor = AgentExecutor(agent=react_agent, tools=tools, verbose=True, return_intermediate_steps=True)            
+
+            response = agent_executor.invoke({"input": query})
+            intermediate_steps = response["intermediate_steps"]
+
+            improvement_areas = intermediate_steps[0][1]
+            youtube_links = intermediate_steps[1][1]
+
+            return jsonify({
+                "improvement_areas": improvement_areas,
+                "youtube_links": youtube_links
+            })
+
+        except Exception as e:
+            logger.error(f"Error during analysis: {e}")
+            return jsonify({"error": str(e)}), 500
+
+if __name__ == "__main__":
+    app.run(debug=False, host='0.0.0.0', port=5000)
+
 
-main()
diff --git a/requirements.txt b/requirements.txt
@@ -79,3 +79,4 @@ uritemplate==4.1.1
 urllib3==2.3.0
 yarl==1.18.3
 zstandard==0.23.0
+flask==3.1.0