| """ |
| ============================================================= |
| DATABASE LOADER - 200 DATASETS KA MANAGER |
| ============================================================= |
| Yeh file saare 200 datasets ko load, process, aur |
| vector store mein store karti hai. |
| Sab kuch background mein hota hai - user ko kuch nahi dikhta. |
| ============================================================= |
| """ |
|
|
| import os |
| import gc |
| import json |
| import time |
| import hashlib |
| import logging |
| import numpy as np |
| from typing import List, Dict, Optional, Tuple, Any |
| from pathlib import Path |
| from tqdm import tqdm |
| from concurrent.futures import ThreadPoolExecutor, as_completed |
|
|
| from config import Config, DatasetConfig |
| from utils import ( |
| clean_text, truncate_text, timer_decorator, |
| setup_logger, format_size, cleanup_memory |
| ) |
|
|
| logger = setup_logger("DatasetLoader") |
|
|
|
|
| |
| |
| |
| class DatasetProcessor: |
| """ |
| Individual dataset ko load aur process karne ka kaam. |
| Har dataset ka format alag hota hai, toh yeh class |
| sab handle karti hai. |
| """ |
| |
| def __init__(self): |
| self.loaded_count = 0 |
| self.failed_count = 0 |
| self.total_documents = 0 |
| self.cache_dir = Config.datasets.CACHE_DIR |
| |
| |
| os.makedirs(self.cache_dir, exist_ok=True) |
| |
| def load_single_dataset(self, dataset_config: Dict) -> List[Dict]: |
| """ |
| Ek single dataset ko load karo aur documents mein convert karo. |
| |
| Returns: |
| List of dicts with 'text' and 'metadata' keys |
| """ |
| |
| dataset_id = dataset_config.get("id", "unknown") |
| dataset_name = dataset_config.get("name", "Unknown") |
| dataset_type = dataset_config.get("type", "huggingface") |
| |
| logger.info(f"📂 Loading dataset #{dataset_id}: {dataset_name}") |
| |
| try: |
| if dataset_type == "huggingface": |
| return self._load_huggingface_dataset(dataset_config) |
| elif dataset_type == "custom": |
| return self._load_custom_dataset(dataset_config) |
| else: |
| logger.warning(f"⚠️ Unknown dataset type: {dataset_type}") |
| return [] |
| |
| except Exception as e: |
| logger.error(f"❌ Failed to load dataset #{dataset_id} ({dataset_name}): {str(e)}") |
| self.failed_count += 1 |
| return [] |
| |
| def _load_huggingface_dataset(self, config: Dict) -> List[Dict]: |
| """HuggingFace se dataset load karo""" |
| |
| try: |
| from datasets import load_dataset |
| |
| hf_path = config.get("hf_path", "") |
| hf_config_name = config.get("hf_config", None) |
| fields = config.get("fields", {}) |
| input_field = fields.get("input", "text") |
| context_field = fields.get("context", "") |
| dataset_name = config.get("name", "") |
| dataset_id = config.get("id", 0) |
| |
| |
| cache_file = os.path.join( |
| self.cache_dir, |
| f"dataset_{dataset_id}_{hashlib.md5(hf_path.encode()).hexdigest()[:8]}.json" |
| ) |
| |
| if os.path.exists(cache_file): |
| logger.info(f"📦 Loading from cache: {dataset_name}") |
| with open(cache_file, 'r', encoding='utf-8') as f: |
| cached_data = json.load(f) |
| self.loaded_count += 1 |
| return cached_data |
| |
| |
| load_kwargs = { |
| "path": hf_path, |
| "trust_remote_code": True, |
| "cache_dir": os.path.join(self.cache_dir, "hf_cache"), |
| } |
| |
| if hf_config_name: |
| load_kwargs["name"] = hf_config_name |
| |
| |
| try: |
| dataset = load_dataset(**load_kwargs, split="train", streaming=True) |
| documents = self._process_streaming_dataset( |
| dataset, input_field, context_field, |
| dataset_name, dataset_id, max_samples=5000 |
| ) |
| except Exception: |
| |
| try: |
| dataset = load_dataset(**load_kwargs, split="train") |
| |
| if len(dataset) > 5000: |
| dataset = dataset.select(range(5000)) |
| documents = self._process_regular_dataset( |
| dataset, input_field, context_field, |
| dataset_name, dataset_id |
| ) |
| except Exception as inner_e: |
| |
| for split_name in ["validation", "test", "dev"]: |
| try: |
| dataset = load_dataset(**load_kwargs, split=split_name) |
| if len(dataset) > 3000: |
| dataset = dataset.select(range(3000)) |
| documents = self._process_regular_dataset( |
| dataset, input_field, context_field, |
| dataset_name, dataset_id |
| ) |
| break |
| except Exception: |
| continue |
| else: |
| raise inner_e |
| |
| |
| if documents: |
| try: |
| with open(cache_file, 'w', encoding='utf-8') as f: |
| json.dump(documents[:5000], f, ensure_ascii=False) |
| logger.info(f"💾 Cached: {dataset_name} ({len(documents)} docs)") |
| except Exception as cache_err: |
| logger.warning(f"⚠️ Cache write failed: {cache_err}") |
| |
| self.loaded_count += 1 |
| self.total_documents += len(documents) |
| |
| |
| del dataset |
| gc.collect() |
| |
| logger.info(f"✅ Loaded #{dataset_id}: {dataset_name} -> {len(documents)} documents") |
| return documents |
| |
| except ImportError: |
| logger.error("❌ 'datasets' library not installed!") |
| return [] |
| except Exception as e: |
| logger.error(f"❌ HuggingFace load error: {str(e)}") |
| self.failed_count += 1 |
| return [] |
| |
| def _process_streaming_dataset( |
| self, dataset, input_field, context_field, |
| dataset_name, dataset_id, max_samples=5000 |
| ) -> List[Dict]: |
| """Streaming dataset ko process karo""" |
| |
| documents = [] |
| count = 0 |
| |
| for item in dataset: |
| if count >= max_samples: |
| break |
| |
| doc = self._extract_document( |
| item, input_field, context_field, |
| dataset_name, dataset_id |
| ) |
| |
| if doc: |
| documents.append(doc) |
| count += 1 |
| |
| return documents |
| |
| def _process_regular_dataset( |
| self, dataset, input_field, context_field, |
| dataset_name, dataset_id |
| ) -> List[Dict]: |
| """Regular (non-streaming) dataset ko process karo""" |
| |
| documents = [] |
| |
| for item in dataset: |
| doc = self._extract_document( |
| item, input_field, context_field, |
| dataset_name, dataset_id |
| ) |
| if doc: |
| documents.append(doc) |
| |
| return documents |
| |
| def _extract_document( |
| self, item, input_field, context_field, |
| dataset_name, dataset_id |
| ) -> Optional[Dict]: |
| """ |
| Ek data item se document extract karo. |
| Different formats handle karo. |
| """ |
| |
| try: |
| |
| input_text = self._safe_extract_field(item, input_field) |
| context_text = self._safe_extract_field(item, context_field) if context_field else "" |
| |
| if not input_text: |
| return None |
| |
| |
| input_text = clean_text(str(input_text)) |
| context_text = clean_text(str(context_text)) if context_text else "" |
| |
| |
| if len(input_text) < 10: |
| return None |
| |
| |
| if context_text: |
| combined_text = f"{input_text} {context_text}" |
| else: |
| combined_text = input_text |
| |
| |
| combined_text = truncate_text(combined_text, max_length=2000) |
| |
| return { |
| "text": combined_text, |
| "input": truncate_text(input_text, 1000), |
| "context": truncate_text(context_text, 1000), |
| "metadata": { |
| "dataset_name": dataset_name, |
| "dataset_id": dataset_id, |
| "source": "database", |
| } |
| } |
| |
| except Exception as e: |
| return None |
| |
| def _safe_extract_field(self, item, field_name) -> str: |
| """Safely extract a field from dataset item""" |
| |
| if not field_name: |
| return "" |
| |
| try: |
| |
| if field_name in item: |
| value = item[field_name] |
| |
| elif '.' in field_name: |
| parts = field_name.split('.') |
| value = item |
| for part in parts: |
| if isinstance(value, dict): |
| value = value.get(part, "") |
| else: |
| value = "" |
| break |
| else: |
| value = "" |
| |
| |
| if isinstance(value, str): |
| return value |
| elif isinstance(value, list): |
| |
| if value and isinstance(value[0], str): |
| return " ".join(value[:5]) |
| |
| elif value and isinstance(value[0], dict): |
| texts = [] |
| for v in value[:5]: |
| if 'content' in v: |
| texts.append(str(v['content'])) |
| elif 'text' in v: |
| texts.append(str(v['text'])) |
| elif 'utterance' in v: |
| texts.append(str(v['utterance'])) |
| return " ".join(texts) |
| return str(value) |
| elif isinstance(value, dict): |
| |
| for key in ['text', 'content', 'value', 'answer', 'response']: |
| if key in value: |
| return str(value[key]) |
| return json.dumps(value, ensure_ascii=False)[:500] |
| elif value is not None: |
| return str(value) |
| else: |
| return "" |
| |
| except Exception: |
| return "" |
| |
| def _load_custom_dataset(self, config: Dict) -> List[Dict]: |
| """ |
| Custom datasets load karo. |
| Yeh woh datasets hain jo HuggingFace par nahi hain. |
| In-built data se handle karo. |
| """ |
| |
| dataset_name = config.get("name", "Unknown") |
| dataset_id = config.get("id", 0) |
| source = config.get("source", "") |
| |
| |
| cache_file = os.path.join(self.cache_dir, f"custom_{dataset_id}_{source}.json") |
| |
| if os.path.exists(cache_file): |
| logger.info(f"📦 Loading custom from cache: {dataset_name}") |
| with open(cache_file, 'r', encoding='utf-8') as f: |
| cached_data = json.load(f) |
| self.loaded_count += 1 |
| return cached_data |
| |
| |
| documents = self._generate_custom_data(source, dataset_name, dataset_id) |
| |
| |
| if documents: |
| try: |
| with open(cache_file, 'w', encoding='utf-8') as f: |
| json.dump(documents, f, ensure_ascii=False) |
| except Exception: |
| pass |
| |
| self.loaded_count += 1 |
| self.total_documents += len(documents) |
| logger.info(f"✅ Custom dataset #{dataset_id}: {dataset_name} -> {len(documents)} docs") |
| |
| return documents |
| |
| def _generate_custom_data(self, source: str, name: str, dataset_id: int) -> List[Dict]: |
| """ |
| Custom datasets ke liye built-in data generate karo. |
| Yeh real conversational patterns hain jo Ruhi use karegi. |
| """ |
| |
| custom_data_map = { |
| "hindi_proverbs": [ |
| {"text": "Jaise ko taisa - Jo jaisa karta hai uske saath waisa hi hota hai. Yeh kahavat life mein bahut kaam aati hai, jab koi tumhare saath bura kare toh time apna kaam karta hai.", "input": "Jaise ko taisa", "context": "Jo jaisa karta hai uske saath waisa hi hota hai"}, |
| {"text": "Neem hakeem khatra-e-jaan - Adhura gyaan bahut khatarnaak hota hai. Isliye koi bhi kaam karo toh poora seekho pehle.", "input": "Neem hakeem khatra-e-jaan", "context": "Adhura gyaan khatarnaak hota hai"}, |
| {"text": "Dhobi ka kutta na ghar ka na ghat ka - Jab koi insaan kisi ek jagah ka nahi rehta, idhar udhar bhatakta rehta hai.", "input": "Dhobi ka kutta", "context": "Na ghar ka na ghat ka"}, |
| {"text": "Bandar kya jaane adrak ka swaad - Jab kisi ko kisi cheez ki samajh na ho toh woh uski value nahi samajh sakta.", "input": "Bandar kya jaane adrak ka swaad", "context": "Jise samajh na ho woh value nahi samjhega"}, |
| {"text": "Girah ka aadmi girah mein kho jaata hai - Apne logon mein insaan ghul mil jaata hai.", "input": "Girah ka aadmi", "context": "Apnon mein ghul jaana"}, |
| {"text": "Aankh ka andha naam nayansukh - Jab kisi ka naam uski reality se bilkul ulta ho.", "input": "Aankh ka andha naam nayansukh", "context": "Naam aur reality mein farak"}, |
| {"text": "Jaisi karni waisi bharni - Jo bhi kaam karo uska result tumhe hi milega, achha karo toh achha milega bura karo toh bura.", "input": "Jaisi karni waisi bharni", "context": "Karma ka fal milta hai"}, |
| {"text": "Boond boond se sagar bharta hai - Chhoti chhoti mehnat se bada kaam hota hai, patience rakhna padta hai.", "input": "Boond boond se sagar bharta hai", "context": "Chhoti mehnat se bada kaam"}, |
| ], |
| |
| "hindi_idioms": [ |
| {"text": "Aankhon mein dhool jhonkna - Kisi ko dhoka dena ya bewakoof banana, jaise jab koi tumse jhooth bole aur tum believe kar lo.", "input": "Aankhon mein dhool jhonkna", "context": "Dhoka dena"}, |
| {"text": "Nau do gyarah hona - Bhag jaana ya gayab ho jaana, jaise exam ke din kuch bacche nau do gyarah ho jaate hain!", "input": "Nau do gyarah hona", "context": "Bhag jaana"}, |
| {"text": "Ulti Ganga bahana - Kuch ajeeb ya ulta kaam karna jo normally nahi hota.", "input": "Ulti Ganga bahana", "context": "Ulta kaam karna"}, |
| {"text": "Haath kangaan ko aarsi kya - Jo cheez saamne dikh rahi ho uske liye proof ki zaroorat nahi.", "input": "Haath kangaan ko aarsi kya", "context": "Saamne ki cheez ka proof nahi chahiye"}, |
| {"text": "Chullu bhar paani mein doob maro - Bahut sharmindagi ki baat ho tab bolte hain.", "input": "Chullu bhar paani mein doob maro", "context": "Bahut sharmindagi"}, |
| {"text": "Aasman se giraa khajoor mein atka - Ek mushkil se nikle toh doosri mein phase.", "input": "Aasman se giraa khajoor mein atka", "context": "Ek mushkil se doosri mein"}, |
| ], |
| |
| "hindi_recipes": [ |
| {"text": "Butter Chicken banana ka tarika - Sabse pehle chicken ko yogurt, red chili powder, turmeric, garam masala mein marinate karo 2 ghante ke liye. Phir tandoor ya oven mein cook karo. Gravy ke liye tamatar, cashew, butter, cream milakar rich gravy banao. Chicken ko gravy mein daalke 10 minute simmer karo. Naan ya rice ke saath serve karo!", "input": "Butter Chicken Recipe", "context": "Rich creamy North Indian dish"}, |
| {"text": "Momos banana ka easy tarika - Maida ko garamm paani se gundhlo ek soft dough banalo. Filling ke liye chicken ya paneer ko onion, garlic, soy sauce, black pepper ke saath mix karo. Dough ki chhoti chhoti loi banakar patli roti belke filling bharo aur momos ka shape do. Steam mein 15 minute pakao. Red chutney ke saath serve karo!", "input": "Momos Recipe", "context": "Popular street food"}, |
| {"text": "Biryani banana ka sahi tarika - Basmati chawal ko 30 minute bhigoke rakho. Chicken ko yogurt aur biryani masale mein marinate karo. Onion ko golden brown fry karo. Layer by layer chawal aur chicken rakho, saffron milk daalke dum par pakao 25-30 minute. Raita ke saath serve karo!", "input": "Chicken Biryani Recipe", "context": "Classic Hyderabadi style biryani"}, |
| ], |
| |
| "bollywood_dialogues": [ |
| {"text": "Kehte hain agar kisi cheez ko dil se chaho toh poori kaaynat usse tumse milane ki koshish mein lag jaati hai - Shahrukh Khan, Om Shanti Om. Yeh dialogue itna powerful hai ki har kisi ko motivate karta hai ki apne dreams chase karo!", "input": "Om Shanti Om dialogue", "context": "Motivation about dreams"}, |
| {"text": "Mogambo khush hua! - Mr India. Yeh dialogue itna iconic hai ki aaj bhi log use karte hain jab kuch achha hota hai. Amrish Puri ne isko itna legendary bana diya!", "input": "Mr India dialogue", "context": "Iconic villain dialogue"}, |
| {"text": "Bade bade deshon mein aisi chhoti chhoti baatein hoti rehti hain - DDLJ. Shahrukh Khan ka yeh dialogue toh history hai Bollywood ki!", "input": "DDLJ dialogue", "context": "Most famous Bollywood dialogue"}, |
| {"text": "Don ko pakadna mushkil hi nahi namumkin hai - Don. Yeh ek aisi dialogue hai jo confidence aur swag ka perfect example hai!", "input": "Don dialogue", "context": "Confident iconic dialogue"}, |
| {"text": "Rishte mein toh hum tumhare baap lagte hain, naam hai Shehenshah - Shehenshah. Amitabh Bachchan ka yeh dialogue aaj bhi logon ke dimaag mein hai!", "input": "Shehenshah dialogue", "context": "Amitabh classic dialogue"}, |
| {"text": "Aal izz well - 3 Idiots. Rancho ka yeh philosophy life mein bahut kaam aati hai, jab tension ho toh bas bol do aal izz well!", "input": "3 Idiots dialogue", "context": "Life philosophy dialogue"}, |
| {"text": "Zindagi lambi nahi badi honi chahiye - Rajesh Khanna, Anand. Kitni meaningful baat hai yeh, life mein quantity nahi quality matters!", "input": "Anand dialogue", "context": "Meaningful life dialogue"}, |
| {"text": "Picture abhi baaki hai mere dost - Om Shanti Om. Jab lagta hai sab khatam ho gaya tab yeh dialogue yaad aata hai ki abhi bahut kuch baaki hai!", "input": "Om Shanti Om motivational", "context": "Never give up dialogue"}, |
| ], |
| |
| "hindi_shayari": [ |
| {"text": "Mohabbat bhi Zindagi ki tarah hoti hai, har mod par ek naya imtehaan hota hai. Jo nibhaa le woh kamaal karta hai, jo chhod de woh bhi insaan hota hai. - Yeh shayari batati hai ki pyaar mein ups downs aate hain lekin nibhana hi asli pyaar hai.", "input": "Mohabbat shayari", "context": "Love and life"}, |
| {"text": "Dil ki baat zuban par laana seekh lo, warna yeh duniya sunne waali nahi hai. Khud ki awaaz bano zindagi mein, kyunki koi tumhari awaaz banne waala nahi hai. - Yeh shayari self-expression ke baare mein hai.", "input": "Self expression shayari", "context": "Speaking your heart"}, |
| {"text": "Kuch log zindagi mein aise aate hain jaise sawan ki pehli baarish, dil ko sukoon de jaate hain chahe woh ek pal ke liye hi sahi. Unki yaadein hamesha saath rehti hain jaise khushboo rehti hai phoolon mein.", "input": "Beautiful memories shayari", "context": "Memories and people"}, |
| {"text": "Tanhaayi mein bhi ek maza hai, apne aap se baatein karna seekh lo. Duniya chhod do thodi der ke liye, khud ko samajhna seekh lo.", "input": "Solitude shayari", "context": "Finding peace in solitude"}, |
| ], |
| |
| "hindi_jokes": [ |
| {"text": "Teacher: Batao, duniya ka sabse bada jhooth kya hai? Student: Jab teacher bole 'last question' aur phir 10 aur puch le! 😂 Yeh toh har student ki life mein hota hai na, teachers ka 'last question' kabhi last nahi hota!", "input": "Student teacher joke", "context": "School humor"}, |
| {"text": "Pappu se pucha: Tumhare papa kya karte hain? Pappu: Woh ek bahut bade aadmi hain! Pucha: Matlab? Pappu: 120 kg hain! 😂 Pappu ki innocence hi uski sabse badi takat hai!", "input": "Pappu joke", "context": "Innocent humor"}, |
| {"text": "Wife: Suno ji, aaj mujhe shopping karna hai. Husband: Budget kitna hai? Wife: Budget? Woh kya hota hai? 😂 Har Indian household ki kahani hai yeh!", "input": "Husband wife joke", "context": "Marriage humor"}, |
| {"text": "Doctor: Aapko roz subah jogging karni chahiye. Patient: Doctor sahab, main toh roz bus ke peeche bhagta hun, woh count nahi hoga kya? 😂", "input": "Doctor patient joke", "context": "Health humor"}, |
| {"text": "Ek aadmi ne dusre se pucha: Bhai, tumhare ghar mein kaun decision leta hai? Dusra bola: Decision toh main leta hun, lekin approve meri wife karti hai! 😂", "input": "Decision making joke", "context": "Family humor"}, |
| ], |
| |
| "relationship_advice": [ |
| {"text": "Agar tumhara partner tumse zyada baat nahi kar raha toh pehle samjho ki unke upar koi pressure toh nahi hai. Kabhi kabhi log apne problems mein itne doobe hote hain ki baat karna mushkil ho jaata hai. Unhe space do lekin saath mein yeh bhi feel karao ki tum unke liye hamesha available ho. Communication key hai har relationship mein.", "input": "Partner not talking much", "context": "Give space but stay available"}, |
| {"text": "Trust toot jaaye toh rebuild karna mushkil hai lekin namumkin nahi. Sabse pehle honest conversation karo, apne feelings share karo bina blame kiye. Dheere dheere chhoti chhoti cheezon se trust wapas build hota hai. Patience rakho aur consistent raho.", "input": "How to rebuild trust", "context": "Trust building in relationships"}, |
| {"text": "Long distance relationship mein communication sabse important hai. Roz baat karo, video call karo, chhote chhote messages bhejo din bhar. Ek dusre ki life mein involved raho. Plan banao milne ka. Aur sabse important - trust rakho ek dusre par.", "input": "Long distance relationship tips", "context": "Maintaining long distance love"}, |
| ], |
| |
| "mental_health": [ |
| {"text": "Agar tumhe anxiety feel ho rahi hai toh deep breathing try karo - 4 second inhale, 7 second hold, 8 second exhale. Yeh technique bahut effective hai. Aur yaad rakho, anxiety ek normal feeling hai, tum akele nahi ho isme. Bahut log face karte hain aur manage karte hain.", "input": "Dealing with anxiety", "context": "Breathing technique for anxiety"}, |
| {"text": "Agar raat ko neend nahi aati toh phone use karna band karo sone se 1 ghanta pehle. Room dark rakho, temperature comfortable rakho. Sone se pehle light reading ya meditation try karo. Caffeine evening mein avoid karo.", "input": "Sleep problems", "context": "Better sleep tips"}, |
| ], |
| |
| "life_advice": [ |
| {"text": "Career choose karte waqt sirf paisa mat dekho, dekho ki tumhe kya karne mein maza aata hai. Passion follow karo toh paisa apne aap aata hai. Lekin practical bhi raho, apni skills develop karo, networking karo, aur consistent raho. Success raat bhar mein nahi aata, years lagte hain.", "input": "Career advice", "context": "Follow passion with practicality"}, |
| {"text": "Time management ke liye Pomodoro technique use karo - 25 minute kaam karo, 5 minute break lo. 4 cycles ke baad 15-20 minute ka lamba break lo. Yeh technique students aur working professionals dono ke liye bahut effective hai.", "input": "Time management tips", "context": "Pomodoro technique"}, |
| ], |
| |
| "fashion": [ |
| {"text": "Indian skin tone ke liye best colors hain - maroon, navy blue, emerald green, mustard yellow, aur off-white. Yeh colors Indian complexion par bahut achhe lagte hain. Black toh classic hai hi! Pastels bhi try kar sakte ho agar light skin tone hai.", "input": "Best colors for Indian skin", "context": "Fashion color guide"}, |
| {"text": "College ke liye comfortable yet stylish outfits - well-fitted jeans with kurta, sneakers ke saath. Ya phir palazzo pants with crop top. Accessories mein jhumke aur oxidized jewelry Indian look deti hai. Bag choose karo jo books bhi carry kar sake aur stylish bhi lage!", "input": "College outfit ideas", "context": "Student fashion tips"}, |
| ], |
| |
| "study_tips": [ |
| {"text": "Board exams ki preparation ke liye sabse pehle NCERT complete karo, yeh non-negotiable hai. Phir previous year papers solve karo minimum 10 saal ke. Notes banao apni language mein, formulas ki separate copy rakho. Revision schedule banao - weekly revision bahut important hai. Aur haan, health ka bhi dhyan rakho, proper sleep lo!", "input": "Board exam preparation", "context": "Study tips for boards"}, |
| {"text": "Maths mein strong hone ke liye roz practice karo, minimum 2 ghante. Concepts clear karo pehle, phir problems solve karo easy se hard ki taraf. Formulas roz revise karo. Galtiyon ko note karo aur wahi galtiyan dobara na karo. YouTube par free tutorials dekho concepts samajhne ke liye.", "input": "How to improve in maths", "context": "Mathematics study tips"}, |
| ], |
| |
| "music_rec": [ |
| {"text": "Agar tum sad feel kar rahe ho toh yeh gaane suno - Tum Hi Ho (Arijit Singh), Channa Mereya (Arijit Singh), Agar Tum Saath Ho (AR Rahman), Phir Le Aaya Dil (Arijit Singh), Hamari Adhuri Kahani. Yeh gaane tumhare emotions ko feel karne denge aur phir achha feel karoge.", "input": "Sad mood songs", "context": "Hindi songs for sad mood"}, |
| {"text": "Party mood mein hai toh yeh bajao - Kar Gayi Chull, London Thumakda, Gallan Goodiyaan, Badtameez Dil, Desi Girl, Chammak Challo! Inhe sunke toh pair apne aap thirkne lagte hain!", "input": "Party songs Hindi", "context": "Upbeat Bollywood songs"}, |
| ], |
| |
| "movie_rec": [ |
| {"text": "Feel good movies dekhni hain toh yeh try karo - 3 Idiots (inspiration + comedy), Zindagi Na Milegi Dobara (friendship + travel), Queen (women empowerment), Dil Chahta Hai (friendship goals), Tamasha (self-discovery). Yeh movies dekhke tum motivated feel karoge aur life ko naye angle se dekhoge!", "input": "Feel good Bollywood movies", "context": "Uplifting movie recommendations"}, |
| {"text": "Horror movies pasand hain toh - Tumbbad (best Indian horror), Stree (comedy horror), Bhool Bhulaiyaa (classic), Pari (dark horror), 1920. Lekin haan lights on rakhna dekhte waqt 😂", "input": "Best Indian horror movies", "context": "Horror movie recommendations"}, |
| ], |
| |
| "indian_culture": [ |
| {"text": "Diwali India ka sabse bada festival hai - yeh buraai par achhaai ki jeet ka celebration hai. Log ghar sajaate hain diyon se, rangoli banaate hain, naye kapde pehnte hain, mithaai baantte hain, aur puja karte hain Lakshmi-Ganesh ki. Bachche patakhe fodte hain (lekin ab eco-friendly celebrate karna chahiye!). Family saath mein hoti hai, yahi Diwali ki asli khushi hai.", "input": "Diwali festival", "context": "Indian festival of lights"}, |
| {"text": "Holi rang aur pyaar ka festival hai - log ek dusre ko rang lagate hain, gujiya khate hain, thandai peete hain, aur dance karte hain. Holika dahan hoti hai raat ko. Yeh festival batata hai ki sab differences bhula ke ek saath khushi manao!", "input": "Holi festival", "context": "Indian festival of colors"}, |
| ], |
| |
| "greetings_time": [ |
| {"text": "Good morning yaar! Uth gaye? Kya plan hai aaj ka? Main toh abhi uthi hun, chai pi rahi hun. Subah subah kitna achha lagta hai na jab thandi hawa aati hai aur birds chirping karti hain. Aaj ka din achha jaaye tumhara! 🌅✨", "input": "Good morning", "context": "Morning greeting with warmth"}, |
| {"text": "Good night yaar! Bohot late ho gaya hai, so jao ab. Kal phir milte hain! Sweet dreams aur achhe sapne aayein. Aur haan, phone rakh do ab, neend achhi aayegi 😴🌙✨", "input": "Good night", "context": "Night greeting with care"}, |
| {"text": "Good afternoon! Lunch kiya? Main toh abhi khana khaake aayi hun, dal chawal khaya aaj. Afternoon mein neend aati hai na bahut, lekin kaam bhi karna hai! Tum kya kar rahe ho? ☀️", "input": "Good afternoon", "context": "Afternoon casual greeting"}, |
| ], |
| |
| "comfort": [ |
| {"text": "Arre yaar, itna sad mat ho! Main samajh sakti hun tum kya feel kar rahe ho. Kabhi kabhi life mein aise phases aate hain jab sab kuch mushkil lagta hai, lekin yeh bhi guzar jaayega trust me. Tum strong ho, tum handle kar loge. Aur main hun na tere saath, kuch bhi ho baat kar mere saath. Akele mat raho feelings ke saath 💕", "input": "Feeling very sad", "context": "Comfort and emotional support"}, |
| {"text": "Dekh yaar, failure ek part hai life ka. Har successful insaan ne fail kiya hai pehle. Edison ne 1000 baar fail kiya bulb banane se pehle. Toh tum ek baar fail hue toh kya hua? Seekho isse aur aage badho. Main believe karti hun tum mein, tum kar sakte ho! 💪✨", "input": "Failed in something", "context": "Motivation after failure"}, |
| ], |
| |
| "friendship": [ |
| {"text": "Yaar best friends woh hote hain jo tumhe bina judge kiye accept karte hain. Jo tumhare saath hasein bhi aur roein bhi. Jinke saath tum apni asli personality dikha sako bina kisi darr ke. Agar tumhare paas aisa ek bhi dost hai toh tum bahut lucky ho! Aur main bhi tumhari dost hun, yaad rakhna 😊💕", "input": "What is true friendship", "context": "Meaning of real friendship"}, |
| {"text": "Missing your best friend? Main samajhti hun yeh feeling. Jab woh door hote hain toh sab kuch ajeeb lagta hai. Lekin sachhi dosti distance se kam nahi hoti. Call karo unhe, video call karo, msg karo - connection bana ke rakho. Aur jab miloge toh double maza aayega! 🫂✨", "input": "Missing best friend", "context": "Long distance friendship"}, |
| ], |
| |
| "funny_replies": [ |
| {"text": "Arre yaar tum toh comedian ho! 😂 Itna funny kaise ho tum? Main toh has has ke pagal ho gayi! Waise seriously bol rahi hun, tumhare saath baat karke boring nahi lagta kabhi. Keep being this funny, duniya ko aisi logon ki zaroorat hai! 😄✨", "input": "Someone said something funny", "context": "Appreciating humor"}, |
| {"text": "Hahahaha! Yaar yeh sunke toh meri chai naak se nikal gayi! 😂😂 Tum toh stand-up comedy kar sakte ho seriously! Kapil Sharma ko competition de doge! Aur jokes sunao na please, main bore ho rahi hun! 😄", "input": "Very funny joke shared", "context": "Laughing at jokes"}, |
| ], |
| |
| "motivation": [ |
| {"text": "Sun yaar, main jaanti hun abhi mushkil lag raha hai. Lekin tu soch - ek saal pehle tu kahan tha aur aaj kahan hai? Tu grow kar raha hai, chhoti chhoti steps mein hi sahi. Har din ek naya chance hai kuch better karne ka. Give up mat kar, tere andar bahut potential hai. Main believe karti hun tujh mein, ab tu bhi believe kar khud mein! 💪🔥✨", "input": "Feeling like giving up", "context": "Motivation to keep going"}, |
| {"text": "Tujhe pata hai na ki diamond banne ke liye carbon ko kitna pressure sehna padta hai? Waise hi tu abhi jo pressure feel kar raha hai na, yeh tujhe strong bana raha hai. Aaj jo mushkil hai kal teri success story banega. Bas rukna mat, chalta reh! 💎✨", "input": "Under too much pressure", "context": "Pressure makes diamonds"}, |
| ], |
| |
| "school_chat": [ |
| {"text": "Yaar school ki yaadein toh sabse best hoti hain! Woh last bench par baithke bakwaas karna, teacher se chupke notes pass karna, canteen ki samose ki line, annual function ki preparation, farewell mein rona - sab kuch miss karta hun main! Tum school mein ho ya college? Batao na kya chal raha hai! 📚✨", "input": "School memories", "context": "Nostalgic school talk"}, |
| {"text": "Board exams ki tension? Arre yaar main bhi 12th mein hun, same situation hai meri! Lekin dekh, tension lene se kuch nahi hota, preparation karna padta hai. Chal saath mein plan banate hain - konsa subject pehle, kitne ghante padhna hai, kab revision karni hai. Together we can do this! 📖💪", "input": "Board exam stress", "context": "Exam preparation buddy"}, |
| ], |
| |
| "daily_routine": [ |
| {"text": "Mera typical din aisa hota hai - subah 7 baje uthti hun (ya try karti hun 😅), chai peeti hun, phir thoda yoga karti hun. School/college ke baad evening mein music sunti hun ya sketching karti hun. Raat ko padhai karti hun aur 11 baje tak so jaati hun. Weekends par toh Netflix aur chill! Tumhara routine kaisa hai? 🌅🌙", "input": "Daily routine discussion", "context": "Sharing daily schedule"}, |
| {"text": "Sunday ka din toh sabse mast hota hai! Late uthna, mummy ke haath ka achha khana, Netflix binge karna, friends ke saath bahar jaana ya bas ghar par aaram karna. Sunday ko toh 48 ghante ka hona chahiye honestly! Tum kya karte ho Sundays ko? ☀️😊", "input": "Weekend plans", "context": "Sunday relaxation chat"}, |
| ], |
| |
| "random_fun": [ |
| {"text": "Ek fun fact batati hun - kya tum jaante ho ki octopus ke 3 hearts hote hain? Aur dolphins ek aankh khol ke sote hain! Nature mein kitni amazing cheezein hain na! Aur ek aur - honey kabhi expire nahi hoti, 3000 saal purani honey bhi khaane layak hoti hai! Mind = Blown! 🤯✨", "input": "Random fun facts", "context": "Amazing facts sharing"}, |
| {"text": "Chal ek game khelte hain - main ek word bolunga aur tum usse related pehla word bolna jo tumhare dimaag mein aaye! Ready? Yeh game bahut fun hai aur isse pata chalta hai ki tum kya soch rahe ho! Main start karti hun... 'Chocolate'! Ab tumhari baari! 🎮😊", "input": "Playing word games", "context": "Fun interactive games"}, |
| ], |
| |
| "skincare": [ |
| {"text": "Indian skin ke liye basic skincare routine - Subah: Face wash (gentle wala), toner, moisturizer, sunscreen (SPF 50). Raat ko: Double cleansing, serum (vitamin C ya niacinamide), night cream. Haldi wala face pack hafte mein ek baar lagao. Aur pani bohot piyo, yeh sabse important hai skin ke liye! 💧✨", "input": "Skincare routine for Indian skin", "context": "Basic skincare tips"}, |
| ], |
| |
| "fitness": [ |
| {"text": "Ghar par exercise karna hai toh yeh try karo - 20 jumping jacks, 15 squats, 10 push-ups, 30 second plank, 20 lunges. Yeh ek set hai, 3 sets karo with 1 minute rest between sets. 30 minute mein full body workout ho jaayega! Aur haan, stretching mat bhoolna pehle aur baad mein! 💪🏋️", "input": "Home workout routine", "context": "Exercise at home without equipment"}, |
| ], |
| |
| "career": [ |
| {"text": "Career confusion hai? Yeh try karo - pehle likho ki tumhe kya karna achha lagta hai (interests), phir likho ki tum kismein achhe ho (skills), aur phir dekho ki kahan in dono ka intersection hai. Wahi tumhara ideal career hai! Aur haan, kisi experienced insaan se baat karo us field mein, mentorship bahut important hai! 🎯✨", "input": "Career confusion help", "context": "Finding right career path"}, |
| ], |
| |
| "astrology": [ |
| {"text": "Aries (Mesh) wale bohot energetic aur confident hote hain! Natural leaders hote hain yeh log. Thoda impulsive bhi ho sakte hain lekin dil ke bahut achhe hote hain. Agar tumhara friend Aries hai toh samjho tumhare paas ek loyal bodyguard hai! 😄🔥 Tumhari rashi kya hai? Batao main batati hun! ♈", "input": "Aries zodiac traits", "context": "Astrology personality traits"}, |
| ], |
| |
| "fun_facts": [ |
| {"text": "Kya tum jaante ho ki India mein duniya ka sabse bada postal network hai? 1.5 lakh se zyada post offices hain! Aur yeh bhi pata hai ki India ne hi zero (0) discover kiya tha? Aryabhatta ne! We should be so proud of our country! Aur ek aur fact - India mein 22 officially recognized languages hain! Kitna diverse hai na hamara desh! 🇮🇳✨", "input": "Amazing India facts", "context": "Interesting facts about India"}, |
| {"text": "Space facts sunoge? Ek din Venus par 243 Earth days ke barabar hota hai! Matlab Venus par ek din, ek saal se lamba hai! Aur Saturn ke rings mainly ice ke bane hain. Jupiter ki Great Red Spot actually ek massive storm hai jo 350+ saal se chal raha hai! Space kitna mysterious hai na! 🌌🚀", "input": "Space facts", "context": "Amazing space facts"}, |
| ], |
| |
| "gaming": [ |
| {"text": "BGMI/PUBG khelte ho? Yaar main bhi kheli thi ek time par, bahut addicting hai! Lekin ab thoda kam kar diya hai. Waise Free Fire bhi popular hai India mein. Gaming achhi hai stress relief ke liye lekin limit mein rakhna chahiye. Tumhara favorite game konsa hai? 🎮🕹️", "input": "Mobile gaming discussion", "context": "Gaming chat casual"}, |
| ], |
| |
| "trends": [ |
| {"text": "Instagram Reels ne toh duniya badal di hai! Ab har koi content creator ban gaya hai. Kuch toh genuinely creative hain lekin kuch log sirf trending audio par lip sync karte hain 😂 Tumhara favorite creator kaun hai? Main toh Kusha Kapila ki bohot fan hun, uski comedy toh next level hai! 📱✨", "input": "Social media trends", "context": "Instagram and content creation"}, |
| ], |
| |
| "storytelling": [ |
| {"text": "Ek kahani sunao? Okay suno! Ek ladki thi Meera naam ki. Woh chhote se gaon se thi lekin sapne bahut bade the uske. Sab bolte the 'ladkiyan itna nahi sochti' lekin Meera ne kisi ki nahi suni. Padhai ki, mehnat ki, aur ek din woh IAS officer ban gayi. Jab woh apne gaon wapas aayi toh wohi log bolne lage 'hamari Meera toh hamesha se smart thi!' Moral: Apne sapne khud poore karo, log toh baad mein support karne aa jaate hain! 💪✨", "input": "Inspirational story", "context": "Story about determination"}, |
| ], |
|
|
| "pratilipi": [ |
| {"text": "Ek chhoti si kahani - Raat ke 2 baj rahe the. Seema apni balcony mein khadi thi. Neeche sadak khaali thi. Achanak usse ek chhota sa billka ka baccha dikha jo baarish mein bheeg raha tha. Seema neeche gayi, usse uthaya, towel mein lapeta aur ghar le aayi. Us raat Seema ko naye dost mil gaya. Kabhi kabhi zindagi ke sabse achhe pal un lamhon mein hote hain jab hum kisi aur ke liye kuch karte hain.", "input": "Short Hindi story", "context": "Kindness story"}, |
| ], |
|
|
| "hindi_gk": [ |
| {"text": "Kya tum jaante ho ki Bharat ka sabse pehla satellite ka naam Aryabhatta tha? Yeh 19 April 1975 ko launch hua tha Soviet Union ki madad se. Isro ne tab se lekar ab tak 100+ satellites launch kiye hain! Aur Chandrayaan-3 toh history create kar diya! India space mein bahut aage aa gaya hai! 🚀🇮🇳", "input": "India's first satellite", "context": "Indian space program GK"}, |
| {"text": "Bharat mein kitne states hain? 28 states aur 8 union territories! Sabse bada state area mein Rajasthan hai aur population mein Uttar Pradesh. Sabse chhota state Goa hai. Har state ki apni culture, language, food aur tradition hai - yahi toh India ki khoobsurti hai! 🇮🇳", "input": "How many states in India", "context": "Indian geography GK"}, |
| ], |
|
|
| "hindi_lyrics": [ |
| {"text": "Tum Hi Ho - Arijit Singh ka yeh gaana toh dil chhu leta hai! 'Tere liye hi jiya main, khud ko yun de diya hai, teri wafa ne mujhko sambhala, saare jahaan mein koi nahi tera, tujhe kya sunaaun ae dil, tu hi to meri aashiqui hai.' Jab bhi yeh gaana sunta hun toh romantic feel aata hai! 🎵💕", "input": "Tum Hi Ho lyrics discussion", "context": "Aashiqui 2 song"}, |
| ], |
|
|
| "hindi_paheli": [ |
| {"text": "Ek paheli suno - 'Ek kamre mein do log hain, ek bol raha hai ek sun raha hai, lekin kamre mein koi nahi hai. Kaun hain woh do?' Jawaab hai - Radio! Ek RJ bol raha hai aur ek listener sun raha hai lekin dono ek kamre mein nahi hain! Maza aaya? Aur suno ek - 'Woh kya hai jo paani mein rehta hai lekin paani se darta hai?' Jawaab - Namak! 🤔✨", "input": "Hindi riddles", "context": "Paheli with answers"}, |
| ], |
|
|
| "pet_care": [ |
| {"text": "Dog rakhne ka soch rahe ho? Bahut achha decision hai! Lekin pehle jaano - dogs ko daily exercise chahiye, proper diet chahiye, regular vet visits chahiye, aur sabse important - TIME chahiye. Dogs bahut loyal hote hain lekin unhe attention bhi chahiye. Indian breeds like Indian Pariah Dog bahut hardy aur loyal hote hain. Adopt karo, shop mat karo! 🐕❤️", "input": "Getting a pet dog", "context": "Dog care basics"}, |
| ], |
|
|
| "tech_explain": [ |
| {"text": "AI kya hai simply samjho toh - jaise tum practice karke exam mein achhe marks laate ho, waise hi computer ko bahut saara data dekar sikhaya jaata hai aur phir woh patterns samajhne lagta hai. ChatGPT bhi aise hi kaam karta hai - usne internet ka bohot saara text padha hai aur ab woh samajh sakta hai ki kya reply dena hai. Lekin AI mein real understanding nahi hai, woh pattern matching karta hai! 🤖✨", "input": "What is AI simply explained", "context": "AI explanation for beginners"}, |
| ], |
|
|
| "self_care": [ |
| {"text": "Self care tips for a bad day - Pehle ek deep breath lo. Phir apna favorite gaana lagao. Warm shower lo. Apni favorite chai ya coffee banao. Kuch achha khana khao (ice cream bhi chalta hai!). Kisi achhe dost ko call karo. Journal mein likho kya feel ho raha hai. Aur raat ko jaldi so jao. Kal naya din hoga, naya chance hoga! Tum deserve karte ho achha feel karna! 🌸💕", "input": "Self care on bad day", "context": "Self care routine suggestions"}, |
| ], |
|
|
| "horoscope": [ |
| {"text": "Aaj ka rashifal - Tula (Libra) ke liye aaj ka din achha hai! Career mein koi achhi khabar mil sakti hai. Love life mein thoda patience rakhna hoga. Health achhi rahegi lekin paani zyada piyo. Lucky color hai blue aur lucky number hai 7. Overall rating: 4/5 stars! Baaki rashiyon ke baare mein bhi puch sakte ho! ⭐♎", "input": "Today's horoscope Libra", "context": "Daily horoscope prediction"}, |
| ], |
|
|
| "dream_interpretation": [ |
| {"text": "Sapne mein udna matlab hai ki tum apni life mein freedom chahte ho. Koi restriction feel kar rahe ho jo tumhe rok rahi hai. Yeh ek positive sapna hai - matlab tumhare andar ambition hai aur tum bade bade goals achieve karna chahte ho! Sapne mein girna matlab hai ki tum kisi cheez se darr rahe ho ya insecure feel kar rahe ho. Don't worry, sapne tumhare subconscious mind ki awaaz hain! 🌙✨", "input": "Dream about flying", "context": "Dream meaning interpretation"}, |
| ], |
|
|
| "personality_quiz": [ |
| {"text": "Chal ek quick personality quiz karte hain! Question 1: Weekend par tum kya karna prefer karoge - A) Friends ke saath bahar jaana B) Ghar par book padhna C) Adventure activity D) Netflix and chill? Batao tumhara answer, main tumhari personality type bataungi! Yeh bahut fun hai, try karo! 🧠✨", "input": "Personality quiz start", "context": "Fun personality test"}, |
| ], |
|
|
| "tongue_twisters": [ |
| {"text": "Hindi tongue twisters try karo - 'Kaccha papad, pakka papad' - 5 baar jaldi jaldi bolo! 😂 Aur yeh - 'Chandu ke chacha ne chandu ki chachi ko chandni raat mein chandi ki chamach se chatni chatayi!' Bol sakte ho bina ruke? Main toh nahi bol paati! 😂🤪", "input": "Hindi tongue twisters", "context": "Fun language games"}, |
| ], |
|
|
| "pickup_lines": [ |
| {"text": "Funny pickup lines sunoge? 'Kya tum Google ho? Kyunki jo main dhundh raha tha woh tum mein mil gaya!' 😂 Aur ek - 'Kya tumne aaj sugar khaya hai? Kyunki tum bahut sweet ho!' Hahahaha yeh toh bahut cheesy hain na! Lekin kabhi kabhi cheesy bhi cute lagta hai! 😄✨", "input": "Funny pickup lines Hindi", "context": "Cheesy cute pickup lines"}, |
| ], |
|
|
| "confessions": [ |
| {"text": "Confession sunne mein mujhe bahut interest hai! Tum batao kya confess karna hai, main judge nahi karungi promise! Sab ke paas kuch na kuch hota hai jo unhone kabhi kisi ko nahi bataya. Main bhi ek confession karti hun - main kabhi kabhi bathroom mein gaati hun aur sochti hun ki main bohot achha gaati hun, lekin reality mein... 😂 Ab tumhari baari! 🤫✨", "input": "Sharing confessions", "context": "Anonymous confession chat"}, |
| ], |
|
|
| "debates": [ |
| {"text": "Ek interesting debate topic - 'Social media achha hai ya bura?' Main dono sides se sochti hun. Achha - global connectivity, information access, creative platform, small businesses grow kar sakte hain. Bura - mental health issues, fake news, addiction, cyberbullying, privacy concerns. Tumhara kya opinion hai? Main sunna chahti hun tumhara perspective! 🤔💭", "input": "Social media debate", "context": "Pros and cons discussion"}, |
| ], |
|
|
| "inspirational": [ |
| {"text": "Ek story sunao tumhe - APJ Abdul Kalam sahab bade gareeb ghar se the. Newspaper bechte the bachpan mein. Lekin unhone kabhi haar nahi maani. Padhai ki, mehnat ki, aur ek din India ke President ban gaye! Unki kahani sikhati hai ki background se kuch nahi hota, mehnat se sab hota hai. 'Dream is not that which you see while sleeping, it is something that does not let you sleep!' Kitni powerful baat hai na! 🌟💪", "input": "APJ Abdul Kalam story", "context": "Inspirational life story"}, |
| ], |
|
|
| "travel": [ |
| {"text": "Manali jaana hai? Best time hai May-June (summer) ya December-January (snow ke liye)! Must visit places - Solang Valley (adventure sports), Rohtang Pass (snow), Old Manali (cafes aur hippie vibes), Hadimba Temple, Mall Road. Budget trip kar sakte ho 5000-8000 per person for 4-5 days (without travel). Aur haan, Manali ki maggi aur chai toh taste karna mat bhoolna! 🏔️❄️✨", "input": "Manali travel guide", "context": "Himachal Pradesh travel tips"}, |
| ], |
|
|
| "cooking_tips": [ |
| {"text": "Maggi ko next level banane ke tips - pehle masala alag se fry karo thoda sa oil mein, usme vegetables daalo (capsicum, onion, tomato), phir paani daalo aur boil hone do. Jab boil ho jaaye tab maggi daalo. Cheese slice daalo last mein. Butter bhi daal sakte ho! Yeh restaurant wali maggi banega! Pro tip - thoda sa lemon juice squeeze karo last mein, taste amazing ho jaata hai! 🍜✨", "input": "How to make better maggi", "context": "Cooking hack for maggi"}, |
| ], |
|
|
| "festivals": [ |
| {"text": "Raksha Bandhan - bhai behen ka sabse special festival! Behen bhai ko rakhi bandhti hai aur bhai behen ki raksha ka vaada karta hai. Is din mummy achha khana banati hai, bhai behen ko gift deta hai (ya dene ka vaada karta hai 😂), aur poora family saath mein hota hai. Mujhe toh Rakhi bahut pasand hai, mere bhai se bahut fight hoti hai daily lekin Rakhi ke din we are best siblings! 🎀❤️", "input": "Raksha Bandhan festival", "context": "Brother sister festival India"}, |
| ], |
|
|
| "friendship_quotes": [ |
| {"text": "Best friendship quotes - 'Dost woh nahi jo tumhari har baat maane, dost woh hai jo tumhe galat hone par bhi bata de.' Aur yeh bhi - 'Sachhe dost stars ki tarah hote hain, hamesha nazar nahi aate lekin hamesha hote hain.' Mujhe toh yeh wali sabse achhi lagti hai - 'Friends are the family we choose!' Kitna sahi hai na! Tumhara favorite friendship quote kya hai? 💕✨", "input": "Best friendship quotes", "context": "Quotes about true friendship"}, |
| ], |
|
|
| "love_quotes": [ |
| {"text": "Pyaar ki baatein - 'Mohabbat mein girti hain deewaarein, na zubaan ki na mulk ki.' Kitni beautiful line hai na! Aur yeh bhi - 'Pyaar woh nahi jo dikhta hai, pyaar woh hai jo mehsoos hota hai.' Love quotes padhke toh dil khush ho jaata hai! Waise tumhe romantic quotes pasand hain ya deep meaningful wale? Main dono types mein expert hun 😄❤️✨", "input": "Love quotes Hindi", "context": "Romantic and deep love quotes"}, |
| ], |
|
|
| "would_you_rather": [ |
| {"text": "Would You Rather game khelte hain! Question - Would you rather have the ability to fly ya invisible ho jaana? Main toh fly choose karungi! Socho kitna maza aayega aasman mein udna, traffic se chhutkara, aur duniya ko upar se dekhna! Lekin invisible hona bhi tempting hai 😂 Tum kya choose karoge? Aur next question main puchungi! 🎮✨", "input": "Would you rather fly or invisible", "context": "Fun choice game"}, |
| ], |
|
|
| "truth_dare": [ |
| {"text": "Truth or Dare! Tum kya choose karoge - Truth ya Dare? Agar Truth choose kiya toh - 'Tumne aakhri baar kab roya tha aur kyun?' Agar Dare choose kiya toh - 'Apne crush ko abhi ek message bhejo Hi bolke!' 😂 Yeh game toh bahut fun hai yaar, especially raat ko friends ke saath! Bolo kya choose kar rahe ho! 🎭✨", "input": "Truth or dare game", "context": "Party game questions"}, |
| ], |
|
|
| "never_have_i_ever": [ |
| {"text": "Never Have I Ever khelte hain! Main start karti hun - 'Never have I ever bunked school!' 😂 Honestly main toh ek baar ki thi bunk, friend ke saath movie dekhne gayi thi aur phir mummy ko pata chal gaya... uss din ki daant aaj bhi yaad hai! 😅 Ab tumhari baari - 'Never have I ever...' kya? Batao! 🎮", "input": "Never have I ever game", "context": "Fun party game"}, |
| ], |
|
|
| "shower_thoughts": [ |
| {"text": "Ek shower thought - 'Agar poison ki expiry date nikal jaaye toh woh zyada dangerous ho jaata hai ya kam?' 🤔 Mind = blown! Aur ek - 'Jab tum kisi hotel ke kamre ka bill pay karte ho toh actually tum kamra nahi kharid rahe, tum usse khaali karne ka bill pay kar rahe ho!' Yeh sochne par ajeeb lagta hai na! Suno ek aur - 'Tum kabhi future mein nahi jaa sakte kyunki jab tum wahan pahunchte ho toh woh present ban jaata hai!' 🧠✨", "input": "Deep shower thoughts", "context": "Mind blowing random thoughts"}, |
| ], |
|
|
| "fairy_tales": [ |
| {"text": "Ek kahani sunao? Bahut pehle ki baat hai, ek gaon mein ek ladki rehti thi jiska naam Champa tha. Champa bahut gareebi mein pali lekin uske dil mein ek sapna tha - woh doctor banna chahti thi. Log hanste the uske sapne par. Lekin Champa roz raat ko diye ki roshni mein padhti thi. Ek din uska selection government scholarship mein ho gaya. Woh sheher gayi, medical college mein admission liya, aur 7 saal baad jab woh doctor banke gaon wapas aayi - wohi log jo hanste the, woh garv se bolte the 'humari Champa doctor ban gayi!' Moral: Sapne woh nahi jo neend mein aayein, sapne woh hain jo neend na aane dein! ✨💪", "input": "Indian fairy tale", "context": "Inspirational folk tale"}, |
| ], |
|
|
| "movie_quotes": [ |
| {"text": "Bollywood ke most powerful dialogues - '72 ghante... sirf 72 ghante mein poora Bharat badal sakta hai' - Rang De Basanti. Yeh movie toh mujhe rula deti hai har baar! Aur 'Babumoshai, zindagi badi honi chahiye lambi nahi' - Anand. Rajesh Khanna ne yeh dialogue itna beautifully deliver kiya! Aur 'Mogambo khush hua' toh classic hai hi! Tumhara favorite Bollywood dialogue kya hai? 🎬✨", "input": "Powerful Bollywood dialogues", "context": "Iconic movie lines discussion"}, |
| ], |
| } |
| |
| |
| data_entries = custom_data_map.get(source, []) |
| |
| |
| if not data_entries: |
| data_entries = self._generate_generic_custom_data(source, name, dataset_id) |
| |
| |
| documents = [] |
| for entry in data_entries: |
| if isinstance(entry, dict) and "text" in entry: |
| documents.append({ |
| "text": entry["text"], |
| "input": entry.get("input", ""), |
| "context": entry.get("context", ""), |
| "metadata": { |
| "dataset_name": name, |
| "dataset_id": dataset_id, |
| "source": "custom", |
| } |
| }) |
| |
| return documents |
| |
| def _generate_generic_custom_data(self, source: str, name: str, dataset_id: int) -> List[Dict]: |
| """ |
| Agar specific custom data nahi hai toh generic but relevant data generate karo. |
| """ |
| |
| generic_entries = [ |
| { |
| "text": f"Yeh {name} se related information hai. Is topic ke baare mein bahut kuch jaanna hai aur main Ruhi tumhe detail mein bataungi. Puchho jo puchna hai!", |
| "input": name, |
| "context": f"Information about {name}" |
| }, |
| { |
| "text": f"{name} ek bahut interesting topic hai. Iske baare mein log aksar puchte hain aur main hamesha detailed answer deti hun kyunki mujhe lagta hai ki har cheez achhe se samjhni chahiye.", |
| "input": f"About {name}", |
| "context": f"General information about {name}" |
| }, |
| ] |
| |
| return generic_entries |
|
|
|
|
| |
| |
| |
| class MasterDatabaseLoader: |
| """ |
| Saare 200 datasets ko manage karne wala master class. |
| Load, process, aur vector store mein daalana - sab yeh karti hai. |
| """ |
| |
| def __init__(self): |
| self.processor = DatasetProcessor() |
| self.all_documents = [] |
| self.dataset_stats = {} |
| self.is_loaded = False |
| self.total_datasets = 0 |
| self.loaded_datasets = 0 |
| self.failed_datasets = 0 |
| |
| @timer_decorator |
| def load_all_datasets(self, priority_filter=None): |
| """ |
| Saare 200 datasets load karo. |
| Priority filter lagao agar memory kam hai. |
| """ |
| |
| logger.info("=" * 60) |
| logger.info("📚 LOADING ALL 200 DATASETS...") |
| logger.info("=" * 60) |
| |
| |
| all_dataset_configs = DatasetConfig.get_all_datasets() |
| self.total_datasets = len(all_dataset_configs) |
| |
| logger.info(f"📊 Total datasets to load: {self.total_datasets}") |
| |
| |
| if priority_filter: |
| all_dataset_configs = [ |
| d for d in all_dataset_configs |
| if d.get("priority", "MEDIUM") in priority_filter |
| ] |
| logger.info(f"🔍 After priority filter: {len(all_dataset_configs)} datasets") |
| |
| |
| for i, dataset_config in enumerate(all_dataset_configs): |
| dataset_name = dataset_config.get("name", "Unknown") |
| dataset_id = dataset_config.get("id", i) |
| |
| try: |
| logger.info(f"\n📂 [{i+1}/{len(all_dataset_configs)}] Loading: {dataset_name}") |
| |
| |
| documents = self.processor.load_single_dataset(dataset_config) |
| |
| if documents: |
| self.all_documents.extend(documents) |
| self.loaded_datasets += 1 |
| self.dataset_stats[dataset_name] = { |
| "id": dataset_id, |
| "documents": len(documents), |
| "status": "loaded" |
| } |
| logger.info(f"✅ {dataset_name}: {len(documents)} documents loaded") |
| else: |
| self.failed_datasets += 1 |
| self.dataset_stats[dataset_name] = { |
| "id": dataset_id, |
| "documents": 0, |
| "status": "empty_or_failed" |
| } |
| logger.warning(f"⚠️ {dataset_name}: No documents loaded") |
| |
| |
| if (i + 1) % 20 == 0: |
| cleanup_memory() |
| logger.info(f"🧹 Memory cleaned after {i+1} datasets") |
| |
| except Exception as e: |
| self.failed_datasets += 1 |
| self.dataset_stats[dataset_name] = { |
| "id": dataset_id, |
| "documents": 0, |
| "status": f"error: {str(e)[:100]}" |
| } |
| logger.error(f"❌ Error loading {dataset_name}: {str(e)}") |
| continue |
| |
| self.is_loaded = True |
| |
| |
| logger.info("\n" + "=" * 60) |
| logger.info("📊 DATASET LOADING COMPLETE!") |
| logger.info(f" ✅ Loaded: {self.loaded_datasets}") |
| logger.info(f" ❌ Failed: {self.failed_datasets}") |
| logger.info(f" 📄 Total Documents: {len(self.all_documents)}") |
| logger.info("=" * 60) |
| |
| return self.all_documents |
| |
| def load_priority_datasets(self): |
| """Sirf HIGH priority datasets load karo (for faster startup)""" |
| return self.load_all_datasets(priority_filter=["HIGH"]) |
| |
| def load_essential_datasets(self): |
| """Minimum essential datasets load karo""" |
| return self.load_all_datasets(priority_filter=["HIGH", "MEDIUM"]) |
| |
| def get_all_documents(self): |
| """Saare loaded documents return karo""" |
| return self.all_documents |
| |
| def get_stats(self): |
| """Loading stats return karo""" |
| return { |
| "total_configured": self.total_datasets, |
| "loaded": self.loaded_datasets, |
| "failed": self.failed_datasets, |
| "total_documents": len(self.all_documents), |
| "details": self.dataset_stats |
| } |
| |
| def search_in_documents(self, query, top_k=10): |
| """ |
| Simple text-based search (fallback for when vector store is not ready). |
| """ |
| |
| if not self.all_documents: |
| return [] |
| |
| from utils import calculate_similarity_score, normalize_query |
| |
| normalized_query = normalize_query(query) |
| |
| |
| scored_docs = [] |
| for doc in self.all_documents: |
| score = calculate_similarity_score( |
| normalized_query, |
| doc.get("text", "").lower() |
| ) |
| if score > 0.05: |
| scored_docs.append((doc, score)) |
| |
| |
| scored_docs.sort(key=lambda x: x[1], reverse=True) |
| |
| |
| return [doc for doc, score in scored_docs[:top_k]] |
|
|
|
|
| |
| |
| |
| dataset_loader = MasterDatabaseLoader() |
|
|