| |
|
|
| |
| |
| |
|
|
| import gradio as gr |
| import pandas as pd |
| import json |
| from datetime import datetime |
| from pathlib import Path |
| import warnings |
| import traceback |
| import asyncio |
| import subprocess |
| from playwright.async_api import async_playwright |
| from playwright_stealth import stealth_async |
|
|
| |
| def install_playwright_browsers(): |
| print("Vérification et installation des navigateurs Playwright...") |
| try: |
| subprocess.run(["playwright", "install", "chromium"], capture_output=True, text=True, check=True) |
| print("Installation de Chromium terminée avec succès.") |
| except Exception as e: |
| print(f"Erreur lors de l'installation de Chromium : {e}") |
| raise |
|
|
| install_playwright_browsers() |
|
|
| warnings.filterwarnings('ignore') |
| OUTPUT_DIR = Path("outputs") |
| OUTPUT_DIR.mkdir(exist_ok=True) |
|
|
| class WAHISScraper: |
| def __init__(self): |
| self.logs = [] |
| self.output_dir = OUTPUT_DIR |
| self.log("✅ Scraper Définitif initialisé.") |
|
|
| def log(self, message): |
| timestamp = datetime.now().strftime("%H:%M:%S") |
| self.logs.append(f"[{timestamp}] {message}") |
| print(message) |
|
|
| async def run_extraction_async(self): |
| self.log("🚀 Lancement de l'extraction finale...") |
| |
| async with async_playwright() as p: |
| browser = None |
| try: |
| self.log("🔧 Lancement d'un navigateur Chromium...") |
| browser = await p.chromium.launch(headless=True) |
| page = await browser.new_page() |
|
|
| self.log("🕵️ Application du camouflage 'stealth'...") |
| await stealth_async(page) |
| |
| self.log("🌍 Visite de la page principale pour passer le challenge Cloudflare...") |
| await page.goto("https://wahis.woah.org/#/event-management", wait_until="networkidle", timeout=90000) |
| self.log("🍪 Challenge Cloudflare réussi, cookies obtenus.") |
|
|
| api_url = "https://wahis.woah.org/api/v1/pi/event/filtered-list?language=fr" |
| headers = { |
| 'Content-Type': 'application/json', 'Accept': 'application/json', |
| 'clientid': 'OIEwebsite', 'env': 'PRD', |
| 'security-token': 'token', 'type': 'REQUEST' |
| } |
| payload = { |
| "pageNumber": 1, "pageSize": 50, "sortColName": "REP_LAST_UPDATE", "sortColOrder": "DESC", |
| "reportFilters": {"countries": [], "diseases": [], "reasons": [], "startDate": None, "endDate": None, "reportType": [], "serotypes": [], "species": [], "reportStatus": []}, |
| "languageChanged": False |
| } |
|
|
| self.log(f"🎯 Exécution de la requête POST vers la bonne API...") |
| api_response_json = await page.evaluate( |
| "async (args) => { const r = await fetch(args.url, { method: 'POST', headers: args.headers, body: JSON.stringify(args.payload) }); return r.json(); }", |
| {'url': api_url, 'headers': headers, 'payload': payload} |
| ) |
| self.log("📊 Réponse JSON reçue avec succès !") |
| |
| |
| |
| self.log("🔍 Analyse de la structure de la réponse JSON...") |
| report_list = [] |
| if isinstance(api_response_json, dict): |
| self.log(f" -> La réponse est un dictionnaire. Clés: {list(api_response_json.keys())}") |
| |
| possible_keys = ['list', 'data', 'results', 'items', 'events'] |
| for key in possible_keys: |
| if key in api_response_json and isinstance(api_response_json[key], list): |
| report_list = api_response_json[key] |
| self.log(f" -> Liste trouvée dans la clé '{key}'.") |
| break |
| |
| |
| if report_list and isinstance(report_list, list): |
| self.log(f" 🎉 SUCCÈS ! {len(report_list)} rapports trouvés !") |
| df = pd.DataFrame(report_list) |
| excel_path, json_path = self.save_results(report_list) |
| self.log("✅ Extraction terminée avec succès !") |
| return "\n".join(self.logs), df, excel_path, json_path |
| else: |
| self.log("❌ La structure JSON ne contient pas de liste de rapports identifiable.") |
|
|
| except Exception as e: |
| self.log(f"❌ Une erreur critique est survenue.") |
| self.log(traceback.format_exc()) |
| finally: |
| if browser and browser.is_connected(): |
| await browser.close() |
|
|
| self.log("❌ L'extraction s'est terminée sans trouver de données.") |
| return "\n".join(self.logs), pd.DataFrame(), None, None |
|
|
| def save_results(self, data): |
| df = pd.DataFrame(data) |
| self.log(f"💾 Sauvegarde de {len(df)} lignes de données...") |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
| excel_file = self.output_dir / f"wahis_reports_{timestamp}.xlsx" |
| json_file = self.output_dir / f"wahis_reports_{timestamp}.json" |
| df.to_excel(excel_file, index=False) |
| with open(json_file, 'w', encoding='utf-8') as f: |
| json.dump(data, f, indent=4, ensure_ascii=False) |
| self.log(f" - Fichiers créés : {excel_file.name}, {json_file.name}") |
| return str(excel_file), str(json_file) |
|
|
| def run_scraping_task(): |
| scraper = WAHISScraper() |
| logs, df, excel_file, json_file = asyncio.run(scraper.run_extraction_async()) |
| return logs, df, gr.File(value=excel_file, visible=True if excel_file else False), gr.File(value=json_file, visible=True if json_file else False) |
|
|
| |
| with gr.Blocks(theme=gr.themes.Soft(), title="WAHIS Scraper") as demo: |
| gr.Markdown("# 🤖 Scraper pour WAHIS (WOAH) - Version Définitive") |
| gr.Info("Cette version reproduit une requête navigateur valide pour une fiabilité maximale.") |
| with gr.Row(): |
| run_button = gr.Button("🚀 Lancer l'extraction des données", variant="primary") |
| gr.Markdown("---") |
| status_textbox = gr.Textbox(lines=15, label="📜 Journal d'exécution", interactive=False) |
| results_dataframe = gr.DataFrame(label="📊 Données extraites") |
| with gr.Row(): |
| excel_output = gr.File(label="💾 Fichier Excel (.xlsx)", visible=False) |
| json_output = gr.File(label="💾 Fichier JSON (.json)", visible=False) |
| run_button.click(fn=run_scraping_task, inputs=[], outputs=[status_textbox, results_dataframe, excel_output, json_output]) |
|
|
| if __name__ == "__main__": |
| demo.launch() |