Spaces:

MMOON
/

WAHIS

Sleeping

App Files Files Community

WAHIS / appV0.py

MMOON

Rename app.py to appV0.py

8362e18 verified 9 months ago

raw

history blame contribute delete

7.32 kB

	# Fichier: app.py

	# ===================================================================================
	# WAHIS SCRAPER - VERSION FINALE AVEC LA CLÉ DE DONNÉES CORRECTE
	# ===================================================================================

	import gradio as gr
	import pandas as pd
	import json
	from datetime import datetime
	from pathlib import Path
	import warnings
	import traceback
	import asyncio
	import subprocess
	from playwright.async_api import async_playwright
	from playwright_stealth import stealth_async

	# Installation du navigateur (ne change pas)
	def install_playwright_browsers():
	print("Vérification et installation des navigateurs Playwright...")
	try:
	subprocess.run(["playwright", "install", "chromium"], capture_output=True, text=True, check=True)
	print("Installation de Chromium terminée avec succès.")
	except Exception as e:
	print(f"Erreur lors de l'installation de Chromium : {e}")
	raise

	install_playwright_browsers()

	warnings.filterwarnings('ignore')
	OUTPUT_DIR = Path("outputs")
	OUTPUT_DIR.mkdir(exist_ok=True)

	class WAHISScraper:
	def __init__(self):
	self.logs = []
	self.output_dir = OUTPUT_DIR
	self.log("✅ Scraper Définitif initialisé.")

	def log(self, message):
	timestamp = datetime.now().strftime("%H:%M:%S")
	self.logs.append(f"[{timestamp}] {message}")
	print(message)

	async def run_extraction_async(self):
	self.log("🚀 Lancement de l'extraction finale...")

	async with async_playwright() as p:
	browser = None
	try:
	self.log("🔧 Lancement d'un navigateur Chromium...")
	browser = await p.chromium.launch(headless=True)
	page = await browser.new_page()

	self.log("🕵️ Application du camouflage 'stealth'...")
	await stealth_async(page)

	self.log("🌍 Visite de la page principale pour passer le challenge Cloudflare...")
	await page.goto("https://wahis.woah.org/#/event-management", wait_until="networkidle", timeout=90000)
	self.log("🍪 Challenge Cloudflare réussi, cookies obtenus.")

	api_url = "https://wahis.woah.org/api/v1/pi/event/filtered-list?language=fr"
	headers = {
	'Content-Type': 'application/json', 'Accept': 'application/json',
	'clientid': 'OIEwebsite', 'env': 'PRD',
	'security-token': 'token', 'type': 'REQUEST'
	}
	payload = {
	"pageNumber": 1, "pageSize": 50, "sortColName": "REP_LAST_UPDATE", "sortColOrder": "DESC",
	"reportFilters": {"countries": [], "diseases": [], "reasons": [], "startDate": None, "endDate": None, "reportType": [], "serotypes": [], "species": [], "reportStatus": []},
	"languageChanged": False
	}

	self.log(f"🎯 Exécution de la requête POST vers la bonne API...")
	api_response_json = await page.evaluate(
	"async (args) => { const r = await fetch(args.url, { method: 'POST', headers: args.headers, body: JSON.stringify(args.payload) }); return r.json(); }",
	{'url': api_url, 'headers': headers, 'payload': payload}
	)
	self.log("📊 Réponse JSON reçue avec succès !")

	# CORRECTION FINALE : Ajouter 'list' aux clés possibles
	# -----------------------------------------------------------------
	self.log("🔍 Analyse de la structure de la réponse JSON...")
	report_list = []
	if isinstance(api_response_json, dict):
	self.log(f" -> La réponse est un dictionnaire. Clés: {list(api_response_json.keys())}")
	# LA CORRECTION EST ICI : Ajout de 'list' en première position
	possible_keys = ['list', 'data', 'results', 'items', 'events']
	for key in possible_keys:
	if key in api_response_json and isinstance(api_response_json[key], list):
	report_list = api_response_json[key]
	self.log(f" -> Liste trouvée dans la clé '{key}'.")
	break
	# -----------------------------------------------------------------

	if report_list and isinstance(report_list, list):
	self.log(f" 🎉 SUCCÈS ! {len(report_list)} rapports trouvés !")
	df = pd.DataFrame(report_list)
	excel_path, json_path = self.save_results(report_list)
	self.log("✅ Extraction terminée avec succès !")
	return "\n".join(self.logs), df, excel_path, json_path
	else:
	self.log("❌ La structure JSON ne contient pas de liste de rapports identifiable.")

	except Exception as e:
	self.log(f"❌ Une erreur critique est survenue.")
	self.log(traceback.format_exc())
	finally:
	if browser and browser.is_connected():
	await browser.close()

	self.log("❌ L'extraction s'est terminée sans trouver de données.")
	return "\n".join(self.logs), pd.DataFrame(), None, None

	def save_results(self, data):
	df = pd.DataFrame(data)
	self.log(f"💾 Sauvegarde de {len(df)} lignes de données...")
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	excel_file = self.output_dir / f"wahis_reports_{timestamp}.xlsx"
	json_file = self.output_dir / f"wahis_reports_{timestamp}.json"
	df.to_excel(excel_file, index=False)
	with open(json_file, 'w', encoding='utf-8') as f:
	json.dump(data, f, indent=4, ensure_ascii=False)
	self.log(f" - Fichiers créés : {excel_file.name}, {json_file.name}")
	return str(excel_file), str(json_file)

	def run_scraping_task():
	scraper = WAHISScraper()
	logs, df, excel_file, json_file = asyncio.run(scraper.run_extraction_async())
	return logs, df, gr.File(value=excel_file, visible=True if excel_file else False), gr.File(value=json_file, visible=True if json_file else False)

	# Interface Gradio
	with gr.Blocks(theme=gr.themes.Soft(), title="WAHIS Scraper") as demo:
	gr.Markdown("# 🤖 Scraper pour WAHIS (WOAH) - Version Définitive")
	gr.Info("Cette version reproduit une requête navigateur valide pour une fiabilité maximale.")
	with gr.Row():
	run_button = gr.Button("🚀 Lancer l'extraction des données", variant="primary")
	gr.Markdown("---")
	status_textbox = gr.Textbox(lines=15, label="📜 Journal d'exécution", interactive=False)
	results_dataframe = gr.DataFrame(label="📊 Données extraites")
	with gr.Row():
	excel_output = gr.File(label="💾 Fichier Excel (.xlsx)", visible=False)
	json_output = gr.File(label="💾 Fichier JSON (.json)", visible=False)
	run_button.click(fn=run_scraping_task, inputs=[], outputs=[status_textbox, results_dataframe, excel_output, json_output])

	if __name__ == "__main__":
	demo.launch()