#!/usr/bin/env python3 """ Script de post-traitement pour analyser et trier les données scrapées """ import os import sys import logging import pandas as pd from collections import defaultdict from datetime import datetime def analyze_clubs(data_dir): """Analyser et extraire tous les clubs""" logging.info("=== Analyse des clubs ===") results_path = os.path.join(data_dir, 'resultats', 'results.csv') if os.path.exists(results_path): df = pd.read_csv(results_path, encoding='utf-8-sig') # Extraire les clubs uniques clubs_info = df.groupby('club').agg({ 'nom': lambda x: x.nunique(), 'prenom': lambda x: x.nunique() }).reset_index() clubs_info.columns = ['club', 'athletes_count', 'unique_athletes'] clubs_info = clubs_info.sort_values('athletes_count', ascending=False) # Sauvegarder clubs_dir = os.path.join(data_dir, 'clubs') os.makedirs(clubs_dir, exist_ok=True) clubs_file = os.path.join(clubs_dir, 'clubs_list.csv') clubs_info.to_csv(clubs_file, index=False, encoding='utf-8-sig') logging.info(f"✅ {len(clubs_info)} clubs exportés dans {clubs_file}") logging.info(f" Top 5 clubs:") for i, club in clubs_info.head(5).iterrows(): logging.info(f" {i+1}. {club['club']}: {club['athletes_count']} résultats") return clubs_info else: logging.warning("⚠️ Fichier de résultats introuvable") return None def analyze_courses(data_dir): """Analyser et extraire les statistiques des courses""" logging.info("=== Analyse des courses ===") courses_path = os.path.join(data_dir, 'courses', 'courses_list.csv') if os.path.exists(courses_path): df = pd.read_csv(courses_path, encoding='utf-8-sig') # Convertir les dates df['date'] = pd.to_datetime(df['date'], errors='coerce') df['année'] = df['date'].dt.year df['mois'] = df['date'].dt.month # Statistiques par année courses_by_year = df.groupby('année').size().reset_index(name='count') courses_by_year = courses_by_year.sort_values('année') # Statistiques par type courses_by_type = df['type'].value_counts().reset_index() courses_by_type.columns = ['type', 'count'] # Statistiques par lieu (top 50) courses_by_location = df['lieu'].value_counts().head(50).reset_index() courses_by_location.columns = ['lieu', 'count'] # Sauvegarder les statistiques stats_dir = os.path.join(data_dir, 'statistics') os.makedirs(stats_dir, exist_ok=True) # Export par année year_file = os.path.join(stats_dir, 'courses_by_year.csv') courses_by_year.to_csv(year_file, index=False, encoding='utf-8-sig') # Export par type type_file = os.path.join(stats_dir, 'courses_by_type.csv') courses_by_type.to_csv(type_file, index=False, encoding='utf-8-sig') # Export par lieu location_file = os.path.join(stats_dir, 'courses_by_location.csv') courses_by_location.to_csv(location_file, index=False, encoding='utf-8-sig') logging.info(f"✅ Statistiques exportées dans {stats_dir}") logging.info(f" Années: {len(courses_by_year)}") logging.info(f" Types: {len(courses_by_type)}") logging.info(f" Lieux: {len(courses_by_location)}") # Récapitulatif logging.info(f"\n📊 RÉCAPITULATIF DES COURSES:") logging.info(f" Total: {len(df)} courses") logging.info(f" Plage de dates: {df['date'].min()} au {df['date'].max()}") logging.info(f" Années: {len(courses_by_year)}") logging.info(f" Types: {len(courses_by_type)}") return { 'total': len(df), 'years': len(courses_by_year), 'types': len(courses_by_type), 'locations': len(courses_by_location) } else: logging.warning("⚠️ Fichier de courses introuvable") return None def extract_distances_from_courses(data_dir): """Extraire et catégoriser les distances des courses""" logging.info("=== Extraction des distances ===") courses_path = os.path.join(data_dir, 'courses', 'courses_list.csv') if os.path.exists(courses_path): df = pd.read_csv(courses_path, encoding='utf-8-sig') import re # Fonction pour extraire la distance def extract_distance(course_name): patterns = [ (r'(\d+)\s*km', lambda m: int(m.group(1)) * 1000), (r'(\d+)\s*m', lambda m: int(m.group(1))), (r'marathon', lambda m: 42195), (r'semi[-\s]?marathon', lambda m: 21097), ] for pattern, extractor in patterns: match = re.search(pattern, course_name, re.IGNORECASE) if match: try: return extractor(match) except: pass return None # Extraire les distances df['distance_meters'] = df['nom'].apply(extract_distance) # Catégoriser def categorize_distance(distance): if pd.isna(distance): return 'Autre' elif distance < 400: return 'Sprint' elif distance < 2000: return 'Demi-fond' elif distance < 5000: return 'Fond' elif distance < 10000: return 'Intermédiaire' elif distance < 21000: return '10km' elif distance < 22000: return 'Semi-marathon' elif distance < 43000: return 'Longue distance' elif distance < 50000: return 'Marathon' else: return 'Ultra' df['category'] = df['distance_meters'].apply(categorize_distance) # Statistiques par catégorie categories = df['category'].value_counts().reset_index() categories.columns = ['category', 'count'] # Sauvegarder les courses avec distances courses_with_distance = os.path.join(data_dir, 'courses', 'courses_with_distances.csv') df.to_csv(courses_with_distance, index=False, encoding='utf-8-sig') # Sauvegarder les statistiques stats_dir = os.path.join(data_dir, 'statistics') categories_file = os.path.join(stats_dir, 'courses_by_category.csv') categories.to_csv(categories_file, index=False, encoding='utf-8-sig') logging.info(f"✅ Distances extraites et exportées") logging.info(f" Catégories: {len(categories)}") logging.info(f"\nRépartition par catégorie:") for _, row in categories.head(10).iterrows(): logging.info(f" {row['category']}: {row['count']} courses") return categories else: logging.warning("⚠️ Fichier de courses introuvable") return None def create_summary(data_dir): """Créer un récapitulatif global""" logging.info("=== Création du récapitulatif ===") summary_dir = os.path.join(data_dir, 'summary') os.makedirs(summary_dir, exist_ok=True) # Créer un fichier de récapitulatif summary_file = os.path.join(summary_dir, 'global_summary.txt') with open(summary_file, 'w', encoding='utf-8') as f: f.write("="*80 + "\n") f.write("RÉCAPITULATIF GLOBAL DES DONNÉES FFA\n") f.write("="*80 + "\n") f.write(f"Date de génération: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") # Courses courses_path = os.path.join(data_dir, 'courses', 'courses_list.csv') if os.path.exists(courses_path): df_courses = pd.read_csv(courses_path, encoding='utf-8-sig') f.write(f"COURSES\n") f.write("-"*40 + "\n") f.write(f"Total des courses: {len(df_courses)}\n") df_courses['date'] = pd.to_datetime(df_courses['date'], errors='coerce') f.write(f"Première course: {df_courses['date'].min()}\n") f.write(f"Dernière course: {df_courses['date'].max()}\n") years = df_courses['date'].dt.year.dropna().unique() f.write(f"Années couvertes: {len(years)} ({min(years)} à {max(years)})\n\n") # Résultats results_path = os.path.join(data_dir, 'resultats', 'results.csv') if os.path.exists(results_path): df_results = pd.read_csv(results_path, encoding='utf-8-sig') f.write(f"RÉSULTATS\n") f.write("-"*40 + "\n") f.write(f"Total des résultats: {len(df_results)}\n") clubs = df_results['club'].nunique() f.write(f"Clubs uniques: {clubs}\n") f.write(f"Athlètes uniques: {df_results['nom'].nunique()}\n\n") # Clubs clubs_path = os.path.join(data_dir, 'clubs', 'clubs_list.csv') if os.path.exists(clubs_path): df_clubs = pd.read_csv(clubs_path, encoding='utf-8-sig') f.write(f"CLUBS\n") f.write("-"*40 + "\n") f.write(f"Total des clubs: {len(df_clubs)}\n\n") f.write(f"Top 10 clubs:\n") for i, club in df_clubs.head(10).iterrows(): f.write(f" {i+1}. {club['club']}: {club['athletes_count']} résultats\n") f.write("\n") logging.info(f"✅ Récapitulatif global créé dans {summary_file}") return summary_file def main(): """Fonction principale""" logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) data_dir = sys.argv[1] if len(sys.argv) > 1 else 'data_2010_2026' logging.info(f"{'='*80}") logging.info(f"POST-TRAITEMENT DES DONNÉES FFA") logging.info(f"{'='*80}") logging.info(f"Répertoire: {data_dir}\n") # Analyser les clubs clubs = analyze_clubs(data_dir) # Analyser les courses courses_stats = analyze_courses(data_dir) # Extraire les distances categories = extract_distances_from_courses(data_dir) # Créer le récapitulatif summary = create_summary(data_dir) logging.info(f"\n{'='*80}") logging.info(f"POST-TRAITEMENT TERMINÉ") logging.info(f"{'='*80}") # Afficher les statistiques if courses_stats: print(f"\n📊 STATISTIQUES FINALES:") print(f" Courses: {courses_stats['total']}") print(f" Années: {courses_stats['years']}") print(f" Types: {courses_stats['types']}") if clubs is not None: print(f" Clubs: {len(clubs)}") if categories is not None: print(f" Catégories: {len(categories)}") print(f"\n✅ Toutes les données ont été analysées et exportées!") print(f"📁 Répertoire principal: {data_dir}") if __name__ == "__main__": main()