#!/usr/bin/env python3 """ Script pour extraire les types de courses, distances et statistiques Analyse les données pour identifier les patterns de courses (100m, marathon, etc.) """ import pandas as pd import os import sys import argparse import logging import re from collections import defaultdict, Counter logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) # Patterns pour extraire les distances des noms de courses DISTANCE_PATTERNS = [ (r'(\d+)\s*m', lambda x: int(x.group(1)), 'm'), # 100m, 5000m (r'(\d+)\s*km', lambda x: int(x.group(1)) * 1000, 'km'), # 10km, semi-marathon (r'marathon', lambda x: 42195, 'marathon'), (r'semi[-\s]?marathon', lambda x: 21097, 'semi-marathon'), (r'demi[-\s]?fond', lambda x: 0, 'demi-fond'), (r'fond', lambda x: 0, 'fond'), (r'sprint', lambda x: 0, 'sprint'), (r'haies', lambda x: 0, 'haies'), (r'cross', lambda x: 0, 'cross country'), (r'route', lambda x: 0, 'route'), (r'trail', lambda x: 0, 'trail'), (r'ultra', lambda x: 0, 'ultra'), ] def extract_distance_from_name(course_name): """Extraire la distance à partir du nom de course""" if pd.isna(course_name): return None, None course_name_lower = course_name.lower() for pattern, extractor, unit in DISTANCE_PATTERNS: match = re.search(pattern, course_name_lower, re.IGNORECASE) if match: try: distance = extractor(match) return distance, unit except: continue return None, None def categorize_course(course_type, course_name): """Catégoriser une course""" if pd.isna(course_type): course_type = '' if pd.isna(course_name): course_name = '' combined = (course_type + ' ' + course_name).lower() # Catégories principales if any(x in combined for x in ['100m', '200m', '400m', 'sprint']): return 'Sprint' elif any(x in combined for x in ['800m', '1500m', 'demi-fond']): return 'Demi-fond' elif any(x in combined for x in ['5000m', '10000m', 'fond']): return 'Fond' elif 'marathon' in combined: return 'Marathon' elif any(x in combined for x in ['semi', '21km']): return 'Semi-marathon' elif 'trail' in combined: return 'Trail' elif 'cross' in combined: return 'Cross country' elif 'route' in combined and 'km' in combined: return 'Route' elif 'haies' in combined: return 'Haies' else: return 'Autre' def analyze_courses(data_dir="data"): """Analyser toutes les courses et extraire les statistiques""" courses_path = os.path.join(data_dir, 'courses', 'courses_list.csv') if not os.path.exists(courses_path): logging.error(f"Fichier de courses introuvable: {courses_path}") return None try: df = pd.read_csv(courses_path, encoding='utf-8-sig') logging.info(f"Chargé {len(df)} courses") # Extraire les distances df['distance_meters'], df['distance_unit'] = zip( *df['nom'].apply(extract_distance_from_name) ) # Catégoriser les courses df['category'] = df.apply( lambda row: categorize_course(row['type'], row['nom']), axis=1 ) # Statistiques globales stats = { 'total_courses': len(df), 'types': {}, 'categories': {}, 'distances': {}, 'by_type': {}, 'by_location': {}, 'by_date': {} } # Analyse par type type_counts = df['type'].value_counts() for course_type, count in type_counts.items(): stats['types'][course_type] = count # Analyse par catégorie category_counts = df['category'].value_counts() for category, count in category_counts.items(): stats['categories'][category] = count # Analyse par distance (pour les courses avec distance) df_with_distance = df[df['distance_meters'] > 0] distance_counts = df_with_distance['distance_meters'].value_counts() for distance, count in distance_counts.items(): stats['distances'][distance] = count # Détails par type for course_type in df['type'].unique(): if pd.notna(course_type): type_df = df[df['type'] == course_type] stats['by_type'][course_type] = { 'count': len(type_df), 'categories': type_df['category'].value_counts().to_dict(), 'locations': type_df['lieu'].value_counts().head(10).to_dict() } # Détails par lieu location_counts = df['lieu'].value_counts().head(20) for location, count in location_counts.items(): stats['by_location'][location] = count # Détails par date (mois/année) df['date'] = pd.to_datetime(df['date'], errors='coerce') df['month_year'] = df['date'].dt.to_period('M') date_counts = df['month_year'].value_counts().sort_index() for period, count in date_counts.items(): stats['by_date'][str(period)] = count return df, stats except Exception as e: logging.error(f"Erreur lors de l'analyse des courses: {e}") return None, None def display_analysis(stats, df=None, show_details=False): """Afficher les résultats de l'analyse""" if not stats: print("\n❌ Impossible d'analyser les courses") return print(f"\n{'='*80}") print(f"📊 ANALYSE DES COURSES") print(f"{'='*80}\n") # Vue d'ensemble print(f"📋 VUE D'ENSEMBLE") print(f"{'─'*40}") print(f"Total des courses: {stats['total_courses']}") print() # Types de courses print(f"🏷️ TYPES DE COURSES") print(f"{'─'*40}") for course_type, count in sorted(stats['types'].items(), key=lambda x: x[1], reverse=True): print(f" {course_type}: {count} courses") print() # Catégories print(f"📊 CATÉGORIES") print(f"{'─'*40}") for category, count in sorted(stats['categories'].items(), key=lambda x: x[1], reverse=True): print(f" {category}: {count} courses") print() # Distances if stats['distances']: print(f"📏 DISTANCES EXTRACTÉES") print(f"{'─'*40}") # Trier par distance for distance in sorted(stats['distances'].keys()): count = stats['distances'][distance] if distance == 42195: distance_str = "Marathon (42.195 km)" elif distance == 21097: distance_str = "Semi-marathon (21.097 km)" elif distance >= 1000: distance_str = f"{distance/1000:.1f} km" else: distance_str = f"{distance} m" print(f" {distance_str}: {count} courses") print() # Lieux les plus populaires print(f"📍 LIEUX LES PLUS POPULAIRES (Top 20)") print(f"{'─'*40}") for i, (location, count) in enumerate(sorted(stats['by_location'].items(), key=lambda x: x[1], reverse=True), 1): print(f" {i:2d}. {location}: {count} courses") print() # Répartition par date if stats['by_date']: print(f"📅 RÉPARTITION PAR DATE") print(f"{'─'*40}") for period, count in list(stats['by_date'].items())[-12:]: # Derniers 12 mois print(f" {period}: {count} courses") print() print(f"{'='*80}\n") # Détails par type if show_details and stats['by_type']: print(f"📋 DÉTAILS PAR TYPE DE COURSE") print(f"{'='*80}\n") for course_type, details in sorted(stats['by_type'].items(), key=lambda x: x[1]['count'], reverse=True): print(f"🔹 {course_type}") print(f" Nombre de courses: {details['count']}") print(f" Répartition par catégorie:") for category, count in sorted(details['categories'].items(), key=lambda x: x[1], reverse=True)[:5]: print(f" - {category}: {count}") print(f" Top lieux:") for i, (location, count) in enumerate(sorted(details['locations'].items(), key=lambda x: x[1], reverse=True)[:5], 1): print(f" {i}. {location}: {count}") print() def export_analysis_csv(stats, df, output_dir="data"): """Exporter l'analyse en CSV""" os.makedirs(os.path.join(output_dir, 'exports'), exist_ok=True) # Exporter le DataFrame enrichi avec distances et catégories courses_with_analysis = os.path.join(output_dir, 'exports', 'courses_analysis.csv') if df is not None: df.to_csv(courses_with_analysis, index=False, encoding='utf-8-sig') logging.info(f"Exporté {len(df)} courses analysées dans {courses_with_analysis}") # Exporter les statistiques par type types_csv = os.path.join(output_dir, 'exports', 'courses_by_type.csv') if stats['types']: types_df = pd.DataFrame(list(stats['types'].items()), columns=['Type', 'Count']) types_df.to_csv(types_csv, index=False, encoding='utf-8-sig') # Exporter les statistiques par catégorie categories_csv = os.path.join(output_dir, 'exports', 'courses_by_category.csv') if stats['categories']: categories_df = pd.DataFrame(list(stats['categories'].items()), columns=['Category', 'Count']) categories_df.to_csv(categories_csv, index=False, encoding='utf-8-sig') # Exporter les statistiques par distance distances_csv = os.path.join(output_dir, 'exports', 'courses_by_distance.csv') if stats['distances']: distances_df = pd.DataFrame(list(stats['distances'].items()), columns=['Distance (m)', 'Count']) distances_df = distances_df.sort_values('Distance (m)') distances_df.to_csv(distances_csv, index=False, encoding='utf-8-sig') return { 'courses_analysis': courses_with_analysis, 'by_type': types_csv, 'by_category': categories_csv, 'by_distance': distances_csv } def search_courses_by_distance(df, min_distance=None, max_distance=None): """Rechercher des courses par distance""" if df is None: return [] mask = df['distance_meters'] > 0 if min_distance is not None: mask &= df['distance_meters'] >= min_distance if max_distance is not None: mask &= df['distance_meters'] <= max_distance courses = df[mask].to_dict('records') return courses def main(): parser = argparse.ArgumentParser(description='Extraire et analyser les types de courses et distances') parser.add_argument('--data-dir', default='data', help='Répertoire des données CSV') parser.add_argument('--details', action='store_true', help='Afficher les détails par type de course') parser.add_argument('--export', action='store_true', help='Exporter l\'analyse en CSV') parser.add_argument('--search-distance', action='store_true', help='Rechercher des courses par distance') parser.add_argument('--min-distance', type=int, help='Distance minimum en mètres') parser.add_argument('--max-distance', type=int, help='Distance maximum en mètres') args = parser.parse_args() # Analyse des courses print(f"\n📊 Analyse des courses depuis {args.data_dir}/...") df, stats = analyze_courses(args.data_dir) if df is not None and stats is not None: # Affichage display_analysis(stats, df, show_details=args.details) # Recherche par distance if args.search_distance: print(f"\n🔍 Recherche de courses par distance:") print(f" Min: {args.min_distance}m, Max: {args.max_distance}m") courses = search_courses_by_distance(df, args.min_distance, args.max_distance) if courses: print(f"\n Trouvé {len(courses)} courses:") for i, course in enumerate(courses[:20], 1): print(f" {i}. {course['nom']} - {course['distance_meters']}m") if len(courses) > 20: print(f" ... et {len(courses) - 20} autres") else: print(" Aucune course trouvée avec ces critères") # Export if args.export: files = export_analysis_csv(stats, df, args.data_dir) print(f"\n💾 Exporté dans:") for key, filepath in files.items(): print(f" {key}: {filepath}") else: print("\n❌ Impossible d'analyser les courses") print("💡 Vérifiez que les données ont été scrapées avec:") print(" python ffa_cli.py scrape") if __name__ == "__main__": main()