Initial commit: Reorganiser le projet FFA Calendar Scraper
- Créer une arborescence propre (src/, scripts/, config/, data/, docs/, tests/) - Déplacer les modules Python dans src/ - Déplacer les scripts autonomes dans scripts/ - Nettoyer les fichiers temporaires et __pycache__ - Mettre à jour le README.md avec documentation complète - Mettre à jour les imports dans les scripts pour la nouvelle structure - Configurer le .gitignore pour ignorer les données et logs - Organiser les données dans data/ (courses, resultats, clubs, exports) Structure du projet: - src/: Modules principaux (ffa_scraper, ffa_analyzer) - scripts/: Scripts CLI et utilitaires - config/: Configuration (config.env) - data/: Données générées - docs/: Documentation - tests/: Tests unitaires 💘 Generated with Crush Assisted-by: GLM-4.7 via Crush <crush@charm.land>
This commit is contained in:
349
scripts/extract_races.py
Executable file
349
scripts/extract_races.py
Executable file
@@ -0,0 +1,349 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script pour extraire les types de courses, distances et statistiques
|
||||
Analyse les données pour identifier les patterns de courses (100m, marathon, etc.)
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
import logging
|
||||
import re
|
||||
from collections import defaultdict, Counter
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
# Patterns pour extraire les distances des noms de courses
|
||||
DISTANCE_PATTERNS = [
|
||||
(r'(\d+)\s*m', lambda x: int(x.group(1)), 'm'), # 100m, 5000m
|
||||
(r'(\d+)\s*km', lambda x: int(x.group(1)) * 1000, 'km'), # 10km, semi-marathon
|
||||
(r'marathon', lambda x: 42195, 'marathon'),
|
||||
(r'semi[-\s]?marathon', lambda x: 21097, 'semi-marathon'),
|
||||
(r'demi[-\s]?fond', lambda x: 0, 'demi-fond'),
|
||||
(r'fond', lambda x: 0, 'fond'),
|
||||
(r'sprint', lambda x: 0, 'sprint'),
|
||||
(r'haies', lambda x: 0, 'haies'),
|
||||
(r'cross', lambda x: 0, 'cross country'),
|
||||
(r'route', lambda x: 0, 'route'),
|
||||
(r'trail', lambda x: 0, 'trail'),
|
||||
(r'ultra', lambda x: 0, 'ultra'),
|
||||
]
|
||||
|
||||
def extract_distance_from_name(course_name):
|
||||
"""Extraire la distance à partir du nom de course"""
|
||||
if pd.isna(course_name):
|
||||
return None, None
|
||||
|
||||
course_name_lower = course_name.lower()
|
||||
|
||||
for pattern, extractor, unit in DISTANCE_PATTERNS:
|
||||
match = re.search(pattern, course_name_lower, re.IGNORECASE)
|
||||
if match:
|
||||
try:
|
||||
distance = extractor(match)
|
||||
return distance, unit
|
||||
except:
|
||||
continue
|
||||
|
||||
return None, None
|
||||
|
||||
def categorize_course(course_type, course_name):
|
||||
"""Catégoriser une course"""
|
||||
if pd.isna(course_type):
|
||||
course_type = ''
|
||||
|
||||
if pd.isna(course_name):
|
||||
course_name = ''
|
||||
|
||||
combined = (course_type + ' ' + course_name).lower()
|
||||
|
||||
# Catégories principales
|
||||
if any(x in combined for x in ['100m', '200m', '400m', 'sprint']):
|
||||
return 'Sprint'
|
||||
elif any(x in combined for x in ['800m', '1500m', 'demi-fond']):
|
||||
return 'Demi-fond'
|
||||
elif any(x in combined for x in ['5000m', '10000m', 'fond']):
|
||||
return 'Fond'
|
||||
elif 'marathon' in combined:
|
||||
return 'Marathon'
|
||||
elif any(x in combined for x in ['semi', '21km']):
|
||||
return 'Semi-marathon'
|
||||
elif 'trail' in combined:
|
||||
return 'Trail'
|
||||
elif 'cross' in combined:
|
||||
return 'Cross country'
|
||||
elif 'route' in combined and 'km' in combined:
|
||||
return 'Route'
|
||||
elif 'haies' in combined:
|
||||
return 'Haies'
|
||||
else:
|
||||
return 'Autre'
|
||||
|
||||
def analyze_courses(data_dir="data"):
|
||||
"""Analyser toutes les courses et extraire les statistiques"""
|
||||
courses_path = os.path.join(data_dir, 'courses', 'courses_list.csv')
|
||||
|
||||
if not os.path.exists(courses_path):
|
||||
logging.error(f"Fichier de courses introuvable: {courses_path}")
|
||||
return None
|
||||
|
||||
try:
|
||||
df = pd.read_csv(courses_path, encoding='utf-8-sig')
|
||||
logging.info(f"Chargé {len(df)} courses")
|
||||
|
||||
# Extraire les distances
|
||||
df['distance_meters'], df['distance_unit'] = zip(
|
||||
*df['nom'].apply(extract_distance_from_name)
|
||||
)
|
||||
|
||||
# Catégoriser les courses
|
||||
df['category'] = df.apply(
|
||||
lambda row: categorize_course(row['type'], row['nom']),
|
||||
axis=1
|
||||
)
|
||||
|
||||
# Statistiques globales
|
||||
stats = {
|
||||
'total_courses': len(df),
|
||||
'types': {},
|
||||
'categories': {},
|
||||
'distances': {},
|
||||
'by_type': {},
|
||||
'by_location': {},
|
||||
'by_date': {}
|
||||
}
|
||||
|
||||
# Analyse par type
|
||||
type_counts = df['type'].value_counts()
|
||||
for course_type, count in type_counts.items():
|
||||
stats['types'][course_type] = count
|
||||
|
||||
# Analyse par catégorie
|
||||
category_counts = df['category'].value_counts()
|
||||
for category, count in category_counts.items():
|
||||
stats['categories'][category] = count
|
||||
|
||||
# Analyse par distance (pour les courses avec distance)
|
||||
df_with_distance = df[df['distance_meters'] > 0]
|
||||
distance_counts = df_with_distance['distance_meters'].value_counts()
|
||||
for distance, count in distance_counts.items():
|
||||
stats['distances'][distance] = count
|
||||
|
||||
# Détails par type
|
||||
for course_type in df['type'].unique():
|
||||
if pd.notna(course_type):
|
||||
type_df = df[df['type'] == course_type]
|
||||
stats['by_type'][course_type] = {
|
||||
'count': len(type_df),
|
||||
'categories': type_df['category'].value_counts().to_dict(),
|
||||
'locations': type_df['lieu'].value_counts().head(10).to_dict()
|
||||
}
|
||||
|
||||
# Détails par lieu
|
||||
location_counts = df['lieu'].value_counts().head(20)
|
||||
for location, count in location_counts.items():
|
||||
stats['by_location'][location] = count
|
||||
|
||||
# Détails par date (mois/année)
|
||||
df['date'] = pd.to_datetime(df['date'], errors='coerce')
|
||||
df['month_year'] = df['date'].dt.to_period('M')
|
||||
date_counts = df['month_year'].value_counts().sort_index()
|
||||
for period, count in date_counts.items():
|
||||
stats['by_date'][str(period)] = count
|
||||
|
||||
return df, stats
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Erreur lors de l'analyse des courses: {e}")
|
||||
return None, None
|
||||
|
||||
def display_analysis(stats, df=None, show_details=False):
|
||||
"""Afficher les résultats de l'analyse"""
|
||||
if not stats:
|
||||
print("\n❌ Impossible d'analyser les courses")
|
||||
return
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"📊 ANALYSE DES COURSES")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
# Vue d'ensemble
|
||||
print(f"📋 VUE D'ENSEMBLE")
|
||||
print(f"{'─'*40}")
|
||||
print(f"Total des courses: {stats['total_courses']}")
|
||||
print()
|
||||
|
||||
# Types de courses
|
||||
print(f"🏷️ TYPES DE COURSES")
|
||||
print(f"{'─'*40}")
|
||||
for course_type, count in sorted(stats['types'].items(), key=lambda x: x[1], reverse=True):
|
||||
print(f" {course_type}: {count} courses")
|
||||
print()
|
||||
|
||||
# Catégories
|
||||
print(f"📊 CATÉGORIES")
|
||||
print(f"{'─'*40}")
|
||||
for category, count in sorted(stats['categories'].items(), key=lambda x: x[1], reverse=True):
|
||||
print(f" {category}: {count} courses")
|
||||
print()
|
||||
|
||||
# Distances
|
||||
if stats['distances']:
|
||||
print(f"📏 DISTANCES EXTRACTÉES")
|
||||
print(f"{'─'*40}")
|
||||
# Trier par distance
|
||||
for distance in sorted(stats['distances'].keys()):
|
||||
count = stats['distances'][distance]
|
||||
if distance == 42195:
|
||||
distance_str = "Marathon (42.195 km)"
|
||||
elif distance == 21097:
|
||||
distance_str = "Semi-marathon (21.097 km)"
|
||||
elif distance >= 1000:
|
||||
distance_str = f"{distance/1000:.1f} km"
|
||||
else:
|
||||
distance_str = f"{distance} m"
|
||||
print(f" {distance_str}: {count} courses")
|
||||
print()
|
||||
|
||||
# Lieux les plus populaires
|
||||
print(f"📍 LIEUX LES PLUS POPULAIRES (Top 20)")
|
||||
print(f"{'─'*40}")
|
||||
for i, (location, count) in enumerate(sorted(stats['by_location'].items(), key=lambda x: x[1], reverse=True), 1):
|
||||
print(f" {i:2d}. {location}: {count} courses")
|
||||
print()
|
||||
|
||||
# Répartition par date
|
||||
if stats['by_date']:
|
||||
print(f"📅 RÉPARTITION PAR DATE")
|
||||
print(f"{'─'*40}")
|
||||
for period, count in list(stats['by_date'].items())[-12:]: # Derniers 12 mois
|
||||
print(f" {period}: {count} courses")
|
||||
print()
|
||||
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
# Détails par type
|
||||
if show_details and stats['by_type']:
|
||||
print(f"📋 DÉTAILS PAR TYPE DE COURSE")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
for course_type, details in sorted(stats['by_type'].items(), key=lambda x: x[1]['count'], reverse=True):
|
||||
print(f"🔹 {course_type}")
|
||||
print(f" Nombre de courses: {details['count']}")
|
||||
print(f" Répartition par catégorie:")
|
||||
for category, count in sorted(details['categories'].items(), key=lambda x: x[1], reverse=True)[:5]:
|
||||
print(f" - {category}: {count}")
|
||||
print(f" Top lieux:")
|
||||
for i, (location, count) in enumerate(sorted(details['locations'].items(), key=lambda x: x[1], reverse=True)[:5], 1):
|
||||
print(f" {i}. {location}: {count}")
|
||||
print()
|
||||
|
||||
def export_analysis_csv(stats, df, output_dir="data"):
|
||||
"""Exporter l'analyse en CSV"""
|
||||
os.makedirs(os.path.join(output_dir, 'exports'), exist_ok=True)
|
||||
|
||||
# Exporter le DataFrame enrichi avec distances et catégories
|
||||
courses_with_analysis = os.path.join(output_dir, 'exports', 'courses_analysis.csv')
|
||||
if df is not None:
|
||||
df.to_csv(courses_with_analysis, index=False, encoding='utf-8-sig')
|
||||
logging.info(f"Exporté {len(df)} courses analysées dans {courses_with_analysis}")
|
||||
|
||||
# Exporter les statistiques par type
|
||||
types_csv = os.path.join(output_dir, 'exports', 'courses_by_type.csv')
|
||||
if stats['types']:
|
||||
types_df = pd.DataFrame(list(stats['types'].items()), columns=['Type', 'Count'])
|
||||
types_df.to_csv(types_csv, index=False, encoding='utf-8-sig')
|
||||
|
||||
# Exporter les statistiques par catégorie
|
||||
categories_csv = os.path.join(output_dir, 'exports', 'courses_by_category.csv')
|
||||
if stats['categories']:
|
||||
categories_df = pd.DataFrame(list(stats['categories'].items()), columns=['Category', 'Count'])
|
||||
categories_df.to_csv(categories_csv, index=False, encoding='utf-8-sig')
|
||||
|
||||
# Exporter les statistiques par distance
|
||||
distances_csv = os.path.join(output_dir, 'exports', 'courses_by_distance.csv')
|
||||
if stats['distances']:
|
||||
distances_df = pd.DataFrame(list(stats['distances'].items()), columns=['Distance (m)', 'Count'])
|
||||
distances_df = distances_df.sort_values('Distance (m)')
|
||||
distances_df.to_csv(distances_csv, index=False, encoding='utf-8-sig')
|
||||
|
||||
return {
|
||||
'courses_analysis': courses_with_analysis,
|
||||
'by_type': types_csv,
|
||||
'by_category': categories_csv,
|
||||
'by_distance': distances_csv
|
||||
}
|
||||
|
||||
def search_courses_by_distance(df, min_distance=None, max_distance=None):
|
||||
"""Rechercher des courses par distance"""
|
||||
if df is None:
|
||||
return []
|
||||
|
||||
mask = df['distance_meters'] > 0
|
||||
|
||||
if min_distance is not None:
|
||||
mask &= df['distance_meters'] >= min_distance
|
||||
|
||||
if max_distance is not None:
|
||||
mask &= df['distance_meters'] <= max_distance
|
||||
|
||||
courses = df[mask].to_dict('records')
|
||||
return courses
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Extraire et analyser les types de courses et distances')
|
||||
parser.add_argument('--data-dir', default='data',
|
||||
help='Répertoire des données CSV')
|
||||
parser.add_argument('--details', action='store_true',
|
||||
help='Afficher les détails par type de course')
|
||||
parser.add_argument('--export', action='store_true',
|
||||
help='Exporter l\'analyse en CSV')
|
||||
parser.add_argument('--search-distance', action='store_true',
|
||||
help='Rechercher des courses par distance')
|
||||
parser.add_argument('--min-distance', type=int,
|
||||
help='Distance minimum en mètres')
|
||||
parser.add_argument('--max-distance', type=int,
|
||||
help='Distance maximum en mètres')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Analyse des courses
|
||||
print(f"\n📊 Analyse des courses depuis {args.data_dir}/...")
|
||||
df, stats = analyze_courses(args.data_dir)
|
||||
|
||||
if df is not None and stats is not None:
|
||||
# Affichage
|
||||
display_analysis(stats, df, show_details=args.details)
|
||||
|
||||
# Recherche par distance
|
||||
if args.search_distance:
|
||||
print(f"\n🔍 Recherche de courses par distance:")
|
||||
print(f" Min: {args.min_distance}m, Max: {args.max_distance}m")
|
||||
courses = search_courses_by_distance(df, args.min_distance, args.max_distance)
|
||||
|
||||
if courses:
|
||||
print(f"\n Trouvé {len(courses)} courses:")
|
||||
for i, course in enumerate(courses[:20], 1):
|
||||
print(f" {i}. {course['nom']} - {course['distance_meters']}m")
|
||||
if len(courses) > 20:
|
||||
print(f" ... et {len(courses) - 20} autres")
|
||||
else:
|
||||
print(" Aucune course trouvée avec ces critères")
|
||||
|
||||
# Export
|
||||
if args.export:
|
||||
files = export_analysis_csv(stats, df, args.data_dir)
|
||||
print(f"\n💾 Exporté dans:")
|
||||
for key, filepath in files.items():
|
||||
print(f" {key}: {filepath}")
|
||||
else:
|
||||
print("\n❌ Impossible d'analyser les courses")
|
||||
print("💡 Vérifiez que les données ont été scrapées avec:")
|
||||
print(" python ffa_cli.py scrape")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user