- Créer une arborescence propre (src/, scripts/, config/, data/, docs/, tests/) - Déplacer les modules Python dans src/ - Déplacer les scripts autonomes dans scripts/ - Nettoyer les fichiers temporaires et __pycache__ - Mettre à jour le README.md avec documentation complète - Mettre à jour les imports dans les scripts pour la nouvelle structure - Configurer le .gitignore pour ignorer les données et logs - Organiser les données dans data/ (courses, resultats, clubs, exports) Structure du projet: - src/: Modules principaux (ffa_scraper, ffa_analyzer) - scripts/: Scripts CLI et utilitaires - config/: Configuration (config.env) - data/: Données générées - docs/: Documentation - tests/: Tests unitaires 💘 Generated with Crush Assisted-by: GLM-4.7 via Crush <crush@charm.land>
299 lines
11 KiB
Python
Executable File
299 lines
11 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Script de post-traitement pour analyser et trier les données scrapées
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import logging
|
|
import pandas as pd
|
|
from collections import defaultdict
|
|
from datetime import datetime
|
|
|
|
def analyze_clubs(data_dir):
|
|
"""Analyser et extraire tous les clubs"""
|
|
logging.info("=== Analyse des clubs ===")
|
|
|
|
results_path = os.path.join(data_dir, 'resultats', 'results.csv')
|
|
|
|
if os.path.exists(results_path):
|
|
df = pd.read_csv(results_path, encoding='utf-8-sig')
|
|
|
|
# Extraire les clubs uniques
|
|
clubs_info = df.groupby('club').agg({
|
|
'nom': lambda x: x.nunique(),
|
|
'prenom': lambda x: x.nunique()
|
|
}).reset_index()
|
|
|
|
clubs_info.columns = ['club', 'athletes_count', 'unique_athletes']
|
|
clubs_info = clubs_info.sort_values('athletes_count', ascending=False)
|
|
|
|
# Sauvegarder
|
|
clubs_dir = os.path.join(data_dir, 'clubs')
|
|
os.makedirs(clubs_dir, exist_ok=True)
|
|
|
|
clubs_file = os.path.join(clubs_dir, 'clubs_list.csv')
|
|
clubs_info.to_csv(clubs_file, index=False, encoding='utf-8-sig')
|
|
|
|
logging.info(f"✅ {len(clubs_info)} clubs exportés dans {clubs_file}")
|
|
logging.info(f" Top 5 clubs:")
|
|
for i, club in clubs_info.head(5).iterrows():
|
|
logging.info(f" {i+1}. {club['club']}: {club['athletes_count']} résultats")
|
|
|
|
return clubs_info
|
|
else:
|
|
logging.warning("⚠️ Fichier de résultats introuvable")
|
|
return None
|
|
|
|
def analyze_courses(data_dir):
|
|
"""Analyser et extraire les statistiques des courses"""
|
|
logging.info("=== Analyse des courses ===")
|
|
|
|
courses_path = os.path.join(data_dir, 'courses', 'courses_list.csv')
|
|
|
|
if os.path.exists(courses_path):
|
|
df = pd.read_csv(courses_path, encoding='utf-8-sig')
|
|
|
|
# Convertir les dates
|
|
df['date'] = pd.to_datetime(df['date'], errors='coerce')
|
|
df['année'] = df['date'].dt.year
|
|
df['mois'] = df['date'].dt.month
|
|
|
|
# Statistiques par année
|
|
courses_by_year = df.groupby('année').size().reset_index(name='count')
|
|
courses_by_year = courses_by_year.sort_values('année')
|
|
|
|
# Statistiques par type
|
|
courses_by_type = df['type'].value_counts().reset_index()
|
|
courses_by_type.columns = ['type', 'count']
|
|
|
|
# Statistiques par lieu (top 50)
|
|
courses_by_location = df['lieu'].value_counts().head(50).reset_index()
|
|
courses_by_location.columns = ['lieu', 'count']
|
|
|
|
# Sauvegarder les statistiques
|
|
stats_dir = os.path.join(data_dir, 'statistics')
|
|
os.makedirs(stats_dir, exist_ok=True)
|
|
|
|
# Export par année
|
|
year_file = os.path.join(stats_dir, 'courses_by_year.csv')
|
|
courses_by_year.to_csv(year_file, index=False, encoding='utf-8-sig')
|
|
|
|
# Export par type
|
|
type_file = os.path.join(stats_dir, 'courses_by_type.csv')
|
|
courses_by_type.to_csv(type_file, index=False, encoding='utf-8-sig')
|
|
|
|
# Export par lieu
|
|
location_file = os.path.join(stats_dir, 'courses_by_location.csv')
|
|
courses_by_location.to_csv(location_file, index=False, encoding='utf-8-sig')
|
|
|
|
logging.info(f"✅ Statistiques exportées dans {stats_dir}")
|
|
logging.info(f" Années: {len(courses_by_year)}")
|
|
logging.info(f" Types: {len(courses_by_type)}")
|
|
logging.info(f" Lieux: {len(courses_by_location)}")
|
|
|
|
# Récapitulatif
|
|
logging.info(f"\n📊 RÉCAPITULATIF DES COURSES:")
|
|
logging.info(f" Total: {len(df)} courses")
|
|
logging.info(f" Plage de dates: {df['date'].min()} au {df['date'].max()}")
|
|
logging.info(f" Années: {len(courses_by_year)}")
|
|
logging.info(f" Types: {len(courses_by_type)}")
|
|
|
|
return {
|
|
'total': len(df),
|
|
'years': len(courses_by_year),
|
|
'types': len(courses_by_type),
|
|
'locations': len(courses_by_location)
|
|
}
|
|
else:
|
|
logging.warning("⚠️ Fichier de courses introuvable")
|
|
return None
|
|
|
|
def extract_distances_from_courses(data_dir):
|
|
"""Extraire et catégoriser les distances des courses"""
|
|
logging.info("=== Extraction des distances ===")
|
|
|
|
courses_path = os.path.join(data_dir, 'courses', 'courses_list.csv')
|
|
|
|
if os.path.exists(courses_path):
|
|
df = pd.read_csv(courses_path, encoding='utf-8-sig')
|
|
|
|
import re
|
|
|
|
# Fonction pour extraire la distance
|
|
def extract_distance(course_name):
|
|
patterns = [
|
|
(r'(\d+)\s*km', lambda m: int(m.group(1)) * 1000),
|
|
(r'(\d+)\s*m', lambda m: int(m.group(1))),
|
|
(r'marathon', lambda m: 42195),
|
|
(r'semi[-\s]?marathon', lambda m: 21097),
|
|
]
|
|
|
|
for pattern, extractor in patterns:
|
|
match = re.search(pattern, course_name, re.IGNORECASE)
|
|
if match:
|
|
try:
|
|
return extractor(match)
|
|
except:
|
|
pass
|
|
return None
|
|
|
|
# Extraire les distances
|
|
df['distance_meters'] = df['nom'].apply(extract_distance)
|
|
|
|
# Catégoriser
|
|
def categorize_distance(distance):
|
|
if pd.isna(distance):
|
|
return 'Autre'
|
|
elif distance < 400:
|
|
return 'Sprint'
|
|
elif distance < 2000:
|
|
return 'Demi-fond'
|
|
elif distance < 5000:
|
|
return 'Fond'
|
|
elif distance < 10000:
|
|
return 'Intermédiaire'
|
|
elif distance < 21000:
|
|
return '10km'
|
|
elif distance < 22000:
|
|
return 'Semi-marathon'
|
|
elif distance < 43000:
|
|
return 'Longue distance'
|
|
elif distance < 50000:
|
|
return 'Marathon'
|
|
else:
|
|
return 'Ultra'
|
|
|
|
df['category'] = df['distance_meters'].apply(categorize_distance)
|
|
|
|
# Statistiques par catégorie
|
|
categories = df['category'].value_counts().reset_index()
|
|
categories.columns = ['category', 'count']
|
|
|
|
# Sauvegarder les courses avec distances
|
|
courses_with_distance = os.path.join(data_dir, 'courses', 'courses_with_distances.csv')
|
|
df.to_csv(courses_with_distance, index=False, encoding='utf-8-sig')
|
|
|
|
# Sauvegarder les statistiques
|
|
stats_dir = os.path.join(data_dir, 'statistics')
|
|
categories_file = os.path.join(stats_dir, 'courses_by_category.csv')
|
|
categories.to_csv(categories_file, index=False, encoding='utf-8-sig')
|
|
|
|
logging.info(f"✅ Distances extraites et exportées")
|
|
logging.info(f" Catégories: {len(categories)}")
|
|
logging.info(f"\nRépartition par catégorie:")
|
|
for _, row in categories.head(10).iterrows():
|
|
logging.info(f" {row['category']}: {row['count']} courses")
|
|
|
|
return categories
|
|
else:
|
|
logging.warning("⚠️ Fichier de courses introuvable")
|
|
return None
|
|
|
|
def create_summary(data_dir):
|
|
"""Créer un récapitulatif global"""
|
|
logging.info("=== Création du récapitulatif ===")
|
|
|
|
summary_dir = os.path.join(data_dir, 'summary')
|
|
os.makedirs(summary_dir, exist_ok=True)
|
|
|
|
# Créer un fichier de récapitulatif
|
|
summary_file = os.path.join(summary_dir, 'global_summary.txt')
|
|
|
|
with open(summary_file, 'w', encoding='utf-8') as f:
|
|
f.write("="*80 + "\n")
|
|
f.write("RÉCAPITULATIF GLOBAL DES DONNÉES FFA\n")
|
|
f.write("="*80 + "\n")
|
|
f.write(f"Date de génération: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
|
|
|
# Courses
|
|
courses_path = os.path.join(data_dir, 'courses', 'courses_list.csv')
|
|
if os.path.exists(courses_path):
|
|
df_courses = pd.read_csv(courses_path, encoding='utf-8-sig')
|
|
f.write(f"COURSES\n")
|
|
f.write("-"*40 + "\n")
|
|
f.write(f"Total des courses: {len(df_courses)}\n")
|
|
|
|
df_courses['date'] = pd.to_datetime(df_courses['date'], errors='coerce')
|
|
f.write(f"Première course: {df_courses['date'].min()}\n")
|
|
f.write(f"Dernière course: {df_courses['date'].max()}\n")
|
|
|
|
years = df_courses['date'].dt.year.dropna().unique()
|
|
f.write(f"Années couvertes: {len(years)} ({min(years)} à {max(years)})\n\n")
|
|
|
|
# Résultats
|
|
results_path = os.path.join(data_dir, 'resultats', 'results.csv')
|
|
if os.path.exists(results_path):
|
|
df_results = pd.read_csv(results_path, encoding='utf-8-sig')
|
|
f.write(f"RÉSULTATS\n")
|
|
f.write("-"*40 + "\n")
|
|
f.write(f"Total des résultats: {len(df_results)}\n")
|
|
|
|
clubs = df_results['club'].nunique()
|
|
f.write(f"Clubs uniques: {clubs}\n")
|
|
f.write(f"Athlètes uniques: {df_results['nom'].nunique()}\n\n")
|
|
|
|
# Clubs
|
|
clubs_path = os.path.join(data_dir, 'clubs', 'clubs_list.csv')
|
|
if os.path.exists(clubs_path):
|
|
df_clubs = pd.read_csv(clubs_path, encoding='utf-8-sig')
|
|
f.write(f"CLUBS\n")
|
|
f.write("-"*40 + "\n")
|
|
f.write(f"Total des clubs: {len(df_clubs)}\n\n")
|
|
f.write(f"Top 10 clubs:\n")
|
|
for i, club in df_clubs.head(10).iterrows():
|
|
f.write(f" {i+1}. {club['club']}: {club['athletes_count']} résultats\n")
|
|
f.write("\n")
|
|
|
|
logging.info(f"✅ Récapitulatif global créé dans {summary_file}")
|
|
return summary_file
|
|
|
|
def main():
|
|
"""Fonction principale"""
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
|
|
data_dir = sys.argv[1] if len(sys.argv) > 1 else 'data_2010_2026'
|
|
|
|
logging.info(f"{'='*80}")
|
|
logging.info(f"POST-TRAITEMENT DES DONNÉES FFA")
|
|
logging.info(f"{'='*80}")
|
|
logging.info(f"Répertoire: {data_dir}\n")
|
|
|
|
# Analyser les clubs
|
|
clubs = analyze_clubs(data_dir)
|
|
|
|
# Analyser les courses
|
|
courses_stats = analyze_courses(data_dir)
|
|
|
|
# Extraire les distances
|
|
categories = extract_distances_from_courses(data_dir)
|
|
|
|
# Créer le récapitulatif
|
|
summary = create_summary(data_dir)
|
|
|
|
logging.info(f"\n{'='*80}")
|
|
logging.info(f"POST-TRAITEMENT TERMINÉ")
|
|
logging.info(f"{'='*80}")
|
|
|
|
# Afficher les statistiques
|
|
if courses_stats:
|
|
print(f"\n📊 STATISTIQUES FINALES:")
|
|
print(f" Courses: {courses_stats['total']}")
|
|
print(f" Années: {courses_stats['years']}")
|
|
print(f" Types: {courses_stats['types']}")
|
|
|
|
if clubs is not None:
|
|
print(f" Clubs: {len(clubs)}")
|
|
|
|
if categories is not None:
|
|
print(f" Catégories: {len(categories)}")
|
|
|
|
print(f"\n✅ Toutes les données ont été analysées et exportées!")
|
|
print(f"📁 Répertoire principal: {data_dir}")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|