Initial commit: Reorganiser le projet FFA Calendar Scraper

- Créer une arborescence propre (src/, scripts/, config/, data/, docs/, tests/) - Déplacer les modules Python dans src/ - Déplacer les scripts autonomes dans scripts/ - Nettoyer les fichiers temporaires et __pycache__ - Mettre à jour le README.md avec documentation complète - Mettre à jour les imports dans les scripts pour la nouvelle structure - Configurer le .gitignore pour ignorer les données et logs - Organiser les données dans data/ (courses, resultats, clubs, exports) Structure du projet: - src/: Modules principaux (ffa_scraper, ffa_analyzer) - scripts/: Scripts CLI et utilitaires - config/: Configuration (config.env) - data/: Données générées - docs/: Documentation - tests/: Tests unitaires 💘 Generated with Crush Assisted-by: GLM-4.7 via Crush <crush@charm.land>
2026-01-01 18:05:14 +01:00
commit a5406a4e89
16 changed files with 3920 additions and 0 deletions
--- a/scripts/ffa_cli.py
+++ b/scripts/ffa_cli.py
@@ -0,0 +1,283 @@
+#!/usr/bin/env python3
+"""
+Interface en ligne de commande pour le scraper FFA
+"""
+
+import argparse
+import sys
+import os
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../src'))
+from ffa_scraper import FFAScraper
+from ffa_analyzer import FFADataAnalyzer
+import logging
+
+def setup_logging(verbose=False):
+    """Configurer le logging"""
+    level = logging.DEBUG if verbose else logging.INFO
+    logging.basicConfig(
+        level=level,
+        format='%(asctime)s - %(levelname)s - %(message)s'
+    )
+
+def scrape_command(args):
+    """Commande de scraping"""
+    scraper = FFAScraper(output_dir=args.output)
+
+    use_multithreading = args.multithreading and not args.no_multithreading
+
+    stats = scraper.scrap_all_data(
+        limit_courses=args.limit_courses,
+        limit_results=args.limit_results,
+        fetch_details=args.fetch_details,
+        max_pages=args.max_pages,
+        use_multithreading=use_multithreading
+    )
+
+    print(f"\nScraping terminé:")
+    print(f"- Clubs: {stats['clubs_count']}")
+    print(f"- Courses: {stats['courses_count']}")
+    print(f"- Résultats: {stats['results_count']}")
+
+def check_command(args):
+    """Commande de vérification du nombre de courses"""
+    scraper = FFAScraper(output_dir=args.output)
+
+    total_pages, total_courses, courses_per_page = scraper._detect_pagination_info()
+
+    print("\n" + "="*60)
+    print("📊 Informations de pagination")
+    print("="*60)
+
+    if total_pages:
+        print(f"Nombre total de pages: {total_pages}")
+        print(f"Estimation du nombre total de courses: ~{total_courses}")
+        print(f"Courses par page: ~{courses_per_page}")
+
+        print(f"\n⏱️  Estimation du temps de scraping:")
+        use_multithreading = args.multithreading and not args.no_multithreading
+        if use_multithreading:
+            print(f"   - Multithreading (4 workers): ~{total_pages / 4 * 2:.0f} secondes")
+        else:
+            print(f"   - Séquentiel: ~{total_pages * 2:.0f} secondes")
+
+        if total_pages > 10:
+            print(f"\n⚠️  Attention: {total_pages} pages à scraper!")
+
+            if args.auto:
+                print(f"\nUtilisation de {total_pages} pages pour le scraping.")
+                stats = scraper.scrap_all_data(
+                    limit_courses=args.limit_courses,
+                    limit_results=args.limit_results,
+                    fetch_details=args.fetch_details,
+                    max_pages=total_pages,
+                    use_multithreading=use_multithreading
+                )
+                print(f"\nScraping terminé:")
+                print(f"- Courses: {stats['courses_count']}")
+    else:
+        print("⚠️  Impossible de détecter la pagination. Utilisez --max-pages pour spécifier le nombre de pages.")
+
+    print("="*60)
+
+def list_command(args):
+    """Commande de listing des données"""
+    analyzer = FFADataAnalyzer(data_dir=args.data_dir)
+
+    print("\n=== Données disponibles ===")
+
+    if analyzer.courses_df is not None:
+        print(f"\n📅 Courses: {len(analyzer.courses_df)} compétitions")
+        if len(analyzer.courses_df) > 0:
+            print("   Types de courses:")
+            types = analyzer.courses_df['type'].value_counts()
+            for course_type, count in types.head(5).items():
+                print(f"   - {course_type}: {count}")
+
+    if analyzer.results_df is not None:
+        print(f"\n🏃 Résultats: {len(analyzer.results_df)} entrées")
+        if len(analyzer.results_df) > 0:
+            print("   Clubs les plus représentés:")
+            clubs = analyzer.results_df['club'].value_counts().head(5)
+            for club, count in clubs.items():
+                print(f"   - {club}: {count} résultats")
+
+            print("\n   Premiers résultats:")
+            for i, result in enumerate(analyzer.results_df.head(3).to_dict('records'), 1):
+                print(f"   {i}. {result.get('prenom', '')} {result.get('nom', '')} - {result.get('club', '')} - Place: {result.get('place', '')}")
+
+    if analyzer.clubs_df is not None and len(analyzer.clubs_df) > 0:
+        print(f"\n🏟️  Clubs: {len(analyzer.clubs_df)} clubs")
+
+    print("\n=== === ===\n")
+
+def search_command(args):
+    """Commande de recherche"""
+    analyzer = FFADataAnalyzer(data_dir=args.data_dir)
+    
+    if args.type == 'athlete':
+        results = analyzer.search_athlete(args.nom, args.prenom)
+        print(f"\nTrouvé {len(results)} résultats pour {args.nom} {args.prenom or ''}")
+        
+        for i, result in enumerate(results[:20], 1):  # Limiter l'affichage
+            print(f"{i}. {result['prenom']} {result['nom']} - {result['club']} - Place: {result['place']} - {result.get('course_url', '')}")
+    
+    elif args.type == 'course':
+        courses = analyzer.get_course_by_date(args.start_date, args.end_date)
+        print(f"\nTrouvé {len(courses)} courses entre {args.start_date} et {args.end_date}")
+
+        for i, course in enumerate(courses[:20], 1):
+            print(f"{i}. {course.get('nom', 'Inconnu')} - {course.get('date', 'Date inconnue')} - {course.get('lieu', 'Lieu inconnu')}")
+
+    elif args.type == 'club':
+        club_info = analyzer.search_club_in_results(args.nom)
+        if club_info and club_info.get('athletes'):
+            print(f"\nClub: {args.nom}")
+            print(f"Athlètes: {len(club_info.get('athletes', []))}")
+
+            for i, athlete in enumerate(club_info.get('athletes', [])[:10], 1):
+                print(f"{i}. {athlete['prenom']} {athlete['nom']} - {len(athlete['results'])} résultats")
+        else:
+            print(f"\nAucun résultat trouvé pour le club: {args.nom}")
+
+def stats_command(args):
+    """Commande de statistiques"""
+    analyzer = FFADataAnalyzer(data_dir=args.data_dir)
+    
+    if args.type == 'athlete':
+        stats = analyzer.get_athlete_stats(args.nom, args.prenom)
+        if stats:
+            print(f"\nStatistiques pour {stats['prenom']} {stats['nom']}:")
+            print(f"- Club: {stats.get('club', 'Inconnu')}")
+            print(f"- Courses total: {stats.get('total_courses', 0)}")
+            print(f"- Victoires: {stats.get('victoires', 0)}")
+            print(f"- Podiums: {stats.get('podiums', 0)}")
+            print(f"- Catégories: {', '.join(stats.get('categories', []))}")
+            print(f"- Courses par année: {stats.get('courses_par_annee', {})}")
+    
+    elif args.type == 'club':
+        rankings = analyzer.get_club_rankings(args.course_url)
+        print(f"\nClassement par club pour la course {args.course_url}:")
+        
+        for i, club in enumerate(rankings[:10], 1):
+            print(f"{i}. {club['club']} - Score: {club['score']} - Participants: {club['participants']}")
+
+def top_command(args):
+    """Commande pour afficher le top des athlètes"""
+    analyzer = FFADataAnalyzer(data_dir=args.data_dir)
+
+    top_athletes = analyzer.get_top_athletes(limit=args.limit, min_results=args.min_results)
+
+    print(f"\n=== Top {len(top_athletes)} athlètes ===")
+    print(f"(Minimum {args.min_results} résultats)\n")
+
+    for i, athlete in enumerate(top_athletes, 1):
+        print(f"{i}. {athlete['prenom']} {athlete['nom']}")
+        print(f"   Club: {athlete.get('club', 'Inconnu')}")
+        print(f"   Victoires: {athlete['victoires']} | Podiums: {athlete['podiums']} | Courses: {athlete['results_count']}")
+        if athlete.get('place_moyenne'):
+            print(f"   Place moyenne: {athlete['place_moyenne']:.2f}")
+        print()
+
+def export_command(args):
+    """Commande d'export"""
+    analyzer = FFADataAnalyzer(data_dir=args.data_dir)
+    
+    if args.type == 'athlete':
+        filepath = analyzer.export_athlete_csv(args.nom, args.prenom, args.filename)
+        if filepath:
+            print(f"Exporté dans: {filepath}")
+        else:
+            print("Aucun résultat trouvé pour cet athlète")
+
+def main():
+    parser = argparse.ArgumentParser(description='FFA Calendar Scraper - Outil de scraping et d\'analyse des données de la FFA')
+    parser.add_argument('--verbose', '-v', action='store_true', help='Mode verbeux')
+    
+    subparsers = parser.add_subparsers(dest='command', help='Commande à exécuter')
+    
+    # Commande scrape
+    scrape_parser = subparsers.add_parser('scrape', help='Lancer le scraping des données')
+    scrape_parser.add_argument('--output', '-o', default='data', help='Répertoire de sortie des données')
+    scrape_parser.add_argument('--limit-courses', type=int, help='Limiter le nombre de courses à scraper')
+    scrape_parser.add_argument('--limit-results', type=int, help='Limiter le nombre de résultats à scraper')
+    scrape_parser.add_argument('--fetch-details', action='store_true', help='Récupérer les détails et résultats de chaque course (plus lent)')
+    scrape_parser.add_argument('--max-pages', type=int, default=10, help='Nombre maximum de pages à scraper (défaut: 10)')
+    scrape_parser.add_argument('--multithreading', action='store_true', default=True, help='Activer le multithreading pour accélérer le scraping (défaut: True)')
+    scrape_parser.add_argument('--no-multithreading', action='store_true', help='Désactiver le multithreading (scraping séquentiel)')
+    
+    # Commande list
+    list_parser = subparsers.add_parser('list', help='Lister les données disponibles')
+    list_parser.add_argument('--data-dir', default='data', help='Répertoire des données')
+
+    # Commande search
+    search_parser = subparsers.add_parser('search', help='Rechercher des données')
+    search_parser.add_argument('type', choices=['athlete', 'club', 'course'], help='Type de recherche')
+    search_parser.add_argument('--data-dir', default='data', help='Répertoire des données')
+    
+    # Arguments spécifiques à la recherche d'athlète
+    search_parser.add_argument('--nom', help='Nom de l\'athlète ou du club')
+    search_parser.add_argument('--prenom', help='Prénom de l\'athlète')
+    search_parser.add_argument('--start-date', help='Date de début (format: YYYY-MM-DD)')
+    search_parser.add_argument('--end-date', help='Date de fin (format: YYYY-MM-DD)')
+    
+    # Commande stats
+    stats_parser = subparsers.add_parser('stats', help='Afficher des statistiques')
+    stats_parser.add_argument('type', choices=['athlete', 'club'], help='Type de statistiques')
+    stats_parser.add_argument('--nom', help='Nom de l\'athlète ou du club')
+    stats_parser.add_argument('--prenom', help='Prénom de l\'athlète')
+    stats_parser.add_argument('--course-url', help='URL de la course pour le classement par club')
+    stats_parser.add_argument('--data-dir', default='data', help='Répertoire des données')
+    
+    # Commande top
+    top_parser = subparsers.add_parser('top', help='Afficher le top des athlètes')
+    top_parser.add_argument('--limit', type=int, default=10, help='Nombre d\'athlètes à afficher (défaut: 10)')
+    top_parser.add_argument('--min-results', type=int, default=3, help='Nombre minimum de résultats (défaut: 3)')
+    top_parser.add_argument('--data-dir', default='data', help='Répertoire des données')
+
+    # Commande export
+    export_parser = subparsers.add_parser('export', help='Exporter des données en CSV')
+    export_parser.add_argument('type', choices=['athlete'], help='Type d\'export')
+    export_parser.add_argument('--nom', help='Nom de l\'athlète ou du club')
+    export_parser.add_argument('--prenom', help='Prénom de l\'athlète')
+    export_parser.add_argument('--filename', help='Nom du fichier de sortie')
+    export_parser.add_argument('--data-dir', default='data', help='Répertoire des données')
+
+    # Commande check
+    check_parser = subparsers.add_parser('check', help='Vérifier le nombre total de courses disponibles')
+    check_parser.add_argument('--output', '-o', default='data', help='Répertoire de sortie des données')
+    check_parser.add_argument('--limit-courses', type=int, help='Limiter le nombre de courses à scraper')
+    check_parser.add_argument('--limit-results', type=int, help='Limiter le nombre de résultats à scraper')
+    check_parser.add_argument('--fetch-details', action='store_true', help='Récupérer les détails et résultats de chaque course (plus lent)')
+    check_parser.add_argument('--auto', action='store_true', help='Lancer automatiquement le scraping après la vérification')
+    check_parser.add_argument('--multithreading', action='store_true', default=True, help='Activer le multithreading pour accélérer le scraping (défaut: True)')
+    check_parser.add_argument('--no-multithreading', action='store_true', help='Désactiver le multithreading (scraping séquentiel)')
+    
+    args = parser.parse_args()
+    
+    if not args.command:
+        parser.print_help()
+        return
+    
+    setup_logging(args.verbose)
+    
+    try:
+        if args.command == 'scrape':
+            scrape_command(args)
+        elif args.command == 'list':
+            list_command(args)
+        elif args.command == 'search':
+            search_command(args)
+        elif args.command == 'top':
+            top_command(args)
+        elif args.command == 'stats':
+            stats_command(args)
+        elif args.command == 'export':
+            export_command(args)
+        elif args.command == 'check':
+            check_command(args)
+    except Exception as e:
+        logging.error(f"Erreur lors de l'exécution de la commande {args.command}: {e}")
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()