Initial commit: Reorganiser le projet FFA Calendar Scraper
- Créer une arborescence propre (src/, scripts/, config/, data/, docs/, tests/) - Déplacer les modules Python dans src/ - Déplacer les scripts autonomes dans scripts/ - Nettoyer les fichiers temporaires et __pycache__ - Mettre à jour le README.md avec documentation complète - Mettre à jour les imports dans les scripts pour la nouvelle structure - Configurer le .gitignore pour ignorer les données et logs - Organiser les données dans data/ (courses, resultats, clubs, exports) Structure du projet: - src/: Modules principaux (ffa_scraper, ffa_analyzer) - scripts/: Scripts CLI et utilitaires - config/: Configuration (config.env) - data/: Données générées - docs/: Documentation - tests/: Tests unitaires 💘 Generated with Crush Assisted-by: GLM-4.7 via Crush <crush@charm.land>
This commit is contained in:
312
scripts/scrape_all_periods.py
Executable file
312
scripts/scrape_all_periods.py
Executable file
@@ -0,0 +1,312 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Script de scraping FFA avec multithreading maximal
|
||||
Scrape par périodes de 15 jours et exécute les scripts de post-traitement
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import logging
|
||||
import subprocess
|
||||
from datetime import datetime, timedelta
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
|
||||
# Charger le module scraper
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../src'))
|
||||
from ffa_scraper import FFAScraper
|
||||
|
||||
def get_15_day_periods(start_year=2010, end_year=2026):
|
||||
"""Générer les périodes de 15 jours entre start_year et end_year"""
|
||||
periods = []
|
||||
|
||||
start_date = datetime(start_year, 1, 1)
|
||||
end_date = datetime(end_year, 12, 31)
|
||||
|
||||
current_date = start_date
|
||||
|
||||
while current_date <= end_date:
|
||||
period_end = current_date + timedelta(days=14)
|
||||
if period_end > end_date:
|
||||
period_end = end_date
|
||||
|
||||
period_name = f"{current_date.strftime('%Y-%m-%d')}_to_{period_end.strftime('%Y-%m-%d')}"
|
||||
|
||||
periods.append({
|
||||
'name': period_name,
|
||||
'start': current_date,
|
||||
'end': period_end
|
||||
})
|
||||
|
||||
current_date = period_end + timedelta(days=1)
|
||||
|
||||
logging.info(f"Nombre total de périodes de 15 jours: {len(periods)}")
|
||||
return periods
|
||||
|
||||
def scrape_period(period, period_index, total_periods):
|
||||
"""Scraper une période spécifique"""
|
||||
scraper = FFAScraper()
|
||||
|
||||
start_str = period['start'].strftime('%Y-%m-%d')
|
||||
end_str = period['end'].strftime('%Y-%m-%d')
|
||||
year = period['start'].year
|
||||
|
||||
# Construire l'URL pour cette période
|
||||
url = (
|
||||
f"https://www.athle.fr/bases/liste.aspx?frmpostback=true"
|
||||
f"&frmbase=calendrier&frmmode=1&frmespace=0"
|
||||
f"&frmsaisonffa={year}"
|
||||
f"&frmdate1={start_str}&frmdate2={end_str}"
|
||||
f"&frmtype1=&frmniveau=&frmligue=&frmdepartement=&frmniveaulab="
|
||||
f"&frmepreuve=&frmtype2=&frmtype3=&frmtype4=&frmposition=4"
|
||||
)
|
||||
|
||||
try:
|
||||
# Scraper avec multithreading interne en utilisant l'URL personnalisée
|
||||
courses = scraper.get_courses_list(max_pages=1, use_multithreading=False, calendar_url=url)
|
||||
|
||||
if courses:
|
||||
logging.info(f"[{period_index + 1}/{total_periods}] {len(courses)} courses pour {start_str} au {end_str}")
|
||||
|
||||
# Sauvegarder immédiatement dans un fichier spécifique à la période
|
||||
output_dir = os.getenv('OUTPUT_DIR', 'data_2010_2026')
|
||||
period_dir = os.path.join(output_dir, 'courses', 'periods')
|
||||
os.makedirs(period_dir, exist_ok=True)
|
||||
|
||||
period_file = os.path.join(period_dir, f"courses_{period['name']}.csv")
|
||||
df = pd.DataFrame(courses)
|
||||
df.to_csv(period_file, index=False, encoding='utf-8-sig')
|
||||
|
||||
return {
|
||||
'period': period,
|
||||
'courses': courses,
|
||||
'success': True
|
||||
}
|
||||
else:
|
||||
return {
|
||||
'period': period,
|
||||
'courses': [],
|
||||
'success': True
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Erreur pour {start_str} au {end_str}: {e}")
|
||||
return {
|
||||
'period': period,
|
||||
'courses': [],
|
||||
'success': False,
|
||||
'error': str(e)
|
||||
}
|
||||
finally:
|
||||
scraper._close_all_selenium()
|
||||
|
||||
def scrape_all_periods_multithreaded(periods, max_workers=8):
|
||||
"""Scraper toutes les périodes avec multithreading maximal"""
|
||||
all_courses = []
|
||||
|
||||
total_periods = len(periods)
|
||||
logging.info(f"=== Scraping avec {max_workers} workers ===")
|
||||
logging.info(f"Périodes à scraper: {total_periods}")
|
||||
|
||||
with ThreadPoolExecutor(max_workers=max_workers, thread_name_prefix='scraper') as executor:
|
||||
# Soumettre toutes les tâches
|
||||
future_to_period = {
|
||||
executor.submit(scrape_period, period, i, total_periods): i
|
||||
for i, period in enumerate(periods)
|
||||
}
|
||||
|
||||
# Barre de progression
|
||||
with tqdm(total=total_periods, desc="Périodes scrapées", unit="période") as pbar:
|
||||
for future in as_completed(future_to_period):
|
||||
period_index = future_to_period[future]
|
||||
try:
|
||||
result = future.result()
|
||||
all_courses.extend(result['courses'])
|
||||
pbar.update(1)
|
||||
pbar.set_postfix({
|
||||
'total': len(all_courses),
|
||||
'success': result['success']
|
||||
})
|
||||
except Exception as e:
|
||||
logging.error(f"Erreur sur la période {period_index}: {e}")
|
||||
pbar.update(1)
|
||||
|
||||
return all_courses
|
||||
|
||||
def merge_all_period_courses(output_dir):
|
||||
"""Fusionner tous les fichiers CSV de périodes"""
|
||||
logging.info(f"\n=== Fusion de tous les fichiers CSV ===")
|
||||
|
||||
periods_dir = os.path.join(output_dir, 'courses', 'periods')
|
||||
all_courses = []
|
||||
|
||||
# Lire tous les fichiers CSV
|
||||
if os.path.exists(periods_dir):
|
||||
period_files = [f for f in os.listdir(periods_dir) if f.endswith('.csv')]
|
||||
|
||||
for period_file in tqdm(period_files, desc="Fusion des fichiers"):
|
||||
file_path = os.path.join(periods_dir, period_file)
|
||||
try:
|
||||
df = pd.read_csv(file_path, encoding='utf-8-sig')
|
||||
all_courses.append(df)
|
||||
except Exception as e:
|
||||
logging.warning(f"Erreur lors de la lecture de {period_file}: {e}")
|
||||
|
||||
if all_courses:
|
||||
# Fusionner tous les DataFrames
|
||||
merged_df = pd.concat(all_courses, ignore_index=True)
|
||||
|
||||
# Sauvegarder le fichier consolidé
|
||||
courses_list_path = os.path.join(output_dir, 'courses', 'courses_list.csv')
|
||||
os.makedirs(os.path.dirname(courses_list_path), exist_ok=True)
|
||||
merged_df.to_csv(courses_list_path, index=False, encoding='utf-8-sig')
|
||||
|
||||
logging.info(f"✅ Fusionné {len(all_courses)} fichiers dans {courses_list_path}")
|
||||
logging.info(f" Total: {len(merged_df)} courses")
|
||||
|
||||
return merged_df
|
||||
else:
|
||||
logging.error("❌ Aucun fichier CSV à fusionner")
|
||||
return None
|
||||
|
||||
def run_post_processing(output_dir):
|
||||
"""Exécuter les scripts de post-traitement"""
|
||||
logging.info(f"\n=== Exécution des scripts de post-traitement ===")
|
||||
|
||||
# Exécuter le script de post-traitement principal
|
||||
post_process_script = os.path.join('.', 'post_process.py')
|
||||
|
||||
if os.path.exists(post_process_script):
|
||||
logging.info(f"\n📝 Exécution de post_process.py...")
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[sys.executable, post_process_script, output_dir],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
logging.info(f"✅ post_process.py terminé avec succès")
|
||||
|
||||
# Afficher les résultats
|
||||
output_lines = result.stdout.split('\n')
|
||||
for line in output_lines[-30:]: # Dernières 30 lignes
|
||||
if line.strip():
|
||||
logging.info(f" {line}")
|
||||
else:
|
||||
logging.error(f"❌ post_process.py a échoué")
|
||||
logging.error(f" Erreur: {result.stderr[:500]}")
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
logging.warning(f"⏰ post_process.py a expiré après 10 minutes")
|
||||
except Exception as e:
|
||||
logging.error(f"❌ Erreur lors de l'exécution de post_process.py: {e}")
|
||||
else:
|
||||
logging.warning(f"⚠️ Script post_process.py introuvable")
|
||||
|
||||
# Exécuter les scripts utilitaires supplémentaires
|
||||
additional_scripts = [
|
||||
('list_clubs.py', ['--output', output_dir, '--details']),
|
||||
('extract_races.py', ['--data-dir', output_dir, '--details']),
|
||||
]
|
||||
|
||||
for script_name, args in additional_scripts:
|
||||
script_path = os.path.join('.', script_name)
|
||||
|
||||
if os.path.exists(script_path):
|
||||
logging.info(f"\n📝 Exécution de {script_name}...")
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[sys.executable, script_path] + args,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
logging.info(f"✅ {script_name} terminé avec succès")
|
||||
else:
|
||||
logging.warning(f"⚠️ {script_name} a rencontré des erreurs")
|
||||
|
||||
except subprocess.TimeoutExpired:
|
||||
logging.warning(f"⏰ {script_name} a expiré après 5 minutes")
|
||||
except Exception as e:
|
||||
logging.warning(f"⚠️ Erreur lors de l'exécution de {script_name}: {e}")
|
||||
else:
|
||||
logging.warning(f"⚠️ Script {script_name} introuvable")
|
||||
|
||||
def main():
|
||||
"""Fonction principale"""
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.FileHandler('ffa_scraper.log'),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
)
|
||||
|
||||
# Configuration
|
||||
start_year = 2010
|
||||
end_year = 2026
|
||||
max_workers = 8 # Workers pour le multithreading
|
||||
|
||||
logging.info(f"{'='*80}")
|
||||
logging.info(f"SCRAPING FFA COMPLET ({start_year}-{end_year})")
|
||||
logging.info(f"{'='*80}")
|
||||
logging.info(f"Mode: Multithreading avec {max_workers} workers")
|
||||
logging.info(f"Périodes: 15 jours par période")
|
||||
|
||||
# Générer les périodes
|
||||
periods = get_15_day_periods(start_year, end_year)
|
||||
|
||||
# Scraper toutes les périodes
|
||||
start_time = time.time()
|
||||
all_courses = scrape_all_periods_multithreaded(periods, max_workers)
|
||||
end_time = time.time()
|
||||
|
||||
# Statistiques
|
||||
logging.info(f"\n{'='*80}")
|
||||
logging.info(f"RÉSUMÉ DU SCRAPING")
|
||||
logging.info(f"{'='*80}")
|
||||
logging.info(f"Temps total: {(end_time - start_time)/60:.1f} minutes")
|
||||
logging.info(f"Courses récupérées: {len(all_courses)}")
|
||||
logging.info(f"Temps moyen par période: {(end_time - start_time)/len(periods):.1f} secondes")
|
||||
|
||||
# Fusionner tous les fichiers CSV
|
||||
output_dir = os.getenv('OUTPUT_DIR', 'data_2010_2026')
|
||||
merged_df = merge_all_period_courses(output_dir)
|
||||
|
||||
if merged_df is not None:
|
||||
# Statistiques supplémentaires
|
||||
print(f"\n{'='*80}")
|
||||
print(f"STATISTIQUES DES COURSES")
|
||||
print(f"{'='*80}")
|
||||
print(f"Total: {len(merged_df)} courses")
|
||||
|
||||
# Courses par année
|
||||
merged_df['date'] = pd.to_datetime(merged_df['date'], errors='coerce')
|
||||
merged_df['année'] = merged_df['date'].dt.year
|
||||
|
||||
print(f"\nCourses par année:")
|
||||
for year in sorted(merged_df['année'].dropna().unique()):
|
||||
count = len(merged_df[merged_df['année'] == year])
|
||||
print(f" {year}: {count} courses")
|
||||
|
||||
print(f"\n✅ Scraping terminé avec succès!")
|
||||
|
||||
# Exécuter les scripts de post-traitement
|
||||
run_post_processing(output_dir)
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"TOUTES LES DONNÉES SONT DISPONIBLES DANS: {output_dir}")
|
||||
print(f"{'='*80}")
|
||||
else:
|
||||
logging.error("❌ Erreur lors de la fusion des fichiers")
|
||||
sys.exit(1)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user