Files
pokemon-battle-engine/tools/data/pokemon_downloader.py
2025-08-14 18:50:14 +09:00

778 lines
30 KiB
Python

#!/usr/bin/env python3
"""
Pokemon Data Downloader
This module provides functionality to download Pokemon data from the PokeAPI
using the pokebase library. It supports segmented downloading to allow for
efficient data management and testing with smaller datasets.
Features:
- Download Pokemon species data (stats, types, abilities)
- Download move data (power, accuracy, effects, type)
- Download type effectiveness data
- Segmented downloading by ID ranges
- Data validation and caching
- Progress tracking with rich progress bars
- Export to JSON format for C++ integration
"""
import json
import logging
import time
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import Dict, List, Optional, Set, Tuple, Any
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
import pokebase as pb
from rich.console import Console
from rich.progress import Progress, TaskID, SpinnerColumn, TextColumn, BarColumn, TimeRemainingColumn
from rich.table import Table
from rich.panel import Panel
import click
from .schemas import DataValidator
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
console = Console()
@dataclass
class PokemonStats:
"""Pokemon base stats structure."""
hp: int
attack: int
defense: int
special_attack: int
special_defense: int
speed: int
@dataclass
class PokemonData:
"""Complete Pokemon data structure for battle simulation."""
id: int
name: str
types: List[str]
base_stats: PokemonStats
abilities: List[str]
moves: List[int] # Move IDs that this Pokemon can learn
weight: int
height: int
base_experience: int
@dataclass
class MoveData:
"""Move data structure for battle simulation."""
id: int
name: str
type: str
power: Optional[int]
accuracy: Optional[int]
pp: int
priority: int
damage_class: str # physical, special, or status
effect_id: Optional[int]
effect_chance: Optional[int]
target: str
description: str
@dataclass
class TypeEffectiveness:
"""Type effectiveness data structure."""
attacking_type: str
defending_type: str
damage_factor: float # 0.0, 0.5, 1.0, 2.0
generation: str # generation name (e.g., "generation-i", "generation-ii")
class PokemonDownloader:
"""Main class for downloading Pokemon data from PokeAPI."""
def __init__(self, output_dir: Path = Path("data"), cache_dir: Path = Path(".cache"), validate_data: bool = True):
"""
Initialize the Pokemon downloader.
Args:
output_dir: Directory to save the downloaded data
cache_dir: Directory for caching API responses
validate_data: Whether to validate data before saving
"""
self.output_dir = Path(output_dir)
self.cache_dir = Path(cache_dir)
self.output_dir.mkdir(exist_ok=True)
self.cache_dir.mkdir(exist_ok=True)
# Data validation
self.validate_data = validate_data
self.validator = DataValidator() if validate_data else None
# Thread safety for concurrent downloads
self._lock = threading.Lock()
self._downloaded_pokemon: Set[int] = set()
self._downloaded_moves: Set[int] = set()
# Rate limiting
self._last_request_time = 0.0
self._min_request_interval = 0.1 # 100ms between requests
def _rate_limit(self):
"""Implement simple rate limiting to be respectful to the API."""
with self._lock:
current_time = time.time()
time_since_last = current_time - self._last_request_time
if time_since_last < self._min_request_interval:
time.sleep(self._min_request_interval - time_since_last)
self._last_request_time = time.time()
def _safe_api_call(self, func, *args, **kwargs):
"""Make a safe API call with rate limiting and error handling."""
self._rate_limit()
max_retries = 3
for attempt in range(max_retries):
try:
return func(*args, **kwargs)
except Exception as e:
if attempt == max_retries - 1:
logger.error(f"API call failed after {max_retries} attempts: {e}")
raise
logger.warning(f"API call attempt {attempt + 1} failed, retrying: {e}")
time.sleep(1.0 * (attempt + 1)) # Exponential backoff
def download_pokemon(self, pokemon_id: int) -> Optional[PokemonData]:
"""
Download data for a single Pokemon.
Args:
pokemon_id: The Pokemon ID to download
Returns:
PokemonData object or None if download failed
"""
try:
# Download Pokemon species and Pokemon data
pokemon = self._safe_api_call(pb.pokemon, pokemon_id)
species = self._safe_api_call(pb.pokemon_species, pokemon_id)
# Extract base stats
stats_map = {stat.stat.name: stat.base_stat for stat in pokemon.stats}
base_stats = PokemonStats(
hp=stats_map.get('hp', 0),
attack=stats_map.get('attack', 0),
defense=stats_map.get('defense', 0),
special_attack=stats_map.get('special-attack', 0),
special_defense=stats_map.get('special-defense', 0),
speed=stats_map.get('speed', 0)
)
# Extract types
types = [t.type.name for t in pokemon.types]
# Extract abilities
abilities = [ability.ability.name for ability in pokemon.abilities]
# Extract learnable moves (just IDs for now)
moves = [move.move.url.split('/')[-2] for move in pokemon.moves]
moves = [int(move_id) for move_id in moves if move_id.isdigit()]
return PokemonData(
id=pokemon.id,
name=pokemon.name,
types=types,
base_stats=base_stats,
abilities=abilities,
moves=moves,
weight=pokemon.weight,
height=pokemon.height,
base_experience=pokemon.base_experience or 0
)
except Exception as e:
logger.error(f"Failed to download Pokemon {pokemon_id}: {e}")
return None
def download_move(self, move_id: int) -> Optional[MoveData]:
"""
Download data for a single move.
Args:
move_id: The move ID to download
Returns:
MoveData object or None if download failed
"""
try:
move = self._safe_api_call(pb.move, move_id)
# Extract effect description (English)
description = ""
if move.effect_entries:
for entry in move.effect_entries:
if entry.language.name == 'en':
description = entry.short_effect or entry.effect or ""
break
return MoveData(
id=move.id,
name=move.name,
type=move.type.name if move.type else "normal",
power=move.power,
accuracy=move.accuracy,
pp=move.pp or 0,
priority=move.priority or 0,
damage_class=move.damage_class.name if move.damage_class else "status",
effect_id=None, # Effect ID not directly available in this API version
effect_chance=move.effect_chance,
target=move.target.name if move.target else "selected-pokemon",
description=description
)
except Exception as e:
logger.error(f"Failed to download move {move_id}: {e}")
return None
def download_type_effectiveness(self, target_generation: str = "generation-i") -> List[TypeEffectiveness]:
"""
Download type effectiveness data for a specific generation.
Args:
target_generation: Generation to get effectiveness for (default: "generation-i")
Returns:
List of TypeEffectiveness objects for the specified generation
"""
effectiveness_data = []
try:
# First, build a cache of type generations to avoid repeated API calls
console.print("🔍 Building type generation cache...")
type_generations = self._build_type_generation_cache()
# Filter types that exist in the target generation
target_gen_index = self._generation_order.index(target_generation)
valid_types = [
type_name for type_name, gen_name in type_generations.items()
if self._generation_order.index(gen_name) <= target_gen_index
]
console.print(f"📊 Processing {len(valid_types)} types for {target_generation}")
for type_name in valid_types:
try:
type_data = self._safe_api_call(pb.type_, type_name)
# Process current damage relations for the target generation
current_relations = self._get_damage_relations_for_generation(
type_data, target_generation, type_generations
)
# Add current generation effectiveness data
for defending_type, damage_factor in current_relations.items():
effectiveness_data.append(TypeEffectiveness(
attacking_type=type_name,
defending_type=defending_type,
damage_factor=damage_factor,
generation=target_generation
))
except Exception as e:
logger.warning(f"Failed to process type {type_name}: {e}")
continue
console.print(f"✅ Processed {len(effectiveness_data)} type effectiveness entries")
return effectiveness_data
except Exception as e:
logger.error(f"Failed to download type effectiveness: {e}")
return []
def _build_type_generation_cache(self) -> Dict[str, str]:
"""
Build a cache of type names to their generation.
Returns:
Dictionary mapping type name to generation name
"""
type_generations = {}
# List of all known Pokemon types across all generations
all_types = [
'normal', 'fire', 'water', 'electric', 'grass', 'ice',
'fighting', 'poison', 'ground', 'flying', 'psychic',
'bug', 'rock', 'ghost', 'dragon', 'dark', 'steel',
'fairy'
]
for type_name in all_types:
try:
type_data = self._safe_api_call(pb.type_, type_name)
type_generations[type_name] = type_data.generation.name
except Exception as e:
logger.warning(f"Failed to get generation for type {type_name}: {e}")
continue
return type_generations
@property
def _generation_order(self) -> List[str]:
"""Get the order of generations for comparison."""
return [
'generation-i', 'generation-ii', 'generation-iii', 'generation-iv',
'generation-v', 'generation-vi', 'generation-vii', 'generation-viii',
'generation-ix'
]
def _get_damage_relations_for_generation(self, type_data, target_generation: str, type_generations: Dict[str, str]) -> Dict[str, float]:
"""
Extract damage relations for a specific generation from type data.
Args:
type_data: Type data from PokeAPI
target_generation: Target generation name
type_generations: Cache of type names to their generation
Returns:
Dictionary mapping defending type to damage factor
"""
relations = {}
# Check if we need historical data
if type_data.generation.name == target_generation:
# Current generation - use current damage relations
damage_relations = type_data.damage_relations
else:
# Look for historical data
damage_relations = None
for past_relation in type_data.past_damage_relations:
if past_relation.generation.name == target_generation:
damage_relations = past_relation.damage_relations
break
# If no historical data found, use current relations
if damage_relations is None:
damage_relations = type_data.damage_relations
# Extract damage factors, filtering by generation
target_gen_index = self._generation_order.index(target_generation)
if hasattr(damage_relations, 'double_damage_to'):
for relation in damage_relations.double_damage_to:
# Only include types that exist in the target generation
defending_gen = type_generations.get(relation.name)
if defending_gen:
defending_gen_index = self._generation_order.index(defending_gen)
if defending_gen_index <= target_gen_index:
relations[relation.name] = 2.0
if hasattr(damage_relations, 'half_damage_to'):
for relation in damage_relations.half_damage_to:
# Only include types that exist in the target generation
defending_gen = type_generations.get(relation.name)
if defending_gen:
defending_gen_index = self._generation_order.index(defending_gen)
if defending_gen_index <= target_gen_index:
relations[relation.name] = 0.5
if hasattr(damage_relations, 'no_damage_to'):
for relation in damage_relations.no_damage_to:
# Only include types that exist in the target generation
defending_gen = type_generations.get(relation.name)
if defending_gen:
defending_gen_index = self._generation_order.index(defending_gen)
if defending_gen_index <= target_gen_index:
relations[relation.name] = 0.0
return relations
def download_pokemon_batch(self, start_id: int, end_id: int, max_workers: int = 5) -> Dict[int, PokemonData]:
"""
Download a batch of Pokemon data concurrently.
Args:
start_id: Starting Pokemon ID (inclusive)
end_id: Ending Pokemon ID (inclusive)
max_workers: Maximum number of concurrent downloads
Returns:
Dictionary mapping Pokemon ID to PokemonData
"""
pokemon_data = {}
pokemon_ids = list(range(start_id, end_id + 1))
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
TimeRemainingColumn(),
console=console
) as progress:
task = progress.add_task(
f"Downloading Pokemon {start_id}-{end_id}",
total=len(pokemon_ids)
)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all download tasks
future_to_id = {
executor.submit(self.download_pokemon, pokemon_id): pokemon_id
for pokemon_id in pokemon_ids
}
# Collect results as they complete
for future in as_completed(future_to_id):
pokemon_id = future_to_id[future]
try:
result = future.result()
if result:
pokemon_data[pokemon_id] = result
with self._lock:
self._downloaded_pokemon.add(pokemon_id)
except Exception as e:
logger.error(f"Pokemon {pokemon_id} download failed: {e}")
progress.update(task, advance=1)
return pokemon_data
def download_moves_batch(self, move_ids: List[int], max_workers: int = 5) -> Dict[int, MoveData]:
"""
Download a batch of move data concurrently.
Args:
move_ids: List of move IDs to download
max_workers: Maximum number of concurrent downloads
Returns:
Dictionary mapping move ID to MoveData
"""
moves_data = {}
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
TimeRemainingColumn(),
console=console
) as progress:
task = progress.add_task(
f"Downloading {len(move_ids)} moves",
total=len(move_ids)
)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all download tasks
future_to_id = {
executor.submit(self.download_move, move_id): move_id
for move_id in move_ids
}
# Collect results as they complete
for future in as_completed(future_to_id):
move_id = future_to_id[future]
try:
result = future.result()
if result:
moves_data[move_id] = result
with self._lock:
self._downloaded_moves.add(move_id)
except Exception as e:
logger.error(f"Move {move_id} download failed: {e}")
progress.update(task, advance=1)
return moves_data
def save_pokemon_data(self, pokemon_data: Dict[int, PokemonData], filename: str = "pokemon.json"):
"""Save Pokemon data to JSON file with optional validation."""
output_file = self.output_dir / filename
data_dict = {str(k): asdict(v) for k, v in pokemon_data.items()}
# Validate data before saving if validation is enabled
if self.validate_data and self.validator:
errors = self.validator.validate_pokemon_collection(data_dict)
if errors:
console.print(f"⚠️ Validation warnings for {filename}:")
for error in errors[:10]: # Show first 10 errors
console.print(f" - {error}")
if len(errors) > 10:
console.print(f" ... and {len(errors) - 10} more errors")
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data_dict, f, indent=2, ensure_ascii=False)
console.print(f"✅ Saved {len(pokemon_data)} Pokemon to {output_file}")
def save_moves_data(self, moves_data: Dict[int, MoveData], filename: str = "moves.json"):
"""Save moves data to JSON file with optional validation."""
output_file = self.output_dir / filename
data_dict = {str(k): asdict(v) for k, v in moves_data.items()}
# Validate data before saving if validation is enabled
if self.validate_data and self.validator:
errors = self.validator.validate_move_collection(data_dict)
if errors:
console.print(f"⚠️ Validation warnings for {filename}:")
for error in errors[:10]: # Show first 10 errors
console.print(f" - {error}")
if len(errors) > 10:
console.print(f" ... and {len(errors) - 10} more errors")
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data_dict, f, indent=2, ensure_ascii=False)
console.print(f"✅ Saved {len(moves_data)} moves to {output_file}")
def save_type_effectiveness(self, effectiveness_data: List[TypeEffectiveness], filename: str = "type_effectiveness.json"):
"""Save type effectiveness data to JSON file with optional validation."""
output_file = self.output_dir / filename
data_dict = [asdict(item) for item in effectiveness_data]
# Validate data before saving if validation is enabled
if self.validate_data and self.validator:
errors = self.validator.validate_type_effectiveness(data_dict)
if errors:
console.print(f"⚠️ Validation warnings for {filename}:")
for error in errors[:10]: # Show first 10 errors
console.print(f" - {error}")
if len(errors) > 10:
console.print(f" ... and {len(errors) - 10} more errors")
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data_dict, f, indent=2, ensure_ascii=False)
console.print(f"✅ Saved {len(effectiveness_data)} type effectiveness entries to {output_file}")
def get_stats_summary(self, pokemon_data: Dict[int, PokemonData]) -> Table:
"""Generate a summary table of downloaded Pokemon data."""
table = Table(title="Downloaded Pokemon Summary")
table.add_column("Metric", style="cyan")
table.add_column("Value", style="magenta")
if not pokemon_data:
table.add_row("Total Pokemon", "0")
return table
# Calculate statistics
total_pokemon = len(pokemon_data)
types_count = {}
total_moves = set()
for pokemon in pokemon_data.values():
for ptype in pokemon.types:
types_count[ptype] = types_count.get(ptype, 0) + 1
total_moves.update(pokemon.moves)
# Add rows to table
table.add_row("Total Pokemon", str(total_pokemon))
table.add_row("Unique Moves Referenced", str(len(total_moves)))
table.add_row("Most Common Type", max(types_count, key=types_count.get) if types_count else "N/A")
table.add_row("ID Range", f"{min(pokemon_data.keys())}-{max(pokemon_data.keys())}")
return table
# CLI Interface
@click.group()
@click.option('--output-dir', default='data', help='Output directory for downloaded data')
@click.option('--cache-dir', default='.cache', help='Cache directory for API responses')
@click.option('--no-validation', is_flag=True, help='Disable data validation')
@click.pass_context
def cli(ctx, output_dir, cache_dir, no_validation):
"""Pokemon Data Downloader CLI."""
ctx.ensure_object(dict)
ctx.obj['downloader'] = PokemonDownloader(
Path(output_dir),
Path(cache_dir),
validate_data=not no_validation
)
@cli.command()
@click.option('--start', default=1, help='Starting Pokemon ID')
@click.option('--end', default=10, help='Ending Pokemon ID')
@click.option('--workers', default=5, help='Number of concurrent workers')
@click.option('--include-moves', is_flag=True, help='Also download moves for these Pokemon')
@click.pass_context
def download_pokemon(ctx, start, end, workers, include_moves):
"""Download Pokemon data for a specific ID range."""
downloader = ctx.obj['downloader']
console.print(Panel.fit(
f"🔽 Downloading Pokemon {start} to {end}",
style="bold blue"
))
# Download Pokemon data
pokemon_data = downloader.download_pokemon_batch(start, end, workers)
if pokemon_data:
# Save Pokemon data
filename = f"pokemon_{start}_{end}.json"
downloader.save_pokemon_data(pokemon_data, filename)
# Show summary
summary_table = downloader.get_stats_summary(pokemon_data)
console.print(summary_table)
# Download moves if requested
if include_moves:
all_move_ids = set()
for pokemon in pokemon_data.values():
all_move_ids.update(pokemon.moves)
if all_move_ids:
console.print(f"\n🔽 Downloading {len(all_move_ids)} unique moves...")
moves_data = downloader.download_moves_batch(list(all_move_ids), workers)
if moves_data:
moves_filename = f"moves_{start}_{end}.json"
downloader.save_moves_data(moves_data, moves_filename)
else:
console.print("❌ No Pokemon data was successfully downloaded")
@cli.command()
@click.option('--move-ids', help='Comma-separated list of move IDs to download')
@click.option('--workers', default=5, help='Number of concurrent workers')
@click.pass_context
def download_moves(ctx, move_ids, workers):
"""Download specific moves by ID."""
downloader = ctx.obj['downloader']
if not move_ids:
console.print("❌ Please provide move IDs with --move-ids")
return
try:
ids = [int(x.strip()) for x in move_ids.split(',')]
except ValueError:
console.print("❌ Invalid move IDs format. Use comma-separated integers.")
return
console.print(Panel.fit(
f"🔽 Downloading {len(ids)} moves",
style="bold blue"
))
moves_data = downloader.download_moves_batch(ids, workers)
if moves_data:
downloader.save_moves_data(moves_data, "custom_moves.json")
console.print(f"✅ Successfully downloaded {len(moves_data)} moves")
else:
console.print("❌ No move data was successfully downloaded")
@cli.command()
@click.option('--generation', default='generation-i', help='Target generation for type effectiveness')
@click.pass_context
def download_types(ctx, generation):
"""Download type effectiveness data."""
downloader = ctx.obj['downloader']
console.print(Panel.fit(
f"🔽 Downloading type effectiveness data for {generation}",
style="bold blue"
))
effectiveness_data = downloader.download_type_effectiveness(generation)
if effectiveness_data:
filename = f"type_effectiveness_{generation}.json"
downloader.save_type_effectiveness(effectiveness_data, filename)
console.print(f"✅ Successfully downloaded {len(effectiveness_data)} type effectiveness entries")
else:
console.print("❌ Failed to download type effectiveness data")
@cli.command()
@click.option('--generations', default='generation-i,generation-ii,generation-iii,generation-iv,generation-v,generation-vi,generation-vii,generation-viii,generation-ix',
help='Comma-separated list of generations to download')
@click.option('--workers', default=3, help='Number of concurrent workers')
@click.pass_context
def download_types_multi(ctx, generations, workers):
"""Download type effectiveness data for multiple generations."""
downloader = ctx.obj['downloader']
generation_list = [gen.strip() for gen in generations.split(',')]
console.print(Panel.fit(
f"🔽 Downloading type effectiveness for {len(generation_list)} generations",
style="bold blue"
))
total_entries = 0
for generation in generation_list:
console.print(f"\n📊 Processing {generation}...")
effectiveness_data = downloader.download_type_effectiveness(generation)
if effectiveness_data:
filename = f"type_effectiveness_{generation}.json"
downloader.save_type_effectiveness(effectiveness_data, filename)
total_entries += len(effectiveness_data)
console.print(f"✅ Saved {len(effectiveness_data)} entries for {generation}")
else:
console.print(f"❌ Failed to download data for {generation}")
console.print(f"\n🎉 Downloaded type effectiveness for {len(generation_list)} generations ({total_entries} total entries)")
@cli.command()
@click.option('--start', default=1, help='Starting Pokemon ID')
@click.option('--end', default=151, help='Ending Pokemon ID (151 for Gen 1)')
@click.option('--workers', default=5, help='Number of concurrent workers')
@click.option('--generation', default='generation-i', help='Target generation for type effectiveness')
@click.pass_context
def download_complete(ctx, start, end, workers, generation):
"""Download complete dataset (Pokemon, moves, and type effectiveness)."""
downloader = ctx.obj['downloader']
console.print(Panel.fit(
f"🔽 Downloading complete Pokemon dataset ({start}-{end}) for {generation}",
style="bold blue"
))
# Download Pokemon
pokemon_data = downloader.download_pokemon_batch(start, end, workers)
if pokemon_data:
downloader.save_pokemon_data(pokemon_data, f"pokemon_complete_{start}_{end}.json")
# Get all unique moves
all_move_ids = set()
for pokemon in pokemon_data.values():
all_move_ids.update(pokemon.moves)
# Download moves
if all_move_ids:
moves_data = downloader.download_moves_batch(list(all_move_ids), workers)
if moves_data:
downloader.save_moves_data(moves_data, f"moves_complete_{start}_{end}.json")
# Download type effectiveness for specified generation
effectiveness_data = downloader.download_type_effectiveness(generation)
if effectiveness_data:
filename = f"type_effectiveness_complete_{generation}.json"
downloader.save_type_effectiveness(effectiveness_data, filename)
# Show final summary
summary_table = downloader.get_stats_summary(pokemon_data)
console.print(summary_table)
console.print("🎉 Complete dataset download finished!")
else:
console.print("❌ Failed to download Pokemon data")
if __name__ == "__main__":
cli()