pokemon-battle-engine/tools/data/pokemon_downloader.py

#!/usr/bin/env python3
"""
Pokemon Data Downloader

This tool downloads generation-specific Pokemon data from the PokeAPI (https://pokeapi.co/)
and organizes it into the proper directory structure for the Pokemon Battle Engine.

Usage:
    python pokemon_downloader.py --generations 1,2,3 --data-types types,pokemon,moves
    python pokemon_downloader.py --all-generations --all-data-types
    python pokemon_downloader.py --help

Data Structure:
    data/
    ├── types/
    │   ├── generation-i.json
    │   ├── generation-ii.json
    │   └── ...
    ├── pokemon/
    │   ├── generation-i.json
    │   ├── generation-ii.json
    │   └── ...
    └── moves/
        ├── generation-i.json
        ├── generation-ii.json
        └── ...
"""

import argparse
import json
import os
import sys
import time
from pathlib import Path
from typing import Dict, List, Optional, Set

import requests
from tqdm import tqdm


class PokemonDataDownloader:
    """Downloads and processes Pokemon data from PokeAPI."""

    BASE_URL = "https://pokeapi.co/api/v2"

    # Generation mappings (approximate - some Pokemon/moves span generations)
    GENERATION_RANGES = {
        "generation-i": (1, 151),      # Kanto
        "generation-ii": (152, 251),   # Johto
        "generation-iii": (252, 386),  # Hoenn
        "generation-iv": (387, 493),   # Sinnoh
        "generation-v": (494, 649),    # Unova
        "generation-vi": (650, 721),   # Kalos
        "generation-vii": (722, 809),  # Alola
        "generation-viii": (810, 905), # Galar
        "generation-ix": (906, 1025),  # Paldea
    }

    def __init__(self, base_dir: str = "data"):
        """Initialize the downloader with base data directory."""
        self.base_dir = Path(base_dir)
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Pokemon-Battle-Engine-Data-Downloader/1.0'
        })

        # Create base directories
        for subdir in ['types', 'pokemon', 'moves']:
            (self.base_dir / subdir).mkdir(parents=True, exist_ok=True)

    def _file_exists_and_valid(self, file_path: Path, min_items: int = 1) -> bool:
        """Check if a file exists and contains at least the minimum number of items."""
        if not file_path.exists():
            return False

        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            # Handle different data structures
            if isinstance(data, dict):
                return len(data) >= min_items
            elif isinstance(data, list):
                return len(data) >= min_items
            else:
                return True  # File exists and is valid JSON
        except (json.JSONDecodeError, KeyError, TypeError):
            return False

    def get_json(self, url: str, max_retries: int = 3) -> Optional[Dict]:
        """Fetch JSON data from URL with retry logic."""
        for attempt in range(max_retries):
            try:
                response = self.session.get(url, timeout=30)
                response.raise_for_status()
                return response.json()
            except requests.RequestException as e:
                if attempt == max_retries - 1:
                    print(f"Failed to fetch {url}: {e}")
                    return None
                time.sleep(2 ** attempt)  # Exponential backoff
        return None

    def get_all_generations(self) -> List[str]:
        """Get list of all Pokemon generations."""
        data = self.get_json(f"{self.BASE_URL}/generation/")
        if not data:
            return list(self.GENERATION_RANGES.keys())

        generations = []
        for gen in data['results']:
            gen_name = gen['name']
            if gen_name in self.GENERATION_RANGES:
                generations.append(gen_name)

        return sorted(generations)

    def download_type_effectiveness(self, generation: str) -> bool:
        """Download type effectiveness chart for a specific generation."""
        output_file = self.base_dir / 'types' / f'{generation}.json'

        # Check if file already exists and has enough types (should have at least 15-18 types)
        if self._file_exists_and_valid(output_file, min_items=15):
            print(f"Using existing type effectiveness data for {generation}")
            return True

        print(f"Downloading type effectiveness for {generation}...")

        # Get generation data to find the version groups
        gen_data = self.get_json(f"{self.BASE_URL}/generation/{generation}/")
        if not gen_data:
            print(f"Failed to get generation data for {generation}")
            return False

        # Get the first version group for this generation
        version_groups = gen_data.get('version_groups', [])
        if not version_groups:
            print(f"No version groups found for {generation}")
            return False

        # Use the first version group to get types
        types_data = self.get_json(f"{self.BASE_URL}/type/")
        if not types_data:
            print("Failed to get types list")
            return False

        type_effectiveness = {}

        # For each type, get its damage relations
        for type_info in tqdm(types_data['results'], desc="Processing types"):
            type_name = type_info['name']

            # Skip special types that don't have damage relations
            if type_name in ['unknown', 'shadow']:
                continue

            type_data = self.get_json(f"{self.BASE_URL}/type/{type_name}/")
            if not type_data:
                print(f"Failed to get data for type {type_name}")
                continue

            # Extract damage relations
            damage_relations = type_data.get('damage_relations', {})

            type_effectiveness[type_name] = {
                'double_damage_from': [t['name'] for t in damage_relations.get('double_damage_from', [])],
                'double_damage_to': [t['name'] for t in damage_relations.get('double_damage_to', [])],
                'half_damage_from': [t['name'] for t in damage_relations.get('half_damage_from', [])],
                'half_damage_to': [t['name'] for t in damage_relations.get('half_damage_to', [])],
                'no_damage_from': [t['name'] for t in damage_relations.get('no_damage_from', [])],
                'no_damage_to': [t['name'] for t in damage_relations.get('no_damage_to', [])],
            }

        # Save the data
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(type_effectiveness, f, indent=2, ensure_ascii=False)

        print(f"Saved type effectiveness data to {output_file}")
        return True

    def download_pokemon_data(self, generation: str) -> bool:
        """Download Pokemon data for a specific generation."""
        output_file = self.base_dir / 'pokemon' / f'{generation}.json'

        # Check if file already exists and has expected number of Pokemon
        if generation in self.GENERATION_RANGES:
            min_id, max_id = self.GENERATION_RANGES[generation]
            expected_pokemon = max_id - min_id + 1

            if self._file_exists_and_valid(output_file, min_items=expected_pokemon * 0.8):  # Allow for some missing Pokemon
                print(f"Using existing Pokemon data for {generation}")
                return True

        print(f"Downloading Pokemon data for {generation}...")

        if generation not in self.GENERATION_RANGES:
            print(f"Unknown generation: {generation}")
            return False

        min_id, max_id = self.GENERATION_RANGES[generation]
        pokemon_list = []

        # Download Pokemon in batches
        for pokemon_id in tqdm(range(min_id, max_id + 1), desc=f"Downloading {generation} Pokemon"):
            pokemon_data = self.get_json(f"{self.BASE_URL}/pokemon/{pokemon_id}/")
            if not pokemon_data:
                print(f"Failed to get Pokemon {pokemon_id}")
                continue

            # Extract relevant Pokemon data
            pokemon_info = {
                'id': pokemon_data['id'],
                'name': pokemon_data['name'],
                'height': pokemon_data['height'],
                'weight': pokemon_data['weight'],
                'base_experience': pokemon_data.get('base_experience', 0),
                'types': [t['type']['name'] for t in pokemon_data['types']],
                'stats': {
                    stat['stat']['name']: stat['base_stat']
                    for stat in pokemon_data['stats']
                },
                'abilities': [
                    {
                        'name': ability['ability']['name'],
                        'is_hidden': ability['is_hidden'],
                        'slot': ability['slot']
                    }
                    for ability in pokemon_data['abilities']
                ]
            }

            # Get species data for additional information
            species_data = self.get_json(pokemon_data['species']['url'])
            if species_data:
                pokemon_info['species'] = {
                    'name': species_data['name'],
                    'generation': species_data['generation']['name'],
                    'is_legendary': species_data['is_legendary'],
                    'is_mythical': species_data['is_mythical'],
                    'is_baby': species_data.get('is_baby', False),
                }

            pokemon_list.append(pokemon_info)

        # Save the data
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(pokemon_list, f, indent=2, ensure_ascii=False)

        print(f"Saved {len(pokemon_list)} Pokemon to {output_file}")
        return True

    def download_all_moves_data(self) -> bool:
        """Download all moves data and save to all_moves.json."""
        all_moves_file = self.base_dir / 'all_moves.json'

        # Check if all moves file already exists and has reasonable size
        if all_moves_file.exists():
            try:
                with open(all_moves_file, 'r', encoding='utf-8') as f:
                    existing_data = json.load(f)
                if len(existing_data) > 800:  # Should have at least 800+ moves
                    print(f"Using existing all_moves.json with {len(existing_data)} moves")
                    return True
                else:
                    print(f"Existing all_moves.json only has {len(existing_data)} moves, re-downloading...")
            except (json.JSONDecodeError, KeyError):
                print("Existing all_moves.json is corrupted, re-downloading...")

        print("Downloading all moves data...")

        # Get all moves
        moves_data = self.get_json(f"{self.BASE_URL}/move/?limit=2000")
        if not moves_data:
            print("Failed to get moves list")
            return False

        all_moves = {}

        # Download move details
        for move_info in tqdm(moves_data['results'], desc="Downloading all moves"):
            move_data = self.get_json(move_info['url'])
            if not move_data:
                continue

            move_info = {
                'id': move_data['id'],
                'name': move_data['name'],
                'generation': move_data['generation']['name'],
                'power': move_data['power'],
                'pp': move_data['pp'],
                'accuracy': move_data['accuracy'],
                'priority': move_data['priority'],
                'damage_class': move_data['damage_class']['name'],
                'type': move_data['type']['name'],
                'target': move_data['target']['name'],
                'effect_chance': move_data.get('effect_chance'),
                'effect': move_data['effect_entries'][0]['effect'] if move_data['effect_entries'] else None,
                'short_effect': move_data['effect_entries'][0]['short_effect'] if move_data['effect_entries'] else None,
                'meta': move_data.get('meta', {}),
            }

            all_moves[move_data['id']] = move_info

        # Save all moves data
        with open(all_moves_file, 'w', encoding='utf-8') as f:
            json.dump(all_moves, f, indent=2, ensure_ascii=False)

        print(f"Saved {len(all_moves)} moves to {all_moves_file}")
        return True

    def download_moves_data(self, generation: str) -> bool:
        """Download moves data for a specific generation."""
        output_file = self.base_dir / 'moves' / f'{generation}.json'

        # Check if file already exists and has a reasonable number of moves
        # Generation I should have ~165 moves, later generations should have more
        min_moves = 100 if generation == "generation-i" else 200
        if self._file_exists_and_valid(output_file, min_items=min_moves):
            print(f"Using existing moves data for {generation}")
            return True

        print(f"Downloading moves data for {generation}...")

        # First ensure we have all moves data
        if not self.download_all_moves_data():
            return False

        # Load all moves data
        all_moves_file = self.base_dir / 'all_moves.json'
        try:
            with open(all_moves_file, 'r', encoding='utf-8') as f:
                all_moves = json.load(f)
        except (FileNotFoundError, json.JSONDecodeError):
            print("Failed to load all_moves.json")
            return False

        moves_list = []

        # Filter moves by generation
        for move_id, move_data in all_moves.items():
            # Check if this move was introduced in the target generation or earlier
            move_generation = move_data['generation']
            if self._compare_generations(move_generation, generation) > 0:
                continue  # Move is from a later generation

            moves_list.append(move_data)

        # Save the filtered data
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(moves_list, f, indent=2, ensure_ascii=False)

        print(f"Saved {len(moves_list)} moves to {output_file}")
        return True

    def _compare_generations(self, gen1: str, gen2: str) -> int:
        """Compare two generations. Returns -1 if gen1 < gen2, 0 if equal, 1 if gen1 > gen2."""
        generations = list(self.GENERATION_RANGES.keys())
        try:
            idx1 = generations.index(gen1)
            idx2 = generations.index(gen2)
            return (idx1 > idx2) - (idx1 < idx2)
        except ValueError:
            return 0

    def download_all_data(self, generations: List[str], data_types: List[str]) -> bool:
        """Download all specified data types for all specified generations."""
        success = True

        for generation in generations:
            print(f"\n{'='*50}")
            print(f"Processing {generation}")
            print(f"{'='*50}")

            for data_type in data_types:
                try:
                    if data_type == 'types':
                        if not self.download_type_effectiveness(generation):
                            success = False
                    elif data_type == 'pokemon':
                        if not self.download_pokemon_data(generation):
                            success = False
                    elif data_type == 'moves':
                        if not self.download_moves_data(generation):
                            success = False
                    else:
                        print(f"Unknown data type: {data_type}")
                        success = False
                except Exception as e:
                    print(f"Error downloading {data_type} for {generation}: {e}")
                    success = False

        return success


def main():
    """Main entry point for the command-line tool."""
    parser = argparse.ArgumentParser(
        description="Download Pokemon data from PokeAPI",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  python pokemon_downloader.py --generations generation-i --data-types types
  python pokemon_downloader.py --all-generations --all-data-types
  python pokemon_downloader.py --generations generation-i,generation-ii --data-types pokemon,moves
        """
    )

    parser.add_argument(
        '--generations',
        type=lambda x: [g.strip() for g in x.split(',')],
        help='Comma-separated list of generations to download (e.g., generation-i,generation-ii)'
    )

    parser.add_argument(
        '--all-generations',
        action='store_true',
        help='Download data for all generations'
    )

    parser.add_argument(
        '--data-types',
        type=lambda x: [dt.strip() for dt in x.split(',')],
        help='Comma-separated list of data types to download (types,pokemon,moves)'
    )

    parser.add_argument(
        '--all-data-types',
        action='store_true',
        help='Download all data types'
    )

    parser.add_argument(
        '--output-dir',
        default='data',
        help='Output directory for downloaded data (default: data)'
    )

    args = parser.parse_args()

    # Validate arguments
    if not args.all_generations and not args.generations:
        print("Error: Must specify either --generations or --all-generations")
        sys.exit(1)

    if not args.all_data_types and not args.data_types:
        print("Error: Must specify either --data-types or --all-data-types")
        sys.exit(1)

    # Initialize downloader
    downloader = PokemonDataDownloader(args.output_dir)

    # Determine generations to download
    if args.all_generations:
        generations = downloader.get_all_generations()
    else:
        generations = args.generations

    # Determine data types to download
    if args.all_data_types:
        data_types = ['types', 'pokemon', 'moves']
    else:
        data_types = args.data_types

    print(f"Downloading {data_types} data for generations: {generations}")
    print(f"Output directory: {args.output_dir}")

    # Download the data
    success = downloader.download_all_data(generations, data_types)

    if success:
        print(f"\n{'='*50}")
        print("Download completed successfully!")
        print(f"Data saved to: {args.output_dir}")
        print(f"{'='*50}")
    else:
        print(f"\n{'='*50}")
        print("Download completed with errors. Check the output above for details.")
        print(f"{'='*50}")
        sys.exit(1)


if __name__ == '__main__':
    main()