| | """
|
| | Command-line interface for the Cloud Agents system.
|
| | """
|
| | import click
|
| | import asyncio
|
| | import logging
|
| | from .coordinator import Coordinator
|
| | from .scaling import ScalingManager
|
| | from .config import settings
|
| |
|
| | logging.basicConfig(level=logging.INFO)
|
| | logger = logging.getLogger(__name__)
|
| |
|
| | @click.group()
|
| | def cli():
|
| | """Cloud Agents CLI for distributed model training."""
|
| | pass
|
| |
|
| | @cli.command()
|
| | @click.option('--num-epochs', default=1, help='Number of training epochs')
|
| | @click.option('--steps-per-epoch', default=100, help='Steps per epoch')
|
| | def train(num_epochs, steps_per_epoch):
|
| | """Start distributed training."""
|
| | try:
|
| | coordinator = Coordinator()
|
| | scaling_manager = ScalingManager()
|
| |
|
| | async def run_training():
|
| |
|
| | asyncio.create_task(scaling_manager.monitor_and_scale())
|
| |
|
| |
|
| | await coordinator.coordinate_training({
|
| | 'num_epochs': num_epochs,
|
| | 'steps_per_epoch': steps_per_epoch
|
| | })
|
| |
|
| | asyncio.run(run_training())
|
| |
|
| | except Exception as e:
|
| | logger.error(f"Training failed: {e}")
|
| | raise
|
| |
|
| | @cli.command()
|
| | def status():
|
| | """Get cluster status."""
|
| | try:
|
| | scaling_manager = ScalingManager()
|
| | status = scaling_manager.get_cluster_status()
|
| |
|
| | click.echo("Cluster Status:")
|
| | click.echo(f"Total Agents: {status['total_agents']}")
|
| | click.echo(f"Busy Agents: {status['busy_agents']}")
|
| | click.echo(f"Idle Agents: {status['idle_agents']}")
|
| | click.echo(f"Utilization: {status['utilization']:.2%}")
|
| | click.echo(f"Can Scale Up: {status['can_scale_up']}")
|
| | click.echo(f"Can Scale Down: {status['can_scale_down']}")
|
| |
|
| | except Exception as e:
|
| | logger.error(f"Failed to get status: {e}")
|
| | raise
|
| |
|
| | if __name__ == '__main__':
|
| | cli() |