Source code for esm_analysis.DistUtils

"""Collection of classes for creating batch clusters."""

from datetime import datetime
import json
from pathlib import Path
from subprocess import run, PIPE
from tempfile import TemporaryDirectory


__all__ = ('MPICluster',)

slurm_directive = """#!/bin/bash
#=============================================================================
# =====================================
# mistral batch job parameters
#-----------------------------------------------------------------------------
#SBATCH --account={account}
#SBATCH --job-name={name}
#SBATCH --partition={queue}
#SBATCH -D {workdir}
#SBATCH --output={workdir}/LOG_mpicluster.%j.o
#SBATCH --error={workdir}/LOG_mpicluster.%j.e
#SBATCH --exclusive
#SBATCH --time={walltime}
#SBATCH --cpus-per-task={cpus_per_task}
#SBATCH --mem={memory}
#SBATCH -n {nworkers}
{extra}
"""
_script = """
rm -fr worker-*
{rm_scheduler}
rm -rf *.lock
rm -f $(ls LOG*.[oe]|grep -v $SLURM_JOB_ID)

ulimit -c 0

{job_extra}

# Settings for OpenMPI and MXM (MellanoX Messaging)
# library
export OMPI_MCA_pml=cm
export OMPI_MCA_mtl=mxm
export OMPI_MCA_mtl_mxm_np=0
export MXM_RDMA_PORTS=mlx5_0:1
export MXM_LOG_LEVEL=ERROR
# Disable GHC algorithm for collective communication
export OMPI_MCA_coll=^ghc

{run_cmd} dask-mpi --no-nanny --{scheduler} --scheduler-file {scheduler_file}

"""


class _Slurm:
    """Definitions to work with the slurm workload manager."""

    @property
    def _type(self):
        return 'slurm'

    @property
    def submit_cmd(self):
        """Slurm submit command."""
        return 'sbatch'

    @property
    def check_cmd(self):
        """Slurm check command."""
        return 'squeue'

    @property
    def cancel_cmd(self):
        """Slurm cancel comand."""
        return 'scancel'

    @property
    def run_cmd(self):
        """Slurm run comman."""
        return ('srun -l --cpu_bind=threads '
                '--distribution=block:cyclic --propagate=STACK')

    def cancel(self, job_id):
        """Close down a cluster with a given job_id."""
        if job_id is None:
            return
        run([self.cancel_cmd, job_id], stdout=PIPE, check=True, shell=False)

    def check(self, job_id):
        """Check the status of a running cluster."""
        if job_id is None:
            return None, None, None
        res = run([self.check_cmd, '-j {}'.format(job_id)], check=True,
                  shell=False, stdout=PIPE).stdout.decode('utf-8').split('\n')
        if len(res) < 2:
            return None, None, None
        status = [line.split() for line in res]
        table = dict(zip(status[0], status[1][:len(status[0])]))

        status_l = dict(PD='Queueing', R='Running', F='Failed')
        return status_l[table['ST']], table['TIME'], table['NODES']


[docs]class MPICluster:
    """Create Cluster of distrbuted workers."""

    def close(self):
        """Close down the running cluster."""
        self._batch_system.cancel(self.job_id)
        self.job_id = None
        self._write_json()

    @property
    def status(self):
        """Check the status of the running cluster."""
        status, _, _ = self._batch_system.check(self.job_id)

        try:
            return status[0].upper()
        except TypeError:
            return None

    def __repr__(self):
        """Print the status of the submitted jobs."""
        status, time, nodes = self._batch_system.check(self.job_id)
        if status is None:
            return 'No cluster running'

        return '{}: time: {} nodes: {}'.format(status, time, nodes)

    def _repr_html_(self):
        """Print the status of the submitted jobs in html format."""
        status, time, nodes = self._batch_system.check(self.job_id)
        colors = dict(Queueing='DodgerBlue',
                      Fail='Tomato',
                      Running='MediumSeaGreen')
        if status is None:
            return '<p>No cluster running<p>'
        color = colors[status]

        return """<p> <span style="color:{color};">{status}</span>:
                  time: {time}
                  nodes: {nodes}</p>""".format(color=color,
                                               status=status,
                                               time=time,
                                               nodes=nodes)

    @property
    def scheduler_file(self):
        """Return the schedule file."""
        return Path(self.workdir) / 'scheduler.json'

    @property
    def script_path(self):
        """Return the path of the script that is/was submitted."""
        return Path(self.workdir) / 'scheduler.sh'

    def _write_script(self):
        with open(str(self.script_path), 'w') as f:
            f.write(self.job_script)
        self.script_path.chmod(0o755)

    def _write_json(self):
        _json_data = dict(job_id=self.job_id,
                          workdir=str(self.workdir),
                          job_script=self.job_script,
                          batch_system=self._batch_system._type,
                          datetime=self.submit_time.isoformat())
        with (self.workdir / 'cluster.json').open('w') as f:
            json.dump(_json_data, f, indent=3, sort_keys=True)

    @staticmethod
    def _load(workdir):
        try:
            with (workdir / 'cluster.json').open('r') as f:
                json_data = json.load(f)
        except FileNotFoundError:
            raise ValueError('Cluster has not been created.')

        json_data['datetime'] = datetime.strptime(json_data['datetime'],
                                                  '%Y-%m-%dT%H:%M:%S.%f')
        json_data['workdir'] = Path(json_data['workdir'])
        return json_data

[docs]    @classmethod
    def load(cls, workdir):
        """
        Load the information of a running cluster.

        This method can be used to connect to an already running cluster.

        ::

            from esm_analysis import MPICluster
            cluster = MPICluster.load('/tmp/old_cluster')

        Parameters
        ----------

        workdir : str
            Directory name where information of the previously created cluster
            is stored. The information on the work directory can be retrieved
            by calling the workdir property

        Returns
        -------

            Instance of the MPICluster object: esm_analysis.MPICluster

        """
        workdir = Path(workdir)
        _json_data = cls._load(workdir)
        lookup = dict(slurm=_Slurm)
        batch_system = lookup[_json_data['batch_system']]()
        script = _json_data['job_script']
        job_id = _json_data['job_id']
        if job_id is None:
            raise ValueError('Cluster was closed, submit a new one')
        submit_time = _json_data['datetime']

        return cls(script, workdir, submit_time=submit_time,
                   job_id=job_id, batch_system=batch_system)

    def _submit(self):

        res = run([self._batch_system.submit_cmd, str(self.script_path)],
                  cwd=str(self.workdir), stdout=PIPE, check=True, shell=False)
        job_id, _, _cluster = res.stdout.decode('utf-8').strip().partition(';')
        return job_id.split(" ")[-1]

    def __init__(self, script, workdir, submit_time=None, batch_system=None,
                 job_id=None):
        """Create a cluster using a given submit script."""
        self.job_script = script
        self.submit_time = submit_time
        self.job_id = job_id
        self._batch_system = batch_system
        self.workdir = Path(workdir)
        self.workdir.mkdir(parents=True, exist_ok=True)
        if self.submit_time is None:
            self._write_script()
            self.job_id = self._submit()
            self.submit_time = datetime.now()
            self._write_json()

[docs]    @classmethod
    def slurm(cls, account, queue, *, slurm_extra=[''], memory='140G',
              workdir=None, walltime='01:00:00', cpus_per_task=48,
              name='dask_job', nworkers=1, job_extra=None):
        """
        Create an MPI cluster using slurm.

        This method sets up a cluster with help of the workload manager slurm.

        ::

            from esm_analysis import MPICluster
            cluster = MPICluster.slurm('account', 'express', nworkers=10)


        The jobs will immediately be submitted to the workload manager
        upon creation of the instance.

        Parameters
        ----------

        account: str
            Account name

        queue: str
            partition job should be submitted to

        walltime: str, optional (default: '01:00:00')
            lenth of the job

        name: str, optional (default: dask_job)
            name of the job

        workdir: str, optional (default: None)
            name of the workdirectory, if None is given, a temporary
            directory is used.

        cpus_per_task: int, optional (default: 48)
            number of cpus per node

        memory: str, optional (default: 140G)
            allocated memory per node

        nworkers: int, optional (default: 1)
            number of nodes used in the job

        job_extra: str, optional (default: None)
            additional commands that should be executed in the run sript

        slurm_extra: list, optional (default: None)
            additional slurm directives

        Returns
        -------

            Instance of the MPICluster object: esm_analysis.MPICluster

        """
        job_extra = job_extra or ''
        workdir = workdir or TemporaryDirectory().name
        workdir = Path(workdir)
        batch_system = _Slurm()
        slurm_extra = ['#SBATCH {}'.format(extr)
                       for extr in slurm_extra if slurm_extra]
        scheduler_file = workdir / 'scheduler.json'
        script = slurm_directive.format(
                account=account,
                workdir=workdir,
                name=name,
                cpus_per_task=cpus_per_task,
                nworkers=nworkers+1,
                walltime=walltime,
                memory=memory,
                extra='\n'.join(slurm_extra),
                queue=queue) +\
            _script.format(run_cmd=batch_system.run_cmd,
                           job_extra=job_extra,
                           scheduler='scheduler',
                           scheduler_file=scheduler_file,
                           rm_scheduler='rm -f {}'.format(scheduler_file)
                           )

        return cls(script, workdir, batch_system=batch_system)