#!/usr/bin/env python
# Copyright (c) 2011, 2013 Arista Networks, Inc.  All rights reserved.
# Arista Networks, Inc. Confidential and Proprietary.
'''
Cli scheduler is responsible for executing scheduled Cli jobs which are used to
periodically execute CLI commands (e.g. show tech-support).
'''

import Toggles.ProcMgrToggleLib
import CliSchedulerLib, Tac, Url, subprocess
import os, time, Logging, QuickTrace, Tracing, SuperServer
import signal
from socket import gethostname

t0 = Tracing.trace0
t1 = Tracing.trace1

qv = QuickTrace.Var
qt0 = QuickTrace.trace0
qt1 = QuickTrace.trace1

def removeFromJobsInProgressList( jobName, agent ):
   for job in agent.cliJobsInProgress:
      if job[0] == jobName:
         agent.cliJobsInProgress.remove( job )
         qt1( 'Removed from JobsInProgress list: ', qv( jobName ))

def _rescheduleJob( agent, jobName, subproc ):
   removeFromJobsInProgressList( jobName, agent )
   if jobName in agent.config.scheduledCli:
      jobStatus = agent.status.scheduledCliJobStatus[ jobName ]
      updateStatus( agent, jobName, time.time(), subproc.returncode,
                    jobStatus.lastExecutionStartTime )
   cliSchedDispatcher( agent )

def _isSubprocDone( agent, jobName ):
   # check if cli job is still running
   result = agent.cliJobSubprocess[ jobName ].poll() != None

   if jobName in agent.gzipJobSubprocess:
      result = ( result and agent.gzipJobSubprocess[ jobName ].poll() != None )

   if jobName in agent.ddJobSubprocess:
      result = ( result and agent.ddJobSubprocess[ jobName ].poll() != None )

   return result

def cliJobCompletionHandler( jobArgs, timedOut=False ):
   agent, jobName, logPath, _maxLogFiles, cliTimeout, verbose = jobArgs

   def killSubprocs( agent ):
      subprocs = [ agent.cliJobSubprocess, agent.gzipJobSubprocess,
                   agent.ddJobSubprocess ]
      subprocList = [ subproc for subproc in subprocs if subproc is not None ]

      for proc in subprocList:
         try:
            proc[ jobName ].kill()
         except OSError:
            pass

   subproc = agent.cliJobSubprocess[ jobName ]
   # get ddProc if it exists
   ddProc = agent.ddJobSubprocess.get( jobName )

   qt1( 'cliJobCompletionHandler job:', qv( jobName), 'ml:', qv( _maxLogFiles ),
        'timedOut:', qv( timedOut ), 'rc:', qv( subproc.returncode if not timedOut
           else 0 ) )
   if timedOut:
      # pylint: disable-msg=E1101
      Logging.log( CliSchedulerLib.SYS_CLI_SCHEDULER_ABORT, jobName,
            "Timed out after %d seconds" % ( cliTimeout ) )
      
      try:
         os.killpg( subproc.pid, signal.SIGKILL )
      except OSError:
         pass

      # give some time for the processes to exit
      try:
         Tac.waitFor( lambda: _isSubprocDone( agent, jobName ),
                      timeout=10,
                      description='cli job process(%s) to die' % jobName,
                      sleep=True )
      except ( Tac.SystemCommandError, Tac.Timeout ):
         killSubprocs( agent )

      subproc.returncode = -1 #using returncode = -1 for timeout
      _rescheduleJob( agent, jobName, subproc )

   elif ddProc and ddProc.returncode != 0:
      # Error trying to save output to file
      # If logfile destination is almost full and Cli encounters ENOSPC
      # error. We use statvfs to detect this and generate filesystem full syslog
      # message instead of abort syslog message
      def _getMount( path ):
         path = os.path.realpath( os.path.abspath( path ) )
         while path != os.path.sep:
            if os.path.ismount( path ):
               return path
            path = os.path.abspath( os.path.join( path, os.pardir ) )
         return path
      if os.statvfs( _getMount( logPath ) ).f_bfree == 0:
         # pylint: disable-msg=E1101
         Logging.log( CliSchedulerLib.SYS_CLI_SCHEDULER_FILESYSTEM_FULL,
                      jobName )
      else:
         # pylint: disable-msg=E1101
         Logging.log( CliSchedulerLib.SYS_CLI_SCHEDULER_ABORT, jobName,
                      "dd rc=%d" % ddProc.returncode )
      _rescheduleJob( agent, jobName, ddProc )      

   elif subproc.returncode != 0:
      # Error running the cli command
      # pylint: disable-msg=E1101
      Logging.log( CliSchedulerLib.SYS_CLI_SCHEDULER_ABORT, jobName,
                   "rc=%d" % subproc.returncode )

      _rescheduleJob( agent, jobName, subproc )

   # ignoring gzipProc status because it should not fail unless ddProc fails
   else:
      if _maxLogFiles:
         t1( 'Initiating log rotation' )
         displayLog = Url.filenameToUrl( logPath )
         if verbose:
            # pylint: disable-msg=E1101
            Logging.log( CliSchedulerLib.SYS_CLI_SCHEDULER_JOB_COMPLETED,
                         jobName, "Logfile is stored in %s" % displayLog )
      else:
         # pylint: disable-msg=E1101
         Logging.log( CliSchedulerLib.SYS_CLI_SCHEDULER_JOB_COMPLETED,
                      jobName, None )
      _rescheduleJob( agent, jobName, subproc )

def updateStatus( agent, jobName, lastExecutionTime=0, lastExecutionStatus=0,
                  startTime=0, jobInProgress=False ):
   qt1( "updateStatus job:", qv( jobName ), 'st:', qv( startTime ), "let:",
        qv( lastExecutionTime ), "les:", qv( lastExecutionStatus ), 'jip:',
        qv( jobInProgress ) )

   schedStatusType = Tac.Type( "System::CliScheduler::ScheduledCliJobStatus" ) 
   schedStatus = schedStatusType( jobName, jobInProgress, startTime,
                                  lastExecutionTime, lastExecutionStatus )
   agent.status.scheduledCliJobStatus.addMember( schedStatus )

def cliSchedDispatcher( agent ):

   while ( len( agent.cliJobsInProgress ) < agent.config.jobsInProgress ) and \
           len( agent.cliJobsPending ) != 0:
      job = agent.cliJobsPending.pop()
      jobName, logDir, maxLogFiles, cliCommand, cliTimeout, verbose = job
      if not agent.status.enabled:
         Logging.log( CliSchedulerLib.SYS_CLI_SCHEDULER_DISABLED_SKIP, jobName )
         qt1( 'CliScheduler is disabled, skip dispatching job:', ( qv( jobName ) ) )
         # In the normal case where agent.config is enabled, once the job is
         # completed, we'll run CliJobCompletionHandler, which calls _rescheduleJob.
         # In there, the completed job is removed from jobsInProgress list and the
         # job status is updated before it calls cliSchedDispatcher again.  There is
         # no need for that if agent.config is disalbed.  Here, we only update the
         # jobsInProgress list and return.
         removeFromJobsInProgressList( jobName, agent )
         continue
      qt1( 'cliSchedDispatcher dispatching job:', ( qv( jobName ) ) )

      os.umask( 0 )
      # Frequency of scheduled CLI command execution shouldn't be more than one
      # every minute. Hence we can store logfiles under /mnt/flash/schedule with
      # following file name <jobName>_YYYY-MM-DD.HH-MM.log # this file is gzip'd
      # Date and time used above will always be in local time
      if not logDir or logDir == CliSchedulerLib.logPrefixDefault:
         logDir = agent.rootFs
         if not logDir.endswith( '/' ):
            logDir += '/'
         logDir = "%sschedule/" % logDir

      logSuffix = "%s/" % jobName
      logDir += logSuffix
      t1( 'logDir: %s' % logDir )
      logPath = None

      if maxLogFiles:
         timeExtension = time.strftime( "_%Y-%m-%d.%H%M", time.localtime() )
         logPath = '%s/%s%s%s.log.gz' % ( logDir, gethostname() + '_' \
                     if agent.config.prependHostname else '', jobName,
                     timeExtension )

         try:
            os.makedirs( logDir, 0777 )
         except OSError, e:
            import errno
            if e.errno == errno.EEXIST:
               pass
            elif e.errno == errno.ENOSPC:
               # pylint: disable-msg=E1101
               Logging.log( CliSchedulerLib.SYS_CLI_SCHEDULER_FILESYSTEM_FULL,
                            jobName )
               return
            elif e.errno == errno.EROFS:
               # pylint: disable-msg=E1101
               Logging.log( CliSchedulerLib.SYS_CLI_SCHEDULER_ABORT, jobName,
                            "read-only filesystem" )
               return
            else:
               raise

         logMgr = CliSchedulerLib.CliSchedLogMgr( logDir, maxLogFiles - 1 )
         logMgr.rotateSnapshots() 

      # using this to suppress stderr
      nullFileObj = open( "/dev/null", "w" )

      newCliCommand = CliSchedulerLib.replaceToken( cliCommand )
      cmdArgs = [ CliSchedulerLib.eosCliShell, "-s", agent.cliSchedSysname, 
                  "--disable-aaa", "--disable-automore", "-p", "15", "-c", 
                  newCliCommand ]
   
      agent.cliJobsInProgress.append( job )
      jobStatus = agent.status.scheduledCliJobStatus.get( jobName )
      lastExecutionStartTime = jobStatus.lastExecutionStartTime if jobStatus else 0
      updateStatus( agent, jobName, startTime=time.time(), jobInProgress=True,
                    lastExecutionTime=lastExecutionStartTime )
      qt1( 'cliSchedDispatcher inprogress:', qv (agent.cliJobsInProgress != None ),
        'njobs:', qv( len( agent.cliJobsInProgress ) ) )
      agent.cliJobSubprocess[ jobName ] = subprocess.Popen( 
         cmdArgs, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
         preexec_fn=os.setsid )

      if maxLogFiles:
         # pipe output to gzip
         gzipArgs = [ "/bin/gzip", "-f", "-9" ]
         agent.gzipJobSubprocess[ jobName ] = subprocess.Popen( 
            gzipArgs, stdin=agent.cliJobSubprocess[ jobName ].stdout, 
            stdout=subprocess.PIPE, stderr=nullFileObj )

         agent.cliJobSubprocess[ jobName ].stdout.close()

         # pipe output of gzip to file and sync afterward
         ddArgs = [ "/bin/dd", "of=" + logPath, "conv=fdatasync" ]
         agent.ddJobSubprocess[ jobName ] = subprocess.Popen( 
            ddArgs, stdin=agent.gzipJobSubprocess[ jobName ].stdout, 
            stdout=nullFileObj, 
            stderr=nullFileObj )

         agent.gzipJobSubprocess[ jobName ].stdout.close()

      jobArgs = ( agent, jobName, logPath, maxLogFiles, cliTimeout, verbose )

      agent.cliJobPoller = Tac.Poller(
            lambda: _isSubprocDone( agent, jobName ),
            handler=lambda ignored: cliJobCompletionHandler( jobArgs ),
            timeoutHandler=lambda: cliJobCompletionHandler( jobArgs, timedOut=True ),
            warnAfter=cliTimeout * 2,
            timeout=cliTimeout,
            description="scheduled Cli Job %s to complete" % jobName )


class CliSchedExec( object ):
   activeExecsWithAt_ = set()
   timePoller_ = None
   lastTimeDiff_ = None
   chkDelay_ = 10

   @staticmethod
   def adjustSchedule():
      time.sleep ( 1 )
      curDateTime = Tac.utcNow()
      curMonoDateTime = Tac.now()
      timeDiff = int( curDateTime - curMonoDateTime )
      if CliSchedExec.lastTimeDiff_ != timeDiff:
         qt0( "Clock change detected diff=", qv( timeDiff ) )
         # Clock was updated, time to change timeMin of all activeExecsWithAt_
         for execs in CliSchedExec.activeExecsWithAt_:
            if curDateTime >= execs.at:
               execs.act.timeMin = Tac.now()
            else:
               execs.act.timeMin = Tac.now() + (execs.at - curDateTime)
            qt1( "CliSchedExec job:", qv( execs.jobName ) ,
                 "is rescheduled to run in", qv( execs.at - curDateTime ),
                 "seconds" )
      CliSchedExec.lastTimeDiff_ = timeDiff
      CliSchedExec.timePoller_.timeMin = Tac.now() + CliSchedExec.chkDelay_

   def __init__( self, agent, jobName, logDir, at, interval, maxLogFiles, 
         cliCommand, timeout, verbose ):
      self.agent = agent
      self.jobName = jobName
      self.logDir = logDir
      self.interval = interval
      self.at = at
      self.maxLogFiles = maxLogFiles
      self.cliCommand = cliCommand
      if 'CLI_SCHED_TIMEOUT' in os.environ:
         self.timeout = int( os.environ[ 'CLI_SCHED_TIMEOUT' ] )
      else:
         self.timeout = timeout

      self.verbose = verbose

      self.act = Tac.ClockNotifiee( handler=self.run )

      if at == CliSchedulerLib.scheduleNow:
         self.act.timeMin = Tac.now()
         qt1( "CliSchedExec job:", qv( jobName ), "is scheduled to run now" )
      else:
         curDateTime = Tac.utcNow()
         if curDateTime >= at:
            self.act.timeMin = Tac.now()
            qt1( "CliSchedExec job:", qv( jobName ), "is scheduled to run now" )
         else:
            CliSchedExec.activeExecsWithAt_.add( self )
            if CliSchedExec.timePoller_ == None:
               qt0( "Starting time poller to detect clock changes by job:",
                    qv ( jobName ) )
               CliSchedExec.lastTimeDiff_ = int( curDateTime - Tac.now() )
               CliSchedExec.timePoller_ = Tac.ClockNotifiee(
                  handler=CliSchedExec.adjustSchedule )
               CliSchedExec.timePoller_.timeMin = Tac.now() + CliSchedExec.chkDelay_
            self.act.timeMin = Tac.now() + (at - curDateTime)
            qt1( "CliSchedExec job:", qv( jobName ), "is scheduled to run in",
                  qv( at - curDateTime ), "seconds" )

   def run( self ):
      self.removeFromactiveExecs()
      cliJob = ( self.jobName, self.logDir, self.maxLogFiles, self.cliCommand, 
                 self.timeout, self.verbose )
      jobNamesProgressList = [ cliJob[0] for cliJob in self.agent.cliJobsInProgress ]
      if jobNamesProgressList.count( self.jobName )  or \
            self.agent.cliJobsPending.count( cliJob ):
         # pylint: disable-msg=E1101
         qt1( "Jobname skipped ", qv( self.jobName ) )
         Logging.log( CliSchedulerLib.SYS_CLI_SCHEDULER_SKIP, self.jobName )
      else:
         self.agent.cliJobsPending.append( ( self.jobName, self.logDir, 
            self.maxLogFiles, self.cliCommand, self.timeout, self.verbose ) )
         cliSchedDispatcher( self.agent )
      self.act.timeMin = Tac.endOfTime \
          if self.interval == CliSchedulerLib.scheduleOnce \
          else Tac.now() + self.interval * 60

   def removeFromactiveExecs( self ):
      if self in CliSchedExec.activeExecsWithAt_:
         CliSchedExec.activeExecsWithAt_.remove( self )
         if len(CliSchedExec.activeExecsWithAt_) == 0:
            qt0( "Removing time poller" )
            CliSchedExec.timePoller_.timeMin = Tac.endOfTime
            CliSchedExec.timePoller_ = None

   def close( self ):
      if not self.act:
         return
      qt1( "CliSchedExec job:", qv( self.jobName ), "is being removed" )
      self.removeFromactiveExecs()
      self.act.timeMin = Tac.endOfTime
      cliJob = ( self.jobName, self.logDir, self.maxLogFiles, self.cliCommand,
                 self.timeout, self.verbose )
      try:
         self.agent.cliJobsPending.remove( cliJob )
         qt1( "CliSchedExec job:", qv( self.jobName ), "is removed from list of "\
              "pending jobs" )
      except ValueError:
         qt1( "CliSchedExec failed to find pending job:", qv( self.jobName ) )
      self.act = None

   def __del__( self ):
      self.close()

class ScheduledCliReactor( object ):
   """A reactor for each scheduled CLI command execution job."""
   notifierTypeName = 'System::CliScheduler::ScheduledCli'
   def __init__( self, cliConfig, schedConfig, agent ):
      self.schedConfig = schedConfig
      self.cliConfig = cliConfig
      self.agent = agent
      qt0( 'ScheduledCliReactor: scheduledCli collection has been updated key:',
           qv( cliConfig.name ) )

      if cliConfig.interval == CliSchedulerLib.scheduleOnce:
         if agent.status.scheduledCliJobStatus[ cliConfig.name ].lastExecutionTime:
            qt0( 'job %s has already executed once' % qv( cliConfig.name ) )
            return None

      self.schedExec = CliSchedExec( self.agent, cliConfig.name,
                        cliConfig.logDir,  cliConfig.startAt, cliConfig.interval,
                        cliConfig.maxLogFiles, cliConfig.cliCommand,
                        int( cliConfig.timeout ), cliConfig.verbose )
      qt0( 'ScheduledCliReactor job:', qv( cliConfig.name ), 'has been scheduled' )

   def close( self ):
      qt0( 'ScheduledCliReactor job:', qv( self.cliConfig.name ), 'close' )
      self.schedExec.close()

class ConfigReactor( Tac.Notifiee ):
   notifierTypeName = 'System::CliScheduler::Config'
   def __init__( self, agent, notifier, status ):
      self.notifier = notifier
      self.status = status
      self.reactors = {}
      self.agent = agent
      Tac.Notifiee.__init__( self, notifier )
      for key in self.notifier.scheduledCli:
         if key not in self.status.scheduledCliJobStatus:
            self.handleConfig( key )
         else:
            self.reactors[ key ] = ScheduledCliReactor( self.notifier.\
                                                    scheduledCli[ key ],\
                                                    self.notifier, self.agent )

      # Delete stale status entries.
      for key in self.status.scheduledCliJobStatus:
         if key not in self.notifier.scheduledCli:
            self.handleConfig( key )
   
   @Tac.handler( 'scheduledCli' )
   def handleConfig( self, name ):
      if name in self.notifier.scheduledCli:
         schedStatusType = Tac.Type( "System::CliScheduler::"\
                                     "ScheduledCliJobStatus" )
         schedStatus = schedStatusType( name, False, 0, 0, 0 )
         self.status.scheduledCliJobStatus.addMember( schedStatus )
         self.reactors[ name ] = ScheduledCliReactor( self.notifier.\
                                                    scheduledCli[ name ],\
                                                    self.notifier, self.agent )
      else:
         if name in self.reactors:
            qt1( 'handleConfig removing reactor for job:', qv( name ) )
            self.reactors[ name ].close()
            del self.reactors[ name ]
         if name in self.status.scheduledCliJobStatus:
            qt1( 'handleConfig removing status for job:', qv( name ) )
            del self.status.scheduledCliJobStatus[ name ]

   def close( self ):
      qt0( 'ConfigReactor close' )
      Tac.Notifiee.close( self )

class LowMemReactor( Tac.Notifiee ):
   notifierTypeName = 'ProcMgr::LowMemoryModeStatus'
   def __init__( self, notifier, status ):
      self.notifier = notifier
      self.status = status
      Tac.Notifiee.__init__( self, notifier )
      self.handleStatus()
   
   @Tac.handler( 'status' )
   def handleStatus( self ):
      # If the lowMemMode status is True, disable CliScheduler. Otherwise, enable it.
      if self.notifier.status:
         Logging.log( CliSchedulerLib.SYS_MEMORY_EXHAUSTION_CLI_SCHEDULER_DISABLED )
         self.status.enabled = False
      else:
         Logging.log( CliSchedulerLib.SYS_CLI_SCHEDULER_ENABLED )
         self.status.enabled = True

class CliSchedulerMgr( SuperServer.SuperServerAgent ):

   def __init__( self, entityManager, rootFs ):
      SuperServer.SuperServerAgent.__init__( self, entityManager )
      mg = entityManager.mountGroup()
      self.rootFs = rootFs
      self.cliJobsInProgress = []
      self.cliJobSubprocess = {}
      self.ddJobSubprocess = {}
      self.cliJobsPending = []
      self.gzipJobSubprocess = {}
      self.cliSchedAct = None
      self.cliSchedSysname = None
      self.act = None
      self.reactor = None
      self.lowMemReactor = None
      self.lowMemToggleEnabled = Toggles.ProcMgrToggleLib.toggleLowMemModeEnabled()
      self.config = mg.mount( 'sys/clischeduler/config',
                              'System::CliScheduler::Config', 'r' )
      self.status = mg.mount(  'sys/clischeduler/status',
                              'System::CliScheduler::Status', 'w' )
      self.lowMemStatus = mg.mount( 'sys/status/lowMemStatus',
                              'ProcMgr::LowMemoryModeStatus', 'r' )

      def _finish():
         self.cliSchedSysname = entityManager.sysname()
         # don't run until active (rpr or sso)
         if not self.active():
            return
         uptime = int( float( file( '/proc/uptime' ).read().split()[ 0 ] ) ) / 60
         # Delay execution of CLI command until 5 mins after system bringup
         if uptime >= 5:
            if self.lowMemToggleEnabled:
               self.lowMemReactor = LowMemReactor( self.lowMemStatus, self.status )
            self.reactor = ConfigReactor( self, self.config, self.status )
         else:
            qt0( 'Deferring Cli scheduler for now ...' )
            self.defer( ( 5 - uptime ) * 60 )
      mg.close( _finish )

   def defer( self, delay ):
      self.act = Tac.ClockNotifiee( handler=self.run )
      self.act.timeMin = Tac.now() + delay

   def run( self ):
      if self.lowMemToggleEnabled and not self.lowMemReactor:
         self.lowMemReactor = LowMemReactor( self.lowMemStatus, self.status )
      self.reactor = ConfigReactor( self, self.config, self.status )
      # keep deferring until we have switched over
      self.act.timeMin = Tac.endOfTime

   def onSwitchover( self, protocol ):
      for key in self.status.scheduledCliJobStatus:
         schedStatusType = Tac.Type( "System::CliScheduler::"\
                                     "ScheduledCliJobStatus" )
         schedStatus = schedStatusType( key, False, 0, 0, 0 )
         self.status.scheduledCliJobStatus.addMember( schedStatus )
      # start Cli scheduler in 5 minutes
      qt0( 'Starting Cli scheduler in 5 minutes ...' )
      self.defer( 5 * 60 )

def Plugin( ctx ):
   # pylint: disable-msg=E1103
   rootFs = Url.parseUrl( "flash:", ctx ).localFilename()
   ctx.registerService( CliSchedulerMgr( ctx.entityManager, rootFs ) )
