# Copyright (c) 2007, 2010 Arista Networks, Inc.  All rights reserved.
# Arista Networks, Inc. Confidential and Proprietary.

import Tac, Agent, Plugins, Logging, Tracing
import Cell
import os, errno
import sys, shutil
import QuickTrace

# Force a dependency for the local entity creation for the interface
# agent plugins. 
# pkgdeps: import DefaultConfigPlugin.Intf

qv = QuickTrace.Var
qt0 = QuickTrace.trace0
qt1 = QuickTrace.trace1

t0 = Tracing.trace0
t3 = Tracing.trace3

Logging.logD( id='SYS_RESTART_SERVICE',
              severity=Logging.logWarning,
              format="Service %s is not running. Attempting to restart it.",
              explanation="Periodic health monitoring found that the said "
                          "service was not running, and attempted to "
                          "restart it.",
              recommendedAction="Check if there is a resource conflict "
                                "which prevents the said service from "
                                "running. For example, the same TCP "
                                "listener port may have been allocated "
                                "to more than one service." )

Logging.logD( id="SYS_SERVICE_FILESYSTEM_FULL",
              severity=Logging.logWarning,
              format="Write to %s for service \'%s\' failed because the filesystem "
                     "is full",
              explanation="A service attempted to write a file and failed because "
                          "the filesystem is full",
              recommendedAction="Please delete unused files to free up space." )


defaultTimeout = 20
lockDir = os.path.realpath( "/var/lock" ) + "/SuperServer"

# NOTE: please resist the temptation to put a timestamp or other
#       "helpful" information like a generation ID that changes each
#       time the file is generated into the generated config file,
#       because the code that decides whether the underlying service
#       needs to be restarted does so by generating a new config file
#       based on the current TAC state, comparing that new file to the
#       existing file, and concluding that the service needs to be
#       restarted if the files differ AT ALL. Obviously, a timestamp
#       defeats that check.

configFileHeader = \
"""# NOTE NOTE NOTE NOTE NOTE
#
# This file is AUTO-GENERATED based on the system's configuration.
# Any modifications you make to this file will be lost when the
# system's configuration is changed, e.g. from the CLI.
#

"""

configFileFooter = \
"""

# End of auto-generated file
"""

# Setting environment variable to skip systemd and
# make SuperServer start/stop services.
os.environ[ "SYSTEMCTL_SKIP_REDIRECT" ] = "1"

class GenericService( Tac.Notifiee ):
   notifierTypeName = "*"

   def __init__( self, serviceName, config, sync=True,
                 configFileHeaderEnabled=True ):
      self.serviceName_ = serviceName
      self.configFileHeaderEnabled_ = configFileHeaderEnabled
      Tac.Notifiee.__init__( self, config, filtered=False )
      if sync:
         self.sync()

   def serviceName( self ):
      return self.serviceName_

   # ------------------------------------------
   # Methods implemented by the deriving class

   def sync( self ):
      # Called whenever the service's config has changed
      raise NotImplementedError

   def warm( self ):
      # Returns whether the system is now running based on
      # for lastest version of the service's config.
      raise NotImplementedError

   def writeConfigFile( self, configFilename, config, saveBackupConfig=True,
                        writeHeader=None, updateInPlace=False ):
      # This method tries to write new config files, catching exceptions as 
      # necessary, and handles future attempts to write the new config file
      qt0( "Writing config file '", qv( configFilename ), "' for service",
           qv( self.serviceName_ ) )
      t0( "writeConfigFile", configFilename )
      tmpFileToCleanup = None
      if writeHeader is None:
         writeHeader = self.configFileHeaderEnabled_
      try:
         if updateInPlace:
            f = open( configFilename, "w" )
            if writeHeader:
               f.write( configFileHeader )
            f.write( config )
            if writeHeader:
               f.write( configFileFooter )
            f.close()
            qt0( "Finished writing config file in place" )
            return True

         # ------------------------
         # Step 1: Write the new config file to a temporary location
         # NOTE - we have to create the temporary file in the same directory
         # as the config file, otherwise the rename will fail if the two
         # files are on different devices. As an example, /tmp/ and /etc/
         # are 2 different tmpfs', so you cannot do a rename between them.
         tmpConfigFilename = configFilename + ".new"
         f = open( tmpConfigFilename, "w" )
         tmpFileToCleanup = tmpConfigFilename
         if writeHeader:
            f.write( configFileHeader )
         f.write( config )
         if writeHeader:
            f.write( configFileFooter )
         f.close()
         # -----------------------
         # Step 2: Back up any existing config file that might exist,
         # if configured to do so. This can be nice for debugging. 
         if saveBackupConfig and os.path.exists( configFilename ):
            saveFilename = configFilename + ".save"
            qt0( "Backing up", qv( configFilename ), "to", qv( saveFilename ) )
            if os.path.exists( saveFilename ):
               qt0( "Overwriting existing backup file", qv( saveFilename ) )
               os.unlink( saveFilename )
            # XXX_APECH Should we be copying the file to the saved location,
            # unstead of moving it? The way things are, the config file
            # will not exist for some sort amount of time. Is this a
            # problem ?
            try:
               os.rename( configFilename, saveFilename )
            except OSError:
               # Temporary fix for cEOS. Please refer to BUG193504 and BUG193506.
               shutil.copy( configFilename, saveFilename )
               os.unlink( configFilename )
         # ------------------------
         # Step 3: Install the newly generated config file
         os.rename( tmpConfigFilename, configFilename )
         tmpFileToCleanup = None
         qt0( "Finished writing config file" )
      except IOError, e:
         if e.errno == errno.ENOSPC:
            Logging.log( SYS_SERVICE_FILESYSTEM_FULL, # pylint: disable-msg=E0602
                         configFilename, self.serviceName_ )
            qt0( "Error writing config file '", qv( configFilename ), "'",
                 "for service", qv( self.serviceName_ ),
                 "because there is no space left on device" )
            if tmpFileToCleanup:
               os.unlink( tmpFileToCleanup )
            # Indicate failure to write config file
            return False

         else:
            # If there's some other IOError, we probably want to know about it
            raise
      else:
         # Succeeds in writing config file
         return True

   def onAttribute( self, attr, key ):
      # Notify any handlers that have been registered
      Tac.Notifiee.onAttribute( self, attr, key )
      
      # Notify the service that its config has changed
      self.sync()


class LinuxService( GenericService ):
   """ Class for a SuperService plugin managing a linux service through
   /sbin/service """
   notifierTypeName = "*"

   def __init__( self, serviceName, linuxServiceName, config, confFilename,
                 sync=True, configFileHeaderEnabled=True,
                 healthCheckNeeded = True ):
      self.linuxServiceName_ = linuxServiceName
      # If serviceName contains linuxServiceName (e.g., sometimes it contains
      # linuxServiceName + vrf), log the more specific name, else use the
      # linuxServiceName
      self.loggingName_ = ( serviceName if linuxServiceName in serviceName
                            else linuxServiceName )
      self.confFilename_ = confFilename
      self.syncs_ = 0
      self.serviceRestartDelay_ = 1
      self.activity_ = Tac.ClockNotifiee()
      self.activity_.handler = self._maybeRestartService
      self.activity_.timeMin = Tac.endOfTime
      self.serviceRestartPending_ = False
      # Can be set to force a service restart on the next
      # sync, even if the config has not changed.
      self.forceRestart_ = False
      self.starts = 0
      self.stops = 0
      self.restarts = 0
      self.healthCheckNeeded_ = healthCheckNeeded
      GenericService.__init__( self, serviceName, config, sync=sync,
                               configFileHeaderEnabled=configFileHeaderEnabled )

      # The monitor activity is used to periodically check on the health of
      # linux services that were started by SuperServer. This ensures that
      # we will eventually recover from the death of a linux service, much
      # as ProcMgr ensures that we recover from an agent death. We expect
      # the linux services to be generally more robust than our agents, but
      # we still need to be able to handle this case
      if self.healthCheckNeeded_:
         self.healthMonitorActivity_ = Tac.ClockNotifiee()
         self.healthMonitorActivity_.handler = self._checkServiceHealth
         self.healthMonitorActivity_.timeMin = Tac.endOfTime
         self.healthMonitorInterval = 60

   # called before destroying service.
   def cleanupService( self ):
      self.activity_.timeMin = Tac.endOfTime
      if self.healthCheckNeeded_:
         self.healthMonitorActivity_.timeMin = Tac.endOfTime

   # ---------------------------
   # Implemented by the deriving class

   def serviceProcessWarm( self ):
      """ Returns whether or not the process associated with the service has
      fully restarted and is running based on the new configuration.
      NOTE - this can be very difficult to get right. Generally, one
      must override the startService, stopService, and restartService
      commands in order to save enough information that one can
      reliably determine whether or not the service has been completely
      restarted and is running based on its new configuration. """
      raise NotImplementedError

   def serviceEnabled( self ):
      """ Returns whether the service is enabled / whether there is
      enough configuration to properly start the service """
      raise NotImplementedError

   def conf( self ):
      """ Returns the contents to write to the service's conf file """
      raise NotImplementedError

   # ---------------------------
   # Service start/stop/restart methods. Can be overridden by the
   # deriving class if any information needs to be saved before the
   # service is restarted.

   def startService( self ):
      self._runServiceCmd( "start" )
      self.starts += 1

   def stopService( self ):
      self._runServiceCmd( "stop" )
      self.stops += 1

   def restartService( self ):
      self._runServiceCmd( "restart" )
      self.restarts += 1

   def serviceCmd( self, cmd ):
      return [ "service", self.linuxServiceName_, cmd ]

   def serviceCmdTimeout( self, cmd ):
      return defaultTimeout

   def checkServiceCmdOutput( self, cmd, output ):
      """Check if the output of the service status command is successful.
      output is splitlines."""
      # Added to debug issue in bug179776; Will remove after bug is fixed
      # the second check in if statement is to ensure that no unnecessary
      # loggin is done in SuperServer, but we still catch the exception. This
      # is needed because stdout from Tac.run above is being captured.
      if cmd == 'status':
         matchPttr = ( 'Active: active', 'is running' )
         return not output or any( s in line for s in matchPttr
                                   for line in output )
      else:
         assert False # support for other commands comes later
         return False

   # ----------------------------
   # Implementation of Service class methods. Should not be overridden
   # by deriving classes

   def warm( self ):
      if self.serviceRestartPending_:
         qt0( "LinuxService", qv( self.serviceName_ ),
               "is not warm: restart is pending" )
         return False
      return self.serviceProcessWarm()

   def _punchWatchdog( self ):
      watchdogReactorAgent = Tac.singleton( 'ProcMgr::WatchdogReactorAgent' )
      if watchdogReactorAgent.watchdogReview:
         watchdogReactorAgent.watchdogReview.doPunchExternalWatchdog()

   def startServicePunchWatchdog( self ):
      """ punches watchdog timer before and after call to startService for services
      that can be slow to startup """
      self._punchWatchdog()
      self.startService()
      self._punchWatchdog()

   @staticmethod
   def runServiceCommand( serviceCmd, timeout=defaultTimeout ):
      cmd = ' '.join( serviceCmd )
      qt0( "running", qv( cmd ) )
      t0( "runing", cmd )
      try:
         Tac.run( serviceCmd, stdout=Tac.CAPTURE, stderr=Tac.CAPTURE,
                  timeout=timeout )
         qt0( qv( cmd ), "- successful" )
      except Tac.Timeout:
         qt0( "Timed out while trying to run", qv( cmd ) )
         return False
      except Tac.SystemCommandError as e:
         qt0( "Unable to run", qv( cmd ) )
         print >> sys.stderr, "command '%s' failed: %s, output %s\n" % \
               ( cmd, e, e.output )
         return False
      return True

   def _runServiceCmd( self, cmd ):
      assert cmd != 'status'
      timeout = self.serviceCmdTimeout( cmd )
      if not LinuxService.runServiceCommand( self.serviceCmd( cmd ),
                                             timeout=timeout ):
         # Retry in a little bit
         self.sync()

   def _maybeRestartService( self ):
      """ Handle changes to our configuration by rewriting our conf file
      and restarting the service if necessary. """

      qt0( "_maybeRestartService", qv( self.loggingName_ ) )

      # Check that we didn't die during our last reconfiguration.  We
      # touch the file /var/lock/SuperServer/<self.confFilename_>.lock around
      # any reconfiguration and restarting of the service, and delete it
      # when we are done. If the file exists, then we did not complete
      # our last reconfig, and must unconditionally restart the
      # service, even if its config file is unchanged
      lockFilename = "%s/%s.lock" % \
            ( lockDir, os.path.basename( self.confFilename_ ) )
      restart = False
      lockFileExists = os.path.exists( lockFilename )
      if lockFileExists:
         qt0( "file", qv( lockFilename ),
               "exists - we died during our last reconfig" )
         # We died during our last reconfig
         restart = True
      
      # Read the current config in our conf file
      oldConfigFileContents = ""
      if os.path.exists( self.confFilename_ ):
         oldConfigFileContents = file( self.confFilename_ ).read()

      # Get the new config, and figure out what the new config file
      # contents would look like.
      newConfig = self.conf()
      if self.configFileHeaderEnabled_:
         newConfigFileContents = configFileHeader + newConfig + configFileFooter
      else:
         newConfigFileContents = newConfig

      resync = False
      if oldConfigFileContents != newConfigFileContents:
         qt0( "config for service", qv( self.loggingName_ ), "has changed" )
         restart = True
         if not lockFileExists:
            os.mknod( lockFilename, 0666 )
            lockFileExists = True
         configFileWritten = self.writeConfigFile( self.confFilename_, newConfig )
         resync = not configFileWritten
         restart = configFileWritten
      else:
         qt0( "config for service", qv( self.loggingName_ ),
              "has not changed" )

      if not self.serviceEnabled():
         # Stop
         self.stopService()
      elif restart or self.forceRestart_:
         # Restart, if possible
         if resync:
            self.stopService()
         else:
            self.restartService()
      elif not resync:
         # The config has not changed - make sure that the service is
         # currently running, but do not restart it unnecessarily.
         # This can be accomplished by running "service <servicename>
         # start", which takes no action if the service is already
         # running
         self.startService()

      if self.healthCheckNeeded_:
         if self.serviceEnabled():
            self.healthMonitorActivity_.timeMin = Tac.now() + \
               self.healthMonitorInterval
         else:
            self.healthMonitorActivity_.timeMin = Tac.endOfTime

      # Declare that reconfiguration is done - delete the lock file
      if lockFileExists:
         os.unlink( lockFilename )

      # If writeConfigFile failed, then call sync() to reschedule another call to
      # _maybeRestartService(), allowing it to retry writing the config file
      if resync:
         self.sync()
      else:
         self.forceRestart_ = False
         self.serviceRestartPending_ = False

   def _checkServiceHealth( self ):

      assert self.healthCheckNeeded_
      if not self.serviceEnabled():
         # This activity should only be scheduled when the service is enabled
         # and running, but it is possible that _maybeRestartService just
         # hasn't been called yet due to the serviceRestartDelay.
         return

      try:
         output = Tac.run( self.serviceCmd( 'status' ),
                           stdout=Tac.CAPTURE, stderr=Tac.CAPTURE,
                           timeout=self.serviceCmdTimeout( 'status' ) )
         output = [ x for x in output.splitlines() if not
                    x.startswith( "Redirecting to /bin/systemctl status" ) ]
         if not self.checkServiceCmdOutput( 'status', output ):
            print >> sys.stderr, "ERROR: service", self.loggingName_, \
               "status"
            for line in output:
               print >> sys.stderr, "  ", line
      except Tac.Timeout:
         qt0( "Service status", qv( self.loggingName_ ),
             "timed out unexpectedly. Restarting it" )
         self.startServicePunchWatchdog()
      except Tac.SystemCommandError as e:
         qt0( "Service", qv( self.loggingName_ ),
             "died unexpectedly. Restarting it" )
         # Added to debug issue in bug179776; Will remove after bug is fixed
         qt0( "Service", qv( self.loggingName_ ), "output : ", e )
         Logging.log( SYS_RESTART_SERVICE, # pylint: disable-msg=E0602
                      self.loggingName_ )
         self.startServicePunchWatchdog()

      self.healthMonitorActivity_.timeMin = Tac.now() + \
                                            self.healthMonitorInterval

   def sync( self ):
      self.syncs_ += 1
      self.serviceRestartPending_ = True
      # Schedule an activity to run in a short while that restarts
      # the service. This delay gives us time to handle multiple-attr
      # notifications without restarting the service each time, e.g. in
      # the case where the system is parsing its startup-config
      # file.
      self.activity_.timeMin = Tac.now() + self.serviceRestartDelay_
   
   def linuxServiceName( self ):
      return self.linuxServiceName_

class SystemdService( LinuxService ):

   notifierTypeName = "*"
   def __init__( self, serviceName, linuxServiceName, config, confFilename,
                 sync=True, configFileHeaderEnabled=True,
                 healthCheckNeeded=False ):
      LinuxService.__init__( self, serviceName, linuxServiceName, config,
                             confFilename, sync=sync,
                             configFileHeaderEnabled=configFileHeaderEnabled,
                             healthCheckNeeded=healthCheckNeeded )

   @classmethod
   def _serviceCmd( cls, serviceName, cmd ):
      if os.getenv( "A4_CHROOT" ):
         # We don't have systemd running in Abuild workspace
         return [ "/usr/sbin/service", serviceName, cmd ]
      return [ "/usr/bin/systemctl", cmd, serviceName ]

   def serviceCmd( self, cmd ):
      return self._serviceCmd( self.linuxServiceName_, cmd )

   def serviceProcessWarm( self ):
      """ Returns whether or not the process associated with the service has
      fully restarted and is running based on the new configuration.
      NOTE - this can be very difficult to get right. Generally, one
      must override the startService, stopService, and restartService
      commands in order to save enough information that one can
      reliably determine whether or not the service has been completely
      restarted and is running based on its new configuration. """
      raise NotImplementedError

   def serviceEnabled( self ):
      """ Returns whether the service is enabled / whether there is
      enough configuration to properly start the service """
      raise NotImplementedError

   def conf( self ):
      """ Returns the contents to write to the service's conf file """
      raise NotImplementedError

class SwitchoverNotifiee( Tac.Notifiee ):
   """Reactor to RedundancyStatus."""
   notifierTypeName = 'Redundancy::RedundancyStatus'

   def __init__( self, redundancyStatus, agent ):
      Tac.Notifiee.__init__( self, redundancyStatus )
      self.agent_ = agent

   @Tac.handler( 'mode' )
   def handleMode( self ):
      if self.notifier_.mode == 'active':
         for name, service in self.agent_.service_.iteritems():
            qt0( "Calling onSwitchover for service", qv( name ) )
            service.onSwitchover( self.notifier_.protocol )

   @Tac.handler( 'protocol' )
   def handleProtocol( self ):
      for name, service in self.agent_.service_.iteritems():
         qt0( "Calling onProtocolChange for service", qv( name ) )
         service.onProtocolChange( self.notifier_.protocol )

class SuperServerAgent( Agent.Agent ):
   """Base class for all SuperServer agents."""
   def __init__( self, entityManager, **kwargs ):
      Agent.Agent.__init__( self, entityManager, 
                            warmupWarningInterval=600, **kwargs )

   def intfStatusAll( self ):
      return self.entityManager.getLocalEntity( 'interface/status/all' )

   def intfStatusLocal( self ):
      return self.entityManager.getLocalEntity( 
                  Cell.path( 'interface/status/local' ) )

   def onSwitchover( self, protocol ):
      """Called when switchover from standby to active happens"""
      pass

   def onProtocolChange( self, protocol ):
      """Called when redundancy protocol changes"""
      pass   

   def redundancyProtocol( self ):
      return self.redundancyStatus().protocol

   def active( self ):
      return self.redundancyStatus().mode == 'active'

# This pointer is here to allow debugging live SuperServer state
# when pyclient'ed into the namespace
superServerAgentPtr = None

# The following method can be used when SuperServer is instantiated in a cohab
# breadth test environment. Before removing the local reference to SuperServer object
# in the test cleanup, the breadth test can call this method. Thus, as soon as the
# local reference of the SuperServer object held by the breadth test is removed, the
# corresponding finalizer will be called immediately.
def clearSuperServerReferences():
   global superServerAgentPtr
   if superServerAgentPtr:
      superServerAgentPtr.switchover_ = None
   superServerAgentPtr = None

class SuperServer( SuperServerAgent ):
   """ The SuperServer agent provides a thin wrapper for starting
   various agents based on plugins. Plugins create services, which
   are really just agents, and register them with the SuperServer. """
   
   def __init__( self, entityManager, services, pluginPath=None ):
      global superServerAgentPtr
      self.service_ = {}
      self.services_ = services
      self.pluginPath_ = pluginPath
      self.switchover_ = None
      self.warmupState_ = {}
      superServerAgentPtr = self
      agentName = self.__class__.__name__
      if "QUICKTRACEDIR" in os.environ:
         qtfile = "%s%s.qt" % (agentName, "-%d" if "QUICKTRACEDIR"
                               not in os.environ else "" )
         # SuperServer and its plugins use level0 & level1 traces
         QuickTrace.initialize( qtfile, "32,32,0,0,0,0,0,0,0,0" )

      # Create directory for config file locks
      if not os.path.exists( lockDir ):
         os.makedirs( lockDir )
      # Keep this the last line since it will invoke doInit() in
      # breadth tests.
      SuperServerAgent.__init__( self, entityManager, agentName=agentName )
   
   def getDefaultTrace( self ):
      # by default enable level 3 tracing for warmup events
      return 'SuperServer/3'
    
   def doInit( self , entityManager ):
      services = self.services_
      pluginPath = self.pluginPath_

      class Context( object ):
         def __init__( self, entityManager, registerServiceFn ):
            self.entityManager = entityManager
            setattr( self, "registerService", registerServiceFn )

      mg = entityManager.mountGroup()

      # Create local interface entities
      self.createLocalEntity( "AllIntfStatusDir",
                              "Interface::AllIntfStatusDir",
                              "interface/status/all" )
      self.createLocalEntity( "AllIntfStatusLocalDir",
                              "Interface::AllIntfStatusLocalDir",
                              Cell.path( "interface/status/local" ) )
      self.localEntitiesCreatedIs( True )

      def _doneMountingStage1():
         qt0( "SuperServer: Loading plugins", qv( services ) )
         mg = self.entityManager.mountGroup()
         ctx = Context( self.entityManager, self.registerService )
         Plugins.loadPlugins( 'SuperServerPlugin', context=ctx,
                              plugins=services, pluginPath=pluginPath )

         qt0( "SuperServer: loading plugins complete" )
         mg.close( _doneMountingStage2 )

      def _doneMountingStage2():
         qt0( "Creating switchover reactor" )
         self.switchover_ = SwitchoverNotifiee(
            self.redundancyStatus(), self )

      mg.close( _doneMountingStage1 )

   def registerService( self, service ):
      name = service.__class__.__name__
      qt0( "Service", qv( name ), "registered" )
      self.service_[ name ] = service

   def service( self, name ):
      return self.service_.get( name )

   def warm( self ):
      if not self.entityManager.cEm_.allMountsComplete():
         return False
      for name, service in self.service_.iteritems():
         warm = service.warm()
         oldWarmState = self.warmupState_.get( name, None )
         if oldWarmState != warm:
            self.warmupState_[ name ] = warm
            t3( "service %s is %swarm" % ( name, "" if warm else "not " ) )
         if not warm:
            qt0( 'Service', name, 'is not warm yet' ) 
            return False
      return True
