#!/usr/bin/env python
# Copyright (c) 2014 Arista Networks, Inc.  All rights reserved.
# Arista Networks, Inc. Confidential and Proprietary.
#
# List the agent status, checking if any agents are in D state: disk I/O state
# for three or more seconds, if so we flag them.
#
import argparse
import datetime
import logging
import math
import ntpath
import os
from os import walk
import shutil
import signal
import sys
import syslog
import time
from EosSsdSpaceMgmtTests import fileSystemInfo, calcSizeFromPct, \
     genTestCoreFile, cleanupFullSrcTestLoad
from multiprocessing import Process, Queue

defaultSrcLogDir = "/var/log"
defaultSrcCoreDir = "/var/core"

oneTwentyEightKiB = 131072

# Global Variables
agentsProcesses = []
archiveDirectory = "/archive"
baseDirectory = ""
clientProcesses = []
monitorRunTime = 0
pathsOfCreatedFiles = []

# Global Constant values
maxDiskSleeps = 3 # Max number of disk sleeps we tolerate, if GE potential issue
maxNumberFiles = 4 # How many copies of a file to keep before deleting
maxRunTimeInSecs = 3000
minFileUnmodifiedTime = 40 # Must be unmodified at least this many seconds
writeFrequency = 10  # Do a write every 10 seconds to add a log write load

pids = []

class AgentProcess:
   """
   Holds information about a specific active agent process.
   """
   def __init__( self, pid, ppid, state, name ):
      """ AgentProcess __init__ function """
      self.pid = pid
      self.ppid = ppid
      self.name = name
      self.state = state
      self.consecDiskWaits = 0
      self.diskWaits = 0
      self.intrSleeps = 0
      self.running = 0
      self.pageWaits = 0
      self.sigWaits = 0
      self.zombieWaits = 0
      self.stateUpdate( state )
      return

   def stateUpdate( self, state ):
      """ Update process state to track consecutive disk waits """
      self.state = state
      if state == 'D':
         self.consecDiskWaits += 1
         self.diskWaits += 1
         logging.debug( "Process: %s DiskWait count: %d consecutive: %d", \
            self.name, self.diskWaits, self.consecDiskWaits )
      else:
         self.consecDiskWaits = 0

         if state == 'S':
            self.intrSleeps += 1
         elif state == 'R':
            self.running += 1
            logging.debug( "%s run %d secs", self.name, self.running )
         elif state == 'W':
            self.pageWaits += 1
            logging.debug( "%s pageWaits %d secs", self.name, self.pageWaits )
         elif state == 'T':
            self.sigWaits += 1
            logging.debug( "%s sigWaits %d secs", self.name, self.sigWaits )
         elif state == 'Z':
            self.zombieWaits += 1
            logging.debug( "%s zombie %d secs", self.name, self.zombieWaits )
      return

   def consecDiskSleepsLimit( self, limit ):
      """ Checks to see if max consecutive disk sleeps reached or exceeded """
      return self.consecDiskWaits >= limit

   def dump( self ):
      """ Dump the process state we've tracked """
      logging.debug( "%s pid: %s ppid: %s state(%s) S:%d R:%d D:%d W:%d " \
            "T:%d Z:%d", self.name, self.pid, self.ppid, self.state, \
            self.intrSleeps, self.running, self.diskWaits, \
            self.pageWaits, self.sigWaits, self.zombieWaits )

   # End of class AgentProcess

class AgentProcesses:
   """
   Holds information about active agent processes.
   """
   def __init__( self ):
      """ AgentProcesses __init__ function """
      self.processes = {}

   def add( self, name, process ):
      """ Add a process by name """
      self.processes[ name ] = process

   def process( self, name, pid ):
      """ Update agent process pid """
      if not name in self.processes.keys():
         return None
      proc = self.processes[ name ]
      if proc.pid == pid:
         return proc
      # Don't forget to handle case where agent restarted
      namepid = name + '-' + str( pid )
      self.processes.pop( name )
      proc.pid = pid
      self.processes[ namepid ] = proc
      return None

   def processUpdate( self, pidStr ):
      """ Adds process if missing, otherwise updates the process state """
      procPath = os.path.join( "/proc", pidStr, "stat" )
      try:
         with open( procPath, "r" ) as procStatFile:
            procStatus = procStatFile.read( 4096 )
      except ( IOError, OSError ) as e:
         # process may have exited
         return None

      words = procStatus.split()
      pid   =  words.pop(0)
      fname =  words.pop(0)
      state =  words.pop(0)
      ppid  =  words.pop(0)

      name = fname[ 1:-1 ]

      if name == "netns":
         procCmdPath = os.path.join( "/proc", pidStr, "cmdline" )
         try:
            with open( procCmdPath, "r" ) as procCmdFile:
               procCmdStatus = procCmdFile.read( 4096 )
         except ( IOError, OSError ) as e:
            # process may have exited
            logging.debug( 'proc %s stat cmdline failed: %s', 
                           pidStr, os.strerror( e.errno ) )
            return None
         path = procCmdStatus.split( "/", 1 )[ 1 ]
         if "-" in path:
            path = path.split( "-", 1 )[ 0 ]
         name = path_leaf( path )
         name = name[ :-1 ]

      proc = self.process( name, pid )
      if proc is None:
         proc = self.add( name, AgentProcess( pid, ppid, state, name ) )
      else:
         proc.stateUpdate( state )

      return proc

   # End of class AgentProcesses

def sigNumToName( signum ):
   """
   Convert the signal number to a name and return the name.
   """
   name = []
   for key in signal.__dict__:
      if key.startswith( "SIG" ) and getattr( signal, key ) == signum:
         name.append( key )
      if len( name ) == 1:
         return name[0]
      else:
         return str(signum)

def parentSigterm( recvSignal, _ ):
   """
   parentSigterm( recvSignal, frame ):
   ----------------------------------
   Process parent signal, expecting SIGTERM to end process.
   """
   logstr = "Parent exiting received signal: %d (%s) " % \
            ( recvSignal, sigNumToName( recvSignal ) )
   logging.debug( logstr )

   for pid in pids:
      print "Client being processed for termination is pid: %s" % pid
      os.kill( pid, signal.SIGTERM )
      print "Sending signal to pid %s signal: %d" % ( pid, recvSignal )

   for process in clientProcesses:
      process[ 0 ].join()
      logging.debug( 'Process result is for %s', process[ 1 ].get() )

   # Return from main program due to signal
   return

def clientSigterm( recvSignal, _ ):
   """
   clientSigterm( recvSignal, frame ):
   ----------------------------------
   Process client signal, expecting SIGTERM to end process.
   """
   logstr = "Client exiting received signal: %d (%s) " % \
            ( recvSignal, sigNumToName( recvSignal ) )
   logging.debug( logstr )
   logging.debug( "ProcWatch Monitor Run Time: %d seconds", monitorRunTime )
   for proc in agentsProcesses.processes.values():
      proc.dump()
   sys.exit( 0 )

def path_leaf( path ):
   """ Returns the leaf of a path """
   head, tail = ntpath.split( path )
   return tail or ntpath.basename( head )

def is_number( inputString ):
   """ check if string is a number """
   try:
      int( inputString )
      return True
   except ValueError:
      return False

def agentPidsInVarLogAgentDir():
   """ Returns the list of pids for each agent """
   agentsDir = "/var/log/agents"

   files = []
   for ( _, _, fileNames ) in walk( agentsDir ):
      files.extend( fileNames )
      break

   agentPidStrs = []
   for agentFile in files:
      if "." in agentFile:
         continue
      while is_number( agentFile.split( "-" )[ 1 ] ) is False:
         _, agentFile = agentFile.split( "-", 1 )

      agentPidStrs.append( agentFile.split( "-" )[ 1 ] )

   return agentPidStrs

class ProcWatch( Process ):
   """
   Process Watch: Check all the agents are running, making progress not blocked.
   Since the file archival process is not-inline, but is a separate process we
   would not expect any Agent to block due to log processing or quicktrace writes.
   The agents write directly to tmpfs in memory and the quicktrace and log files
   as well as core files are periodically (like every minute) copied to the SSD
   archive. If the SSD I/O becomes delayed, the agents are not blocked only the
   archival process, which is not a switching/routing process so if it is slow it
   is not critical to the function of the switch.
   """
   def __init__( self, queue, debug ):
      """ ProcWatch __init__ function """
      super( ProcWatch, self ).__init__()
      self.queue = queue
      self.debug = debug

   def run( self ):
      """
      This is the Process Watch child process, forked from the main parent process.
      """
      global agentsProcesses
      global monitorRunTime
      self.queue.put( self.name )

      # Start monitoting agent processes
      if self.debug:
         logging.basicConfig( stream=sys.stderr, level=logging.DEBUG )
      else:
         logging.basicConfig( stream=sys.stderr )
      logging.debug( 'ProcWatch process pid: %d', os.getpid() )
      signal.signal( signal.SIGTERM, clientSigterm )
      syslog.openlog( "ProcWatch" )

      agentsProcesses = AgentProcesses()

      # Set an upper limit on run time
      while monitorRunTime < maxRunTimeInSecs:
         time.sleep( 1 )
         monitorRunTime += 1
         agentpidstrs = []
         agentpidstrs = agentPidsInVarLogAgentDir()
         for agentpidstr in agentpidstrs:
            proc = agentsProcesses.processUpdate( agentpidstr )
            if proc:
               if proc.consecDiskSleepsLimit( maxDiskSleeps ):
                  syslog.syslog( '%%ERR-LOG: Process %s exceeded %d disk waits:' \
                     % ( proc.name, maxDiskSleeps ) )
         syslog.closelog()

      # End of ProcWatch run
      return

def compressedFileCreate( dataString, filePath, baseFileName ):
   """
   Gzip compress datastring argument, extend base file name with logrotate style
   file name, write compressed data to a file with path and extended file name

   Normally we would append our base file name with a .YR-MO-DA_HR:MN:SC.gz
   as a .year-month-day-hour:minute:seconds.gz file extension but Vfat cannot
   handle ':' in the name so instead we use .YR-MO-DA_HR-MN-SC.tgz
   However, logrotate only supports %Y, %m, %d and %s where %s is the number of
   seconds since the Epoch, 1970-01-01 00:00:00 so instead we use:
   .YR-MO-DA_epochsecs.gz
   """
   dateTimeName = datetime.datetime.now().strftime( ".%Y-%m-%d_%s.tgz" )
   fileName = baseFileName + dateTimeName
   path = os.path.join( filePath, fileName )

   try:
      with open( path, 'wb' ) as fdesc:
         fdesc.write( dataString )
   except ( IOError, OSError ):
      # It's all good for this test if we fill up the log
      pass
   return path

def archiveDirPath( archiveDir, dirPath ):
   """ Generate the archival file path given the archive base and src path """
   relDirPath = dirPath.strip('/')
   destDirPath = os.path.join( archiveDir, relDirPath )
   return destDirPath

def filesCheckEqualModtimes( modtime1, modtime2 ):
   """ Check file modification times to see if the are equal """
   return math.floor( modtime1 ) == math.floor( modtime2 )

def directoryCreate( filePath ):
   """ If a directory does not exist, create it """
   if not os.path.exists( filePath ):
      os.makedirs( filePath )
   return

def directoryRemoveWithContents( dirPath ):
   """ Clean out the content of the specific directory """
   if os.path.isdir( dirPath ):
      shutil.rmtree( dirPath )
   return

def directoryCleanup():
   """ Cleanup src and archive directory """
   directoryRemoveWithContents( archiveDirectory )
   directoryRemoveWithContents( baseDirectory )
   return

def filesCleanup():
   """ Cleans up and files created and their parent and src directory """
   archivedFileSize = 0
   for filePath in pathsOfCreatedFiles:
      fileName = os.path.basename( filePath )
      archivePath = archiveDirPath( archiveDirectory, fileName )
      if not os.path.isfile( archivePath ):
         syslog.syslog( '%%ERR-LOG: Missing archive file: %s' % archivePath )
         continue
      fileSize = os.path.getsize( archivePath )
      if archivedFileSize == 0:
         archivedFileSize = fileSize
      elif archivedFileSize != fileSize:
         syslog.syslog( '%%ERR-LOG: Wrong filesize, file: %s' % archivePath )
         syslog.syslog( '%%ERR-LOG: Expected size %d actual %d' % \
                        ( archivedFileSize, fileSize ) )

   # The archive test file cleanup takes care of the archive side
   cleanupFullSrcTestLoad()
   directoryCleanup()
   return

def genfileSigterm( recvSignal, _ ):
   """
   genfileSigterm( recvSignal, frame ):
   -----------------------------------
   File gen process signal, expecting SIGTERM to end process, sleep 90 seconds
   cleanup any files that were created and exit.
   """
   logstr = "Received signal: %d (%s) sleep 90 seconds and then exit" % \
            ( recvSignal, sigNumToName( recvSignal ) )
   logging.debug( logstr )
   time.sleep( 90 )
   filesCleanup()
   sys.exit( 0 )

def manageFiles():
   """ Manages file so they are aged and eventually removed """
   for filePath in pathsOfCreatedFiles:
      fileName = os.path.basename( filePath )
      archivePath = archiveDirPath( archiveDirectory, fileName )
      if len( pathsOfCreatedFiles ) > maxNumberFiles:
         # if not archived skip to the next file
         if not os.path.isfile( filePath) or not os.path.isfile( archivePath ):
            continue
         stats = os.stat( filePath )
         curtime = time.time()
         difftime = curtime - stats.st_mtime
         if difftime < minFileUnmodifiedTime:
            continue
         astats = os.stat( archivePath )
         if filesCheckEqualModtimes( stats.st_mtime, astats.st_mtime ):
            os.remove( filePath )
            logging.debug( 'File management removed file: %s', filePath )
   return

class FileGeneration( Process ):
   """
   FileGeneration( Process ):
   -------------------------
   File Generation Process: drive file creation
   Generate files to be archived to add load for the archival process so we can
   test it under some modest stress.
   """
   def __init__( self, queue, debug ):
      """ FileGeneration __init__ function """
      super( FileGeneration, self ).__init__()
      self.queue = queue
      self.debug = debug

   def almostFullSrcLogDir( self ):
      """
      almostFullSrcLogDir():
      ---------------------
      Check if the src side log directory is almost full, which we define to
      be ~90% full so we have some space to handle any events that may need to
      log information we might want to reference.
      """
      pct = 90
      _, sizeKB, usedKB, _, _, _ = fileSystemInfo( defaultSrcLogDir )
      maxKB = calcSizeFromPct( long( sizeKB ), pct )
      if usedKB >= maxKB:
         return False

      return True

   def run( self ):
      """
      This is the File Generation child process, forked from the main parent process.
      """
      self.queue.put( self.name )
      stringSize = 1 * oneTwentyEightKiB # generate uncompressed random string size
      baseFileName = "tempfile"

      if self.debug:
         logging.basicConfig( stream=sys.stderr, level=logging.DEBUG )
      else:
         logging.basicConfig( stream=sys.stderr )
      logging.debug( 'FileGeneration process pid: %d', os.getpid() )
      signal.signal( signal.SIGTERM, genfileSigterm )
      syslog.openlog( 'FileGeneration' )
      randomString = file( "/dev/urandom", "r").read( stringSize )
      compressString = randomString.encode( 'zlib' )

      time.sleep( 1 )
      writingTime = 1
      while writingTime < maxRunTimeInSecs:
         path = compressedFileCreate( compressString, baseDirectory, baseFileName )
         pathsOfCreatedFiles.append( path )
         manageFiles()
         time.sleep( writeFrequency )
         writingTime += writeFrequency
         if self.almostFullSrcLogDir():
            break

      # Generate core files
      agentName = "testingAgent"
      path = defaultSrcCoreDir
      netSize = None
      netPid = None
      fileSize = 1 * oneTwentyEightKiB
      pidNumStart = 900000
      num = 1

      # Core files should get archived so as long as we don't write them too
      # fast the src core directory cannot fill up, src side files get archived
      # and become symlinks to their archive home, so no need to check for space
      while writingTime < maxRunTimeInSecs:
         pidNum = pidNumStart + num
         genTestCoreFile( path, pidNum, agentName, fileSize, netPid,
                          netSize, srcDest=True )
         num += 1
         time.sleep( writeFrequency )
         writingTime += writeFrequency

      syslog.closelog()
      return

def main( srcDir, archiveDir, debug ):
   """
   This script executes on the DUT. It monitors processes and runs independently
   of other ptest activitites, checking all process activity every second.
   Files are generated to add extra load for the archive process to stress it.
   """
   global baseDirectory

   if debug:
      logging.basicConfig( stream=sys.stderr, level=logging.DEBUG )
   else:
      logging.basicConfig( stream=sys.stderr )

   signal.signal( signal.SIGTERM, parentSigterm )

   # Ensure we have test directories to use that are clean 
   directoryCleanup()

   baseDirectory = srcDir
   directoryCreate( baseDirectory )

   logging.debug( 'Main parent process pid: %d', os.getpid() )

   queue = Queue()
   procWatch = ProcWatch( queue, debug )
   clientProcesses.append( ( procWatch, queue, debug ) )
   procWatch.start()
   print "append client procWatch pid %s" % procWatch.pid
   pids.append( procWatch.pid )

   procFileGen = FileGeneration( queue, debug )
   clientProcesses.append( ( procFileGen, queue, debug ) )
   procFileGen.start()
   print "append client procFileGen pid %s" % procFileGen.pid
   pids.append( procFileGen.pid )

   for process in clientProcesses:
      process[ 0 ].join()

   return

if __name__ == '__main__':
   parser = argparse.ArgumentParser( prog=sys.argv[0] )

   parser.add_argument( '-a', '--archive',
                        help='target archive directory (/archive)',
                        required=True )

   parser.add_argument( '-d', '--debug',
                        help='activates program debug statements',
                        action='store_true' )

   parser.add_argument( '-s', '--src',
                        help='source dir to search (i.e., /var/core or /var/log)',
                        required=True )

   args = parser.parse_args()

   main( args.src, args.archive, args.debug )
