#!/usr/bin/env python

# Copyright (c) 2018 Arista Networks, Inc.  All rights reserved.
# Arista Networks, Inc. Confidential and Proprietary.

# archivecheck.py:
#
# Script run every minute as a cron job for archiving logs.
#
# Does nothing if there is no archive configured or if the archive is disabled.
# Otherwise, perform a routine check of the archive status and try to reduce the
# archive size (by deleting older logs) if there is not enough free space.
# Then it starts to copy logs and core files from EOS tmpfs directories /var/log
# and /var/core to the archive.
#
# Archive operations are done using the LogMgr/ArchiveLib library.
# The actual copy mechanism is driven by archivetree.py

import argparse
import os
import fcntl
import sys
import errno
import syslog

import ArchiveLib
import SpaceMgmtLib
import Tac

LOG_PREFIX = 'archivecheck'
ARCHIVECHECK_LOCKFILE = '/var/lock/LCK..archivecheck.pid'
ARCHIVECHECK_PIDFILE = '/var/run/archivecheck.pid'
FLASH_MNTPT = '/mnt/flash'
DRIVE_MNTPT = '/mnt/drive'
SSD_NO_USER_OVERRIDE_FILE = os.path.join( FLASH_MNTPT, 'no_ssd_var' )
LEGACY_ARCHIVE_DISABLED_FILE = os.path.join( FLASH_MNTPT,
                                             '.arista_ssd_archive_disabled' )

verbose = False

def logMsg( *args, **kwargs ):
   if verbose or 'ignoreVerbose' in kwargs and kwargs[ 'ignoreVerbose' ]:
      syslog.syslog( ' '.join( map( str, args ) ) )

def logErr( *args ):
   syslog.syslog( 'error: ' + ' '.join( map( str, args ) ) )

def acquireLock():
   lockFile = open( ARCHIVECHECK_LOCKFILE, 'w+' )

   try:
      fcntl.flock( lockFile, fcntl.LOCK_EX | fcntl.LOCK_NB )
      return lockFile
   except IOError as e:
      if e.errno != errno.EACCES and e.errno != errno.EAGAIN:
         raise
      return None

def lockScript():
   if not os.path.exists( ARCHIVECHECK_LOCKFILE ):
      cmds = [
         [ 'touch', ARCHIVECHECK_LOCKFILE ],
         [ 'chmod', '666', ARCHIVECHECK_LOCKFILE ]
      ]
      for cmd in cmds:
         # Need Tac.run for root privileges
         Tac.run( cmd, asRoot=True, stdout=Tac.DISCARD, stderr=Tac.DISCARD )

   lockFile = acquireLock()

   if lockFile is not None:
      with open( ARCHIVECHECK_PIDFILE, 'w+' ) as f:
         f.write( ' '.join( [ str( os.getpid() ), sys.argv[ 0 ] ] ) )
   else:
      with open( ARCHIVECHECK_PIDFILE, 'r' ) as f:
         content = f.read()
      lastpid, script = content.split()

      if ( os.path.exists( '/proc/%s' % lastpid )
           and script == sys.argv[ 0 ] ):
         logMsg( 'another archivecheck (pid=%s) is running' % lastpid )

   return lockFile

def unlockScript( lockFile ):
   fcntl.flock( lockFile, fcntl.LOCK_UN )

def lookupConfiguredArchive():
   try:
      archive = ArchiveLib.Archive.currentArchive()
   except ( AssertionError ) as e:
      logErr( 'failed to use configured archive:', e )
      return None
   else:
      if archive is False:
         logMsg( 'no archive configured' )
      if archive is None:
         logErr( 'failed to read configuration file or content is invalid' )

   return archive

def checkArchiveStatus( archive ):
   try:
      archive.statusCheck()
      return True
   except ( IOError,
            OSError,
            SpaceMgmtLib.Quota.QuotaCmdException,
            Tac.SystemCommandError ) as e:
      logErr( 'archive status check failed:', e )
      return False

def checkArchiveSize( archive ):
   try:
      sizeCheckStats = archive.sizeCheck()
   except ( IOError, OSError ) as e:
      logErr( 'failed to perfom archive size check:', e )
      return False

   if sizeCheckStats is None:
      logMsg( "archive size check: archive directories don't exist" )
      return False

   totalSize, fileCount, stats = sizeCheckStats

   if totalSize == 0:
      logMsg( 'archive size check: no space reclaim needed' )
   else:
      msgFmt = (
         'TotalSize={totalSize}'
         ' FileCount={fileCount}'
         ' archivedArchivesSize={aaSize}'
         ' archivedArchviesFiles={aaFiles}'
         ' logSize={logSize}'
         ' logFiles={logFiles}'
         ' agentLogSize={agentSize}'
         ' agentLogFiles={agentFiles}'
         ' coreSize={coreSize}'
         ' coreFiles={coreFiles}'
      )
      msg = msgFmt.format( totalSize=totalSize,
                           fileCount=fileCount,
                           aaSize=stats[ 'archivedArchives' ][ 0 ],
                           aaFiles=','.join( stats[ 'archivedArchives' ][ 1 ] ),
                           logSize=stats[ 'logs' ][ 0 ],
                           logFiles=','.join( stats[ 'logs' ][ 1 ] ),
                           agentSize=stats[ 'agents' ][ 0 ],
                           agentFiles=','.join( stats[ 'agents' ][ 1 ] ),
                           coreSize=stats[ 'cores' ][ 0 ],
                           coreFiles=','.join( stats[ 'cores' ][ 1 ] ) )

      logMsg( 'archive size check reclaim stats:', msg, ignoreVerbose=True )

   return True

def archiveVarLogDir( archive ):
   cmd = [ 'python',
           '/usr/bin/archivetree.py',
           '--src',
           '/var/log',
           '--archive',
           archive.path ]

   try:
      Tac.run( cmd, asRoot=True, stdout=Tac.CAPTURE, stderr=Tac.CAPTURE )
   except Tac.SystemCommandError as e:
      logErr( 'archiving /var/log failed:', 'exitcode=%s:' % e.error, e.args[ 0 ] )

def archiveVarCoreDir( archive ):
   cmd = [ 'python',
           '/usr/bin/archivetree.py',
           '--src',
           '/var/core',
           '--archive',
           archive.path,
           '--root' ]

   try:
      Tac.run( cmd, asRoot=True, stdout=Tac.CAPTURE, stderr=Tac.CAPTURE )
   except Tac.SystemCommandError as e:
      logErr( 'archiving /var/core failed:', 'exitcode=%s:' % e.error, e.args[ 0 ] )

def archiveLogs( archive ):
   try:
      # lock access to the archive while we are copying logs
      with archive.runWithLockedAccess():

         # If archive directories don't exist, there is nothing to do
         if not archive.dirsExist:
            logMsg( "archive directories don't exist" )
            return

         # Pace ourselves to not use too many cycles, run everything sequentially
         # First, archive everything under /var/log
         if os.path.isdir( '/var/log' ):
            archiveVarLogDir( archive )
            logMsg( '/var/log directory archived' )

         # Finally, archive the /var/core files
         if os.path.isdir( '/var/core' ):
            archiveVarCoreDir( archive )
            logMsg( '/var/core directory archived' )

   except Tac.Timeout:
      logMsg( 'unable to lock access to the archive' )

def parseArgs():
   parser = argparse.ArgumentParser()
   parser.add_argument( '-v',
                        '--verbose',
                        help='Log what is being done to syslog',
                        action='store_true' )
   return parser.parse_args()

def main():
   global verbose

   syslog.openlog( LOG_PREFIX )

   args = parseArgs()
   verbose = args.verbose

   # make sure only one instance of this script is running
   lockFile = lockScript()
   if lockFile is None:
      return

   try:
      # if no archive is configured, there is nothing to do
      archive = lookupConfiguredArchive()
      if not archive:
         return

      # check archive status
      if not checkArchiveStatus( archive ):
         return

      # archive is disabled, there is nothing more to do
      if archive.disabled:
         logMsg( 'archive disabled' )
         return

      # make sure there will be enough space to copy logs
      if not checkArchiveSize( archive ):
         return

      # copy logs from tmpfs directories /var/log and /var/core to the archive
      archiveLogs( archive )
   finally:
      unlockScript( lockFile )

if __name__ == '__main__':
   main()
