#!/usr/bin/env python
# Copyright (c) 2017 Arista Networks, Inc.  All rights reserved.
# Arista Networks, Inc. Confidential and Proprietary.


# pkgdeps: rpm dpdk
# pkgdeps: rpm pciutils

"""This module contains a plugin that manages Sfe-managed Ethernet phys."""

from __future__ import absolute_import, division, print_function

import json
import os
import re
import platform

from EthIntf import MAX_SUPPORTED_MTU
import Fru
import Tac
import Tracing
from SfFruHelper import DEVICE, FIRST_MAC, MAC, PCI, DRIVER
from VeosHypervisor import (
getPlatform,
platformCloud,
platformCloudOnly,
platformBareMetal,
platformAzure,
)

__defaultTraceHandle__ = Tracing.Handle( "Fru.Sfe" )
Tracing.traceSettingIs( ",".join( [ os.environ.get( "TRACE", "" ) ] +
   [ 'Fru.Sfe/*' ] ) )

traceDetail = Tracing.trace9
traceNormal = Tracing.trace8
traceAlert = Tracing.trace1
traceError = Tracing.trace0

deviceCacheFile = "/var/run/sfaFruPluginDevices.json"

DRIVER_DEFAULT = "BESS"
DPS_INTF = "et100"
DPS_PCI = "0000:ff:ff.f"

class SfePhyDriver( Fru.FruDriver ):
   """This Fru plugin manages any object of type Inventory::Phy::SfePhyDir."""

   requires = [ Fru.FruDriver.systemInit, Fru.FruDriver.interfaceInit ]

   managedTypeName = "Inventory::Phy::SfePhyDir"
   managedApiRe = "$"

   def __init__( self, phyDir, parentMib, parentDriver, driverCtx ):
      traceAlert( "Creating a Fru driver for the PhyDir" )
      # if Sfe is running on a hardware platform with an ASIC, e.g. 7170
      # the parentDriver will not have a veosConfig attribute
      self.platform = getPlatform() or ''
      if parentDriver and hasattr( parentDriver, 'veosConfig' ) \
            and parentDriver.veosConfig[ 'MODE' ] == 'sfe_failsafe':
         traceAlert( "Skipping Sfe PhyDir driver in failsafe mode" )
         return
      Fru.FruDriver.__init__( self, phyDir, parentMib, parentDriver, driverCtx )
      cellId = Fru.fruBase( phyDir ).managingCellId
      hwCellDir = driverCtx.sysdbRoot[ "hardware" ][ "cell" ][ str( cellId ) ]
      launchConfig = driverCtx.sysdbRoot[ "hardware" ][ "sfe" ][ "launcherConfig" ]

      psDir = hwCellDir.mkdir( "phy/sfe" )
      self.phyConfig_ = psDir.newEntity( "Hardware::Phy::SfePhyConfigDir",
                                         "config" )
      self.phyConfig_.hwPlatformOpenFlow = False

      if platformCloudOnly():
         # This is a temporary change to enable SubIntf CLI on
         # Cloud platform.
         # It will be still disabled in the datapath, by disabling
         # interface specific knob.
         self.phyConfig_.hwCapability.l3SubintfSupported = True
      elif self.hasSriovDevice():
         self.phyConfig_.hwCapability.l3SubintfSupported = False
      else:
         self.phyConfig_.hwCapability.l3SubintfSupported = True
      traceDetail( "hwCapability.l3SubintfSupported = %s" %
                   self.phyConfig_.hwCapability.l3SubintfSupported )

      if phyDir.phy:
         self._setDeviceNames( phyDir )
         self._deviceCache = {}
         self._dpdkDevices = []
         self._initDeviceCache()
         self._updateDeviceCache( phyDir )
         encapsulating = self._sfeInitialize( phyDir )
         if not encapsulating:
            launchConfig.newEntity( 'Tac::Dir', 'Sfe' )
         if encapsulating:
            self.phyConfig_.hwPlatform = True
            self._allocateHugePages()
            # for now we only support one encapsulating phy and
            # it cannot be mixed with non-encapsulating phys
            assert len( phyDir.phy ) == 1
         self.phyConfig_.sfeFruPluginDone = True
         traceDetail( "hwPlatform:%s hwPlatformOpenFlow:%s sfeFruPluginDone:%s" %
                   ( self.phyConfig_.hwPlatform, self.phyConfig_.hwPlatformOpenFlow,
                     self.phyConfig_.sfeFruPluginDone ) )

   def hasSriovDevice( self ):
      try:
         lspciop = Tac.run( [ '/sbin/lspci' ], stdout=Tac.CAPTURE ).split( "\n" )
         for device in lspciop:
            if 'Ethernet' in device and 'Virtual Function' in device:
               return True
      except Tac.SystemCommandError as e:
         traceAlert( 'Ignoring Caught exception %s ' % ( e ) )
      return False

   def _allocateHugePages( self ):
      if os.environ.get( 'SIMULATION_VMID' ):
         traceDetail( "Skipping hw platform hugepage allocation" )
         return
      huge1GBPath = "/sys/kernel/mm/hugepages/hugepages-1048576kB"
      huge2MPath = "/sys/kernel/mm/hugepages/hugepages-2048kB"
      thirtyTwoBit = ( platform.architecture()[ 0 ] == '32bit' )
      # We run Sfe in 32-bit mode on some mixed ASIC platforms such as the 7170.
      # In this mode having 1G huge pages causes the Sfe process to use up more of
      # its virtual memory address space because the huge page are mapped in chunks
      # of 1G into the address space. Using 2M pages is not as good for caching but
      # increase the granularity in which memory can be mapped reducing the overall
      # virtual memory usage of the Sfe process thus allowing for more route scale.
      if not thirtyTwoBit and os.path.isdir( huge1GBPath ):
         try:
            memFile = open( huge1GBPath + "/nr_hugepages", "w" )
            memFile.write( "1" )
            memFile.close()
         except OSError:
            traceAlert( "Unable to allocate 1x1GB hugepages on hw platform" )
      if os.path.isdir( huge2MPath ):
         try:
            memFile = open( huge2MPath + "/nr_hugepages", "w" )
            if thirtyTwoBit:
               memFile.write( "640" )
            else:
               memFile.write( "128" )
            memFile.close()
         except OSError:
            traceAlert( "Unable to allocate 128x2M hugepages on hw platform" )
      traceDetail( "Allocated 1x1GB hugepages on running hw platform" )

   def addDeviceInCache( self, devName, mac, pci, driver ):
      if devName not in self._deviceCache[ DEVICE ]:
         self._deviceCache[ DEVICE ].append( devName )
      self._deviceCache[ MAC ][ devName ] = mac.encode( 'utf-8' )
      self._deviceCache[ PCI ][ devName ] = pci.encode( 'utf-8' )
      self._deviceCache[ DRIVER ][ devName ] = driver.encode( 'utf-8' )

   def _updateDeviceCache( self, phyDir ):
      # On Baremetal platform, deviceCache file is not updated by Sfa Fru,
      if platformBareMetal() and not os.path.isfile( deviceCacheFile ):
         f = file( deviceCacheFile, 'w' )
         json.dump( self._deviceCache, f )
         f.close()
         for phy in phyDir.phy.values():
            devName = phy.port.intfId.replace( "Ethernet", "et" )
            if devName != DPS_INTF:
               output = Tac.run( [ "ethtool", "-i", devName ], stdout=Tac.CAPTURE )
               pci = re.search( 'bus-info: (.+)', output ).group( 1 )
               driver = re.search( 'driver: (.+)', output ).group( 1 )
               mac = phy.port.macAddr
               self.addDeviceInCache( devName, mac, pci, driver )

   def _setDeviceNames( self, phyDir ):
      # For all front-panel interfaces managed by PhyEthtool,
      # we have to set the deviceName on the EthIntfStatus
      # ourselves (just like we do in Fru for the management
      # interfaces).
      for invPhy in phyDir.phy.values():
         port = invPhy.port
         intfStatus = port.intfStatus
         if port.role != "Management" and intfStatus is not None:
            intfStatus.deviceName = port.intfId.replace( "Ethernet", "et" )
            traceDetail( "Set intfStatus kernel device name for %s to %s" %
                        ( port.intfId, intfStatus.deviceName ) )

   def _initDeviceCache( self ):
      self._deviceCache[ FIRST_MAC ] = ""
      self._deviceCache[ DEVICE ] = []
      self._deviceCache[ MAC ] = {}
      self._deviceCache[ PCI ] = {}
      self._deviceCache[ DRIVER ] = {}

   def _readDeviceCache( self ):
      f = file( deviceCacheFile, 'r' )
      self._deviceCache = json.load( f )
      f.close()
      statusType = "dpdk"
      # On Azure, the Accelerated networking devices are always attached to kernel.
      # On Baremetal platfrom, devices are initally attached to kernel.
      if platformAzure() or platformBareMetal():
         statusType = "kernel"
      dpdkDevs = Tac.run( [ "/usr/share/dpdk/tools/dpdk-devbind.py", "--status",
                            "--status-dev", "net", "--status-type", statusType ],
                            stdout=Tac.CAPTURE )
      # Remove headers
      dpdkDevs = dpdkDevs.split( "\n" )[ 3 : -1 ]
      for dev in dpdkDevs:
         pci = dev.split( " " )[ 0 ]
         if pci not in self._dpdkDevices:
            self._dpdkDevices.append( pci )
      traceDetail( "Device Cache = %r" % self._deviceCache )
      traceDetail( "DPDK devices = %r" % self._dpdkDevices )

   def _getPci( self, devName ):
      if devName == DPS_INTF:
         return DPS_PCI

      pciAddress = ""
      if PCI in self._deviceCache:
         pciAddress = self._deviceCache[ PCI ].get( devName, "" )
      traceDetail( "PCI in Cache = %r" % pciAddress )
      if pciAddress == "":
         # If the cache doesn't have it return empty string
         traceDetail( "PCI = %r Not Found in Cache" % pciAddress )
         return ""
      if pciAddress in self._dpdkDevices or 'SIMULATION_VMID' in os.environ:
         traceDetail( "PCI = %r Found" % pciAddress )
         # DPDK has this device
         return pciAddress
      if "hv_et" in devName:
         # netvsc device is still with kernel
         return pciAddress
      # if the device is not with DPDK return empty string
      traceDetail( "PCI = %r Neither with kernel nor with DPDK" % pciAddress )
      return ""

   def _isDriverSriov( self, driver ):
      return driver.lower().endswith( "vf" )

   def _getDriver( self, devName ):
      if devName == DPS_INTF:
         return DRIVER_DEFAULT

      pciDriver = ""
      if DRIVER in self._deviceCache:
         pciDriver = self._deviceCache[ DRIVER ].get( devName, "" )
      traceDetail( "Driver in Cache = %r" % pciDriver )
      if pciDriver == "":
         traceDetail( "Driver for %r Not Found in Cache" % devName )
      elif self._isDriverSriov( pciDriver ):
         traceDetail( "Driver = %r is SR-IOV" % pciDriver )
      else:
         traceDetail( "Driver = %r is non SR-IOV" % pciDriver )
      return pciDriver

   def _sfeInitialize( self, phyDir ):
      # Internal func to set map all IRQs to core 0
      def changeSmpAffinity( unused, dir_name, files ):
         for f in files:
            if f != 'smp_affinity':
               continue
            fname = dir_name + '/' + f
            try:
               with open( fname, 'w' ) as fd:
                  fd.write( '1' )
            except IOError:
               continue
      traceAlert( " _sfeInitialize: Mapping all IRQs to core 0" )
      os.path.walk( '/proc/irq/', changeSmpAffinity, 0 )
      traceAlert( "_sfeInitialize: rebinding interfaces to DPDK drivers" )
      softRestart = False
      hotPlug = False
      encapsulating = False
      vdevName = ''
      self._readDeviceCache()
      if os.path.isdir( "/dev/hugepages1G" ) and os.path.isdir( "/dev/hugepages2M" ):
         # System has already been initialized implying that Fru has been restarted.
         # Don't initialize drivers, mappings and huge pages. Kill bessd if
         # it is running
         traceAlert( "_sfeInitialize: softRestart is true" )
         traceDetail( "_sfeInitialize: softRestart is true" )
         softRestart = True

      if not softRestart:
         # Install bess and igb_uio kernel modules
         traceAlert( "_sfeInitialize: Loading kernel modules" )
         traceDetail( "_sfeInitialize: Loading kernel modules" )
         if 'SIMULATION_VMID' not in os.environ:
            Tac.run( [ "modprobe", "-a", "uio" ] )
            Tac.run( [ "modprobe", "-a", "bess" ] )
            Tac.run( [ "modprobe", "-a", "igb_uio" ] )
            # Install ib_uverbs required only for Azure Sfe
            if self.platform == 'Azure':
               Tac.run( [ "modprobe", "-a", "ib_uverbs" ] )
               Tac.run( [ "modprobe", "-a", "uio_hv_generic" ] )

      # Bind all non management PCI devices to the igb_uio module.
      dpdkDir = "/usr/share/dpdk/"
      traceDetail( "Creating Hardware::Phy::SfePhy for %r" % phyDir.phy.keys() )
      for fruPhy in phyDir.phy.values():
         devName = fruPhy.name.replace( "Phy", "", 1 )
         traceDetail( "Device %s" % devName )
         # Extract the PCI adress in the form NN:MM.O from the device path
         # and bind this device to the DPDK poll mode driver.
         deviceStr = "device"
         if self.platform == 'Azure':
            deviceStr = ""
         sysPath = os.path.join( "/sys/class/net", devName, deviceStr )
         path = os.path.realpath( sysPath )
         dpdkDev = False
         traceDetail( "platform = %r" % self.platform )
         traceDetail( "platformCloud = %r" % platformCloud() )
         traceDetail( "path = %r" % path )
         traceDetail( "devName = %r" % devName )
         traceDetail( "devName in path %r" % ( devName in path ) )
         if devName in path:
            # Device might be already bound to DPDK
            pciAddress = self._getPci( devName )
            pciDriver = self._getDriver( devName )
            traceDetail( "Device %s PCI %s Driver %s" %
                         ( devName, pciAddress, pciDriver ) )
            if devName != DPS_INTF:
               assert pciAddress, "ERROR: Device %s, has no pciAddress" % devName
            if devName == DPS_INTF and not platformBareMetal():
               hotPlug = True
            traceDetail( "Device %s is with DPDK" % devName )
            dpdkDev = True
         else:
            # Device is owned by kernel driver
            output = Tac.run( [ "ethtool", "-i", devName ], stdout=Tac.CAPTURE )
            pciAddress = re.search( 'bus-info: (.+)', output ).group( 1 )
            pciDriver = re.search( 'driver: (.+)', output ).group( 1 )

            traceDetail( "Device %s PCI %s Driver %s" %
                         ( devName, pciAddress, pciDriver ) )
            # If this interface was not with DPDK and the scenario
            # is soft restart then it must be a hotplug interface
            if softRestart:
               traceDetail( "Device %s is hot plugged" % devName )
               hotPlug = True
         if self.platform == 'Azure' and devName != DPS_INTF:
            domain = bus = slot = function = 0
            dpdkDev = False
            vdevName = pciAddress.encode( "ascii" )
         else:
            domain = int( pciAddress.split( ":" )[ 0 ], 16 )
            bus = int( pciAddress.split( ":" )[ 1 ], 16 )
            slot = int( pciAddress.split( ":" )[ 2 ].split( "." )[ 0 ], 16 )
            function = int( pciAddress.split( ":" )[ 2 ].split( "." )[ 1 ], 16 )

         tacPciAddress = Tac.Value( "Inventory::PciAddress",
                                    domain=domain, bus=bus,
                                    slot=slot, function=function )
         pciDriver = str( pciDriver ).encode( 'utf-8' )
         if ( platformCloudOnly() or self._isDriverSriov( pciDriver ) or
              devName == DPS_INTF ):
            l3SubintfSupported = False
         else:
            l3SubintfSupported = True

         if not dpdkDev:
            traceDetail( "_sfeInitialize: adding device %s PCI %s to DPDK "
                        % ( devName, pciAddress ) )
            if self.platform == 'Azure':
               # Manually bind/unbind hv_netvsc device is required here.
               netUUID = "f8615163-df3e-46c5-913f-f2d2f965ed0e"
               sysPath = '/sys/bus/vmbus/drivers/'
               cmd = """echo %s > %s/uio_hv_generic/new_id""" % ( netUUID, sysPath )
               try:
                  Tac.run( [ "bash", "-c", cmd ], asRoot=True )
               except Tac.SystemCommandError:
                  traceDetail( "_sfeInitialize: cannot add %s to uio_hv_generic"
                                % ( netUUID ) )

               cmd = """echo %s > %s/hv_netvsc/unbind""" % ( pciAddress, sysPath )
               try:
                  Tac.run( [ "bash", "-c", cmd ], asRoot=True )
               except Tac.SystemCommandError:
                  traceDetail( "_sfeInitialize: cannot unbind %s to hv_netvsc"
                                % ( pciAddress ) )
               cmd = """echo %s > %s/uio_hv_generic/bind""" % ( pciAddress, sysPath )
               try:
                  Tac.run( [ "bash", "-c", cmd ], asRoot=True )
               except Tac.SystemCommandError:
                  traceDetail( "_sfeInitialize: cannot bind %s to uio_hv_generic"
                                % ( pciAddress ) )
            else:
               # We need to use '--force', otherwise the ssh route to the et1 won't
               # let it unbind from kernel driver.
               Tac.run( [ dpdkDir + "tools/dpdk-devbind.py", "--force",
                          "-b", "igb_uio", pciAddress ] )

         port = fruPhy.port
         # 'Phyet' are ethernet interfaces used in vEOS
         traceDetail( "Phy %s --> %s" % ( fruPhy.name, port.intfId ) )

         capability = Tac.newInstance( "Hardware::Phy::SfePhyCapability" )
         capability.l3SubintfSupported = l3SubintfSupported
         traceDetail( "Intf %s capability.l3SubintfSupported = %s" %
                      ( port.intfId, capability.l3SubintfSupported ) )

         if fruPhy.encapsulating:
            encapsulating = True
            if not softRestart and 'SIMULATION_VMID' not in os.environ:
               # Install VFIO drivers -
               # options are specified in /etc/modprobe.d/vfio.conf
               Tac.run( [ "modprobe", "vfio" ] )
               Tac.run( [ "modprobe", "vfio-iommu-type1" ] )
               Tac.run( [ "modprobe", "vfio-pci" ] )
               Tac.run( [ dpdkDir + "tools/dpdk-devbind.py", "--force", "-b",
                          "vfio-pci", pciAddress ] )

         Fru.Dep( self.phyConfig_.phy, fruPhy ).newMember(
            fruPhy.name, port.intfId, MAX_SUPPORTED_MTU, tacPciAddress, pciDriver,
            port.id - 1, False, vdevName, fruPhy.headerFormat, fruPhy.encapsulating,
            capability )

      if not softRestart:
         # Set up for huge tables.
         traceAlert( "_sfeInitialize: - setting up huge pages " )
         Tac.run( [ "mkdir", "-p", "/dev/hugepages1G" ] )
         Tac.run( [ "mkdir", "-p", "/dev/hugepages2M" ] )

         if 'SIMULATION_VMID' not in os.environ:
            try:
               Tac.run( [ "mount", "-t", "hugetlbfs", "-o",
                          "pagesize=1G", "none", "/dev/hugepages1G" ] )
            except Tac.SystemCommandError:
               traceAlert( "Fru could not mount 1G hugepages..." )

            try:
               Tac.run( [ "mount", "-t", "hugetlbfs", "-o",
                          "pagesize=2M", "none", "/dev/hugepages2M" ] )
            except Tac.SystemCommandError:
               traceAlert( "Fru could not mount 2M hugepages..." )

      if not hotPlug:
         traceDetail( "Not a hotplug scenario. Restart bess" )
         Tac.run( [ "pkill", "bessd" ], ignoreReturnCode=True )

      return encapsulating

def Plugin( context ):
   traceDetail( "Sfe plugin registering with Fru..." )
   context.registerDriver( SfePhyDriver )
   mg = context.entityManager.mountGroup()
   mg.mount( 'hardware/sfe/launcherConfig', 'Tac::Dir', 'wi' )
   mg.close( None )
