From 479504bdc936a70e373ebb478a9655335b9be83b Mon Sep 17 00:00:00 2001 From: Mateusz Galazyn Date: Thu, 6 Jun 2024 00:19:17 +0200 Subject: [PATCH] Fix flaky test cleanup and port allocation --- cardano-testnet/cardano-testnet.cabal | 3 +- cardano-testnet/src/Cardano/Testnet.hs | 2 +- cardano-testnet/src/Testnet/Ping.hs | 27 ++- cardano-testnet/src/Testnet/Process/Run.hs | 29 +-- cardano-testnet/src/Testnet/Property/Util.hs | 52 ++++- cardano-testnet/src/Testnet/Runtime.hs | 182 ++++++++++++------ cardano-testnet/src/Testnet/Start/Cardano.hs | 95 +++++---- cardano-testnet/src/Testnet/Types.hs | 18 +- .../Test/Cli/Babbage/LeadershipSchedule.hs | 8 +- .../Cardano/Testnet/Test/Cli/KesPeriodInfo.hs | 8 +- 10 files changed, 292 insertions(+), 132 deletions(-) diff --git a/cardano-testnet/cardano-testnet.cabal b/cardano-testnet/cardano-testnet.cabal index 842f68b3e56..e0cec0cdff0 100644 --- a/cardano-testnet/cardano-testnet.cabal +++ b/cardano-testnet/cardano-testnet.cabal @@ -73,8 +73,8 @@ library , ouroboros-network-api , prettyprinter , process - , random , resourcet + , retry , safe-exceptions , scientific , si-timers @@ -82,6 +82,7 @@ library , tasty ^>= 1.5 , tasty-expected-failure , tasty-hedgehog + , temporary , text , time , transformers diff --git a/cardano-testnet/src/Cardano/Testnet.hs b/cardano-testnet/src/Cardano/Testnet.hs index cfc727f607f..c506c054bf3 100644 --- a/cardano-testnet/src/Cardano/Testnet.hs +++ b/cardano-testnet/src/Cardano/Testnet.hs @@ -6,7 +6,7 @@ module Cardano.Testnet ( -- ** Start a testnet cardanoTestnet, cardanoTestnetDefault, - requestAvailablePortNumbers, + retryOnAddressInUseError, -- ** Testnet options CardanoTestnetOptions(..), diff --git a/cardano-testnet/src/Testnet/Ping.hs b/cardano-testnet/src/Testnet/Ping.hs index 8ac6fbc7296..d26a6e7e23d 100644 --- a/cardano-testnet/src/Testnet/Ping.hs +++ b/cardano-testnet/src/Testnet/Ping.hs @@ -9,6 +9,7 @@ module Testnet.Ping ( pingNode , checkSprocket , waitForSprocket + , waitForPortClosed , TestnetMagic , PingClientError(..) ) where @@ -20,13 +21,13 @@ import Cardano.Network.Ping (HandshakeFailure, NodeVersion (..), hands import qualified Codec.CBOR.Read as CBOR import Control.Exception.Safe -import Control.Monad (when) +import Control.Monad import Control.Monad.Class.MonadTime.SI (Time) import qualified Control.Monad.Class.MonadTimer.SI as MT import Control.Monad.IO.Class import Control.Tracer (nullTracer) import qualified Data.ByteString.Lazy as LBS -import Data.Either (isLeft) +import Data.Either import Data.IORef import qualified Data.List as L import Data.Word (Word32) @@ -35,10 +36,11 @@ import Network.Mux.Timeout (TimeoutFn, withTimeoutSerial) import Network.Mux.Types (MiniProtocolDir (InitiatorDir), MiniProtocolNum (..), MuxBearer (read, write), MuxSDU (..), MuxSDUHeader (..), RemoteClockModel (RemoteClockModel)) -import Network.Socket (AddrInfo (..), StructLinger (..)) +import Network.Socket (AddrInfo (..), PortNumber, StructLinger (..)) import qualified Network.Socket as Socket import Prettyprinter +import qualified Hedgehog.Extras.Stock.IO.Network.Socket as IO import qualified Hedgehog.Extras.Stock.IO.Network.Sprocket as IO type TestnetMagic = Word32 @@ -167,6 +169,25 @@ sprocketToAddrInfo sprocket = do [] Socket.AF_UNIX Socket.Stream Socket.defaultProtocol (Socket.SockAddrUnix socketAbsPath) Nothing +-- | Wait until port gets closed. +waitForPortClosed + :: MonadIO m + => MT.DiffTime -- ^ timeout + -> MT.DiffTime -- ^ check interval + -> PortNumber + -> m Bool -- ^ 'True' if port is closed, 'False' if timeout was reached before that +waitForPortClosed timeout interval portNumber = liftIO $ do + lastResult <- newIORef False + _ <- MT.timeout timeout $ loop lastResult + not <$> readIORef lastResult + where + loop lastResult = do + isOpen <- liftIO $ IO.isPortOpen (fromIntegral portNumber) + writeIORef lastResult isOpen + when isOpen $ do + -- repeat when port open + MT.threadDelay interval + loop lastResult data PingClientError = PceDecodingError diff --git a/cardano-testnet/src/Testnet/Process/Run.hs b/cardano-testnet/src/Testnet/Process/Run.hs index 90b81c35b9e..d760d6b899b 100644 --- a/cardano-testnet/src/Testnet/Process/Run.hs +++ b/cardano-testnet/src/Testnet/Process/Run.hs @@ -144,9 +144,11 @@ procCli = GHC.withFrozenCallStack $ H.procFlex "cardano-cli" "CARDANO_CLI" -- | Create a 'CreateProcess' describing how to start the cardano-node process -- and an argument list. procNode - :: [String] + :: HasCallStack + => MonadIO m + => [String] -- ^ Arguments to the CLI command - -> ExceptT ExecutableError IO CreateProcess + -> ExceptT ExecutableError m CreateProcess -- ^ Captured stdout procNode = GHC.withFrozenCallStack $ procFlexNew "cardano-node" "CARDANO_NODE" @@ -234,31 +236,32 @@ resourceAndIOExceptionHandlers = [ Handler $ pure . ProcessIOException , Handler $ pure . ResourceException ] - procFlexNew - :: String + :: MonadIO m + => String -- ^ Cabal package name corresponding to the executable -> String -- ^ Environment variable pointing to the binary to run -> [String] -- ^ Arguments to the CLI command - -> ExceptT ExecutableError IO CreateProcess + -> ExceptT ExecutableError m CreateProcess -- ^ Captured stdout procFlexNew = procFlexNew' H.defaultExecConfig procFlexNew' - :: H.ExecConfig + :: MonadIO m + => H.ExecConfig -> String -- ^ Cabal package name corresponding to the executable -> String -- ^ Environment variable pointing to the binary to run -> [String] -- ^ Arguments to the CLI command - -> ExceptT ExecutableError IO CreateProcess + -> ExceptT ExecutableError m CreateProcess -- ^ Captured stdout procFlexNew' execConfig pkg binaryEnv arguments = GHC.withFrozenCallStack $ do bin <- binFlexNew pkg binaryEnv - return (IO.proc bin arguments) + pure (IO.proc bin arguments) { IO.env = getLast $ H.execConfigEnv execConfig , IO.cwd = getLast $ H.execConfigCwd execConfig -- this allows sending signals to the created processes, without killing the test-suite process @@ -267,11 +270,12 @@ procFlexNew' execConfig pkg binaryEnv arguments = GHC.withFrozenCallStack $ do -- | Compute the path to the binary given a package name or an environment variable override. binFlexNew - :: String + :: MonadIO m + => String -- ^ Package name -> String -- ^ Environment variable pointing to the binary to run - -> ExceptT ExecutableError IO FilePath + -> ExceptT ExecutableError m FilePath -- ^ Path to executable binFlexNew pkg binaryEnv = do maybeEnvBin <- liftIO $ IO.lookupEnv binaryEnv @@ -316,9 +320,10 @@ data ExecutableError -- to a haskell package. It is assumed that the project has already been configured and the -- executable has been built. binDist - :: String + :: MonadIO m + => String -- ^ Package name - -> ExceptT ExecutableError IO FilePath + -> ExceptT ExecutableError m FilePath -- ^ Path to executable binDist pkg = do pJsonFp <- handleIOExceptT RetrievePlanJsonFailure planJsonFile diff --git a/cardano-testnet/src/Testnet/Property/Util.hs b/cardano-testnet/src/Testnet/Property/Util.hs index 9db518d81c7..4453ed9dbcd 100644 --- a/cardano-testnet/src/Testnet/Property/Util.hs +++ b/cardano-testnet/src/Testnet/Property/Util.hs @@ -1,6 +1,7 @@ {-# LANGUAGE DataKinds #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE MonoLocalBinds #-} +{-# LANGUAGE NumericUnderscores #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ScopedTypeVariables #-} @@ -15,14 +16,23 @@ module Testnet.Property.Util import Cardano.Api +import Control.Exception.Safe +import Control.Monad +import Control.Monad.Trans.Resource +import qualified Control.Retry as R import qualified Data.Aeson as Aeson import GHC.Stack +import qualified System.Directory as IO import qualified System.Environment as IO +import System.FilePath (()) import System.Info (os) +import qualified System.IO as IO +import qualified System.IO.Temp as IO import qualified System.IO.Unsafe as IO import qualified Hedgehog as H import qualified Hedgehog.Extras as H +import qualified Hedgehog.Extras.Stock.CallStack as H import Hedgehog.Internal.Property (MonadTest) @@ -43,16 +53,52 @@ integrationRetryWorkspace n workspaceName f = withFrozenCallStack $ if disableRetries then integration $ - H.runFinallies $ H.workspace (workspaceName <> "-no-retries") f + H.runFinallies $ workspace (workspaceName <> "-no-retries") f else integration $ H.retry n $ \i -> - H.runFinallies $ H.workspace (workspaceName <> "-" <> show i) f + H.runFinallies $ workspace (workspaceName <> "-" <> show i) f + +-- | Create a workspace directory which will exist for at least the duration of +-- the supplied block. +-- +-- The directory will have the supplied prefix but contain a generated random +-- suffix to prevent interference between tests +-- +-- The directory will be deleted if the block succeeds, but left behind if +-- the block fails. +-- TODO: this is a version which retries deleting of a workspace on exception - upstream to hedgehog-extras +workspace + :: MonadTest m + => HasCallStack + => MonadResource m + => FilePath + -> (FilePath -> m ()) + -> m () +workspace prefixPath f = withFrozenCallStack $ do + systemTemp <- H.evalIO IO.getCanonicalTemporaryDirectory + maybeKeepWorkspace <- H.evalIO $ IO.lookupEnv "KEEP_WORKSPACE" + ws <- H.evalIO $ IO.createTempDirectory systemTemp $ prefixPath <> "-test" + H.annotate $ "Workspace: " <> ws + H.evalIO $ IO.writeFile (ws "module") H.callerModuleName + f ws + when (os /= "mingw32" && maybeKeepWorkspace /= Just "1") $ do + -- try to delete the directory 5 times, 100ms apart + let retryPolicy = R.constantDelay 100_000 <> R.limitRetries 10 + -- retry only on IOExceptions + ioExH _ = Handler $ \(_ :: IOException) -> pure True + -- For some reason, the temporary directory removal sometimes fails. + -- Lets wrap this in MonadResource try multiple times before we fail. + void + . register + . R.recovering retryPolicy [ioExH] + . const + $ IO.removePathForcibly ws -- | The 'FilePath' in '(FilePath -> H.Integration ())' is the work space directory. -- This is created (and returned) via 'H.workspace'. integrationWorkspace :: HasCallStack => FilePath -> (FilePath -> H.Integration ()) -> H.Property integrationWorkspace workspaceName f = withFrozenCallStack $ - integration $ H.runFinallies $ H.workspace workspaceName f + integration $ H.runFinallies $ workspace workspaceName f isLinux :: Bool isLinux = os == "linux" diff --git a/cardano-testnet/src/Testnet/Runtime.hs b/cardano-testnet/src/Testnet/Runtime.hs index f8c1239b52e..e05ed778961 100644 --- a/cardano-testnet/src/Testnet/Runtime.hs +++ b/cardano-testnet/src/Testnet/Runtime.hs @@ -13,6 +13,7 @@ module Testnet.Runtime ( startNode , startLedgerNewEpochStateLogging + , NodeStartFailure (..) ) where import Cardano.Api @@ -31,12 +32,13 @@ import Data.Aeson import Data.Aeson.Encode.Pretty (encodePretty) import Data.Algorithm.Diff import Data.Algorithm.DiffOutput +import Data.Bifunctor (first) import qualified Data.ByteString.Lazy.Char8 as BSC +import Data.List (isInfixOf) import qualified Data.List as List -import Data.Text (Text, unpack) import GHC.Stack import qualified GHC.Stack as GHC -import Network.Socket (PortNumber) +import Network.Socket (HostAddress, PortNumber) import Prettyprinter (unAnnotate) import qualified System.Directory as IO import System.FilePath @@ -47,7 +49,7 @@ import Testnet.Filepath import qualified Testnet.Ping as Ping import Testnet.Process.Run import Testnet.Types (NodeRuntime (NodeRuntime), TestnetRuntime (configurationFile), - poolSprockets) + poolSprockets, showIpv4Address) import Hedgehog (MonadTest) import qualified Hedgehog as H @@ -61,31 +63,51 @@ data NodeStartFailure | ExecutableRelatedFailure ExecutableError | FileRelatedFailure IOException | NodeExecutableError (Doc Ann) + | NodeAddressAlreadyInUseError (Doc Ann) -- | NodePortNotOpenError IOException | MaxSprocketLengthExceededError deriving Show +-- | Analyze @stderr@ contents and return the appropriate error. If the node didn't start because the address +-- was already in use, 'NodeAddressAlreadyInUse' is returned. +mkNodeNonEmptyStderrError + :: String -- ^ @stderr@ contents + -> NodeStartFailure +mkNodeNonEmptyStderrError stderr' = do + if "Address already in use" `isInfixOf` stderr' + then NodeAddressAlreadyInUseError $ pretty stderr' + else NodeExecutableError $ pretty stderr' + instance Error NodeStartFailure where prettyError = \case ProcessRelatedFailure e -> "Cannot initiate process:" <+> pshow e ExecutableRelatedFailure e -> "Cannot run cardano-node executable" <+> pshow e FileRelatedFailure e -> "File error:" <+> prettyException e NodeExecutableError e -> "Cardano node process did not start:" <+> unAnnotate e + NodeAddressAlreadyInUseError e -> "Cardano node process did not start - address already in use:" <+> unAnnotate e MaxSprocketLengthExceededError -> "Max sprocket length exceeded" -- TODO: We probably want a check that this node has the necessary config files to run and -- if it doesn't we fail hard. -- | Start a node, creating file handles, sockets and temp-dirs. +-- +-- If the port in the function argument was obtained using 'H.randomPort' which binds to the port first and then +-- closes it, on some operating systems, like MacOS, the port can get stuck in TIME_WAIT state for a +-- significant period. Unfortunately there is no Haskell API giving the ability to check that - this +-- means that the user of this function needs to retry on 'NodeAddressAlreadyInUseError' until this +-- function succeeds. +-- (see state diagram in https://www.rfc-editor.org/rfc/rfc793#section-3.2 p. 23.) startNode :: HasCallStack => MonadResource m => MonadCatch m => MonadFail m + => MonadTest m => TmpAbsolutePath -- ^ The temporary absolute path -> String -- ^ The name of the node - -> Text + -> HostAddress -- ^ Node IPv4 address -> PortNumber -- ^ Node port @@ -107,62 +129,110 @@ startNode tp node ipv4 port testnetMagic nodeCmd = GHC.withFrozenCallStack $ do socketRelPath = socketDir node "sock" sprocket = Sprocket tempBaseAbsPath socketRelPath - hNodeStdout <- handleIOExceptionsWith FileRelatedFailure . liftIO $ IO.openFile nodeStdoutFile IO.WriteMode - hNodeStderr <- handleIOExceptionsWith FileRelatedFailure . liftIO $ IO.openFile nodeStderrFile IO.ReadWriteMode + hNodeStdout <- retryOpenFile nodeStdoutFile IO.WriteMode + hNodeStderr <- retryOpenFile nodeStderrFile IO.ReadWriteMode - unless (List.length (H.sprocketArgumentName sprocket) <= H.maxSprocketArgumentNameLength) $ - left MaxSprocketLengthExceededError + -- Sometimes the handles are not getting properly closed when node fails to start. This results in + -- operating system holding the file lock for longer than it's necessary. This in the end prevents retrying + -- node start and acquiring a lock for the same stderr/stdout files again. + closeHandlesOnError [hNodeStdout, hNodeStderr] $ do - let socketAbsPath = H.sprocketSystemName sprocket + unless (List.length (H.sprocketArgumentName sprocket) <= H.maxSprocketArgumentNameLength) $ + left MaxSprocketLengthExceededError - nodeProcess - <- firstExceptT ExecutableRelatedFailure - $ hoistExceptT liftIO $ procNode $ mconcat - [ nodeCmd - , [ "--socket-path", H.sprocketArgumentName sprocket - , "--port", show port - , "--host-addr", unpack ipv4 + let socketAbsPath = H.sprocketSystemName sprocket + + nodeProcess + <- firstExceptT ExecutableRelatedFailure + $ hoistExceptT liftIO $ procNode $ mconcat + [ nodeCmd + , [ "--socket-path", H.sprocketArgumentName sprocket + , "--port", show port + , "--host-addr", showIpv4Address ipv4 + ] ] - ] - - (Just stdIn, _, _, hProcess, _) - <- firstExceptT ProcessRelatedFailure $ initiateProcess - $ nodeProcess - { IO.std_in = IO.CreatePipe, IO.std_out = IO.UseHandle hNodeStdout - , IO.std_err = IO.UseHandle hNodeStderr - , IO.cwd = Just tempBaseAbsPath - } - - -- We force the evaluation of initiateProcess so we can be sure that - -- the process has started. This allows us to read stderr in order - -- to fail early on errors generated from the cardano-node binary. - _ <- liftIO (IO.getPid hProcess) - >>= hoistMaybe (NodeExecutableError $ "startNode:" <+> pretty node <+> "'s process did not start.") - - -- Wait for socket to be created - eSprocketError <- - Ping.waitForSprocket - 30 -- timeout - 0.2 -- check interval - sprocket - - -- If we do have anything on stderr, fail. - stdErrContents <- liftIO $ IO.readFile nodeStderrFile - unless (null stdErrContents) - $ left . NodeExecutableError $ pretty stdErrContents - - -- No stderr and no socket? Fail. - firstExceptT - (\ioex -> - NodeExecutableError . hsep $ - ["Socket", pretty socketAbsPath, "was not created after 30 seconds. There was no output on stderr. Exception:", prettyException ioex]) - $ hoistEither eSprocketError - - -- Ping node and fail on error - Ping.pingNode (fromIntegral testnetMagic) sprocket - >>= (firstExceptT (NodeExecutableError . ("Ping error:" <+>) . prettyError) . hoistEither) - - pure $ NodeRuntime node ipv4 port sprocket stdIn nodeStdoutFile nodeStderrFile hProcess + + -- The port number if it is obtained using 'H.randomPort', it is firstly bound to and then closed. The closing + -- and release in the operating system is done asynchronously and can be slow. Here we wait until the port + -- is out of CLOSING state. + H.note_ $ "Waiting for port " <> show port <> " to be available before starting node" + H.assertM $ Ping.waitForPortClosed 30 0.1 port + + (Just stdIn, _, _, hProcess, _) + <- firstExceptT ProcessRelatedFailure $ initiateProcess + $ nodeProcess + { IO.std_in = IO.CreatePipe, IO.std_out = IO.UseHandle hNodeStdout + , IO.std_err = IO.UseHandle hNodeStderr + , IO.cwd = Just tempBaseAbsPath + } + + -- We force the evaluation of initiateProcess so we can be sure that + -- the process has started. This allows us to read stderr in order + -- to fail early on errors generated from the cardano-node binary. + _ <- liftIO (IO.getPid hProcess) + >>= hoistMaybe (NodeExecutableError $ "startNode:" <+> pretty node <+> "'s process did not start.") + + -- Wait for socket to be created + eSprocketError <- + Ping.waitForSprocket + 30 -- timeout + 0.2 -- check interval + sprocket + + -- If we do have anything on stderr, fail. + stdErrContents <- liftIO $ IO.readFile nodeStderrFile + unless (null stdErrContents) $ + throwError $ mkNodeNonEmptyStderrError stdErrContents + + -- No stderr and no socket? Fail. + firstExceptT + (\ioex -> + NodeExecutableError . hsep $ + ["Socket", pretty socketAbsPath, "was not created after 30 seconds. There was no output on stderr. Exception:", prettyException ioex]) + $ hoistEither eSprocketError + + -- Ping node and fail on error + Ping.pingNode (fromIntegral testnetMagic) sprocket + >>= (firstExceptT (NodeExecutableError . ("Ping error:" <+>) . prettyError) . hoistEither) + + pure $ NodeRuntime node ipv4 port sprocket stdIn nodeStdoutFile nodeStderrFile hProcess + where + -- close provided list of handles when 'ExceptT' throws an error + closeHandlesOnError :: MonadIO m => [IO.Handle] -> ExceptT e m a -> ExceptT e m a + closeHandlesOnError handles action = + catchE action $ \e -> do + liftIO $ mapM_ IO.hClose handles + throwE e + + -- Sometimes even when we close the files manually, the operating system still holds the lock for some + -- reason. This is most prominent on MacOS. Therefore, as a last resort, instead of + -- failing the node startup procedure, we simply try to use a different file name for the logs, with + -- the suffix @-n.log@ where @n@ is an attempt number. + retryOpenFile :: MonadIO m + => MonadCatch m + => FilePath -- ^ path we're trying to open + -> IO.IOMode + -> ExceptT NodeStartFailure m IO.Handle + retryOpenFile fullPath mode = go 0 + where + go :: MonadIO m + => MonadCatch m + => Int + -> ExceptT NodeStartFailure m IO.Handle + go n = do + let (path, extension) = splitExtension fullPath + path' = if n > 0 + then path <> "-" <> show n <> extension + else fullPath + r <- fmap (first FileRelatedFailure) . try . liftIO $ IO.openFile path' mode + case r of + Right h -> pure h + Left e + -- give up after 1000 attempts + | n >= 1000 -> throwE e + | otherwise -> go (n + 1) + + createDirectoryIfMissingNew :: HasCallStack => FilePath -> IO FilePath createDirectoryIfMissingNew directory = GHC.withFrozenCallStack $ do diff --git a/cardano-testnet/src/Testnet/Start/Cardano.hs b/cardano-testnet/src/Testnet/Start/Cardano.hs index bbd67e52c6f..30b88313547 100644 --- a/cardano-testnet/src/Testnet/Start/Cardano.hs +++ b/cardano-testnet/src/Testnet/Start/Cardano.hs @@ -1,5 +1,8 @@ +{-# LANGUAGE BangPatterns #-} {-# LANGUAGE DataKinds #-} +{-# LANGUAGE LambdaCase #-} {-# LANGUAGE NamedFieldPuns #-} +{-# LANGUAGE NumericUnderscores #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ScopedTypeVariables #-} @@ -17,7 +20,7 @@ module Testnet.Start.Cardano , cardanoTestnetDefault , getDefaultAlonzoGenesis , getDefaultShelleyGenesis - , requestAvailablePortNumbers + , retryOnAddressInUseError ) where @@ -30,27 +33,24 @@ import Cardano.Node.Configuration.Topology import Prelude hiding (lines) +import Control.Concurrent (threadDelay) import Control.Monad import Data.Aeson import qualified Data.Aeson as Aeson import Data.Bifunctor (first) import qualified Data.ByteString.Lazy as LBS import Data.Either -import Data.IORef import qualified Data.List as L import Data.Maybe -import Data.Text (Text) import qualified Data.Text as Text -import Data.Time (UTCTime) +import Data.Time (UTCTime, diffUTCTime) +import Data.Time.Clock (NominalDiffTime) import qualified Data.Time.Clock as DTC import Data.Word (Word32) -import GHC.IO.Unsafe (unsafePerformIO) import GHC.Stack import qualified GHC.Stack as GHC -import Network.Socket (PortNumber) import System.FilePath (()) import qualified System.Info as OS -import qualified System.Random.Stateful as R import Text.Printf (printf) import Testnet.Components.Configuration @@ -66,6 +66,7 @@ import Testnet.Types as TR hiding (shelleyGenesis) import Hedgehog (MonadTest) import qualified Hedgehog as H import qualified Hedgehog.Extras as H +import qualified Hedgehog.Extras.Stock.IO.Network.Port as H import qualified Hedgehog.Extras.Stock.OS as OS -- | There are certain conditions that need to be met in order to run @@ -123,41 +124,6 @@ getDefaultShelleyGenesis opts = do startTime <- H.noteShow $ DTC.addUTCTime startTimeOffsetSeconds currentTime return (startTime, Defaults.defaultShelleyGenesis startTime opts) --- | Hardcoded testnet IP address -testnetIpv4Address :: Text -testnetIpv4Address = "127.0.0.1" - --- | Starting port number, from which testnet nodes will get new ports. -defaultTestnetNodeStartingPortNumber :: PortNumber -defaultTestnetNodeStartingPortNumber = 20000 - --- | Global counter used to track which testnet node's ports were already allocated -availablePortNumber :: IORef PortNumber -availablePortNumber = unsafePerformIO $ do - let startingPort = toInteger defaultTestnetNodeStartingPortNumber - -- add a random offset to the starting port number to avoid clashes when starting multiple testnets - randomPart <- R.uniformRM (1,9) R.globalStdGen - newIORef . fromInteger $ startingPort + randomPart * 1000 -{-# NOINLINE availablePortNumber #-} - --- | Request a list of unused port numbers for testnet nodes. This shifts 'availablePortNumber' by --- 'maxPortsPerRequest' in order to make sure that each node gets an unique port. -requestAvailablePortNumbers - :: HasCallStack - => MonadIO m - => MonadTest m - => Int -- ^ Number of ports to request - -> m [PortNumber] -requestAvailablePortNumbers numberOfPorts - | numberOfPorts > fromIntegral maxPortsPerRequest = withFrozenCallStack $ do - H.note_ $ "Tried to allocate " <> show numberOfPorts <> " port numbers in one request. " - <> "It's allowed to allocate no more than " <> show maxPortsPerRequest <> " per request." - H.failure - | otherwise = liftIO $ atomicModifyIORef' availablePortNumber $ \n -> - (n + maxPortsPerRequest, [n..n + fromIntegral numberOfPorts - 1]) - where - maxPortsPerRequest = 50 - -- | Setup a number of credentials and pools, like this: -- -- > ├── byron @@ -228,8 +194,6 @@ cardanoTestnet nDReps = numDReps testnetOptions era = cardanoNodeEra testnetOptions - portNumbers <- requestAvailablePortNumbers numPoolNodes - -- Sanity checks testnetMinimumConfigurationRequirements testnetOptions when (shelleyStartTime /= startTime) $ do @@ -340,12 +304,12 @@ cardanoTestnet } } - -- Add Byron, Shelley and Alonzo genesis hashes to node configuration config <- createConfigJson (TmpAbsolutePath tmpAbsPath) era H.evalIO $ LBS.writeFile (unFile configurationFile) config + portNumbers <- replicateM numPoolNodes $ H.randomPort testnetDefaultIpv4Address -- Byron related forM_ (zip [1..] portNumbers) $ \(i, portNumber) -> do let iStr = printf "%03d" (i - 1) @@ -357,7 +321,7 @@ cardanoTestnet forM_ (zip [1..] portNumbers) $ \(i, myPortNumber) -> do let producers = flip map (filter (/= myPortNumber) portNumbers) $ \otherProducerPort -> RemoteAddress - { raAddress = testnetIpv4Address + { raAddress = showIpv4Address testnetDefaultIpv4Address , raPort = otherProducerPort , raValency = 1 } @@ -370,8 +334,8 @@ cardanoTestnet let nodeName = mkNodeName i keyDir = tmpAbsPath poolKeyDir i H.note_ $ "Node name: " <> nodeName - eRuntime <- runExceptT $ - startNode (TmpAbsolutePath tmpAbsPath) nodeName testnetIpv4Address port testnetMagic + eRuntime <- runExceptT . retryOnAddressInUseError $ + startNode (TmpAbsolutePath tmpAbsPath) nodeName testnetDefaultIpv4Address port testnetMagic [ "run" , "--config", unFile configurationFile , "--topology", keyDir "topology.json" @@ -440,3 +404,38 @@ cardanoTestnet writeGenesisSpecFile eraName toWrite = GHC.withFrozenCallStack $ do genesisJsonFile <- H.noteShow $ tmpAbsPath "genesis." <> eraName <> ".spec.json" H.evalIO $ LBS.writeFile genesisJsonFile $ Aeson.encode toWrite + +-- | Retry an action when `NodeAddressAlreadyInUseError` gets thrown from an action +retryOnAddressInUseError + :: forall m a. HasCallStack + => MonadTest m + => MonadIO m + => ExceptT NodeStartFailure m a -- ^ action being retried + -> ExceptT NodeStartFailure m a +retryOnAddressInUseError act = withFrozenCallStack $ go maximumTimeout retryTimeout + where + go :: HasCallStack => NominalDiffTime -> NominalDiffTime -> ExceptT NodeStartFailure m a + go timeout interval + | timeout <= 0 = withFrozenCallStack $ do + H.note_ "Exceeded timeout when retrying node start" + act + | otherwise = withFrozenCallStack $ do + !time <- liftIO DTC.getCurrentTime + catchError act $ \case + NodeAddressAlreadyInUseError _ -> do + liftIO $ threadDelay (round $ interval * 1_000_000) + !time' <- liftIO DTC.getCurrentTime + let elapsedTime = time' `diffUTCTime` time + newTimeout = timeout - elapsedTime + H.note_ $ "Retrying on 'address in use' error, timeout: " <> show newTimeout + go newTimeout interval + e -> throwError e + + -- Retry timeout in seconds. This should be > 2 * net.inet.tcp.msl on darwin, + -- net.inet.tcp.msl in RFC 793 determines TIME_WAIT socket timeout. + -- Usually it's 30 or 60 seconds. We take two times that plus some extra time. + maximumTimeout = 150 + -- Wait for that many seconds before retrying. + retryTimeout = 5 + + diff --git a/cardano-testnet/src/Testnet/Types.hs b/cardano-testnet/src/Testnet/Types.hs index db0c15a6182..508a6614b1e 100644 --- a/cardano-testnet/src/Testnet/Types.hs +++ b/cardano-testnet/src/Testnet/Types.hs @@ -38,6 +38,8 @@ module Testnet.Types , ShelleyGenesis(..) , shelleyGenesis , getStartTime + , testnetDefaultIpv4Address + , showIpv4Address ) where import Cardano.Api @@ -55,12 +57,14 @@ import Prelude import Control.Monad import qualified Data.Aeson as A +import Data.List (intercalate) import Data.Text (Text) import Data.Time.Clock (UTCTime) +import GHC.Exts (IsString (..)) import GHC.Generics (Generic) import qualified GHC.IO.Handle as IO import GHC.Stack -import Network.Socket (PortNumber) +import Network.Socket (HostAddress, PortNumber, hostAddressToTuple, tupleToHostAddress) import System.FilePath import qualified System.Process as IO @@ -115,7 +119,7 @@ poolNodeStdout = nodeStdout . poolRuntime data NodeRuntime = NodeRuntime { nodeName :: !String - , nodeIpv4 :: !Text + , nodeIpv4 :: !HostAddress , nodePort :: !PortNumber , nodeSprocket :: !Sprocket , nodeStdinHandle :: !IO.Handle @@ -191,3 +195,13 @@ readNodeLoggingFormat = \case allNodes :: TestnetRuntime -> [NodeRuntime] allNodes tr = fmap poolRuntime (poolNodes tr) + +-- | Hardcoded testnet IPv4 address pointing to the local host +testnetDefaultIpv4Address :: HostAddress +testnetDefaultIpv4Address = tupleToHostAddress (127, 0, 0, 1) + +-- | Format IPv4 address as a string-like e.g. @127.0.0.1@ +showIpv4Address :: IsString s => HostAddress -> s +showIpv4Address address = fromString . intercalate "." $ show <$> [a,b,c,d] + where (a,b,c,d) = hostAddressToTuple address + diff --git a/cardano-testnet/test/cardano-testnet-test/Cardano/Testnet/Test/Cli/Babbage/LeadershipSchedule.hs b/cardano-testnet/test/cardano-testnet-test/Cardano/Testnet/Test/Cli/Babbage/LeadershipSchedule.hs index 9c7aeacb834..91d9102378f 100644 --- a/cardano-testnet/test/cardano-testnet-test/Cardano/Testnet/Test/Cli/Babbage/LeadershipSchedule.hs +++ b/cardano-testnet/test/cardano-testnet-test/Cardano/Testnet/Test/Cli/Babbage/LeadershipSchedule.hs @@ -46,6 +46,7 @@ import Testnet.Types import Hedgehog (Property, (===)) import qualified Hedgehog as H +import qualified Hedgehog.Extras.Stock.IO.Network.Port as H import qualified Hedgehog.Extras.Stock.IO.Network.Sprocket as IO import qualified Hedgehog.Extras.Test.Base as H import qualified Hedgehog.Extras.Test.File as H @@ -220,7 +221,7 @@ hprop_leadershipSchedule = integrationRetryWorkspace 2 "babbage-leadership-sched let valency = 1 topology = RealNodeTopology $ flip map poolNodes $ \PoolNode{poolRuntime=NodeRuntime{nodeIpv4,nodePort}} -> - RemoteAddress nodeIpv4 nodePort valency + RemoteAddress (showIpv4Address nodeIpv4) nodePort valency H.lbsWriteFile topologyFile $ Aeson.encode topology let testSpoKesVKey = work "kes.vkey" testSpoKesSKey = work "kes.skey" @@ -248,8 +249,9 @@ hprop_leadershipSchedule = integrationRetryWorkspace 2 "babbage-leadership-sched jsonBS <- createConfigJson tempAbsPath (cardanoNodeEra cTestnetOptions) H.lbsWriteFile (unFile configurationFile) jsonBS - [newNodePort] <- requestAvailablePortNumbers 1 - eRuntime <- runExceptT $ startNode (TmpAbsolutePath work) "test-spo" "127.0.0.1" newNodePort testnetMagic + newNodePort <- H.randomPort testnetDefaultIpv4Address + eRuntime <- runExceptT . retryOnAddressInUseError $ + startNode (TmpAbsolutePath work) "test-spo" testnetDefaultIpv4Address newNodePort testnetMagic [ "run" , "--config", unFile configurationFile , "--topology", topologyFile diff --git a/cardano-testnet/test/cardano-testnet-test/Cardano/Testnet/Test/Cli/KesPeriodInfo.hs b/cardano-testnet/test/cardano-testnet-test/Cardano/Testnet/Test/Cli/KesPeriodInfo.hs index ed6eecf7474..d08d471d720 100644 --- a/cardano-testnet/test/cardano-testnet-test/Cardano/Testnet/Test/Cli/KesPeriodInfo.hs +++ b/cardano-testnet/test/cardano-testnet-test/Cardano/Testnet/Test/Cli/KesPeriodInfo.hs @@ -42,6 +42,7 @@ import Hedgehog (Property) import qualified Hedgehog as H import Hedgehog.Extras (threadDelay) import Hedgehog.Extras.Stock (sprocketSystemName) +import qualified Hedgehog.Extras.Stock.IO.Network.Port as H import qualified Hedgehog.Extras.Stock.IO.Network.Sprocket as IO import qualified Hedgehog.Extras.Test.Base as H import qualified Hedgehog.Extras.Test.File as H @@ -214,7 +215,7 @@ hprop_kes_period_info = integrationRetryWorkspace 2 "kes-period-info" $ \tempAbs let valency = 1 topology = RealNodeTopology $ flip map poolNodes $ \PoolNode{poolRuntime=NodeRuntime{nodeIpv4,nodePort}} -> - RemoteAddress nodeIpv4 nodePort valency + RemoteAddress (showIpv4Address nodeIpv4) nodePort valency H.lbsWriteFile topologyFile $ Aeson.encode topology let testSpoVrfVKey = work "vrf.vkey" @@ -247,8 +248,9 @@ hprop_kes_period_info = integrationRetryWorkspace 2 "kes-period-info" $ \tempAbs jsonBS <- createConfigJson tempAbsPath (cardanoNodeEra cTestnetOptions) H.lbsWriteFile (unFile configurationFile) jsonBS - [newNodePortNumber] <- requestAvailablePortNumbers 1 - eRuntime <- runExceptT $ startNode tempAbsPath "test-spo" "127.0.0.1" newNodePortNumber testnetMagic + newNodePortNumber <- H.randomPort testnetDefaultIpv4Address + eRuntime <- runExceptT . retryOnAddressInUseError $ + startNode tempAbsPath "test-spo" testnetDefaultIpv4Address newNodePortNumber testnetMagic [ "run" , "--config", unFile configurationFile , "--topology", topologyFile