diff --git a/test/functional/feature_framework_startup_failures.py b/test/functional/feature_framework_startup_failures.py new file mode 100755 index 0000000000..d6da7f713b --- /dev/null +++ b/test/functional/feature_framework_startup_failures.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025-present The Bitcoin Core developers +# Distributed under the MIT software license, see the accompanying +# file COPYING or http://www.opensource.org/licenses/mit-license.php. +""" +Verify framework startup failures only raise one exception since +multiple exceptions being raised muddies the waters of what actually +went wrong. We should maintain this bar of only raising one exception as +long as additional maintenance and complexity is low. + +Test relaunches itself into child processes in order to trigger failures +without the parent process' BitcoinTestFramework also failing. +""" + +from test_framework.util import ( + assert_raises_message, + rpc_port, +) +from test_framework.test_framework import BitcoinTestFramework + +from hashlib import md5 +from os import linesep +import re +import subprocess +import sys +import time + +class FeatureFrameworkStartupFailures(BitcoinTestFramework): + def set_test_params(self): + self.num_nodes = 1 + + def setup_network(self): + # Don't start the node yet, as we want to measure during run_test. + self.add_nodes(self.num_nodes, self.extra_args) + + # Launches a child test process that runs this same file, but instantiates + # a child test. Verifies that it raises only the expected exception, once. + def _verify_startup_failure(self, test, internal_args, expected_exception): + # Inherit args from parent, only modifying tmpdir so children don't fail + # as a cause of colliding with the parent dir. + parent_args = sys.argv.copy() + assert self.options.tmpdir, "Framework should always set tmpdir." + i, path = next(((i, m[1]) for i, arg in enumerate(parent_args) if (m := re.match(r'--tm(?:p(?:d(?:ir?)?)?)?=(.+)', arg))), + (len(parent_args), self.options.tmpdir)) + subdir = md5(expected_exception.encode('utf-8')).hexdigest()[:8] + parent_args[i:i + 1] = [f"--tmpdir={path}/{subdir}"] + args = [sys.executable] + parent_args + [f"--internal_test={test.__name__}"] + internal_args + + try: + # The subprocess encoding argument gives different results for e.output + # on Linux/Windows, so handle decoding by ourselves for consistency. + output = subprocess.run(args, timeout=60 * self.options.timeout_factor, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT).stdout.decode("utf-8") + except subprocess.TimeoutExpired as e: + print("Unexpected child process timeout!\n" + "WARNING: Timeouts like this halt execution of TestNode logic, " + "meaning dangling bitcoind processes are to be expected.\n" + f"\n{e.output.decode('utf-8')}\n", + file=sys.stderr) + raise + + errors = [] + if (n := output.count("Traceback")) != 1: + errors.append(f"Found {n}/1 tracebacks - expecting exactly one with no knock-on exceptions.") + if (n := len(re.findall(expected_exception, output))) != 1: + errors.append(f"Found {n}/1 occurrences of the specific exception: {expected_exception}") + if (n := output.count("Test failed. Test logging available at")) != 1: + errors.append(f"Found {n}/1 test failure output messages.") + + assert not errors, f"Child test didn't contain (only) expected errors:\n{linesep.join(errors)}\n\n{output}\n\n" + + def run_test(self): + self.log.info("Verifying _verify_startup_failure() functionality (self-check).") + assert_raises_message( + AssertionError, + ("Child test didn't contain (only) expected errors:\n" + + linesep.join(["Found 0/1 tracebacks - expecting exactly one with no knock-on exceptions.", + "Found 0/1 occurrences of the specific exception: NonExistentError", + "Found 0/1 test failure output messages."])).encode("unicode_escape").decode("utf-8"), + self._verify_startup_failure, + TestSuccess, [], + "NonExistentError", + ) + + self.log.info("Parent process is measuring node startup duration in order to obtain a reasonable timeout value for later test...") + node_start_time = time.time() + self.nodes[0].start() + self.nodes[0].wait_for_rpc_connection() + node_start_duration = time.time() - node_start_time + self.nodes[0].stop_node() + self.log.info(f"...measured {node_start_duration:.1f}s.") + + self.log.info("Verifying inability to connect to bitcoind's RPC interface due to wrong port results in one exception containing at least one OSError.") + self._verify_startup_failure( + TestWrongRpcPortStartupFailure, [f"--internal_node_start_duration={node_start_duration}"], + r"AssertionError: \[node 0\] Unable to connect to bitcoind after \d+s \(ignored errors: {[^}]*'OSError \w+'?: \d+[^}]*}, latest error: \w+\([^)]+\)\)" + ) + + self.log.info("Verifying startup failure due to invalid arg results in only one exception.") + self._verify_startup_failure( + TestInitErrorStartupFailure, [], + r"FailedToStartError: \[node 0\] bitcoind exited with status 1 during initialization\. Error: Error parsing command line arguments: Invalid parameter -nonexistentarg" + ) + + self.log.info("Verifying start() then stop_node() on a node without wait_for_rpc_connection() in between raises an assert.") + self._verify_startup_failure( + TestStartStopStartupFailure, [], + r"AssertionError: \[node 0\] Should only call stop_node\(\) on a running node after wait_for_rpc_connection\(\) succeeded\. Did you forget to call the latter after start\(\)\? Not connected to process: \d+" + ) + +class InternalTestMixin: + def add_options(self, parser): + # Just here to silence unrecognized argument error, we actually read the value in the if-main at the bottom. + parser.add_argument("--internal_test", dest="internal_never_read", help="ONLY TO BE USED WHEN TEST RELAUNCHES ITSELF") + +class TestWrongRpcPortStartupFailure(InternalTestMixin, BitcoinTestFramework): + def add_options(self, parser): + parser.add_argument("--internal_node_start_duration", dest="node_start_duration", help="ONLY TO BE USED WHEN TEST RELAUNCHES ITSELF", type=float) + InternalTestMixin.add_options(self, parser) + + def set_test_params(self): + self.num_nodes = 1 + # Override RPC listen port to something TestNode isn't expecting so that + # we are unable to establish an RPC connection. + self.extra_args = [[f"-rpcport={rpc_port(2)}"]] + # Override the timeout to avoid waiting unnecessarily long to realize + # nothing is on that port. Divide by timeout_factor to counter + # multiplication in base, 2 * node_start_duration should be enough. + self.rpc_timeout = max(3, 2 * self.options.node_start_duration) / self.options.timeout_factor + + def run_test(self): + assert False, "Should have failed earlier during startup." + +class TestInitErrorStartupFailure(InternalTestMixin, BitcoinTestFramework): + def set_test_params(self): + self.num_nodes = 1 + self.extra_args = [["-nonexistentarg"]] + + def run_test(self): + assert False, "Should have failed earlier during startup." + +class TestStartStopStartupFailure(InternalTestMixin, BitcoinTestFramework): + def set_test_params(self): + self.num_nodes = 1 + + def setup_network(self): + self.add_nodes(self.num_nodes, self.extra_args) + self.nodes[0].start() + self.nodes[0].stop_node() + assert False, "stop_node() should raise an exception when we haven't called wait_for_rpc_connection()" + + def run_test(self): + assert False, "Should have failed earlier during startup." + +class TestSuccess(InternalTestMixin, BitcoinTestFramework): + def set_test_params(self): + self.num_nodes = 1 + + def setup_network(self): + pass # Don't need to start our node. + + def run_test(self): + pass # Just succeed. + + +if __name__ == '__main__': + if class_name := next((m[1] for arg in sys.argv[1:] if (m := re.match(r'--internal_test=(.+)', arg))), None): + internal_test = globals().get(class_name) + assert internal_test, f"Unrecognized test: {class_name}" + internal_test(__file__).main() + else: + FeatureFrameworkStartupFailures(__file__).main() diff --git a/test/functional/test_framework/test_node.py b/test/functional/test_framework/test_node.py index 9f0baaccca..449c062875 100755 --- a/test/functional/test_framework/test_node.py +++ b/test/functional/test_framework/test_node.py @@ -8,7 +8,6 @@ import contextlib import decimal import errno from enum import Enum -import http.client import json import logging import os @@ -264,6 +263,8 @@ class TestNode(): """Sets up an RPC connection to the bitcoind process. Returns False if unable to connect.""" # Poll at a rate of four times per second poll_per_s = 4 + suppressed_errors = collections.defaultdict(int) + latest_error = "" for _ in range(poll_per_s * self.rpc_timeout): if self.process.poll() is not None: # Attach abrupt shutdown error/s to the exception message @@ -304,33 +305,46 @@ class TestNode(): # overhead is trivial, and the added guarantees are worth # the minimal performance cost. self.log.debug("RPC successfully started") + # Set rpc_connected even if we are in use_cli mode so that we know we can call self.stop() if needed. + self.rpc_connected = True if self.use_cli: return self.rpc = rpc - self.rpc_connected = True self.url = self.rpc.rpc_url return - except JSONRPCException as e: # Initialization phase + except JSONRPCException as e: + # Suppress these as they are expected during initialization. # -28 RPC in warmup - # -342 Service unavailable, RPC server started but is shutting down due to error - if e.error['code'] != -28 and e.error['code'] != -342: + # -342 Service unavailable, could be starting up or shutting down + if e.error['code'] not in [-28, -342]: raise # unknown JSON RPC exception - except ConnectionResetError: - # This might happen when the RPC server is in warmup, but shut down before the call to getblockcount - # succeeds. Try again to properly raise the FailedToStartError - pass + suppressed_errors[f"JSONRPCException {e.error['code']}"] += 1 + latest_error = repr(e) except OSError as e: - if e.errno == errno.ETIMEDOUT: - pass # Treat identical to ConnectionResetError - elif e.errno == errno.ECONNREFUSED: - pass # Port not yet open? - else: + error_num = e.errno + # Work around issue where socket timeouts don't have errno set. + # https://github.com/python/cpython/issues/109601 + if error_num is None and isinstance(e, TimeoutError): + error_num = errno.ETIMEDOUT + + # Suppress similarly to the above JSONRPCException errors. + if error_num not in [ + errno.ECONNRESET, # This might happen when the RPC server is in warmup, + # but shut down before the call to getblockcount succeeds. + errno.ETIMEDOUT, # Treat identical to ECONNRESET + errno.ECONNREFUSED # Port not yet open? + ]: raise # unknown OS error - except ValueError as e: # cookie file not found and no rpcuser or rpcpassword; bitcoind is still starting + suppressed_errors[f"OSError {errno.errorcode[error_num]}"] += 1 + latest_error = repr(e) + except ValueError as e: + # Suppress if cookie file isn't generated yet and no rpcuser or rpcpassword; bitcoind may be starting. if "No RPC credentials" not in str(e): raise + suppressed_errors["missing_credentials"] += 1 + latest_error = repr(e) time.sleep(1.0 / poll_per_s) - self._raise_assertion_error("Unable to connect to bitcoind after {}s".format(self.rpc_timeout)) + self._raise_assertion_error(f"Unable to connect to bitcoind after {self.rpc_timeout}s (ignored errors: {str(dict(suppressed_errors))}, latest error: {latest_error})") def wait_for_cookie_credentials(self): """Ensures auth cookie credentials can be read, e.g. for testing CLI with -rpcwait before RPC connection is up.""" @@ -387,15 +401,15 @@ class TestNode(): """Stop the node.""" if not self.running: return + assert self.rpc_connected, self._node_msg( + "Should only call stop_node() on a running node after wait_for_rpc_connection() succeeded. " + f"Did you forget to call the latter after start()? Not connected to process: {self.process.pid}") self.log.debug("Stopping node") - try: - # Do not use wait argument when testing older nodes, e.g. in wallet_backwards_compatibility.py - if self.version_is_at_least(180000): - self.stop(wait=wait) - else: - self.stop() - except http.client.CannotSendRequest: - self.log.exception("Unable to stop node.") + # Do not use wait argument when testing older nodes, e.g. in wallet_backwards_compatibility.py + if self.version_is_at_least(180000): + self.stop(wait=wait) + else: + self.stop() # If there are any running perf processes, stop them. for profile_name in tuple(self.perf_subprocesses.keys()): diff --git a/test/functional/test_framework/util.py b/test/functional/test_framework/util.py index c7b5cc5d59..b2f765e4d1 100644 --- a/test/functional/test_framework/util.py +++ b/test/functional/test_framework/util.py @@ -101,10 +101,9 @@ def assert_raises_message(exc, message, fun, *args, **kwds): except JSONRPCException: raise AssertionError("Use assert_raises_rpc_error() to test RPC failures") except exc as e: - if message is not None and message not in e.error['message']: - raise AssertionError( - "Expected substring not found in error message:\nsubstring: '{}'\nerror message: '{}'.".format( - message, e.error['message'])) + if message is not None and message not in repr(e): + raise AssertionError("Expected substring not found in exception:\n" + f"substring: '{message}'\nexception: {repr(e)}.") except Exception as e: raise AssertionError("Unexpected exception raised: " + type(e).__name__) else: diff --git a/test/functional/test_runner.py b/test/functional/test_runner.py index eb7c246e8a..340a418e10 100755 --- a/test/functional/test_runner.py +++ b/test/functional/test_runner.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# Copyright (c) 2014-2022 The Bitcoin Core developers +# Copyright (c) 2014-present The Bitcoin Core developers # Distributed under the MIT software license, see the accompanying # file COPYING or http://www.opensource.org/licenses/mit-license.php. """Run regression test suite. @@ -355,6 +355,7 @@ BASE_SCRIPTS = [ 'p2p_handshake.py --v2transport', 'feature_dirsymlinks.py', 'feature_help.py', + 'feature_framework_startup_failures.py', 'feature_shutdown.py', 'wallet_migration.py', 'p2p_ibd_txrelay.py',