Adds babysitter script for PR tests

I see two problems with our current automated PR tests: - Many false negatives— PR tests often fail due to recurring failures unrelated to the PR - Information about what failed is awkward to retrieve, scattered in multiple places This commit adds a Python script, `scripts/babysitter`. Currently our Jenkins test suites are each run wrapped with the `timeout` command. The babysitter acts as a drop-in replacement for GNU timeout but adds the following features: - Logs machine readable output about each test suite (as line-delimited json) - If the test suite uses NUnit, can detect if a test case failed or crashed (terminated mono in mid-test), and retry unsuccessful tests (up to a limit). The reasoning here is that tests which fail inconsistently are currently most likely due to one of our outstanding recurring failures rather than the change made in the PR. Therefore, if a failing test succeeds on retry, the PR itself is probably valid (although the failure should be logged and looked at). In addition to the script itself, changes to NUnit were required in order to support the retry feature and allow fine-grained logging for NUnit suites. Major TODOs: - Add retry support for our non-NUnit-based test suites - Save the XML files NUnit produces (since reruns stomp XMLs from previous runs) - Add some kind of sensible feature for dealing with timeouts
author: Andi McClure <andi.mcclure@xamarin.com> 2015-12-18 21:15:18 +0300
committer: Andi McClure <andi.mcclure@xamarin.com> 2015-12-18 21:15:18 +0300
commit: 1dc0598badaed161efa92fd261de7e555d1a2b13 (patch)
tree: ba1662e526821226b59852cd7f9c6937fecd4822 /scripts
parent: 5b9b780ba74ebd9258cb0862a571f9c1f2149885 (diff)
1 files changed, 364 insertions, 0 deletions
diff --git a/scripts/babysitter b/scripts/babysitter
new file mode 100755
index 00000000000..f48e07957e8
--- /dev/null
+++ b/scripts/babysitter
@@ -0,0 +1,364 @@
+#!/usr/bin/python
+
+# Mimics GNU timeout, but does some fancy tracking based on custom features in mono nunit24.
+
+import argparse
+import subprocess
+import re
+import signal
+import time
+import sys
+import os.path
+import copy
+import tempfile
+import calendar
+import json
+
+### Constants
+
+# Here is how the communication with nunit works. It has to work with two constraints:
+# - We don't invoke nunit directly. We invoke some Makefile which invokes some other Makefile
+#   and at some point down the line someone calls nunit.
+# - nunit has to be able to report back to us even if (especially if) it terminates improperly.
+# To deal with all this, communication babysitter->nunit is done by environment variables,
+# and communication nunit->babysitter is done by leaving behind files in known locations.
+
+# Filenames
+
+CURRENT_TEST_FILE = "babysitter_report_current_test_file.txt"
+RAN_TEST_FILE     = "babysitter_report_ran_test_file.txt"
+FAILED_TEST_FILE  = "babysitter_report_failed_test_file.txt"
+LOGGING_FILE      = "babysitter_report.json_lines"
+
+# Environment keys
+
+CURRENT_TEST_KEY  = 'NUNIT_BABYSITTER_CURRENT_TEST_FILE' # Tell nunit where to leave files
+RAN_TEST_KEY      = 'NUNIT_BABYSITTER_RAN_TEST_FILE'
+FAILED_TEST_KEY   = 'NUNIT_BABYSITTER_FAILED_TEST_FILE'
+RUN_KEY           = 'NUNIT_BABYSITTER_RUN_TEST' # Semicolon-separated list of test names
+RUN_MODE_KEY      = 'NUNIT_BABYSITTER_RUN_MODE' # Equal to either RUN or AFTER
+RETRY_KEY         = 'BABYSITTER_RETRY'          # Equal to an integer
+CRASH_RESUME_KEY  = 'BABYSITTER_CRASH_RESUME'
+VERBOSE_KEY       = 'BABYSITTER_VERBOSE'
+LOGGING_DIR_KEY   = 'WORKSPACE'
+
+# JSON keys
+
+DATE_JSON       = 'date'          # POSIX timestamp of test run
+INVOKE_JSON     = 'invocation'
+COUNT_JSON      = 'passes'        # How many times was command executed?
+LIMIT_JSON      = 'failure_max'
+SUPPORT_JSON    = 'retry_support' # Was the test running with a babysitter-aware nunit?
+FINAL_CODE_JSON = 'final_code'
+TESTS_JSON      = 'tests'         # Holds dictionary of (test name)->(dict with TEST_ keys below)
+TEST_FAILURES         = 'normal_failures'
+TEST_CRASH_FAILURES   = 'crash_failures'
+TEST_TIMEOUT_FAILURES = 'timeout_failures'
+
+### Interpret arguments
+
+scriptname = sys.argv[0]
+
+# This is very silly: This program needs to switch from argparse to manual parsing
+# after the second positional argument, but argparse doesn't let you do this.
+# I work around this with a custom subclass which can selectively swallow errors:
+class Hesitate(Exception):
+	pass
+class HesitantParser(argparse.ArgumentParser):
+	def __init__(s, *args, **kwargs):
+		s.hesitating = True  # Errors will be swallowed until this is set false
+		argparse.ArgumentParser.__init__(s, *args, **kwargs)
+	def error(s, *args, **kwargs):
+		if s.hesitating:
+			raise Hesitate() # Bail out before errors are printed.
+		argparse.ArgumentParser.error(s, *args, **kwargs)
+
+# Define args
+argparser = HesitantParser(description="""\
+Run a test case with a timeout.\n
+Durations are floating point numbers followed by an optional unit:\n
+'s' for seconds (the default)
+'m' for minutes
+'h' for hours
+'d' for days\n
+supported environment variables:
+  %s: Directory path to save logs into
+  %s: If set to a number, failed tests will be rerun this many times
+  %s: If set, rerun even for tests which terminated early
+  %s: If set, print extra logging during run""" %
+		(LOGGING_DIR_KEY, RETRY_KEY, CRASH_RESUME_KEY, VERBOSE_KEY),
+	formatter_class=argparse.RawTextHelpFormatter)
+argparser.add_argument('-s', '--signal', dest='signal', metavar='signal', default='TERM',
+	help="Send this signal to the command on timeout, instead of TERM.")
+argparser.add_argument('-k', '--kill-after-duration', dest='kill_after', metavar='duration',
+	help="If process continues running after signal, send KILL after this duration.")
+argparser.add_argument('duration',
+	help="Time to run before sending signal.")
+argparser.add_argument('command', nargs="+", help="Command+args to run.")
+
+# Repeatedly parse the given args until we find the shortest prefix for which parsing succeeds.
+argc = len(sys.argv)
+extra_args = []
+for limit in range(1,argc+1):
+	try:
+		if limit == argc: # On the final pass, parse for real
+			argparser.hesitating = False
+		args = argparser.parse_args(sys.argv[1:limit])
+		# If we're still here, parse_args succeeded.
+		# The final parsed arg is the command; remaining argv items are its args.
+		extra_args = sys.argv[limit:]
+		break
+	except Hesitate: # Before the final pass, if parse_args fails, skip
+		pass
+
+argparser.hesitating = False # Just act like a normal argparser from here
+
+durationre = re.compile(r'(\d+)([smhd]?)')
+def parse_duration(duration): # Accept units
+	match = durationre.match(duration)
+	if not match:
+		argparser.error("Could not understand duration %s" % duration)
+	time, units = match.group(1), match.group(2)
+	time = int(time)
+	if units == 'm':
+		time *= 60
+	elif units == 'h':
+		time *= 60*60
+	elif units == 'd':
+		time *= 60*60*24
+	return time
+
+def parse_signal(sig):        # Accept names
+	if sig.isdigit():
+		return int(sig)
+	for k,v in signal.__dict__.iteritems():
+		if k == ("SIG%s" % sig):
+			return v
+	argparser.error("Could not understand signal name %s" % sig)
+
+# Final interpretation of arguments
+duration = parse_duration(args.duration)
+kill_after = parse_duration(args.kill_after) if args.kill_after is not None else None
+timeout_signal = parse_signal(args.signal)
+command = args.command + extra_args
+
+# Process environment
+global_env = copy.deepcopy( os.environ )
+
+verbose = VERBOSE_KEY in global_env
+logging = LOGGING_DIR_KEY in global_env
+logging_dir = global_env[LOGGING_DIR_KEY] if logging else None
+logfile = os.path.join(logging_dir, LOGGING_FILE) if logging else None
+crash_resuming = CRASH_RESUME_KEY in global_env
+failmax = int(global_env[RETRY_KEY]) if RETRY_KEY in global_env else 0
+babysitting = logging or failmax
+if babysitting:
+	babysitter_dir = tempfile.mkdtemp()
+	global_env[CURRENT_TEST_KEY] = os.path.join(babysitter_dir, CURRENT_TEST_FILE)
+	global_env[RAN_TEST_KEY]     = os.path.join(babysitter_dir, RAN_TEST_FILE)
+	global_env[FAILED_TEST_KEY]  = os.path.join(babysitter_dir, FAILED_TEST_FILE)
+
+### Utility functions
+
+def wait(proc, duration):
+	# FIXME: If we can guarantee Python 3.3, Popen objects have a wait(timeout) method
+	start = time.time()
+	while True:
+		code = proc.poll()
+		if code is not None:
+			return code
+		if time.time()-start > duration:
+			return None
+		time.sleep(0.05)
+
+def attemptDelete(path):
+	try:
+		os.remove(path)
+	except OSError:
+		pass
+
+def attemptLines(path):
+	try:
+		with open(path) as f:
+			return map(lambda s: s.strip('\r\n'), f.readlines())
+	except (OSError, IOError):
+		return []
+
+def attemptFirstLine(path):
+	lines = attemptLines(path)
+	if lines:
+		return lines[0]
+	return None
+
+def posixtime(): # Amazingly, time.time() does not guarantee an epoch in the docs. However this does:
+	return calendar.timegm(time.gmtime())
+
+failcount = {}
+def failure_may_retry(test):
+	if test not in failcount:
+		failcount[test] = 0
+	failcount[test] += 1
+	return failcount[test] < failmax
+
+def verbose_print(arg):
+	if (verbose):
+		print(arg)
+
+### Run command
+
+def run(): # Returns exit code
+	resume_after = []
+	retry_these = []
+	ever_completed = False
+	died_politely = False
+	proc = None
+	code = None
+
+	# Set up logging
+	if logging:
+		log = {DATE_JSON: posixtime(), COUNT_JSON:0, LIMIT_JSON:failmax, SUPPORT_JSON:False,
+			INVOKE_JSON: " ".join(command)}
+	else:
+		log = None
+
+	def log_value(key, set=None, add=None, target=log): # Call to add toplevel value to log
+		if not logging:
+			return
+		if add is not None:
+			if key not in target:
+				target[key] = 0
+			target[key] += add
+		else:
+			target[key] = set
+
+	def log_test(testname, key, set=None, add=None):   # Call to add test-level value to log
+		if not logging:
+			return
+		if TESTS_JSON not in log:
+			log[TESTS_JSON] = {}
+		if testname not in log[TESTS_JSON]:
+			log[TESTS_JSON][testname] = {}
+		log_value(key, set=set, add=add, target=log[TESTS_JSON][testname])
+
+	# Ready to run tests
+	try:
+		while True:
+			env = copy.copy(global_env)
+			if ever_completed:
+				retry_next = []
+			else: # Persist reported failures list until first non-crash run
+				retry_next = retry_these
+
+			log_value(COUNT_JSON, add=1)
+
+			# Prepare environment/filesystem
+			if babysitting:
+				for key in [CURRENT_TEST_KEY, RAN_TEST_KEY, FAILED_TEST_KEY]:
+					attemptDelete(env[key])
+				if resume_after:
+					env[RUN_KEY] = ";".join(resume_after)
+					env[RUN_MODE_KEY] = "EXCLUDE"
+				elif retry_these:
+					env[RUN_KEY] = ";".join(retry_these)
+					env[RUN_MODE_KEY] = "RUN"
+
+			# Run test
+			try:
+				proc = subprocess.Popen(command, env=env)
+			except OSError:
+				died_politely = True
+				sys.stderr.write("%s: Could not execute command `%s`\n" % (scriptname, command[0]))
+				sys.exit(127)
+
+			code = wait(proc, duration)
+			timed_out = code is None
+			if timed_out:                  # Initial timeout
+				proc.send_signal(timeout_signal)
+				if kill_after is not None: # Kill-after timeout
+					code = wait(proc, kill_after)
+					if code is None:
+						proc.send_signal(9)
+				code = proc.wait()         # Waits forever
+				sys.stderr.write("%s: Command `%s` timed out\n" % (scriptname, command[0]))
+			died_politely = True
+
+			# The test has now run, and what happens next varies:
+			# 1. The test either completed fully without failures, or timed out: Just quit.
+			# 2. The test crashed (halted without completing):
+			#   Remember any failures for later and rerun, using a blacklist of tests we have completed.
+			# 3. The test completed, but there were failures reported:
+			#   Rerun, using a whitelist of only reported-failed tests.
+			# 4. The test crashed partway through a run with a whitelist:
+			#   Rerun, using a whitelist consisting of the previous whitelist minus successful tests.
+
+			crashed_at = attemptFirstLine(env[CURRENT_TEST_KEY])
+			failed_tests = attemptLines(env[FAILED_TEST_KEY])
+			ran_tests = attemptLines(env[RAN_TEST_KEY])
+			bailout = False
+
+			if crashed_at or failed_tests or ran_tests: # Test suite follows the babysitter protocol
+				log_value(SUPPORT_JSON, True)
+
+			if not crashed_at and not ever_completed:  # The resume_after whitelist is only
+				verbose_print ("--- First completion") # used before the first noncrashing run
+				resume_after = []
+				ever_completed = True
+
+			if timed_out:       # Currently no retries after timeout
+				bailout = True
+				code = 124      # See GNU timeout manpage
+
+			if (code or crashed_at) and babysitting: # Process failures
+				# Handle crash failures
+				if crashed_at and not timed_out:
+					log_test(crashed_at, TEST_CRASH_FAILURES, add=1)
+					if not crash_resuming:
+						bailout = True
+
+					if failure_may_retry(crashed_at):
+						verbose_print( "--- CRASH FAIL    on %s (will retry)" % (crashed_at) )
+						if ever_completed:           # Rerun with whitelist next time
+							for test in retry_these: # Prepopulate with last whitelist minus run tests
+								if test == crashed_at or test not in ran_tests: # (plus crashed test)
+									retry_next.append(test)
+						else:                            # Rerun with blacklist next time
+							for test in ran_tests:       # Add tests we just ran to blacklist
+								if test != crashed_at:   # (except for the crashed test)
+									resume_after.append(test)
+					else:
+						verbose_print( "--- CRASH FAIL    on %s (will NOT retry)" % (crashed_at) )
+						bailout = True
+
+				# Handle reported failures
+				for test in failed_tests:
+					log_test(test, TEST_FAILURES, add=1)
+					if failure_may_retry(test):
+						verbose_print( "--- REPORTED FAIL on %s (will retry)" % (test) )
+						retry_next.append(test)
+					else:
+						verbose_print( "--- REPORTED FAIL on %s (will NOT retry)" % (test) )
+						bailout = True
+
+			if bailout or not (babysitting and (resume_after or retry_next)): # If not retrying
+				return code
+
+			# If we got here, a retry is occurring
+			retry_these = retry_next
+
+			if resume_after:
+				print "Babysitter script will rerun, resuming at crashed test %s" % (crashed_at)
+			else:
+				print "Babysitter script will rerun, running %d failed tests" % (len(retry_these))
+			verbose_print( "--- Tests pending to rerun: %s" % (retry_these) )
+	finally:
+		# Emergency: Ensure command does not outlive this script
+		if proc is not None and not died_politely:
+			proc.send_signal(9)
+
+		# Write out logs
+		log_value(FINAL_CODE_JSON, "EXCEPTION" if code is None else code)
+		if logging:
+			with open(logfile, "a") as f:
+				f.write(json.dumps(log) + "\n")
+
+sys.exit( run() )
author	Andi McClure <andi.mcclure@xamarin.com>	2015-12-18 21:15:18 +0300
committer	Andi McClure <andi.mcclure@xamarin.com>	2015-12-18 21:15:18 +0300
commit	1dc0598badaed161efa92fd261de7e555d1a2b13 (patch)
tree	ba1662e526821226b59852cd7f9c6937fecd4822 /scripts
parent	5b9b780ba74ebd9258cb0862a571f9c1f2149885 (diff)