diff options
Diffstat (limited to 'scripts/ci/babysitter')
-rwxr-xr-x | scripts/ci/babysitter | 409 |
1 files changed, 409 insertions, 0 deletions
diff --git a/scripts/ci/babysitter b/scripts/ci/babysitter new file mode 100755 index 00000000000..f8b5a6c345c --- /dev/null +++ b/scripts/ci/babysitter @@ -0,0 +1,409 @@ +#!/usr/bin/env python + +# Mimics GNU timeout, but does some fancy tracking based on custom features in mono nunit24. + +import argparse +import subprocess +import re +import signal +import time +import sys +import os.path +import copy +import tempfile +import calendar +import json + +### Constants + +# Here is how the communication with nunit works. It has to work with two constraints: +# - We don't invoke nunit directly. We invoke some Makefile which invokes some other Makefile +# and at some point down the line someone calls nunit. +# - nunit has to be able to report back to us even if (especially if) it terminates improperly. +# To deal with all this, communication babysitter->nunit is done by environment variables, +# and communication nunit->babysitter is done by leaving behind files in known locations. + +# Filenames + +CURRENT_TEST_FILE = "babysitter_report_current_test_file.txt" +RAN_TEST_FILE = "babysitter_report_ran_test_file.txt" +FAILED_TEST_FILE = "babysitter_report_failed_test_file.txt" +LOGGING_FILE = "babysitter_report.json_lines" + +# Environment keys + +# Keys used for Babysitter<->Nunit IPC +CURRENT_TEST_KEY = 'MONO_BABYSITTER_NUNIT_CURRENT_TEST_FILE' # Tell nunit where to leave files +RAN_TEST_KEY = 'MONO_BABYSITTER_NUNIT_RAN_TEST_FILE' +FAILED_TEST_KEY = 'MONO_BABYSITTER_NUNIT_FAILED_TEST_FILE' +RUN_KEY = 'MONO_BABYSITTER_NUNIT_RUN_TEST' # Semicolon-separated list of test names +RUN_MODE_KEY = 'MONO_BABYSITTER_NUNIT_RUN_MODE' # Equal to either RUN or AFTER + +# Keys used for script configuration (see --help text) +LOG_FILE_KEY = 'MONO_BABYSITTER_LOG_FILE' # Path +RETRY_KEY = 'MONO_BABYSITTER_RETRY' # Equal to an integer +VERBOSE_KEY = 'MONO_BABYSITTER_VERBOSE' # "Undocumented"-- used for debugging babysitter + +# JSON keys + +DATE_JSON = 'date' # POSIX timestamp of test suite run +INVOKE_JSON = 'invocation' +COUNT_JSON = 'iteration' # How many times was command executed? +LIMIT_JSON = 'failure_max' +SUPPORT_JSON = 'babysitter_protocol' # Was the test suite running with a babysitter-aware nunit? +FINAL_CODE_JSON = 'final_code' +TESTS_JSON = 'tests' # Holds dictionary of (test case name)->(dict with TEST_ keys below) +TEST_FAILURES = 'normal_failures' +TEST_CRASH_FAILURES = 'crash_failures' +TEST_TIMEOUT_FAILURES = 'timeout_failures' + +### Interpret arguments + +scriptname = sys.argv[0] + +# This is very silly: This program needs to switch from argparse to manual parsing +# after the second positional argument, but argparse doesn't let you do this. +# I work around this with a custom subclass which can selectively swallow errors: +class Hesitate(Exception): + pass +class HesitantParser(argparse.ArgumentParser): + def __init__(s, *args, **kwargs): + s.hesitating = True # Errors will be swallowed until this is set false + argparse.ArgumentParser.__init__(s, *args, **kwargs) + def error(s, *args, **kwargs): + if s.hesitating: + raise Hesitate() # Bail out before errors are printed. + argparse.ArgumentParser.error(s, *args, **kwargs) + +# Define args +argparser = HesitantParser(description="""\ +Run a test suite with a timeout.\n +Durations are floating point numbers followed by an optional unit:\n +'s' for seconds (the default) +'m' for minutes +'h' for hours +'d' for days\n +supported environment variables: + %s: File to write logs to (as line-delimited JSON) + %s: If set to a number, failed test cases will be rerun this many times (NUnit test suites only)""" % + (LOG_FILE_KEY, RETRY_KEY), + formatter_class=argparse.RawTextHelpFormatter) +argparser.add_argument('-s', '--signal', dest='signal', metavar='signal', default='TERM', + help="Send this signal to the command on timeout, instead of TERM.") +argparser.add_argument('-k', '--kill-after-duration', dest='kill_after', metavar='duration', + help="If process continues running after signal, send KILL after this duration.") +argparser.add_argument('duration', + help="Time to run before sending signal.") +argparser.add_argument('command', nargs="+", help="Command+args to run.") + +# Repeatedly parse the given args until we find the shortest prefix for which parsing succeeds. +argc = len(sys.argv) +extra_args = [] +for limit in range(1,argc+1): + try: + if limit == argc: # On the final pass, parse for real + argparser.hesitating = False + args = argparser.parse_args(sys.argv[1:limit]) + # If we're still here, parse_args succeeded. + # The final parsed arg is the command; remaining argv items are its args. + extra_args = sys.argv[limit:] + break + except Hesitate: # Before the final pass, if parse_args fails, skip + pass + +argparser.hesitating = False # Just act like a normal argparser from here + +durationre = re.compile(r'(\d+)([smhd]?)') +def parse_duration(duration): # Accept units + match = durationre.match(duration) + if not match: + argparser.error("Could not understand duration %s" % duration) + time, units = match.group(1), match.group(2) + time = int(time) + if units == 'm': + time *= 60 + elif units == 'h': + time *= 60*60 + elif units == 'd': + time *= 60*60*24 + return time + +def parse_signal(sig): # Accept names + if sig.isdigit(): + return int(sig) + for k,v in signal.__dict__.iteritems(): + if k == ("SIG%s" % sig): + return v + argparser.error("Could not understand signal name %s" % sig) + +# Final interpretation of arguments +duration = parse_duration(args.duration) +kill_after = parse_duration(args.kill_after) if args.kill_after is not None else None +timeout_signal = parse_signal(args.signal) +command = args.command + extra_args + +# Process environment +global_env = copy.deepcopy( os.environ ) + +verbose = VERBOSE_KEY in global_env +logging = LOG_FILE_KEY in global_env +logfile = global_env[LOG_FILE_KEY] if logging else None +crash_resuming = True # TODO: Consider exposing this option, or adding a retry_on_crash option. +failmax = int(global_env[RETRY_KEY]) if RETRY_KEY in global_env else 0 +babysitting = True # If false, babysitter becomes a timeout clone with no env manipulation or anything. +if babysitting: + babysitter_dir = tempfile.mkdtemp() + global_env[CURRENT_TEST_KEY] = os.path.join(babysitter_dir, CURRENT_TEST_FILE) + global_env[RAN_TEST_KEY] = os.path.join(babysitter_dir, RAN_TEST_FILE) + global_env[FAILED_TEST_KEY] = os.path.join(babysitter_dir, FAILED_TEST_FILE) + +have_unix_process_groups = 'killpg' in os.__dict__ +have_windows_process_groups = 'CREATE_NEW_PROCESS_GROUP' in subprocess.__dict__ + +### Timeout implementation + +def wait(proc, duration): + # TODO: If we detect Python 3.3, Popen objects have a wait(timeout) method we can use + start = time.time() + while True: + code = proc.poll() + if code is not None: + return code + if time.time()-start > duration: + return None + time.sleep(0.05) + +# Popen and send_signal can't be called in their basic forms because we want to +# send signals to all children, not just to the immediately spawned process. +# Unfortunately the way to do this varies by operating system. +def popen(*args, **kwargs): + if have_unix_process_groups: # Call function on spawn to become process group leader + kwargs['preexec_fn'] = os.setsid + elif have_windows_process_groups: # Set magic flag for Windows process groups + kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP + return subprocess.Popen(*args, **kwargs) + +def send_signal(proc, sig): + if have_unix_process_groups: # UNIX + # For compatibility with GNU timeout, pre-send the signal to just the monitored process + os.kill(proc.pid, sig) + # Send signal to entire group + os.killpg(proc.pid, sig) + # For compatibility with GNU Timeout, send a SIGCONT after the signal + # (so delivery has a chance to occur even for stopped processes) + if sig != signal.SIGKILL and sig != signal.SIGCONT: + os.kill(proc.pid, signal.SIGCONT) + elif have_windows_process_groups: # Windows with Python 2.7 or better + os.kill(proc.pid, sig) # Becuase CREATE_NEW_PROCESS_GROUP, will go to entire group + else: # Windows with Python 2.6-- CREATE_NEW_PROCESS_GROUP not supported + proc.send_signal(sig) # No way to contact group, just kill process + +### Utility functions + +def attemptDelete(path): + try: + os.remove(path) + except OSError: + pass + +def attemptLines(path): + try: + with open(path) as f: + return map(lambda s: s.strip('\r\n'), f.readlines()) + except (OSError, IOError): + return [] + +def attemptFirstLine(path): + lines = attemptLines(path) + if lines: + return lines[0] + return None + +def posixtime(): # Amazingly, time.time() does not guarantee an epoch in the docs. However this does: + return calendar.timegm(time.gmtime()) + +failcount = {} +def failure_may_retry(test): + if test not in failcount: + failcount[test] = 0 + failcount[test] += 1 + return failcount[test] < failmax + +def verbose_print(arg): + if (verbose): + print(arg) + +def failure_annotate(test): + return "%s (failure #%d of %d allowed)" % (test, failcount[test], failmax) + +def pluralize(lst): + return "s" if len(lst) > 1 else "" + +### Run command + +def run(): # Returns exit code + resume_after = [] + retry_these = [] + ever_completed = False + died_politely = False + proc = None + code = None + + # Set up logging + log = {DATE_JSON: posixtime(), COUNT_JSON:0, LIMIT_JSON:failmax, SUPPORT_JSON:False, + INVOKE_JSON: " ".join(command)} + + def log_value(key, set=None, add=None, target=log): # Call to add toplevel value to log + if add is not None: + if key not in target: + target[key] = 0 + target[key] += add + else: + target[key] = set + + def log_test(testname, key, set=None, add=None): # Call to add test-case-level value to log + if TESTS_JSON not in log: + log[TESTS_JSON] = {} + if testname not in log[TESTS_JSON]: + log[TESTS_JSON][testname] = {} + log_value(key, set=set, add=add, target=log[TESTS_JSON][testname]) + + # Ready to run tests + try: + while True: + env = copy.copy(global_env) + if ever_completed: + retry_next = [] + else: # Persist reported failures list until first non-crash run + retry_next = retry_these + + log_value(COUNT_JSON, add=1) + + # Prepare environment/filesystem + if babysitting: + for key in [CURRENT_TEST_KEY, RAN_TEST_KEY, FAILED_TEST_KEY]: + attemptDelete(env[key]) + if resume_after: + env[RUN_KEY] = ";".join(resume_after) + env[RUN_MODE_KEY] = "EXCLUDE" + elif retry_these: + env[RUN_KEY] = ";".join(retry_these) + env[RUN_MODE_KEY] = "RUN" + + # Run test suite + try: + proc = popen(command, env=env) + except OSError: + died_politely = True + sys.stderr.write("%s: Could not execute command `%s`\n" % (scriptname, command[0])) + sys.exit(127) + + code = wait(proc, duration) + timed_out = code is None + if timed_out: # Initial timeout + send_signal(proc, timeout_signal) + if kill_after is not None: # Kill-after timeout + code = wait(proc, kill_after) + if code is None: + send_signal(proc, signal.SIGKILL) + code = proc.wait() # Waits forever + sys.stderr.write("%s: Command `%s` timed out\n" % (scriptname, command[0])) + died_politely = True + + # The test suite has now run, and what happens next varies: + # 1. The suite either completed fully without failures, or timed out: Just quit. + # 2. The suite crashed (halted without completing): + # Remember any failures for later and rerun, using a blacklist of testcases we have completed. + # 3. The suite completed, but there were failures reported: + # Rerun, using a whitelist of only reported-failed testcases. + # 4. The suite crashed partway through a run with a whitelist: + # Rerun, using a whitelist consisting of the previous whitelist minus successful testcases. + + crashed_at = attemptFirstLine(env[CURRENT_TEST_KEY]) + failed_tests = attemptLines(env[FAILED_TEST_KEY]) + ran_tests = attemptLines(env[RAN_TEST_KEY]) + bailout = False + + if crashed_at or failed_tests or ran_tests: # Test suite follows the babysitter protocol + log_value(SUPPORT_JSON, True) + + if not crashed_at and not ever_completed: # The resume_after whitelist is only + resume_after = [] # used before the first noncrashing run + ever_completed = True + + if timed_out: # Currently no retries after timeout + bailout = True + code = 124 # See GNU timeout manpage + + if code or crashed_at: # Process failures + # Handle crash failures + if crashed_at and not timed_out: + log_test(crashed_at, TEST_CRASH_FAILURES, add=1) + if not crash_resuming: + bailout = True + + if failure_may_retry(crashed_at): + if ever_completed: # Rerun with whitelist next time + for test in retry_these: # Prepopulate with last whitelist minus run testcases + if test == crashed_at or test not in ran_tests: # (plus crashed testcase) + retry_next.append(test) + else: # Rerun with blacklist next time + for test in ran_tests: # Add testcases we just ran to blacklist + if test != crashed_at: # (except for the crashed testcase) + resume_after.append(test) + else: + bailout = True + + # Handle reported failures + for test in failed_tests: + log_test(test, TEST_FAILURES, add=1) + if failure_may_retry(test): + retry_next.append(test) + else: + bailout = True + + # Report human-readable failures for stdout log. + message = "%s:" % (scriptname) + if timed_out: + message += " Saw timeout in test case %s (never allowed)." % (crashed_at) + elif crashed_at: + message += " Saw crash in test case %s." % (failure_annotate(crashed_at)) + if failed_tests: + message += " Saw test failure in test case%s %s." % (pluralize(failed_tests), "; ".join(map(failure_annotate, failed_tests))) + if not (timed_out or crashed_at or failed_tests): + message += " Test suite terminated with code %d, " % (code) + if log[SUPPORT_JSON]: + message += "but failure did not occur during a test case. Halting." + else: + message += "and suite cannot report test case data. Halting." + elif bailout: + message += " Will halt testing." + print(message) + + if bailout or not (resume_after or retry_next): # If not retrying + return code + + # If we got here, a retry is occurring + retry_these = retry_next + + # Report human-readable retry notice for stdout log. + message = "%s: Will rerun test suite" % (scriptname) + if log[COUNT_JSON] > 1: + message += " (iteration #%d)" % (log[COUNT_JSON]) + if resume_after: + message += ", resuming at crashed testcase %s." % (crashed_at) + else: + message += ", running previously failed testcase%s: %s." % (pluralize(retry_these), "; ".join(retry_these)) + print(message) + finally: + # Emergency: Ensure command does not outlive this script + if proc is not None and not died_politely: + send_signal(proc, signal.SIGKILL) + + # Write out logs + log_value(FINAL_CODE_JSON, "EXCEPTION" if code is None else code) + if logging: + with open(logfile, "a") as f: + f.write(json.dumps(log) + os.linesep) + +sys.exit( run() ) |