Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mono/mono.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/ci/babysitter')
-rwxr-xr-xscripts/ci/babysitter409
1 files changed, 409 insertions, 0 deletions
diff --git a/scripts/ci/babysitter b/scripts/ci/babysitter
new file mode 100755
index 00000000000..f8b5a6c345c
--- /dev/null
+++ b/scripts/ci/babysitter
@@ -0,0 +1,409 @@
+#!/usr/bin/env python
+
+# Mimics GNU timeout, but does some fancy tracking based on custom features in mono nunit24.
+
+import argparse
+import subprocess
+import re
+import signal
+import time
+import sys
+import os.path
+import copy
+import tempfile
+import calendar
+import json
+
+### Constants
+
+# Here is how the communication with nunit works. It has to work with two constraints:
+# - We don't invoke nunit directly. We invoke some Makefile which invokes some other Makefile
+# and at some point down the line someone calls nunit.
+# - nunit has to be able to report back to us even if (especially if) it terminates improperly.
+# To deal with all this, communication babysitter->nunit is done by environment variables,
+# and communication nunit->babysitter is done by leaving behind files in known locations.
+
+# Filenames
+
+CURRENT_TEST_FILE = "babysitter_report_current_test_file.txt"
+RAN_TEST_FILE = "babysitter_report_ran_test_file.txt"
+FAILED_TEST_FILE = "babysitter_report_failed_test_file.txt"
+LOGGING_FILE = "babysitter_report.json_lines"
+
+# Environment keys
+
+# Keys used for Babysitter<->Nunit IPC
+CURRENT_TEST_KEY = 'MONO_BABYSITTER_NUNIT_CURRENT_TEST_FILE' # Tell nunit where to leave files
+RAN_TEST_KEY = 'MONO_BABYSITTER_NUNIT_RAN_TEST_FILE'
+FAILED_TEST_KEY = 'MONO_BABYSITTER_NUNIT_FAILED_TEST_FILE'
+RUN_KEY = 'MONO_BABYSITTER_NUNIT_RUN_TEST' # Semicolon-separated list of test names
+RUN_MODE_KEY = 'MONO_BABYSITTER_NUNIT_RUN_MODE' # Equal to either RUN or AFTER
+
+# Keys used for script configuration (see --help text)
+LOG_FILE_KEY = 'MONO_BABYSITTER_LOG_FILE' # Path
+RETRY_KEY = 'MONO_BABYSITTER_RETRY' # Equal to an integer
+VERBOSE_KEY = 'MONO_BABYSITTER_VERBOSE' # "Undocumented"-- used for debugging babysitter
+
+# JSON keys
+
+DATE_JSON = 'date' # POSIX timestamp of test suite run
+INVOKE_JSON = 'invocation'
+COUNT_JSON = 'iteration' # How many times was command executed?
+LIMIT_JSON = 'failure_max'
+SUPPORT_JSON = 'babysitter_protocol' # Was the test suite running with a babysitter-aware nunit?
+FINAL_CODE_JSON = 'final_code'
+TESTS_JSON = 'tests' # Holds dictionary of (test case name)->(dict with TEST_ keys below)
+TEST_FAILURES = 'normal_failures'
+TEST_CRASH_FAILURES = 'crash_failures'
+TEST_TIMEOUT_FAILURES = 'timeout_failures'
+
+### Interpret arguments
+
+scriptname = sys.argv[0]
+
+# This is very silly: This program needs to switch from argparse to manual parsing
+# after the second positional argument, but argparse doesn't let you do this.
+# I work around this with a custom subclass which can selectively swallow errors:
+class Hesitate(Exception):
+ pass
+class HesitantParser(argparse.ArgumentParser):
+ def __init__(s, *args, **kwargs):
+ s.hesitating = True # Errors will be swallowed until this is set false
+ argparse.ArgumentParser.__init__(s, *args, **kwargs)
+ def error(s, *args, **kwargs):
+ if s.hesitating:
+ raise Hesitate() # Bail out before errors are printed.
+ argparse.ArgumentParser.error(s, *args, **kwargs)
+
+# Define args
+argparser = HesitantParser(description="""\
+Run a test suite with a timeout.\n
+Durations are floating point numbers followed by an optional unit:\n
+'s' for seconds (the default)
+'m' for minutes
+'h' for hours
+'d' for days\n
+supported environment variables:
+ %s: File to write logs to (as line-delimited JSON)
+ %s: If set to a number, failed test cases will be rerun this many times (NUnit test suites only)""" %
+ (LOG_FILE_KEY, RETRY_KEY),
+ formatter_class=argparse.RawTextHelpFormatter)
+argparser.add_argument('-s', '--signal', dest='signal', metavar='signal', default='TERM',
+ help="Send this signal to the command on timeout, instead of TERM.")
+argparser.add_argument('-k', '--kill-after-duration', dest='kill_after', metavar='duration',
+ help="If process continues running after signal, send KILL after this duration.")
+argparser.add_argument('duration',
+ help="Time to run before sending signal.")
+argparser.add_argument('command', nargs="+", help="Command+args to run.")
+
+# Repeatedly parse the given args until we find the shortest prefix for which parsing succeeds.
+argc = len(sys.argv)
+extra_args = []
+for limit in range(1,argc+1):
+ try:
+ if limit == argc: # On the final pass, parse for real
+ argparser.hesitating = False
+ args = argparser.parse_args(sys.argv[1:limit])
+ # If we're still here, parse_args succeeded.
+ # The final parsed arg is the command; remaining argv items are its args.
+ extra_args = sys.argv[limit:]
+ break
+ except Hesitate: # Before the final pass, if parse_args fails, skip
+ pass
+
+argparser.hesitating = False # Just act like a normal argparser from here
+
+durationre = re.compile(r'(\d+)([smhd]?)')
+def parse_duration(duration): # Accept units
+ match = durationre.match(duration)
+ if not match:
+ argparser.error("Could not understand duration %s" % duration)
+ time, units = match.group(1), match.group(2)
+ time = int(time)
+ if units == 'm':
+ time *= 60
+ elif units == 'h':
+ time *= 60*60
+ elif units == 'd':
+ time *= 60*60*24
+ return time
+
+def parse_signal(sig): # Accept names
+ if sig.isdigit():
+ return int(sig)
+ for k,v in signal.__dict__.iteritems():
+ if k == ("SIG%s" % sig):
+ return v
+ argparser.error("Could not understand signal name %s" % sig)
+
+# Final interpretation of arguments
+duration = parse_duration(args.duration)
+kill_after = parse_duration(args.kill_after) if args.kill_after is not None else None
+timeout_signal = parse_signal(args.signal)
+command = args.command + extra_args
+
+# Process environment
+global_env = copy.deepcopy( os.environ )
+
+verbose = VERBOSE_KEY in global_env
+logging = LOG_FILE_KEY in global_env
+logfile = global_env[LOG_FILE_KEY] if logging else None
+crash_resuming = True # TODO: Consider exposing this option, or adding a retry_on_crash option.
+failmax = int(global_env[RETRY_KEY]) if RETRY_KEY in global_env else 0
+babysitting = True # If false, babysitter becomes a timeout clone with no env manipulation or anything.
+if babysitting:
+ babysitter_dir = tempfile.mkdtemp()
+ global_env[CURRENT_TEST_KEY] = os.path.join(babysitter_dir, CURRENT_TEST_FILE)
+ global_env[RAN_TEST_KEY] = os.path.join(babysitter_dir, RAN_TEST_FILE)
+ global_env[FAILED_TEST_KEY] = os.path.join(babysitter_dir, FAILED_TEST_FILE)
+
+have_unix_process_groups = 'killpg' in os.__dict__
+have_windows_process_groups = 'CREATE_NEW_PROCESS_GROUP' in subprocess.__dict__
+
+### Timeout implementation
+
+def wait(proc, duration):
+ # TODO: If we detect Python 3.3, Popen objects have a wait(timeout) method we can use
+ start = time.time()
+ while True:
+ code = proc.poll()
+ if code is not None:
+ return code
+ if time.time()-start > duration:
+ return None
+ time.sleep(0.05)
+
+# Popen and send_signal can't be called in their basic forms because we want to
+# send signals to all children, not just to the immediately spawned process.
+# Unfortunately the way to do this varies by operating system.
+def popen(*args, **kwargs):
+ if have_unix_process_groups: # Call function on spawn to become process group leader
+ kwargs['preexec_fn'] = os.setsid
+ elif have_windows_process_groups: # Set magic flag for Windows process groups
+ kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
+ return subprocess.Popen(*args, **kwargs)
+
+def send_signal(proc, sig):
+ if have_unix_process_groups: # UNIX
+ # For compatibility with GNU timeout, pre-send the signal to just the monitored process
+ os.kill(proc.pid, sig)
+ # Send signal to entire group
+ os.killpg(proc.pid, sig)
+ # For compatibility with GNU Timeout, send a SIGCONT after the signal
+ # (so delivery has a chance to occur even for stopped processes)
+ if sig != signal.SIGKILL and sig != signal.SIGCONT:
+ os.kill(proc.pid, signal.SIGCONT)
+ elif have_windows_process_groups: # Windows with Python 2.7 or better
+ os.kill(proc.pid, sig) # Becuase CREATE_NEW_PROCESS_GROUP, will go to entire group
+ else: # Windows with Python 2.6-- CREATE_NEW_PROCESS_GROUP not supported
+ proc.send_signal(sig) # No way to contact group, just kill process
+
+### Utility functions
+
+def attemptDelete(path):
+ try:
+ os.remove(path)
+ except OSError:
+ pass
+
+def attemptLines(path):
+ try:
+ with open(path) as f:
+ return map(lambda s: s.strip('\r\n'), f.readlines())
+ except (OSError, IOError):
+ return []
+
+def attemptFirstLine(path):
+ lines = attemptLines(path)
+ if lines:
+ return lines[0]
+ return None
+
+def posixtime(): # Amazingly, time.time() does not guarantee an epoch in the docs. However this does:
+ return calendar.timegm(time.gmtime())
+
+failcount = {}
+def failure_may_retry(test):
+ if test not in failcount:
+ failcount[test] = 0
+ failcount[test] += 1
+ return failcount[test] < failmax
+
+def verbose_print(arg):
+ if (verbose):
+ print(arg)
+
+def failure_annotate(test):
+ return "%s (failure #%d of %d allowed)" % (test, failcount[test], failmax)
+
+def pluralize(lst):
+ return "s" if len(lst) > 1 else ""
+
+### Run command
+
+def run(): # Returns exit code
+ resume_after = []
+ retry_these = []
+ ever_completed = False
+ died_politely = False
+ proc = None
+ code = None
+
+ # Set up logging
+ log = {DATE_JSON: posixtime(), COUNT_JSON:0, LIMIT_JSON:failmax, SUPPORT_JSON:False,
+ INVOKE_JSON: " ".join(command)}
+
+ def log_value(key, set=None, add=None, target=log): # Call to add toplevel value to log
+ if add is not None:
+ if key not in target:
+ target[key] = 0
+ target[key] += add
+ else:
+ target[key] = set
+
+ def log_test(testname, key, set=None, add=None): # Call to add test-case-level value to log
+ if TESTS_JSON not in log:
+ log[TESTS_JSON] = {}
+ if testname not in log[TESTS_JSON]:
+ log[TESTS_JSON][testname] = {}
+ log_value(key, set=set, add=add, target=log[TESTS_JSON][testname])
+
+ # Ready to run tests
+ try:
+ while True:
+ env = copy.copy(global_env)
+ if ever_completed:
+ retry_next = []
+ else: # Persist reported failures list until first non-crash run
+ retry_next = retry_these
+
+ log_value(COUNT_JSON, add=1)
+
+ # Prepare environment/filesystem
+ if babysitting:
+ for key in [CURRENT_TEST_KEY, RAN_TEST_KEY, FAILED_TEST_KEY]:
+ attemptDelete(env[key])
+ if resume_after:
+ env[RUN_KEY] = ";".join(resume_after)
+ env[RUN_MODE_KEY] = "EXCLUDE"
+ elif retry_these:
+ env[RUN_KEY] = ";".join(retry_these)
+ env[RUN_MODE_KEY] = "RUN"
+
+ # Run test suite
+ try:
+ proc = popen(command, env=env)
+ except OSError:
+ died_politely = True
+ sys.stderr.write("%s: Could not execute command `%s`\n" % (scriptname, command[0]))
+ sys.exit(127)
+
+ code = wait(proc, duration)
+ timed_out = code is None
+ if timed_out: # Initial timeout
+ send_signal(proc, timeout_signal)
+ if kill_after is not None: # Kill-after timeout
+ code = wait(proc, kill_after)
+ if code is None:
+ send_signal(proc, signal.SIGKILL)
+ code = proc.wait() # Waits forever
+ sys.stderr.write("%s: Command `%s` timed out\n" % (scriptname, command[0]))
+ died_politely = True
+
+ # The test suite has now run, and what happens next varies:
+ # 1. The suite either completed fully without failures, or timed out: Just quit.
+ # 2. The suite crashed (halted without completing):
+ # Remember any failures for later and rerun, using a blacklist of testcases we have completed.
+ # 3. The suite completed, but there were failures reported:
+ # Rerun, using a whitelist of only reported-failed testcases.
+ # 4. The suite crashed partway through a run with a whitelist:
+ # Rerun, using a whitelist consisting of the previous whitelist minus successful testcases.
+
+ crashed_at = attemptFirstLine(env[CURRENT_TEST_KEY])
+ failed_tests = attemptLines(env[FAILED_TEST_KEY])
+ ran_tests = attemptLines(env[RAN_TEST_KEY])
+ bailout = False
+
+ if crashed_at or failed_tests or ran_tests: # Test suite follows the babysitter protocol
+ log_value(SUPPORT_JSON, True)
+
+ if not crashed_at and not ever_completed: # The resume_after whitelist is only
+ resume_after = [] # used before the first noncrashing run
+ ever_completed = True
+
+ if timed_out: # Currently no retries after timeout
+ bailout = True
+ code = 124 # See GNU timeout manpage
+
+ if code or crashed_at: # Process failures
+ # Handle crash failures
+ if crashed_at and not timed_out:
+ log_test(crashed_at, TEST_CRASH_FAILURES, add=1)
+ if not crash_resuming:
+ bailout = True
+
+ if failure_may_retry(crashed_at):
+ if ever_completed: # Rerun with whitelist next time
+ for test in retry_these: # Prepopulate with last whitelist minus run testcases
+ if test == crashed_at or test not in ran_tests: # (plus crashed testcase)
+ retry_next.append(test)
+ else: # Rerun with blacklist next time
+ for test in ran_tests: # Add testcases we just ran to blacklist
+ if test != crashed_at: # (except for the crashed testcase)
+ resume_after.append(test)
+ else:
+ bailout = True
+
+ # Handle reported failures
+ for test in failed_tests:
+ log_test(test, TEST_FAILURES, add=1)
+ if failure_may_retry(test):
+ retry_next.append(test)
+ else:
+ bailout = True
+
+ # Report human-readable failures for stdout log.
+ message = "%s:" % (scriptname)
+ if timed_out:
+ message += " Saw timeout in test case %s (never allowed)." % (crashed_at)
+ elif crashed_at:
+ message += " Saw crash in test case %s." % (failure_annotate(crashed_at))
+ if failed_tests:
+ message += " Saw test failure in test case%s %s." % (pluralize(failed_tests), "; ".join(map(failure_annotate, failed_tests)))
+ if not (timed_out or crashed_at or failed_tests):
+ message += " Test suite terminated with code %d, " % (code)
+ if log[SUPPORT_JSON]:
+ message += "but failure did not occur during a test case. Halting."
+ else:
+ message += "and suite cannot report test case data. Halting."
+ elif bailout:
+ message += " Will halt testing."
+ print(message)
+
+ if bailout or not (resume_after or retry_next): # If not retrying
+ return code
+
+ # If we got here, a retry is occurring
+ retry_these = retry_next
+
+ # Report human-readable retry notice for stdout log.
+ message = "%s: Will rerun test suite" % (scriptname)
+ if log[COUNT_JSON] > 1:
+ message += " (iteration #%d)" % (log[COUNT_JSON])
+ if resume_after:
+ message += ", resuming at crashed testcase %s." % (crashed_at)
+ else:
+ message += ", running previously failed testcase%s: %s." % (pluralize(retry_these), "; ".join(retry_these))
+ print(message)
+ finally:
+ # Emergency: Ensure command does not outlive this script
+ if proc is not None and not died_politely:
+ send_signal(proc, signal.SIGKILL)
+
+ # Write out logs
+ log_value(FINAL_CODE_JSON, "EXCEPTION" if code is None else code)
+ if logging:
+ with open(logfile, "a") as f:
+ f.write(json.dumps(log) + os.linesep)
+
+sys.exit( run() )