#!/usr/bin/env python3
#
# Copyright (c) 2015 Jon Turney
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#

#
# parser for .hint files
#

import argparse
import re
from collections import OrderedDict

try:
    import license_expression
except ModuleNotFoundError:
    licensing = None
else:
    # reach inside license_expression to add custom license ids we permit
    json = license_expression.get_license_index()
    extra_licenses = [
        'Linux-man-pages-copyleft',  # requires SPDX license-list 3.15
        'OFSFDL',                    # "Old FSF documentation license"
        'Public-Domain',
    ]
    for l in extra_licenses:
        if not any(j["spdx_license_key"] == l for j in json):
            json.append({"spdx_license_key": l})
    licensing = license_expression.build_spdx_licensing(json)

# types of key:
# 'multilineval' - always have a value, which may be multiline
# 'val'          - always have a value
# 'optval'       - may have an empty value
# 'noval'        - always have an empty value
keytypes = ['multilineval', 'val', 'optval', 'noval']

# kinds of hint file, and their allowed keys
pvr, override, spvr = range(3)

hintkeys = {}

commonkeys = {
    'ldesc': 'multilineval',
    'category': 'val',
    'sdesc': 'val',
    'test': 'noval',   # mark the package as a test version
    'version': 'val',  # version override
    'disable-check': 'val',
    'notes': 'val',    # tool notes; not significant to calm itself
}

hintkeys[pvr] = commonkeys.copy()
hintkeys[pvr].update({
    'message': 'multilineval',
    'external-source': 'val',
    'requires': 'optval',
    'obsoletes': 'optval',
    'provides': 'val',
    'conflicts': 'val',
})

hintkeys[spvr] = commonkeys.copy()
hintkeys[spvr].update({
    'skip': 'noval',   # in all spvr hints, but ignored
    'homepage': 'val',
    'build-depends': 'optval',
    'license': 'val',
})

hintkeys[override] = {
    'keep': 'val',
    'keep-count': 'val',
    'keep-count-test': 'val',
    'keep-days': 'val',
    'keep-superseded-test': 'noval',
    'disable-check': 'val',
    'replace-versions': 'val',
    'noretain': 'val',
}

# valid categories
categories = ['accessibility',
              'admin',
              'archive',
              'audio',
              'base',
              'comm',
              'database',
              'debug',
              'devel',
              'doc',
              'editors',
              'games',
              'gnome',
              'graphics',
              'interpreters',
              'kde',
              'libs',
              'lua',
              'lxde',
              'mail',
              'mate',
              'math',
              'net',
              'ocaml',
              'office',
              'perl',
              'php',
              'publishing',
              'python',
              'ruby',
              'scheme',
              'science',
              'security',
              'shells',
              'source',  # added to all source packages created by deduplicator to ensure they have a category
              'sugar',
              'system',
              'tcl',
              'text',
              'utils',
              'video',
              'virtual',
              'web',
              'x11',
              'xfce',
              '_obsolete',
              ]


#
# A simple lexer to handle multi-line quoted values
#
# Historically, a multi-line quote is terminated only by a quote at the end of
# the line, and embedded quotes are transformed to single quotes.  So there is
# no escaping of embedded quotes, and no way to represent one.
#
# XXX: Fix the few packages which use embedded quotes, then we can switch this
# to a simpler character by character lexer, which just reads until next
# newline, and next quote when we encounter a quote.
#
def item_lexer(c):
    i = -1
    lines = c.splitlines()

    while i < len(lines) - 1:
        i = i + 1
        o = lines[i]

        # discard lines starting with '#'
        if o.startswith('#'):
            continue

        o = o.strip()

        # discard empty lines
        if not o:
            continue

        # line containing quoted text
        if o.count('"') == 2:
            yield (i, o, None)
            continue

        # if the line contains an opening quote
        if '"' in o:
            # continue reading lines till closing quote
            while i < len(lines) - 1:
                i = i + 1
                # multi-line quoted text preserves any leading space used for
                # indentation, but removes any trailing space
                o = o + '\n' + lines[i].rstrip()
                # multi-line quoted text is only terminated by a quote at the
                # end of the line
                if o.endswith('"'):
                    yield (i, o, None)
                    break
            else:
                yield (i, o, "unterminated quote")

            continue

        # an unquoted line
        yield (i, o, None)


def split_trim_sort_join(hint, splitchar, joinchar=None):
    if joinchar is None:
        joinchar = splitchar + ' '

    return joinchar.join(sorted([s.strip() for s in hint.split(splitchar)]))


# parse the file |fn| as a .hint file of kind |kind|
def hint_file_parse(fn, kind, strict=False):
    hints = OrderedDict()
    errors = []
    warnings = []

    assert (kind in hintkeys) or (kind is None)

    with open(fn, 'rb') as f:
        c = f.read()

        # validate that .hint file is UTF-8 encoded
        try:
            c = c.decode('utf-8')

            # parse as key:value items
            for (i, item, error) in item_lexer(c):

                if (error):
                    errors.append('%s at line %d' % (error, i))

                if (item.count('"') != 0) and (item.count('"') != 2):
                    errors.append('double-quote within double-quotes at line %d (hint files have no escape character)' % (i))

                # key:value
                match = re.match(r'^([^:\s]+):\s*(.*)$', item, re.DOTALL)
                if match:
                    key = match.group(1)
                    value = match.group(2)

                    if kind is not None:
                        if key not in hintkeys[kind]:
                            errors.append('unknown key %s at line %d' % (key, i))
                            continue
                        valtype = hintkeys[kind][key]

                        # check if the key occurs more than once
                        if key in hints:
                            errors.append('duplicate key %s' % (key))

                        # check the value meets any key-specific constraints
                        if (valtype == 'val') and (len(value) == 0):
                            errors.append('%s has empty value' % (key))

                        if (valtype == 'noval') and (len(value) != 0):
                            errors.append("%s has non-empty value '%s'" % (key, value))

                        # only 'ldesc' and 'message' are allowed a multi-line value
                        if (valtype != 'multilineval') and (len(value.splitlines()) > 1):
                            errors.append("key %s has multi-line value" % (key))

                    # validate all categories are in the category list (case-insensitively)
                    if key == 'category':
                        for c in value.split():
                            if c.lower() not in categories:
                                errors.append("unknown category '%s'" % (c))

                    if key in ['sdesc', 'ldesc']:
                        # verify that value for ldesc or sdesc is quoted (genini
                        # forces this)
                        if not (value.startswith('"') and value.endswith('"')):
                            errors.append("%s value '%s' should be quoted" % (key, value))

                        # warn about and fix common typos in ldesc/sdesc
                        value, msg = typofix(value)
                        if msg:
                            warnings.append("%s in %s" % (','.join(msg), key))

                    # if sdesc ends with a '.', warn and fix it
                    if key == 'sdesc':
                        if re.search(r'\."$', value):
                            warnings.append("sdesc ends with '.'")
                            value = re.sub(r'\."$', '"', value)

                    # if sdesc contains '  ', warn and fix it
                    if key == 'sdesc':
                        if '  ' in value:
                            warnings.append("sdesc contains '  '")
                            value = value.replace('  ', ' ')

                    # message must have an id and some text
                    if key == 'message':
                        if not re.match(r'(\S+)\s+(\S.*)', value):
                            errors.append('message value must have id and text')

                    # license must be a valid spdx license expression
                    if key == 'license' and licensing:
                        try:
                            licensing.parse(value, strict=True)
                            le = licensing.validate(value, strict=True)
                        except license_expression.ExpressionParseError as e:
                            errors.append('errors parsing license expression: %s' % (e))
                        except license_expression.ExpressionError as e:
                            errors.append('errors validating license expression: %s' % (e))
                        else:
                            if not le.normalized_expression:
                                warnings.append('errors in license expression: %s' % (le.errors))
                            elif le.original_expression != le.normalized_expression:
                                warnings.append("license expression: '%s' normalizes to '%s'" % (value, le.normalized_expression))

                    # warn if value starts with a quote followed by whitespace
                    if re.match(r'^"[ \t]+', value):
                        warnings.append('value for key %s starts with quoted whitespace' % (key))

                    # store the key:value
                    hints[key] = value
                else:
                    errors.append("unknown construct '%s' at line %d" % (item, i))

            if ('skip' in hints) and (len(hints) == 1):
                errors.append("hint only contains skip: key, please update to cygport >= 0.22.0")

            # for the pvr kind, 'category' and 'sdesc' must be present
            # (genini also requires 'requires' but that seems wrong)
            # for the spvr kind, 'homepage' must be present for new packages
            if (kind == pvr) or (kind == spvr):
                mandatory = ['category', 'sdesc']
                if (kind == spvr) and strict:
                    mandatory.append('homepage')

                for k in mandatory:
                    if k not in hints:
                        errors.append("required key '%s' missing" % (k))

                suggested = []
                if (kind == spvr) and strict:
                    suggested.append('license')

                for k in suggested:
                    if k not in hints:
                        warnings.append("key '%s' missing" % (k))

            # warn if ldesc and sdesc seem transposed
            #
            # (Unfortunately we can't be totally strict about this, as some
            # packages like to repeat the basic description in ldesc in every
            # subpackage, but add to sdesc to distinguish the subpackages)
            if 'ldesc' in hints:
                if len(hints['sdesc']) > 2 * len(hints['ldesc']):
                    warnings.append('sdesc is much longer than ldesc')

            # sort these hints, as differences in ordering are uninteresting
            if 'build-depends' in hints:
                if ',' in hints['build-depends']:
                    hints['build-depends'] = split_trim_sort_join(hints['build-depends'], ',')
                else:
                    hints['build-depends'] = split_trim_sort_join(hints['build-depends'], None, ', ')

            if 'obsoletes' in hints:
                # obsoletes is specified as comma separated, but cygport writes it space separated at the moment...
                if ',' in hints['obsoletes']:
                    hints['obsoletes'] = split_trim_sort_join(hints['obsoletes'], ',')
                else:
                    hints['obsoletes'] = split_trim_sort_join(hints['obsoletes'], None, ', ')

            if 'replace-versions' in hints:
                hints['replace-versions'] = split_trim_sort_join(hints['replace-versions'], None, ' ')

        except UnicodeDecodeError:
            errors.append('invalid UTF-8')

    if errors:
        hints['parse-errors'] = errors

    if warnings:
        hints['parse-warnings'] = warnings

    return hints


# write hints |hints| to file |fn|
def hint_file_write(fn, hints):
    with open(fn, 'w') as f:
        for k, v in hints.items():
            print("%s: %s" % (k, v), file=f)


#
# words that Cygwin package maintainers apparently can't spell correctly
#

words = [
    (' accomodates ', ' accommodates '),
    (' consistant ', ' consistent '),
    (' examing ', ' examining '),
    (' extremly ', ' extremely '),
    (' interm ', ' interim '),
    (' procesors ', ' processors '),
    (' utilitzed ', ' utilized '),
    (' utilties ', ' utilities '),
]


def typofix(v):
    msg = []

    for (wrong, right) in words:
        if wrong in v:
            v = v.replace(wrong, right)
            msg.append('%s -> %s' % (wrong.strip(), right.strip()))

    return v, msg


#
#
#

def main(args):
    status = 0

    for fn in args.files:
        hints = hint_file_parse(fn, spvr if fn.endswith('src.hint') else pvr)

        if args.verbose >= 1:
            print(hints)

        if 'parse-warnings' in hints:
            if args.verbose > 0:
                for l in hints['parse-warnings']:
                    print('%s: %s' % (fn, l))
            status = 1

        if 'parse-errors' in hints:
            for l in hints['parse-errors']:
                print('%s: %s' % (fn, l))
            status = 255

    return status


#
#
#

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='.hint file validator')
    parser.add_argument('files', nargs='*', metavar='filename', help='list of files')
    parser.add_argument('-v', '--verbose', action='count', dest='verbose', help='verbose output', default=0)

    (args) = parser.parse_args()

    exit(main(args))