#!/usr/bin/python3
"""
Universal tokenizer

This code was highly inspired by Laurent Pointal's TreeTagger wrapper:
https://perso.limsi.fr/pointal/dev:treetaggerwrapper

(c) 2009 Jan Pomikalek <jan.pomikalek@gmail.com>
Jan Michelfeit, Vit Suchomel <name.surname@sketchengine.co.uk> 2011-2015
"""

import io

__version__ = '4.0'

GLUE_TAG = '<g/>'


def tokenize_recursively(text, re_list, offset, depth=0):
    if depth >= len(re_list):
        return [(offset, '*', text)]
    token_type, regular_expr = re_list[depth]
    tokens = []
    pos = 0
    while pos < len(text):
        m = regular_expr.search(text, pos)
        if not m:
            tokens.extend(tokenize_recursively(text[pos:], re_list, offset + pos, depth+1))
            break
        else:
            startpos, endpos = m.span()
            if startpos > pos:
                tokens.extend(tokenize_recursively(text[pos:startpos], re_list, offset + pos, depth+1))
            tokens.append((offset + startpos, token_type, text[startpos:endpos]))
            pos = endpos
    return tokens


def tokenize(text, configuration, offset):
    re_list = configuration.re_list
    return tokenize_recursively(text, re_list, offset)


def print_token(off, typ, val, debug, offsets):
    rv = ''
    if offsets:
        rv += '%d\t' % off
    if debug:
        rv += '%s\t' % typ
    else:
        rv += '%s\n' % val
    return rv


def print_tokens(tokens, out, add_glue=True, trim=None, debug=False, offsets=False):
    glue_here = False
    for off, typ, val in tokens:
        # replace newlines with spaces
        val = val.replace('\r', ' ').replace('\n', ' ')
        if typ == 'WHITESPACE':
            if not add_glue:
                out.write(print_token(off, typ, val, debug, offsets))
            glue_here = False
        elif typ == 'SGML_TAG':
            if val.startswith('</'):
                out.write(print_token(off, typ, val, debug, offsets))
            else:
                if add_glue and glue_here:
                    out.write(print_token(off, 'GLUE', GLUE_TAG, debug, offsets))
                out.write(print_token(off, typ, val, debug, offsets))
                glue_here = False
        else:
            if trim and len(val) > trim:
                val = val[:trim//2] + val[-trim//2:]
            val = xml_unescape(val)
            if add_glue and glue_here:
                out.write(print_token(off, 'GLUE', GLUE_TAG, debug, offsets))
            out.write(print_token(off, typ, val, debug, offsets))
            glue_here = True


def xml_unescape(text):
    if '&' not in text:
        return text
    text = text.replace('&lt;', '<').replace('&gt;', '>').replace('&quot;', '"').replace('&apos;', "'").replace('&amp;', '&')
    return text


def import_config(config_path):
    from importlib import machinery, util
    loader = machinery.SourceFileLoader('configuration', config_path)
    spec = util.spec_from_loader('configuration', loader)
    module = util.module_from_spec(spec)
    spec.loader.exec_module(module)
    return module


if __name__ == "__main__":
    import argparse 
    import sys

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""Description:
- splits input text into tokens (one token per line)
- for specified languages recognizes abbreviations and clictics (such as 've
  or n't in English)
- preserves SGML markup
- recognizes URLs, e-mail addreses, DNS domains, IP addresses
- adds glue (<g/>) tags between tokens not separated by space
- the output can be tagged with the TreeTagger part-of-speech tagger
    """)
    parser.add_argument("-t", "--trim", help="maximum token length in characters", type=int, default=None)
    parser.add_argument("-n", "--no-glue", help="keep whitespace and don't add glue (<g/>) tags", action="store_true")
    parser.add_argument("-w", "--whole", help="read whole input at once (preserves multi-line tags; memory hungry)", action="store_true")
    parser.add_argument("-d", "--debug", help="show token types for debugging", action="store_true")
    parser.add_argument("-o", "--offsets", help="print token offsets", action="store_true")
    parser.add_argument("-v", "--version", action="version", version='%(prog)s '+ __version__)
    parser.add_argument("CONFIG_FILE")
    args = parser.parse_args()
    try:
        configuration = import_config(args.CONFIG_FILE)
    except:
        sys.stderr.write('Invalid configuration file!\n')
        sys.exit(2)

    stdin = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8')
    stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
    if args.whole:
        input_data = [stdin.read()]
    else:
        input_data = stdin

    offset = 0
    for line in input_data:
        tokens = tokenize(line, configuration, offset)
        print_tokens(tokens, stdout, not args.no_glue, args.trim, args.debug, args.offsets)
        offset += len(line)

