# coding=utf-8

#Sorces: majka_pipe/unitok_czech_v2.py, rftagger_pipe/unitok/slovak_1.py

import re

SGML_TAG = r"""
    <!-- .*? -->                # XML/SGML comment
    |                           # -- OR --
    <[!?/]?(?!\d)\w[-\.:\w]*    # Start of tag/directive
    (                           # Attributes
        [^>'"]*                 # - attribute name (+whitespace +equal sign)
        ('[^']*'|"[^"]*")       # - attribute value
    )*
    \s*                         # Spaces at the end
    /?                          # Forward slash at the end of singleton tags
    \s*                         # More spaces at the end
    >                           # +End of tag/directive
"""
SGML_TAG_RE = re.compile(SGML_TAG, re.UNICODE | re.VERBOSE | re.DOTALL)

SETTOPBOX = r"""
(?<!\S)
set[-– ]{0,2}top[-– ]{0,2}box
(?![-\w])
"""
SETTOPBOX_RE = re.compile(SETTOPBOX, re.UNICODE | re.IGNORECASE | re.VERBOSE)

ORDINAL = r"""
(?<!\S) # preceded by space
    \d*\.
(?!\s[A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽÄĹĽÔŔ]) # not followed by space and uppercase letter
"""
ORDINAL_RE = re.compile(ORDINAL, re.UNICODE | re.VERBOSE)

NUMBER = r"""
(?<!\S)
    [-+]?
    \d+([,. /+-]\d+)+
(?![-\w])
"""
NUMBER_RE = re.compile(NUMBER, re.UNICODE | re.VERBOSE)

COMPLEX_ABBR = r"""
(?<!\w) [sr]\. (?=\s\d) # strana 1, rok 2000
|
(?<=\s) n\. (?=\s?l\.) # náš
|
(?<=\sn\.\s) l\. | (?<=\sn\.) l\. # letopočet
|
(?<=\s) s\. (?=\s?r.\s?o\.) # společnost
|
(?<=\ss.) r\. (?=\s?o\.) | (?<=\s) r\. (?=\s?o\.) # ručení
|
(?<=[.\s]r\.\s) o\. | (?<=[.\s]r\.) o\. # omezený
|
(?<=\s) a\. (?=\s?s\.) # akciový
|
(?<=\sa\.\s) s\. | (?<=\sa\.) s\. # společnost
"""
COMPLEX_ABBR_RE = re.compile(COMPLEX_ABBR, re.UNICODE | re.VERBOSE)

WHITESPACE = r"\s+"
WHITESPACE_RE = re.compile(WHITESPACE)

DNS_HOST = r"(([-a-z0-9]+\.)+[a-z]{2,})"

URL = r"""
    (
    # scheme://[user:password]
    (ftps?|https?|file)://([-a-z0-9_;?&=](:[-a-z0-9_;?&=]*)?@)?
    # or "www" without the scheme part
    |www\.
    )
    # DNS host / localhost / IP
    (""" + DNS_HOST + """
    | localhost |
    ([0-9]{1,3}\.){3}[0-9]{1,3})
    # Port specification (optional)
    (:[0-9]+)?
    # Scheme specific extension (optional)
    (/[-\w;/?:@=&\$_.+!*'(~#%,]*)?
"""
URL_RE = re.compile(URL, re.VERBOSE | re.IGNORECASE | re.UNICODE)

EMAIL = r"[-a-z0-9._']+@" + DNS_HOST
EMAIL_RE = re.compile(EMAIL, re.UNICODE | re.IGNORECASE)

HTMLENTITY = r"&(#x?[0-9A-F]+|\w+);"
HTMLENTITY_RE = re.compile(HTMLENTITY)

HASHTAG = r"(?<!\w)#[a-z][a-z0-9_]+"
HASHTAG_RE = re.compile(HASHTAG, re.UNICODE | re.IGNORECASE)

DOTCOM = r"""
(?<!\w)
    ([-a-z0-9]+\.){1,2}(com|org|cz|sk|eu)
(?!\w)
"""
DOTCOM_RE = re.compile(DOTCOM, re.UNICODE | re.IGNORECASE | re.VERBOSE)

ABBREVIATION = r"""
(?<!\w) (aj|ap|apod|atd|CSc|čl|Čl|čs|Čs|čsl|Čsl|doc|Doc|dr|Dr|DrSc|gen|Gen|Ch|ing|Ing|JUDr|kl|Kl|kupř|kupr|Kupř|Kupr|max|Max|Mgr|min|Min|mj|Mj|mjr|Mjr|MUDr|MVDr|např|napr|Např|Napr|nar|Nar|npor|Npor|odd|Odd|PaedDr|Ph|PhDr|plk|Plk|popř|popr|Popř|Popr|pozn|Pozn|pplk|Pplk|ppor|Ppor|prof|Prof|př|pr|Př|Pr|příp|príp|Příp|Príp|resp|Resp|RNDr|RSDr|sb|Sb|soc|Soc|spol|Spol|srov|Srov|st|stol|St|str|Str|sv|Sv|tab|Tab|tč|Tč|tel|Tel|tj|Tj|tř|tr|Tř|Tr|tzn|Tzn|tzv|Tzv|ul|Ul|zkr|Zkr|zn|Zn|zvl|Zvl)\.
"""
ABBREVIATION_RE = re.compile(ABBREVIATION, re.UNICODE | re.VERBOSE)

USA = r"""
(?<!\w)
    ([A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽÄĹĽÔŔ]\.)+([A-ZÁČĎÉĚÍŇÓŘŠŤÚŮÝŽÄĹĽÔŔ](?!\w))?
"""
USA_RE = re.compile(USA, re.UNICODE | re.VERBOSE)

CLITIC_RE = re.compile(r"(?<=\w)-li(?!\w)", re.UNICODE)

EUPHEMISM = r"\w+\*+\w+"
EUPHEMISM_RE = re.compile(EUPHEMISM, re.UNICODE)

WORD = r"\w[\w-]*\w|\w"
WORD_RE = re.compile(WORD, re.UNICODE)

MULTICHAR_PUNCTUATION = r"([?!]+|'')"
MULTICHAR_PUNCTUATION_RE = re.compile(MULTICHAR_PUNCTUATION)

SINGLECHAR_PUNCTUATION = r"[\u0028\u005b\u007b\u0029\u005d\u007d\u2985\u2989\u3008\u298d\u300c\u2991\u3010\u2995\u3014\u2018\u169b\u201c\xab\u23b5\xbb\u0f3a\ufd3e\u29d9\u27e9\u276a\u276e\u2772\u29fd\u2986\u300b\u298a\u300f\u298e\u2992\u3017\u3018\u301b\u169c\u301f\u0f3d\u29da\u27e6\u2769\u27ea\u276d\u2771\u2775\u2983\u2987\u298b\u300a\u298f\u300e\u2993\u2997\u3016\u301a\u301e\u203a\u0f3c\u2046\u29db\u27e7\u2768\u27eb\u276c\u2770\u2774\u3019\u2984\u3009\u2988\u2996\u300d\u298c\u3011\u2990\u3015\u2994\u2019\u2998\u201d\u301d\u23b4\u2039\u0f3b\ufd3f\u2045\u29d8\u27e8\u276b\u276f\u2773\u29fc]"
SINGLECHAR_PUNCTUATION_RE = re.compile(SINGLECHAR_PUNCTUATION, re.UNICODE)

ANY_SEQUENCE = r"(.)\1*"
ANY_SEQUENCE_RE = re.compile(ANY_SEQUENCE)

re_list = [
    ('SGML_TAG', SGML_TAG_RE),
    ('SETTOPBOX', SETTOPBOX_RE),
    ('ORDINAL', ORDINAL_RE),
    ('NUMBER', NUMBER_RE),
    ('COMPLEX_ABBR', COMPLEX_ABBR_RE),
    ('WHITESPACE', WHITESPACE_RE),
    ('URL', URL_RE),
    ('EMAIL', EMAIL_RE),
    ('HTMLENTITY', HTMLENTITY_RE),
    ('HASHTAG', HASHTAG_RE),
    ('DOTCOM', DOTCOM_RE),
    ('ABBREVIATION', ABBREVIATION_RE),
    ('USA', USA_RE),
    ('CLITIC', CLITIC_RE),
    ('EUPHEMISM', EUPHEMISM_RE),
    ('WORD', WORD_RE),
    ('MULTICHAR_PUNCTUATION', MULTICHAR_PUNCTUATION_RE),
    ('SINGLECHAR_PUNCTUATION', SINGLECHAR_PUNCTUATION_RE),
    ('ANY_SEQUENCE', ANY_SEQUENCE_RE),
]
