Source code for kwx.languages

"""
languages
---------

Module for organizing language dependencies for text cleaning.

The following languages have been selected because their stopwords can be removed via https://github.com/stopwords-iso/stopwords-iso/tree/master/python.

Contents:
    lem_abbr_dict,
    stem_abbr_dict,
    sw_abbr_dict
"""


[docs]def lem_abbr_dict():
    """
    Calls a dictionary of languages and their abbreviations for lemmatization.

    Notes
    -----
        These languages can be lemmatized via https://spacy.io/usage/models, and are also those that can have their words ordered by parts of speech.

    Returns
    -------
        lem_abbr_dict : dict
            A dictionary with languages as keys and their abbreviations as items.
    """
    return {
        "chinese": "zh",
        "danish": "da",
        "dutch": "nl",
        "english": "en",
        "french": "fr",
        "german": "de",
        "greek": "el",
        "italian": "it",
        "japanese": "ja",
        "lithuanian": "lt",
        "norwegian": "nb",
        "polish": "pl",
        "portuguese": "pt",
        "romanian": "ro",
        "spanish": "es",
    }


[docs]def stem_abbr_dict():
    """
    Calls a dictionary of languages and their abbreviations for stemming.

    Notes
    -----
        These languages don't have good lemmatizers, and will thus be stemmed via https://www.nltk.org/api/nltk.stem.html.

    Returns
    -------
        stem_abbr_dict : dict
            A dictionary with languages as keys and their abbreviations as items.
    """
    return {
        "arabic": "ar",
        "finnish": "fi",
        "hungarian": "hu",
        "swedish": "sv",
    }


[docs]def sw_abbr_dict():
    """
    Calls a dictionary of languages and their abbreviations for stop word removal.

    Notes
    -----
        These languages can only have their stopwords removed via https://github.com/stopwords-iso/stopwords-iso).

    Returns
    -------
        sw_abbr_dict : dict
            A dictionary with languages as keys and their abbreviations as items.
    """
    return {
        "afrikaans": "af",
        "bulgarian": "bg",
        "bengali": "bn",
        "breton": "br",
        "catalan": "ca",
        "czech": "cs",
        "esperanto": "eo",
        "estonian": "et",
        "basque": "eu",
        "farsi": "fa",
        "persian": "fa",
        "irish": "ga",
        "galician": "gl",
        "gujarati": "gu",
        "hausa": "ha",
        "hebrew": "he",
        "hindi": "hi",
        "croatian": "hr",
        "armenian": "hy",
        "indonesian": "id",
        "korean": "ko",
        "kurdish": "ku",
        "latin": "la",
        "latvian": "lv",
        "marathi": "mr",
        "malay": "ms",
        "norwegian": "no",
        "russian": "ru",
        "slovak": "sk",
        "slovenian": "sl",
        "somali": "so",
        "sotho": "st",
        "swahili": "sw",
        "thai": "th",
        "tagalog": "tl",
        "turkish": "tr",
        "ukrainian": "uk",
        "urdu": "ur",
        "vietnamese": "vi",
        "yoruba": "yo",
        "zulu": "zu",
    }