Module deeptranslit.deeptranslit

Expand source code
import os
import re
import pickle
import string
import pydload
import logging
import itertools

from txt2txt import infer, build_model

kenlm_available = True

    import kenlm
    logging.warn('KenLm not installed. Simple scoring will be used.')
    kenlm_available = False

model_links = {
            'hi': {
                    'checkpoint': '',
                    'params': '',
                    'words': '',
                    'lm': ''

lang_code_mapping = {
            'hindi': 'hi',

class DeepTranslit():
    params = None
    model = None
    words = None
    lm = None
    rank = 'auto'

    def __init__(self, lang_code, rank='auto'):
        Initialize deeptranslit


        lang_code (str): Name or code of the language. (Currently supported: hindi/hi)

        rank (str): Mode of ranking. In default mode ('auto') kenlm will be used if available. (simple|kenlm|auto are the supported options)


        if lang_code in lang_code_mapping:
            lang_code = lang_code_mapping[lang_code]
        if lang_code not in model_links:
            print("DeepTranslit doesn't support '" + lang_code + "' yet.")
            print("Please raise a issue at to add this language into future checklist.")
            return None
        # loading the model
        home = os.path.expanduser("~")
        lang_path = os.path.join(home, '.DeepTranslit_' + lang_code)
        checkpoint_path = os.path.join(lang_path, 'checkpoint')
        params_path = os.path.join(lang_path, 'params')
        words_path = os.path.join(lang_path, 'words')
        lm_path = os.path.join(lang_path, 'lm')
        if not os.path.exists(lang_path):

        if not os.path.exists(checkpoint_path):
            print('Downloading checkpoint', model_links[lang_code]['checkpoint'], 'to', checkpoint_path)
            pydload.dload(url=model_links[lang_code]['checkpoint'], save_to_path=checkpoint_path, max_time=None)

        if not os.path.exists(params_path):
            print('Downloading model params', model_links[lang_code]['params'], 'to', params_path)
            pydload.dload(url=model_links[lang_code]['params'], save_to_path=params_path, max_time=None)
        if not os.path.exists(words_path):
            print('Downloading words', model_links[lang_code]['words'], 'to', words_path)
            pydload.dload(url=model_links[lang_code]['words'], save_to_path=words_path, max_time=None)

        if not os.path.exists(lm_path):
            print('Downloading lm', model_links[lang_code]['lm'], 'to', lm_path)
            pydload.dload(url=model_links[lang_code]['lm'], save_to_path=lm_path, max_time=None)
        DeepTranslit.model, DeepTranslit.params = build_model(params_path=params_path, enc_lstm_units=64, use_gru=True, display_summary=False)

        DeepTranslit.words = pickle.load(open(words_path, 'rb'))

        if kenlm_available and rank in {'auto', 'kenlm'}:
            logging.warn('Loading KenLM.')
            DeepTranslit.lm = kenlm.Model(lm_path)
            DeepTranslit.rank = rank

    def transliterate(self, sent, top=3):
        Transliterate an input sentence while preserving punctuation at word or sentence endings.


        sent (str): Sentence to be transliterated.

        top (int): top-n results to be returned. if 0 or None, all results will be returned.


        list: returns list of tuples of size 2 with first element of each tuple being the transliterated sentence and second element being the "score"

        rank = DeepTranslit.rank
        words = sent.strip().split()
        puncs = []
        for i, word in enumerate(words):
            words[i] = re.sub('[' + string.punctuation + ']', '', word.lower())
            if not words[i]:

            punc = None
            if word[-1] in string.punctuation:
                punc = word[-1]

        words = [w for w in words if w]
        np_words = []

        for i, word in enumerate(words):
            if [c for c in word if c not in DeepTranslit.params['input_encoding']]:
                np_words.append((i - len(np_words), word))
                words[i] = None
        words = [w for w in words if w]        

        preds = infer(words, DeepTranslit.model, DeepTranslit.params)

        for posi, np_word in np_words:
            preds = preds[:posi] + [[{'sequence': np_word, 'prob': 1}]] + preds[posi:]

        resp = []

        preds = list(itertools.product(*preds))

        for pred in preds:
            words = [w['sequence'] for w in pred]
            for i, word in enumerate(words):
                if puncs[i]:
                    word = word + puncs[i]
                words[i] = word

            probs = [w['prob'] for w in pred]

            sent = ' '.join(words)
            resp.append((sent, probs))
        if rank == 'auto':
            if kenlm_available:
                rank = 'kenlm'
                rank = 'simple'

        if rank == 'simple':
            for i, (sent, probs) in enumerate(resp):
                words = sent.split()
                score = sum([1 for word in words if word in DeepTranslit.words])
                resp[i] = (sent, score)
            resp = sorted(resp, key=lambda x: x[1], reverse=True)
        elif rank == 'kenlm':
            if not kenlm_available:
                logging.error("KenLm not available")
                return resp

            for i, (sent, probs) in enumerate(resp):
                score = DeepTranslit.lm.score(sent)
                resp[i] = (sent, score)

            resp = sorted(resp, key=lambda x: x[1], reverse=True) 

        if top:
            resp = resp[:top]

        return resp


class DeepTranslit (lang_code, rank='auto')

Initialize deeptranslit


lang_code (str): Name or code of the language. (Currently supported: hindi/hi)

rank (str): Mode of ranking. In default mode ('auto') kenlm will be used if available. (simple|kenlm|auto are the supported options)

Expand source code
class DeepTranslit():
    params = None
    model = None
    words = None
    lm = None
    rank = 'auto'

    def __init__(self, lang_code, rank='auto'):
        Initialize deeptranslit


        lang_code (str): Name or code of the language. (Currently supported: hindi/hi)

        rank (str): Mode of ranking. In default mode ('auto') kenlm will be used if available. (simple|kenlm|auto are the supported options)


        if lang_code in lang_code_mapping:
            lang_code = lang_code_mapping[lang_code]
        if lang_code not in model_links:
            print("DeepTranslit doesn't support '" + lang_code + "' yet.")
            print("Please raise a issue at to add this language into future checklist.")
            return None
        # loading the model
        home = os.path.expanduser("~")
        lang_path = os.path.join(home, '.DeepTranslit_' + lang_code)
        checkpoint_path = os.path.join(lang_path, 'checkpoint')
        params_path = os.path.join(lang_path, 'params')
        words_path = os.path.join(lang_path, 'words')
        lm_path = os.path.join(lang_path, 'lm')
        if not os.path.exists(lang_path):

        if not os.path.exists(checkpoint_path):
            print('Downloading checkpoint', model_links[lang_code]['checkpoint'], 'to', checkpoint_path)
            pydload.dload(url=model_links[lang_code]['checkpoint'], save_to_path=checkpoint_path, max_time=None)

        if not os.path.exists(params_path):
            print('Downloading model params', model_links[lang_code]['params'], 'to', params_path)
            pydload.dload(url=model_links[lang_code]['params'], save_to_path=params_path, max_time=None)
        if not os.path.exists(words_path):
            print('Downloading words', model_links[lang_code]['words'], 'to', words_path)
            pydload.dload(url=model_links[lang_code]['words'], save_to_path=words_path, max_time=None)

        if not os.path.exists(lm_path):
            print('Downloading lm', model_links[lang_code]['lm'], 'to', lm_path)
            pydload.dload(url=model_links[lang_code]['lm'], save_to_path=lm_path, max_time=None)
        DeepTranslit.model, DeepTranslit.params = build_model(params_path=params_path, enc_lstm_units=64, use_gru=True, display_summary=False)

        DeepTranslit.words = pickle.load(open(words_path, 'rb'))

        if kenlm_available and rank in {'auto', 'kenlm'}:
            logging.warn('Loading KenLM.')
            DeepTranslit.lm = kenlm.Model(lm_path)
            DeepTranslit.rank = rank

    def transliterate(self, sent, top=3):
        Transliterate an input sentence while preserving punctuation at word or sentence endings.


        sent (str): Sentence to be transliterated.

        top (int): top-n results to be returned. if 0 or None, all results will be returned.


        list: returns list of tuples of size 2 with first element of each tuple being the transliterated sentence and second element being the "score"

        rank = DeepTranslit.rank
        words = sent.strip().split()
        puncs = []
        for i, word in enumerate(words):
            words[i] = re.sub('[' + string.punctuation + ']', '', word.lower())
            if not words[i]:

            punc = None
            if word[-1] in string.punctuation:
                punc = word[-1]

        words = [w for w in words if w]
        np_words = []

        for i, word in enumerate(words):
            if [c for c in word if c not in DeepTranslit.params['input_encoding']]:
                np_words.append((i - len(np_words), word))
                words[i] = None
        words = [w for w in words if w]        

        preds = infer(words, DeepTranslit.model, DeepTranslit.params)

        for posi, np_word in np_words:
            preds = preds[:posi] + [[{'sequence': np_word, 'prob': 1}]] + preds[posi:]

        resp = []

        preds = list(itertools.product(*preds))

        for pred in preds:
            words = [w['sequence'] for w in pred]
            for i, word in enumerate(words):
                if puncs[i]:
                    word = word + puncs[i]
                words[i] = word

            probs = [w['prob'] for w in pred]

            sent = ' '.join(words)
            resp.append((sent, probs))
        if rank == 'auto':
            if kenlm_available:
                rank = 'kenlm'
                rank = 'simple'

        if rank == 'simple':
            for i, (sent, probs) in enumerate(resp):
                words = sent.split()
                score = sum([1 for word in words if word in DeepTranslit.words])
                resp[i] = (sent, score)
            resp = sorted(resp, key=lambda x: x[1], reverse=True)
        elif rank == 'kenlm':
            if not kenlm_available:
                logging.error("KenLm not available")
                return resp

            for i, (sent, probs) in enumerate(resp):
                score = DeepTranslit.lm.score(sent)
                resp[i] = (sent, score)

            resp = sorted(resp, key=lambda x: x[1], reverse=True) 

        if top:
            resp = resp[:top]

        return resp

Class variables

var lm
var model
var params
var rank

str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str

Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.str() (if defined) or repr(object). encoding defaults to sys.getdefaultencoding(). errors defaults to 'strict'.

var words


def transliterate(self, sent, top=3)

Transliterate an input sentence while preserving punctuation at word or sentence endings.


sent (str): Sentence to be transliterated.

top (int): top-n results to be returned. if 0 or None, all results will be returned.


list: returns list of tuples of size 2 with first element of each tuple being the transliterated sentence and second element being the "score"

Expand source code
def transliterate(self, sent, top=3):
    Transliterate an input sentence while preserving punctuation at word or sentence endings.


    sent (str): Sentence to be transliterated.

    top (int): top-n results to be returned. if 0 or None, all results will be returned.


    list: returns list of tuples of size 2 with first element of each tuple being the transliterated sentence and second element being the "score"

    rank = DeepTranslit.rank
    words = sent.strip().split()
    puncs = []
    for i, word in enumerate(words):
        words[i] = re.sub('[' + string.punctuation + ']', '', word.lower())
        if not words[i]:

        punc = None
        if word[-1] in string.punctuation:
            punc = word[-1]

    words = [w for w in words if w]
    np_words = []

    for i, word in enumerate(words):
        if [c for c in word if c not in DeepTranslit.params['input_encoding']]:
            np_words.append((i - len(np_words), word))
            words[i] = None
    words = [w for w in words if w]        

    preds = infer(words, DeepTranslit.model, DeepTranslit.params)

    for posi, np_word in np_words:
        preds = preds[:posi] + [[{'sequence': np_word, 'prob': 1}]] + preds[posi:]

    resp = []

    preds = list(itertools.product(*preds))

    for pred in preds:
        words = [w['sequence'] for w in pred]
        for i, word in enumerate(words):
            if puncs[i]:
                word = word + puncs[i]
            words[i] = word

        probs = [w['prob'] for w in pred]

        sent = ' '.join(words)
        resp.append((sent, probs))
    if rank == 'auto':
        if kenlm_available:
            rank = 'kenlm'
            rank = 'simple'

    if rank == 'simple':
        for i, (sent, probs) in enumerate(resp):
            words = sent.split()
            score = sum([1 for word in words if word in DeepTranslit.words])
            resp[i] = (sent, score)
        resp = sorted(resp, key=lambda x: x[1], reverse=True)
    elif rank == 'kenlm':
        if not kenlm_available:
            logging.error("KenLm not available")
            return resp

        for i, (sent, probs) in enumerate(resp):
            score = DeepTranslit.lm.score(sent)
            resp[i] = (sent, score)

        resp = sorted(resp, key=lambda x: x[1], reverse=True) 

    if top:
        resp = resp[:top]

    return resp