Module deeptranslit.deeptranslit
Expand source code
import os
import re
import pickle
import string
import pydload
import logging
import itertools
from txt2txt import infer, build_model
kenlm_available = True
try:
import kenlm
except:
logging.warn('KenLm not installed. Simple scoring will be used.')
kenlm_available = False
model_links = {
'hi': {
'checkpoint': 'https://github.com/bedapudi6788/DeepTranslit/releases/download/v0.5/en_hi_checkpoint',
'params': 'https://github.com/bedapudi6788/DeepTranslit/releases/download/v0.5/en_hi_params',
'words': 'https://github.com/bedapudi6788/DeepTranslit/releases/download/v0.5/hi_words',
'lm': 'https://github.com/bedapudi6788/DeepTranslit/releases/download/v0.5/hi_lm.bin'
},
}
lang_code_mapping = {
'hindi': 'hi',
}
class DeepTranslit():
params = None
model = None
words = None
lm = None
rank = 'auto'
def __init__(self, lang_code, rank='auto'):
"""
Initialize deeptranslit
Parameters:
lang_code (str): Name or code of the language. (Currently supported: hindi/hi)
rank (str): Mode of ranking. In default mode ('auto') kenlm will be used if available. (simple|kenlm|auto are the supported options)
"""
if lang_code in lang_code_mapping:
lang_code = lang_code_mapping[lang_code]
if lang_code not in model_links:
print("DeepTranslit doesn't support '" + lang_code + "' yet.")
print("Please raise a issue at https://github.com/bedapudi6788/deeptranslit to add this language into future checklist.")
return None
# loading the model
home = os.path.expanduser("~")
lang_path = os.path.join(home, '.DeepTranslit_' + lang_code)
checkpoint_path = os.path.join(lang_path, 'checkpoint')
params_path = os.path.join(lang_path, 'params')
words_path = os.path.join(lang_path, 'words')
lm_path = os.path.join(lang_path, 'lm')
if not os.path.exists(lang_path):
os.mkdir(lang_path)
if not os.path.exists(checkpoint_path):
print('Downloading checkpoint', model_links[lang_code]['checkpoint'], 'to', checkpoint_path)
pydload.dload(url=model_links[lang_code]['checkpoint'], save_to_path=checkpoint_path, max_time=None)
if not os.path.exists(params_path):
print('Downloading model params', model_links[lang_code]['params'], 'to', params_path)
pydload.dload(url=model_links[lang_code]['params'], save_to_path=params_path, max_time=None)
if not os.path.exists(words_path):
print('Downloading words', model_links[lang_code]['words'], 'to', words_path)
pydload.dload(url=model_links[lang_code]['words'], save_to_path=words_path, max_time=None)
if not os.path.exists(lm_path):
print('Downloading lm', model_links[lang_code]['lm'], 'to', lm_path)
pydload.dload(url=model_links[lang_code]['lm'], save_to_path=lm_path, max_time=None)
DeepTranslit.model, DeepTranslit.params = build_model(params_path=params_path, enc_lstm_units=64, use_gru=True, display_summary=False)
DeepTranslit.model.load_weights(checkpoint_path)
DeepTranslit.words = pickle.load(open(words_path, 'rb'))
if kenlm_available and rank in {'auto', 'kenlm'}:
logging.warn('Loading KenLM.')
DeepTranslit.lm = kenlm.Model(lm_path)
DeepTranslit.rank = rank
def transliterate(self, sent, top=3):
"""
Transliterate an input sentence while preserving punctuation at word or sentence endings.
Parameters:
sent (str): Sentence to be transliterated.
top (int): top-n results to be returned. if 0 or None, all results will be returned.
Returns:
list: returns list of tuples of size 2 with first element of each tuple being the transliterated sentence and second element being the "score"
"""
rank = DeepTranslit.rank
words = sent.strip().split()
puncs = []
for i, word in enumerate(words):
words[i] = re.sub('[' + string.punctuation + ']', '', word.lower())
if not words[i]:
continue
punc = None
if word[-1] in string.punctuation:
punc = word[-1]
puncs.append(punc)
words = [w for w in words if w]
np_words = []
for i, word in enumerate(words):
if [c for c in word if c not in DeepTranslit.params['input_encoding']]:
np_words.append((i - len(np_words), word))
words[i] = None
words = [w for w in words if w]
preds = infer(words, DeepTranslit.model, DeepTranslit.params)
for posi, np_word in np_words:
preds = preds[:posi] + [[{'sequence': np_word, 'prob': 1}]] + preds[posi:]
resp = []
preds = list(itertools.product(*preds))
for pred in preds:
words = [w['sequence'] for w in pred]
for i, word in enumerate(words):
if puncs[i]:
word = word + puncs[i]
words[i] = word
probs = [w['prob'] for w in pred]
sent = ' '.join(words)
resp.append((sent, probs))
if rank == 'auto':
if kenlm_available:
rank = 'kenlm'
else:
rank = 'simple'
if rank == 'simple':
for i, (sent, probs) in enumerate(resp):
words = sent.split()
score = sum([1 for word in words if word in DeepTranslit.words])
resp[i] = (sent, score)
resp = sorted(resp, key=lambda x: x[1], reverse=True)
elif rank == 'kenlm':
if not kenlm_available:
logging.error("KenLm not available")
return resp
for i, (sent, probs) in enumerate(resp):
score = DeepTranslit.lm.score(sent)
resp[i] = (sent, score)
resp = sorted(resp, key=lambda x: x[1], reverse=True)
if top:
resp = resp[:top]
return resp
Classes
class DeepTranslit (lang_code, rank='auto')
-
Initialize deeptranslit
Parameters:
lang_code (str): Name or code of the language. (Currently supported: hindi/hi)
rank (str): Mode of ranking. In default mode ('auto') kenlm will be used if available. (simple|kenlm|auto are the supported options)
Expand source code
class DeepTranslit(): params = None model = None words = None lm = None rank = 'auto' def __init__(self, lang_code, rank='auto'): """ Initialize deeptranslit Parameters: lang_code (str): Name or code of the language. (Currently supported: hindi/hi) rank (str): Mode of ranking. In default mode ('auto') kenlm will be used if available. (simple|kenlm|auto are the supported options) """ if lang_code in lang_code_mapping: lang_code = lang_code_mapping[lang_code] if lang_code not in model_links: print("DeepTranslit doesn't support '" + lang_code + "' yet.") print("Please raise a issue at https://github.com/bedapudi6788/deeptranslit to add this language into future checklist.") return None # loading the model home = os.path.expanduser("~") lang_path = os.path.join(home, '.DeepTranslit_' + lang_code) checkpoint_path = os.path.join(lang_path, 'checkpoint') params_path = os.path.join(lang_path, 'params') words_path = os.path.join(lang_path, 'words') lm_path = os.path.join(lang_path, 'lm') if not os.path.exists(lang_path): os.mkdir(lang_path) if not os.path.exists(checkpoint_path): print('Downloading checkpoint', model_links[lang_code]['checkpoint'], 'to', checkpoint_path) pydload.dload(url=model_links[lang_code]['checkpoint'], save_to_path=checkpoint_path, max_time=None) if not os.path.exists(params_path): print('Downloading model params', model_links[lang_code]['params'], 'to', params_path) pydload.dload(url=model_links[lang_code]['params'], save_to_path=params_path, max_time=None) if not os.path.exists(words_path): print('Downloading words', model_links[lang_code]['words'], 'to', words_path) pydload.dload(url=model_links[lang_code]['words'], save_to_path=words_path, max_time=None) if not os.path.exists(lm_path): print('Downloading lm', model_links[lang_code]['lm'], 'to', lm_path) pydload.dload(url=model_links[lang_code]['lm'], save_to_path=lm_path, max_time=None) DeepTranslit.model, DeepTranslit.params = build_model(params_path=params_path, enc_lstm_units=64, use_gru=True, display_summary=False) DeepTranslit.model.load_weights(checkpoint_path) DeepTranslit.words = pickle.load(open(words_path, 'rb')) if kenlm_available and rank in {'auto', 'kenlm'}: logging.warn('Loading KenLM.') DeepTranslit.lm = kenlm.Model(lm_path) DeepTranslit.rank = rank def transliterate(self, sent, top=3): """ Transliterate an input sentence while preserving punctuation at word or sentence endings. Parameters: sent (str): Sentence to be transliterated. top (int): top-n results to be returned. if 0 or None, all results will be returned. Returns: list: returns list of tuples of size 2 with first element of each tuple being the transliterated sentence and second element being the "score" """ rank = DeepTranslit.rank words = sent.strip().split() puncs = [] for i, word in enumerate(words): words[i] = re.sub('[' + string.punctuation + ']', '', word.lower()) if not words[i]: continue punc = None if word[-1] in string.punctuation: punc = word[-1] puncs.append(punc) words = [w for w in words if w] np_words = [] for i, word in enumerate(words): if [c for c in word if c not in DeepTranslit.params['input_encoding']]: np_words.append((i - len(np_words), word)) words[i] = None words = [w for w in words if w] preds = infer(words, DeepTranslit.model, DeepTranslit.params) for posi, np_word in np_words: preds = preds[:posi] + [[{'sequence': np_word, 'prob': 1}]] + preds[posi:] resp = [] preds = list(itertools.product(*preds)) for pred in preds: words = [w['sequence'] for w in pred] for i, word in enumerate(words): if puncs[i]: word = word + puncs[i] words[i] = word probs = [w['prob'] for w in pred] sent = ' '.join(words) resp.append((sent, probs)) if rank == 'auto': if kenlm_available: rank = 'kenlm' else: rank = 'simple' if rank == 'simple': for i, (sent, probs) in enumerate(resp): words = sent.split() score = sum([1 for word in words if word in DeepTranslit.words]) resp[i] = (sent, score) resp = sorted(resp, key=lambda x: x[1], reverse=True) elif rank == 'kenlm': if not kenlm_available: logging.error("KenLm not available") return resp for i, (sent, probs) in enumerate(resp): score = DeepTranslit.lm.score(sent) resp[i] = (sent, score) resp = sorted(resp, key=lambda x: x[1], reverse=True) if top: resp = resp[:top] return resp
Class variables
var lm
var model
var params
var rank
-
str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.str() (if defined) or repr(object). encoding defaults to sys.getdefaultencoding(). errors defaults to 'strict'.
var words
Methods
def transliterate(self, sent, top=3)
-
Transliterate an input sentence while preserving punctuation at word or sentence endings.
Parameters:
sent (str): Sentence to be transliterated.
top (int): top-n results to be returned. if 0 or None, all results will be returned.
Returns:
list: returns list of tuples of size 2 with first element of each tuple being the transliterated sentence and second element being the "score"
Expand source code
def transliterate(self, sent, top=3): """ Transliterate an input sentence while preserving punctuation at word or sentence endings. Parameters: sent (str): Sentence to be transliterated. top (int): top-n results to be returned. if 0 or None, all results will be returned. Returns: list: returns list of tuples of size 2 with first element of each tuple being the transliterated sentence and second element being the "score" """ rank = DeepTranslit.rank words = sent.strip().split() puncs = [] for i, word in enumerate(words): words[i] = re.sub('[' + string.punctuation + ']', '', word.lower()) if not words[i]: continue punc = None if word[-1] in string.punctuation: punc = word[-1] puncs.append(punc) words = [w for w in words if w] np_words = [] for i, word in enumerate(words): if [c for c in word if c not in DeepTranslit.params['input_encoding']]: np_words.append((i - len(np_words), word)) words[i] = None words = [w for w in words if w] preds = infer(words, DeepTranslit.model, DeepTranslit.params) for posi, np_word in np_words: preds = preds[:posi] + [[{'sequence': np_word, 'prob': 1}]] + preds[posi:] resp = [] preds = list(itertools.product(*preds)) for pred in preds: words = [w['sequence'] for w in pred] for i, word in enumerate(words): if puncs[i]: word = word + puncs[i] words[i] = word probs = [w['prob'] for w in pred] sent = ' '.join(words) resp.append((sent, probs)) if rank == 'auto': if kenlm_available: rank = 'kenlm' else: rank = 'simple' if rank == 'simple': for i, (sent, probs) in enumerate(resp): words = sent.split() score = sum([1 for word in words if word in DeepTranslit.words]) resp[i] = (sent, score) resp = sorted(resp, key=lambda x: x[1], reverse=True) elif rank == 'kenlm': if not kenlm_available: logging.error("KenLm not available") return resp for i, (sent, probs) in enumerate(resp): score = DeepTranslit.lm.score(sent) resp[i] = (sent, score) resp = sorted(resp, key=lambda x: x[1], reverse=True) if top: resp = resp[:top] return resp