Module embeddingsprep.preprocessing.utils.structure
Structure Utils
Functions changing the text data structure
Expand source code
"""
# Structure Utils
Functions changing the text data structure
"""
from nltk import ngrams
from collections import Counter
def melt_vocab_dic(vocab1, vocab2):
"""Melts vocab dictionnary by adding the counts of each words.
Vocab dictionnaries must be s.t.:
vocab = {'word' : count_occurencies_word}
Args:
vocab1 : dic
vocab2 : dic
Returns:
vocab : dic
"""
for word in vocab2:
if word in vocab1:
vocab1[word] = vocab1[word] + vocab2[word]
else:
vocab1[word] = vocab2[word]
return vocab1
def get_unigram_voc(text):
"""
Builds a dictionnary, batch per batch, of unique unigrams & their
occurences.
Args:
text : str
a text that is assumed to be cleaned with the preprocessors cleaning
method
Returns:
vocabulary : dic
a dictionnary {'word' : count_word}
"""
words = text.split(" ") # That's where we assume the text has been
# cleaned with the cleaning method, otherwise the 'split' leads to bad
# tokenisation
vocabulary = dict(Counter(words))
del words # Avoids out of memory problems while multiprocessing
return vocabulary
def get_bigram_voc(text, parsing_char):
"""
Builds a dictionnary, of unique bigrams & their occurences.
Args:
text : str
a text that is assumed to be cleaned with the preprocessor cleaning
method
Returns:
vocab : dic
a dictionnary {'bigram' : count_bigram}
"""
words = text.split(" ")
bigrams = ngrams(words, 2)
big_list = []
for i in bigrams:
big_list.append(parsing_char.join(i))
vocabulary = dict(Counter(big_list))
del bigrams
del words
del big_list
return vocabulary
Functions
def get_bigram_voc(text, parsing_char)
-
Builds a dictionnary, of unique bigrams & their occurences.
Args
text
:str
- a text that is assumed to be cleaned with the preprocessor cleaning method
Returns
vocab
:dic
- a dictionnary {'bigram' : count_bigram}
Expand source code
def get_bigram_voc(text, parsing_char): """ Builds a dictionnary, of unique bigrams & their occurences. Args: text : str a text that is assumed to be cleaned with the preprocessor cleaning method Returns: vocab : dic a dictionnary {'bigram' : count_bigram} """ words = text.split(" ") bigrams = ngrams(words, 2) big_list = [] for i in bigrams: big_list.append(parsing_char.join(i)) vocabulary = dict(Counter(big_list)) del bigrams del words del big_list return vocabulary
def get_unigram_voc(text)
-
Builds a dictionnary, batch per batch, of unique unigrams & their occurences.
Args
text
:str
- a text that is assumed to be cleaned with the preprocessors cleaning method
Returns
vocabulary
:dic
- a dictionnary {'word' : count_word}
Expand source code
def get_unigram_voc(text): """ Builds a dictionnary, batch per batch, of unique unigrams & their occurences. Args: text : str a text that is assumed to be cleaned with the preprocessors cleaning method Returns: vocabulary : dic a dictionnary {'word' : count_word} """ words = text.split(" ") # That's where we assume the text has been # cleaned with the cleaning method, otherwise the 'split' leads to bad # tokenisation vocabulary = dict(Counter(words)) del words # Avoids out of memory problems while multiprocessing return vocabulary
def melt_vocab_dic(vocab1, vocab2)
-
Melts vocab dictionnary by adding the counts of each words. Vocab dictionnaries must be s.t.: vocab = {'word' : count_occurencies_word}
Args
vocab1
:dic
vocab2
:dic
Returns
vocab
:dic
Expand source code
def melt_vocab_dic(vocab1, vocab2): """Melts vocab dictionnary by adding the counts of each words. Vocab dictionnaries must be s.t.: vocab = {'word' : count_occurencies_word} Args: vocab1 : dic vocab2 : dic Returns: vocab : dic """ for word in vocab2: if word in vocab1: vocab1[word] = vocab1[word] + vocab2[word] else: vocab1[word] = vocab2[word] return vocab1