Module embeddingsprep.models.utils
Utils for Word2Vec wrapper
Expand source code
"""
Utils for Word2Vec wrapper
"""
import json
def read_files(filenames):
"""
Reads a file line by line
Args:
filenames : list of str
a list of string containing the paths to the files used to learn the
embeddings
Returns:
sents : list of list of str
a list containing the words, line per line
"""
sents = []
for file in filenames:
with open(file, "r", encoding="utf-8") as f:
for line in f:
sents.append(line.replace("\n", "").split(" "))
return sents
def open_vocab(vocab_path):
"""
Opens the json containing the vocabulary used for the word2vec
Args:
vocab_path : str
the path where the vocab json file is stored
Returns:
vocab_dic : dic
the vocab dictionnary
"""
with open(vocab_path, "r", encoding="utf-8") as vo:
vocab_dic = json.load(vo, encoding="utf-8")
return vocab_dic
Functions
def open_vocab(vocab_path)
-
Opens the json containing the vocabulary used for the word2vec
Args
vocab_path
:str
- the path where the vocab json file is stored
Returns
vocab_dic
:dic
- the vocab dictionnary
Expand source code
def open_vocab(vocab_path): """ Opens the json containing the vocabulary used for the word2vec Args: vocab_path : str the path where the vocab json file is stored Returns: vocab_dic : dic the vocab dictionnary """ with open(vocab_path, "r", encoding="utf-8") as vo: vocab_dic = json.load(vo, encoding="utf-8") return vocab_dic
def read_files(filenames)
-
Reads a file line by line
Args
filenames
:list
ofstr
- a list of string containing the paths to the files used to learn the embeddings
Returns
sents
:list
oflist
ofstr
- a list containing the words, line per line
Expand source code
def read_files(filenames): """ Reads a file line by line Args: filenames : list of str a list of string containing the paths to the files used to learn the embeddings Returns: sents : list of list of str a list containing the words, line per line """ sents = [] for file in filenames: with open(file, "r", encoding="utf-8") as f: for line in f: sents.append(line.replace("\n", "").split(" ")) return sents