Module embeddingsprep.models.word2vec
Word2Vec
Wrapper for gensim's implementation.
Outputs embeddings to a tensorflow projector-readable format.
Expand source code
"""
# Word2Vec
Wrapper for gensim's implementation.
Outputs embeddings to a tensorflow projector-readable format.
"""
import os
import gensim
import json
from glob import glob
from multiprocessing import cpu_count
from models.utils import read_files, open_vocab
class Word2Vec(object):
"""
The class implementing a word2vec model.
"""
def __init__(
self,
emb_size=200,
window=10,
epochs=10,
min_count=1,
restore_from=False,
):
"""Instantiate a word2vec model.
Args:
emb_size : int
The size/dimension of the embedding to train. default : 200
window : int
The size of the window to train the embeddings on. default : 10
epoch : int
Number of epochs for training. default : 10
min_count : int
The minimum number of occurences a word must have to be
included in the vocabulary. default : 1
restore_from : bool or str
False if the model is initialized from scratch. Path to a
gensim word2vec model if the model is to be restored.
"""
self.emb_size = emb_size
self.window = window
self.epochs = epochs
self.min_count = min_count
self.restore_from = restore_from
def train(self, filenames):
""" Trains the model with specified params.
Args:
filenames : str or list of str
Path of file or directory to find the files to train the model on. Files should consist of plain, raw text, preprocessed with the Preprocessor.
"""
if type(filenames) == list:
filenames = filenames
elif type(filenames) == str:
if os.path.isdir(filenames):
filenames = glob(os.path.join(filenames, "*"))
elif os.path.isfile(filenames):
filenames = [filenames]
else:
raise TypeError
else:
raise TypeError
sents = read_files(filenames)
if self.restore_from:
assert os.path.isfile(
self.restore_from
), "The path to restore model is not a file. Aborting"
self.model = gensim.models.Word2Vec.load(self.restore_from)
else:
self.model = gensim.models.Word2Vec(
sents,
size=self.emb_size,
window=self.window,
min_count=self.min_count,
workers=cpu_count(),
)
self.model.train(sents, total_examples=len(sents), epochs=self.epochs)
def save(self, dir):
"""
Saves the model in the given dir, all together with the correct embeddings.tsv, metadata.tsv files
Args:
dir : str
the path to the saving directory
"""
if not (os.path.isdir(dir)):
os.mkdir(dir)
self.model.save(os.path.join(dir, "word2vec.model"))
try:
vocab_path = glob(dir, "*.json")[0]
vocab = open_vocab(vocab_path)
except:
vocab_keys = list(self.model.wv.vocab.keys())
vocab = {vocab_keys[i]: i for i in range(len(vocab_keys))}
inv_vocab = {v: k for k, v in vocab.items()}
METADATA_PATH = os.path.join(dir, "metadata.tsv")
VECTOR_PATHS = os.path.join(dir, "embeddings.tsv")
with open(METADATA_PATH, "w", encoding="utf-8") as metadata:
with open(VECTOR_PATHS, "w", encoding="utf-8") as vectors:
metadata.write("WORD\tINDEX\n")
for i in range(len(vocab)):
try:
vector = self.model.wv[inv_vocab[i]]
metadata.write(
str(inv_vocab[i]) + "\t" + str(i) + "\n"
)
n = len(vector)
for j in range(n):
if j == (n - 1):
vectors.write(str(vector[j]) + "\n")
else:
vectors.write(str(vector[j]) + "\t")
except:
print(
"{0} not in vocabulary. Passing. \n".format(
inv_vocab[i]
)
)
pass
Classes
class Word2Vec (emb_size=200, window=10, epochs=10, min_count=1, restore_from=False)
-
The class implementing a word2vec model.
Instantiate a word2vec model.
Args
emb_size
:int
- The size/dimension of the embedding to train. default : 200
window
:int
- The size of the window to train the embeddings on. default : 10
epoch
:int
- Number of epochs for training. default : 10
min_count
:int
- The minimum number of occurences a word must have to be included in the vocabulary. default : 1
restore_from
:bool
orstr
- False if the model is initialized from scratch. Path to a gensim word2vec model if the model is to be restored.
Expand source code
class Word2Vec(object): """ The class implementing a word2vec model. """ def __init__( self, emb_size=200, window=10, epochs=10, min_count=1, restore_from=False, ): """Instantiate a word2vec model. Args: emb_size : int The size/dimension of the embedding to train. default : 200 window : int The size of the window to train the embeddings on. default : 10 epoch : int Number of epochs for training. default : 10 min_count : int The minimum number of occurences a word must have to be included in the vocabulary. default : 1 restore_from : bool or str False if the model is initialized from scratch. Path to a gensim word2vec model if the model is to be restored. """ self.emb_size = emb_size self.window = window self.epochs = epochs self.min_count = min_count self.restore_from = restore_from def train(self, filenames): """ Trains the model with specified params. Args: filenames : str or list of str Path of file or directory to find the files to train the model on. Files should consist of plain, raw text, preprocessed with the Preprocessor. """ if type(filenames) == list: filenames = filenames elif type(filenames) == str: if os.path.isdir(filenames): filenames = glob(os.path.join(filenames, "*")) elif os.path.isfile(filenames): filenames = [filenames] else: raise TypeError else: raise TypeError sents = read_files(filenames) if self.restore_from: assert os.path.isfile( self.restore_from ), "The path to restore model is not a file. Aborting" self.model = gensim.models.Word2Vec.load(self.restore_from) else: self.model = gensim.models.Word2Vec( sents, size=self.emb_size, window=self.window, min_count=self.min_count, workers=cpu_count(), ) self.model.train(sents, total_examples=len(sents), epochs=self.epochs) def save(self, dir): """ Saves the model in the given dir, all together with the correct embeddings.tsv, metadata.tsv files Args: dir : str the path to the saving directory """ if not (os.path.isdir(dir)): os.mkdir(dir) self.model.save(os.path.join(dir, "word2vec.model")) try: vocab_path = glob(dir, "*.json")[0] vocab = open_vocab(vocab_path) except: vocab_keys = list(self.model.wv.vocab.keys()) vocab = {vocab_keys[i]: i for i in range(len(vocab_keys))} inv_vocab = {v: k for k, v in vocab.items()} METADATA_PATH = os.path.join(dir, "metadata.tsv") VECTOR_PATHS = os.path.join(dir, "embeddings.tsv") with open(METADATA_PATH, "w", encoding="utf-8") as metadata: with open(VECTOR_PATHS, "w", encoding="utf-8") as vectors: metadata.write("WORD\tINDEX\n") for i in range(len(vocab)): try: vector = self.model.wv[inv_vocab[i]] metadata.write( str(inv_vocab[i]) + "\t" + str(i) + "\n" ) n = len(vector) for j in range(n): if j == (n - 1): vectors.write(str(vector[j]) + "\n") else: vectors.write(str(vector[j]) + "\t") except: print( "{0} not in vocabulary. Passing. \n".format( inv_vocab[i] ) ) pass
Methods
def save(self, dir)
-
Saves the model in the given dir, all together with the correct embeddings.tsv, metadata.tsv files
Args
dir
:str
- the path to the saving directory
Expand source code
def save(self, dir): """ Saves the model in the given dir, all together with the correct embeddings.tsv, metadata.tsv files Args: dir : str the path to the saving directory """ if not (os.path.isdir(dir)): os.mkdir(dir) self.model.save(os.path.join(dir, "word2vec.model")) try: vocab_path = glob(dir, "*.json")[0] vocab = open_vocab(vocab_path) except: vocab_keys = list(self.model.wv.vocab.keys()) vocab = {vocab_keys[i]: i for i in range(len(vocab_keys))} inv_vocab = {v: k for k, v in vocab.items()} METADATA_PATH = os.path.join(dir, "metadata.tsv") VECTOR_PATHS = os.path.join(dir, "embeddings.tsv") with open(METADATA_PATH, "w", encoding="utf-8") as metadata: with open(VECTOR_PATHS, "w", encoding="utf-8") as vectors: metadata.write("WORD\tINDEX\n") for i in range(len(vocab)): try: vector = self.model.wv[inv_vocab[i]] metadata.write( str(inv_vocab[i]) + "\t" + str(i) + "\n" ) n = len(vector) for j in range(n): if j == (n - 1): vectors.write(str(vector[j]) + "\n") else: vectors.write(str(vector[j]) + "\t") except: print( "{0} not in vocabulary. Passing. \n".format( inv_vocab[i] ) ) pass
def train(self, filenames)
-
Trains the model with specified params.
Args
filenames
:str
orlist
ofstr
- Path of file or directory to find the files to train the model on. Files should consist of plain, raw text, preprocessed with the Preprocessor.
Expand source code
def train(self, filenames): """ Trains the model with specified params. Args: filenames : str or list of str Path of file or directory to find the files to train the model on. Files should consist of plain, raw text, preprocessed with the Preprocessor. """ if type(filenames) == list: filenames = filenames elif type(filenames) == str: if os.path.isdir(filenames): filenames = glob(os.path.join(filenames, "*")) elif os.path.isfile(filenames): filenames = [filenames] else: raise TypeError else: raise TypeError sents = read_files(filenames) if self.restore_from: assert os.path.isfile( self.restore_from ), "The path to restore model is not a file. Aborting" self.model = gensim.models.Word2Vec.load(self.restore_from) else: self.model = gensim.models.Word2Vec( sents, size=self.emb_size, window=self.window, min_count=self.min_count, workers=cpu_count(), ) self.model.train(sents, total_examples=len(sents), epochs=self.epochs)