-
Notifications
You must be signed in to change notification settings - Fork 0
/
token_embedding.py
146 lines (124 loc) · 5.21 KB
/
token_embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env python3.7
"""Token embedding for functional signatures, using probabilistic distribution of features."""
# Build-in modules
import argparse
import os
# Third-party modules
import numpy as np
import pickle
from tensorflow.keras.initializers import he_uniform
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop
# -- File info --
__version__ = '0.2.1'
__copyright__ = 'Andrzej Kucik 2019'
__author__ = 'Andrzej Kucik'
__maintainer__ = 'Andrzej Kucik'
__email__ = 'andrzej.kucik@gmail.com'
__date__ = '2019-08-09'
def embed_functions(path_to_output, path_to_models, path_to_histories=None, autoencoder=False,
dimension=256, epochs=150, batch_size=2048):
"""Function projecting functional signatures of premises into a lower dimensional space.
Arguments:
path_to_output - path to premise count functional signatures
autoencoder - if True, then autoencoder embedding will be the method of projection,
dimension - dimension of projection,
epochs - number of training epochs for the embedding model,
batch_size - batch size for the training of the embedding model,
save_model - if True, then model is saved to
path_to_current_working_directory/models/embedding_model.h5,
save_histories - if True then training history is saved to
path_to_current_working_directory/histories/embedding_model.pickle,
"""
# Get functional signatures
output_tensor = np.load(path_to_output, mmap_mode='r')
# Number of unique functions
num_fun = output_tensor.shape[-1]
# Define model parameters
if autoencoder:
output_tensor = output_tensor/ np.max(output_tensor, axis=-1)
input_tensor = output_tensor
activation = 'sigmoid'
loss = 'mse'
else:
input_tensor = np.identity(num_fun)
activation = 'softmax'
loss = 'categorical_crossentropy'
# Embedding model
model_input = Input(shape=(num_fun,), name='input')
embedding = Dense(dimension, kernel_initializer=he_uniform(), activation='tanh', name='embedding')(model_input)
model_output = Dense(num_fun, kernel_initializer=he_uniform(), activation=activation, name='output')(embedding)
model = Model(model_input, model_output, name='encoder_decoder')
model.summary()
# Compile model
model.compile(optimizer=RMSprop(decay=1e-8), loss=loss, metrics=['accuracy'])
# Train model
history = model.fit(input_tensor, output_tensor, epochs=epochs, batch_size=batch_size, shuffle=True)
# Save history
if path_to_histories is not None:
with open(os.path.join(path_to_histories, 'embedding_model_history.pickle'), 'wb') as dictionary:
pickle.dump(history.history, dictionary, protocol=pickle.HIGHEST_PROTOCOL)
# Save trained model
model.save(os.path.join(path_to_models, 'embedding_model.h5'))
def main():
# Argument parser
parser = argparse.ArgumentParser(description='Process arguments')
# Argument for recording path (SEF format)
parser.add_argument('-f',
'--functions',
required=True,
help='Path to functional signatures (numpy file)',
type=str)
parser.add_argument('-c',
'--context',
required=True,
help='Path to context vector',
type=str,
default=None)
parser.add_argument('-dim',
'--dimension',
required=False,
help='Dimension of the embedding.',
type=int,
default=256)
parser.add_argument('-e',
'--epochs',
required=False,
help='Number of training epochs.',
type=int,
default=150)
parser.add_argument('-bs',
'--batch_size',
required=False,
help='Embedding batch size.',
type=int,
default=2048)
# Parse arguments
args = vars(parser.parse_args())
functions_path = args['functions']
context_path = args['context']
dim = args['dimension']
epochs = args['epochs']
batch_size = args['batch_size']
# Checks
if not os.path.isfile(functions_path):
exit('Path to functions is not a file!')
if not functions_path[-4:] == '.npy':
exit('Path to functions is not a numpy array file!')
if type(dim) is not int:
exit('Dimension must be an integer.')
if dim <= 0:
exit('Dimension must be a positive integer.')
cwd = os.getcwd()
try:
os.mkdir('models')
except OSError:
pass
try:
os.mkdir('histories')
except OSError:
pass
embed_functions(path_to_output=functions_path, path_to_models=os.path.join(cwd, 'models'),
path_to_histories=os.path.join(cwd, 'histories'),
dimension=dim, epochs=epochs, batch_size=batch_size)