-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataset_generator.py
101 lines (78 loc) · 2.91 KB
/
dataset_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import h5py
import numpy as np
###### Parameters #####
TEXT_FILE = 'bookcorpus.txt' # One sentence per line, sentences must already be tokenized.
MAX_SENTENCE_LENGTH = 100
DATASET_FILE = 'bookcorpus.hdf5'
VOCABULARY_FILE = 'vocabulary.txt'
TRAIN_RATIO = 0.8
VALIDATION_RATIO = 0.1 # test ratio is then determined by leftovers, in this case would be 0.1
###### Load text file #####
voc = {}
sentences = {}
for i in range(2, MAX_SENTENCE_LENGTH + 1):
sentences[i] = []
with open(TEXT_FILE, 'r') as f:
for line in f:
sent = line.rstrip().split()
len_sent = len(sent)
if len_sent >= 2 and len_sent <= MAX_SENTENCE_LENGTH:
sentences[len_sent].append(sent)
for word in sent:
voc[word] = voc.get(word, 0) + 1
print(f'Found {len(voc)} unique tokens.')
###### Load GloVe embeddings #####
glove_embeddings = {}
with open('glove.42B.300d.txt') as f:
for line in f:
word, vec = line.split(' ', 1)
if word in voc:
glove_embeddings[word] = np.array(list(map(float, vec.split())))
are_in = 0
not_in = 0
for k, v in voc.items():
if k in glove_embeddings:
are_in += v
else:
not_in += v
print(f'{(are_in * 100) / (are_in + not_in)}% of the tokens have an embedding')
print(f"{(not_in * 100) / (are_in + not_in)}% of the tokens don't")
###### Set token ids #####
ids = {}
count = 1
for k in glove_embeddings.keys():
if k not in ids:
ids[k] = count
count += 1
print(f'{len(ids)} token ids (vocabulary size).')
###### Create embeddings dataset #####
dataset = h5py.File(DATASET_FILE,'w')
emb_dataset = np.zeros((len(ids) + 1, 300), dtype='float32')
for k, v in ids.items():
emb_dataset[v] = np.array(glove_embeddings[k], dtype='float32')
dataset.create_dataset("embeddings", data=emb_dataset)
###### Save vocabulary #####
voc_pair = []
for k, v in ids.items():
voc_pair.append((v, k))
voc_pair.sort()
with open(VOCABULARY_FILE, 'w') as f:
f.write('<UNKNOWN> ')
for _id, word in voc_pair:
f.write(f'{word} ')
###### Save sequences to dataset #####
for k, v in sentences.items():
ided_sentences = np.zeros((len(v), k), dtype=np.uint32)
for i, sent in enumerate(v):
for j, word in enumerate(sent):
ided_sentences[i, j] = ids.get(word, 0)
np.random.shuffle(ided_sentences)
train_size = int(ided_sentences.shape[0] * TRAIN_RATIO)
validation_size = int(ided_sentences.shape[0] * VALIDATION_RATIO)
if train_size > 0:
dataset.create_dataset(f'train/{k}', data=ided_sentences[:train_size])
if validation_size > 0:
dataset.create_dataset(f'validation/{k}', data=ided_sentences[train_size : train_size + validation_size])
if (train_size + validation_size) < ided_sentences.shape[0]:
dataset.create_dataset(f'test/{k}', data=ided_sentences[train_size + validation_size:])
dataset.close()