seq2seq를 공부하고 있습니다.
기존의 자료들은 모두가 과거의 pytorch의 Field와 BucketIterator를 쓰고 있어서 다른 방법으로 하고 싶어서 만들어 보고 있습니다.
대다수가 multi30K 같은 pytorch에서 제공하는 데이터셋을 바탕으로 하고 있어서 custum Dataset으로 하고 싶어서 작성하고 있는데 몇가지 문제점이 있어 도움을 구합니다.
어디서부터 보여드려야 제게 도움이 될까 모르겠어서 tokenizer와 Dataset, DataLoader를 보여드려야 할 것 같습니다.
import re
from torchtext.vocab import build_vocab_from_iterator
def yield_token(spacy, language_dataset):
for sentence in language_dataset:
new_string = re.sub(r"[^a-zA-Z0-9 ]","",sentence)
yield [tok.text for tok in spacy.tokenizer(new_string)]
en_vocab = build_vocab_from_iterator(yield_token(spacy_en, dataset['Go.']), specials=['<unk>', '<sos>', '<eos>', '<pad>'])
ger_vocab = build_vocab_from_iterator(yield_token(spacy_ger, dataset['Geh.']), specials=['<unk>', '<sos>', '<eos>', '<pad>'])
from torch.utils.data import Dataset, DataLoader
import re
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
class Translation_Dataset(Dataset):
def __init__(self, data, en_corpus, ger_corpus, spacy_en, spacy_ger):
super(Translation_Dataset, self).__init__()
self.dataset = data
self.en_corpus = en_corpus
self.ger_corpus = ger_corpus
self.spacy_en = spacy_en
self.spacy_ger = spacy_ger
print(self.en_corpus['Hi'])
def __len__(self):
return len(self.dataset)
def __getitem__(self, item):
return self.tokenizer_eng(self.dataset['Go.'][item]), self.tokenizer_ger(self.dataset['Geh.'][item])
def tokenizer_ger(self, text: str):
new_string = re.sub(r"[^a-zA-Z0-9 ]","",text)
return [self.ger_corpus['<sos>']] + [self.ger_corpus[word] for word in [tok.text for tok in self.spacy_ger.tokenizer(new_string)]],\
[self.ger_corpus[word] for word in [tok.text for tok in self.spacy_ger.tokenizer(new_string)]] + [self.ger_corpus['<eos>']]
# return new_string
def tokenizer_eng(self, text: str):
new_string = re.sub(r"[^a-zA-Z0-9 ]","",text)
return [self.en_corpus[word] for word in [tok.text for tok in self.spacy_en.tokenizer(new_string)]] + [self.en_corpus['<eos>']]
# return new_string
def collate_fn(batch_size):
input_en_list, input_ger_list, ouput_ger_list = [], [], []
for input_en, (input_ger, output_ger) in batch_size:
input_en = torch.tensor(input_en)
input_en_list.append(input_en)
input_ger = torch.tensor(input_ger)
input_ger_list.append(input_ger)
output_ger = torch.tensor(output_ger)
ouput_ger_list.append(output_ger)
input_en_tensors = pad_sequence(input_en_list, batch_first=True, padding_value=en_vocab['<pad>'])
input_ger_tensors = pad_sequence(input_ger_list, batch_first=True, padding_value=ger_vocab['<pad>'])
output_ger_tensors = pad_sequence(ouput_ger_list, batch_first=True, padding_value=ger_vocab['<pad>'])
return input_en_tensors, input_ger_tensors, output_ger_tensors
from sklearn.model_selection import train_test_split
train_set, val_set = train_test_split(dataset, test_size=0.4, random_state=5555, shuffle=True)
val_set, test_set = train_test_split(val_set, test_size=0.5, random_state=5555, shuffle=True)
train_dataset = Translation_Dataset(train_set, en_vocab, ger_vocab, spacy_en, spacy_ger)
val_dataset = Translation_Dataset(val_set, en_vocab, ger_vocab, spacy_en, spacy_ger)
test_dataset = Translation_Dataset(test_set, en_vocab, ger_vocab, spacy_en, spacy_ger)
train_dataLoader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
val_dataLoader = DataLoader(val_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
test_dataLoader = DataLoader(test_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
sample_set = Translation_Dataset(dataset, en_vocab, ger_vocab, spacy_en, spacy_ger)
sampleLoader = DataLoader(sample_set, batch_size=2, shuffle=True, collate_fn=collate_fn)
class Encoder(nn.Module):
def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
super(Encoder, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.embedding = nn.Embedding(input_size, embedding_size)
self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
def forward(self, x):
print("in Encoder through parameter x's size:\t", x.size())
embedding = self.embedding(x)
outputs, (hidden, cell) = self.rnn(embedding)
print("in Encoder output's size:\t", outputs.shape)
print("in Encoder hidden's size:\t", hidden.shape)
print("in Encoder cell's size:\t", cell.shape, end='\n\n')
return hidden, cell
class Decoder(nn.Module):
def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, p):
super(Decoder, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.dropout = nn.Dropout(p)
self.embedding = nn.Embedding(input_size, embedding_size)
self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers)
self.fc = nn.Linear(hidden_size, output_size)
def forward(self, x, hidden, cell):
print('size of x: ', x.size())
embedding = self.embedding(x)
print('size of embedding: ', embedding.size())
print('size of hidden:\t', hidden.size())
print('size of cell:\t', cell.size())
outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
raise ValueError('sorry')
return x
여기서 부터 제 문제점이 나옵니다.
encoder_model = Encoder(input_size=len(en_vocab), embedding_size=256, hidden_size=256, num_layers=5, p=0.1)
decoder_model = Decoder(input_size=len(ger_vocab), embedding_size=256, hidden_size=256, output_size=len(ger_vocab) ,num_layers=5, p=0.1)
for en_input, ger_input, ger_output in sampleLoader:
en_input = en_input.long().to(device)
ger_input = ger_input.long().to(device)
ger_output = ger_output.long().to(device)
hidden, cell = Encoder(en_input)
output = Decoder(ger_input, hidden, cell)
제가 고민하는 부분은 Encoder에서 반환된 hidden, cell이 이제 Decoder으로 들어가서 LSTM층을 통과할 때 문제가 생깁니다.
Decoder는 Encoder에서 나온 hidden, cell을 받아서 LSTM층에 넣어서 연산을 해야 하는데 tensor 가 항상 맞지 않습니다. 초보자인 제 생각은 함수 collate_fn에서 문제가 있는 것이 아닌가 생각하고 있습니다.
어떻게 하면 tensor 문제를 해결할 수 있는지 모르겠습니다.