또르르's 개발 Story

[17-1] LSTM / GRU with PyTorch 본문

부스트캠프 AI 테크 U stage/실습

[17-1] LSTM / GRU with PyTorch

또르르21 2021. 2. 16. 23:34

1️⃣ 설정

 

필요한 모듈을 import 합니다.

from tqdm import tqdm

from torch import nn

from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# zero padding은 상당히 많은 부분을 차지하기 때문에 많은 메모리를 사용하게 됩니다. 이 문제를 해결하기 위해 나온 모듈 입니다.


import torch

 

 

2️⃣ 데이터 전처리

 

아래의 sample data를 확인해봅시다.
전체 단어 수와 pad token의 id도 아래와 같습니다.

vocab_size = 100

pad_id = 0			# 배치화를 시키기 위해서는 문장의 길이를 동일하게 만들어주어야함 => padding


data = [
  [85,14,80,34,99,20,31,65,53,86,3,58,30,4,11,6,50,71,74,13],
  [62,76,79,66,32],
  [93,77,16,67,46,74,24,70],
  [19,83,88,22,57,40,75,82,4,46],
  [70,28,30,24,76,84,92,76,77,51,7,20,82,94,57],
  [58,13,40,61,88,18,92,89,8,14,61,67,49,59,45,12,47,5],
  [22,5,21,84,39,6,9,84,36,59,32,30,69,70,82,56,1],
  [94,21,79,24,3,86],
  [80,80,33,63,34,63],
  [87,32,79,65,2,96,43,80,85,20,41,52,95,50,35,96,24,80]
]

Padding 처리를 해주면서 padding 전 길이도 저장합니다.

# 원래는 max값만 padding해주면 되지만

# PackedSquence에서는 원래의 길이도 저장해놓아야함


max_len = len(max(data, key=len))

print(f"Maximum sequence length: {max_len}")


valid_lens = []

for i, seq in enumerate(tqdm(data)):

  valid_lens.append(len(seq))
  
  if len(seq) < max_len:
  
    data[i] = seq + [pad_id] * (max_len - len(seq))
>>> print(data)

>>> print(valid_lens)


[[85, 14, 80, 34, 99, 20, 31, 65, 53, 86, 3, 58, 30, 4, 11, 6, 50, 71, 74, 13], [62, 76, 79, 66, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [93, 77, 16, 67, 46, 74, 24, 70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [19, 83, 88, 22, 57, 40, 75, 82, 4, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [70, 28, 30, 24, 76, 84, 92, 76, 77, 51, 7, 20, 82, 94, 57, 0, 0, 0, 0, 0], [58, 13, 40, 61, 88, 18, 92, 89, 8, 14, 61, 67, 49, 59, 45, 12, 47, 5, 0, 0], [22, 5, 21, 84, 39, 6, 9, 84, 36, 59, 32, 30, 69, 70, 82, 56, 1, 0, 0, 0], [94, 21, 79, 24, 3, 86, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [80, 80, 33, 63, 34, 63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [87, 32, 79, 65, 2, 96, 43, 80, 85, 20, 41, 52, 95, 50, 35, 96, 24, 80, 0, 0]]

[20, 5, 8, 10, 15, 18, 17, 6, 6, 18]
# B: batch size, L: maximum sequence length

batch = torch.LongTensor(data)  # (B, L)

batch_lens = torch.LongTensor(valid_lens)  # (B)


batch_lens, sorted_idx = batch_lens.sort(descending=True)

batch = batch[sorted_idx]


>>> print(batch)

>>> print(batch_lens)


tensor([[85, 14, 80, 34, 99, 20, 31, 65, 53, 86,  3, 58, 30,  4, 11,  6, 50, 71,
         74, 13],
        [58, 13, 40, 61, 88, 18, 92, 89,  8, 14, 61, 67, 49, 59, 45, 12, 47,  5,
          0,  0],
        [87, 32, 79, 65,  2, 96, 43, 80, 85, 20, 41, 52, 95, 50, 35, 96, 24, 80,
          0,  0],
        [22,  5, 21, 84, 39,  6,  9, 84, 36, 59, 32, 30, 69, 70, 82, 56,  1,  0,
          0,  0],
        [70, 28, 30, 24, 76, 84, 92, 76, 77, 51,  7, 20, 82, 94, 57,  0,  0,  0,
          0,  0],
        [19, 83, 88, 22, 57, 40, 75, 82,  4, 46,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [93, 77, 16, 67, 46, 74, 24, 70,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [94, 21, 79, 24,  3, 86,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [80, 80, 33, 63, 34, 63,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0],
        [62, 76, 79, 66, 32,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0]])
tensor([20, 18, 18, 17, 15, 10,  8,  6,  6,  5])

 

 

3️⃣ LSTM 사용

 

LSTM에선 cell state가 추가됩니다.

embedding_size = 256		

hidden_size = 512			# LSTM의 hidden size			

num_layers = 1				# 쌓을 LSTM layer의 개수

num_dirs = 1				 # 1: 단방향 RNN, 2: 양방향 RNN


embedding = nn.Embedding(vocab_size, embedding_size)

lstm = nn.LSTM(

    input_size=embedding_size,
    
    hidden_size=hidden_size,
    
    num_layers=num_layers,
    
    bidirectional=True if num_dirs > 1 else False
    
)


h_0 = torch.zeros((num_layers * num_dirs, batch.shape[0], hidden_size))  # (num_layers * num_dirs, B, d_h)

c_0 = torch.zeros((num_layers * num_dirs, batch.shape[0], hidden_size))  # (num_layers * num_dirs, B, d_h)
# d_w: word embedding size

batch_emb = embedding(batch)  # (B, L, d_w)


packed_batch = pack_padded_sequence(batch_emb.transpose(0, 1), batch_lens)


packed_outputs, (h_n, c_n) = lstm(packed_batch, (h_0, c_0)) # hidden state와 cell state를 pack 형태로 넣어줌

# output도 tuple형태로 나옴
>>> print(packed_outputs)

PackedSequence(data=tensor([[-0.0791, -0.0778, -0.0158,  ..., -0.0655,  0.0202, -0.0772],
        [ 0.1288, -0.0434, -0.0321,  ..., -0.1825,  0.0450,  0.0880],
        [ 0.0034,  0.0616,  0.1221,  ...,  0.0004, -0.0898,  0.0293],
        ...,
        [ 0.2479, -0.1265, -0.0316,  ...,  0.0169, -0.0107,  0.1590],
        [ 0.1038, -0.0857, -0.0716,  ...,  0.0389, -0.1226,  0.0512],
        [ 0.0316, -0.0520, -0.0757,  ..., -0.0191, -0.0528,  0.0836]],
       grad_fn=<CatBackward>), batch_sizes=tensor([10, 10, 10, 10, 10,  9,  7,  7,  6,  6,  5,  5,  5,  5,  5,  4,  4,  3,
         1,  1]), sorted_indices=None, unsorted_indices=None)


>>> print(packed_outputs[0].shape)

torch.Size([123, 512])

>>> print(h_n.shape)

torch.Size([1, 10, 512])

>>> print(c_n.shape)

torch.Size([1, 10, 512])
outputs, output_lens = pad_packed_sequence(packed_outputs)

# 원래 형태로 바꿔주기 위해 pad_packed_sequence를 이용

>>> print(outputs.shape)

torch.Size([20, 10, 512])

>>> print(output_lens)

tensor([20, 18, 18, 17, 15, 10,  8,  6,  6,  5])

 

 

4️⃣ GRU 사용

 

GRU는 cell state가 없어 RNN과 동일하게 사용 가능합니다.
GRU를 이용하여 LM(Language modeling) task를 수행합니다.

(즉, 이전과 다르게 전체 sequence를 넣는 것이 아닌 순차적으로 결과값을 가지고 와서 사용)

gru = nn.GRU(

    input_size=embedding_size,
    
    hidden_size=hidden_size,
    
    num_layers=num_layers,
    
    bidirectional=True if num_dirs > 1 else False
    
)
output_layer = nn.Linear(hidden_size, vocab_size)
input_id = batch.transpose(0, 1)[0, :]  # (B)

hidden = torch.zeros((num_layers * num_dirs, batch.shape[0], hidden_size))  # (1, B, d_h)

Teacher forcing (미리 결과를 알려줌) 없이 이전에 얻은 결과를 다음 input으로 이용합니다.

for t in range(max_len):

  input_emb = embedding(input_id).unsqueeze(0)  # (1, B, d_w)
  
  output, hidden = gru(input_emb, hidden)  # output: (1, B, d_h), hidden: (1, B, d_h)
  
  # GRU에 전체 input (input_emb)를 넣어주는 이유는 어짜피 cell 하나만 output할 것이기 때문에 GRU 자체에서 input을 하나만 사용함


  # V: vocab size
  
  output = output_layer(output)  # (1, B, V)
  
  probs, top_id = torch.max(output, dim=-1)  # probs: (1, B), top_id: (1, B)
  

  print("*" * 50)
  
  print(f"Time step: {t}")
  
  print(output.shape)
  
  print(probs.shape)
  
  print(top_id.shape)
  

  input_id = top_id.squeeze(0)  # (B)
**************************************************
Time step: 0
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
**************************************************
Time step: 1
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
**************************************************
Time step: 2
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
**************************************************
...
**************************************************
Time step: 18
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])
**************************************************
Time step: 19
torch.Size([1, 10, 100])
torch.Size([1, 10])
torch.Size([1, 10])

 

 

5️⃣ 양방향 및 여러 layer 사용

 

양방향 (2개 이상)의 layer를 쓰게 되면 놓친 부분이 있거나 넘어간 부분을 catch 할 수 있어 표현력이 늘어납니다.

num_layers = 2

num_dirs = 2		# 양방향

dropout=0.1


gru = nn.GRU(

    input_size=embedding_size,
    
    hidden_size=hidden_size,
    
    num_layers=num_layers,
    
    dropout=dropout,
    
    bidirectional=True if num_dirs > 1 else False
    
)

Bidirectional이 되었고 layer의 개수가 2로 늘었기 때문에 hidden state의 shape도 (4, B, d_h)가 됩니다.

# d_w: word embedding size, num_layers: layer의 개수, num_dirs: 방향의 개수

batch_emb = embedding(batch)  # (B, L, d_w)

h_0 = torch.zeros((num_layers * num_dirs, batch.shape[0], hidden_size))  # (num_layers(=2) * num_dirs(=2), B, d_h) = (4, B, d_h)

# 4는 첫번째 layer의 forward action, 첫번째 layer의 backward action, 두 번째 layer의 forward action, 두번째 layer의 backward action


packed_batch = pack_padded_sequence(batch_emb.transpose(0, 1), batch_lens)


packed_outputs, h_n = gru(packed_batch, h_0)
>>> print(packed_outputs)

PackedSequence(data=tensor([[-0.0158,  0.0130,  0.1449,  ..., -0.0821, -0.0987, -0.2085],
        [ 0.0433, -0.0432,  0.0775,  ...,  0.1127, -0.0365,  0.0422],
        [-0.0116, -0.0718,  0.0366,  ..., -0.1872, -0.1407,  0.2594],
        ...,
        [-0.0862,  0.1056, -0.1800,  ..., -0.1264,  0.1416,  0.1343],
        [ 0.0534,  0.1532, -0.0805,  ...,  0.0393, -0.0906,  0.1223],
        [ 0.0464,  0.1583, -0.0622,  ...,  0.0473,  0.0213,  0.0725]],
       grad_fn=<CatBackward>), batch_sizes=tensor([10, 10, 10, 10, 10,  9,  7,  7,  6,  6,  5,  5,  5,  5,  5,  4,  4,  3,
         1,  1]), sorted_indices=None, unsorted_indices=None)
         
         
>>> print(packed_outputs[0].shape)

torch.Size([123, 1024])

>>> print(h_n.shape)

torch.Size([4, 10, 512])
outputs, output_lens = pad_packed_sequence(packed_outputs)


>>> print(outputs.shape)  	# (L, B, num_dirs*d_h)

torch.Size([20, 10, 1024])	# torch.Size([20, 10, 1024]) => 1024는 순방향과 역방향이 concat이 되서 나오기 때문에


>>> print(output_lens)

tensor([20, 18, 18, 17, 15, 10,  8,  6,  6,  5])

각각의 결과물의 shape는 다음과 같습니다.

  • outputs: (max_len, batch_size, num_dir * hidden_size)
  • h_n: (num_layers*num_dirs, batch_size, hidden_size)
Comments