728x90
반응형
# imdb 리뷰 5만개, 50% 긍정, 부정, 전처리 완료 : 내용이숫자화
from tensorflow.keras.datasets import imdb
num_words = 10000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = num_words)
print(x_train.shape, x_test.shape)

# (25000,) (25000,)

 

print(x_train[0])

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]

 

igwi = {}
for key, value in imdb.get_word_index().items() :
    igwi[value] = key
for i in range(1, 6) :
    print('{}번째 가장 높은 빈도단어는 {}'.format(i, igwi[i]))
    
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
1646592/1641221 [==============================] - 0s 0us/step
1번째 가장 높은 빈도단어는 the
2번째 가장 높은 빈도단어는 and
3번째 가장 높은 빈도단어는 a
4번째 가장 높은 빈도단어는 of
5번째 가장 높은 빈도단어는 to

 

print(y_train[:10])

# [1 0 0 1 0 0 1 0 1 0]

 

import numpy as np
lengths = np.array([len(x) for x in x_train])
print(np.mean(lengths), np.median(lengths))

# 238.71364 178.0

 

# 히스토그램으로 단어 갯수
import matplotlib.pyplot as plt
plt.hist(lengths)
plt.xlabel('length')
plt.ylabel('frequency')
plt.show()

# padding
from tensorflow.keras.preprocessing.sequence import pad_sequences
a1 = [[1,2,3]]
a2 = [[1,2,3,4,5,6,7,8]]
a1_post = pad_sequences(a1, maxlen=5, padding = 'post')
a2_post = pad_sequences(a2, maxlen=5, padding = 'post')
print(a1_post)
print(a2_post)

# [[1 2 3 0 0]]
# [[4 5 6 7 8]]

 

# 분석을 위해 데이터의 길이를 통일하게 처리 : padding
# 패딩 : 데이터 길이기 지정 길이보다 짧으면 0으로 채움
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len = 500
print('befor pad', len(x_train[0])) # 218
# pre 앞쪽을 0으로
pad_x_train = pad_sequences(x_train, maxlen = max_len, padding = 'pre')
pad_x_test = pad_sequences(x_test, maxlen = max_len, padding = 'pre')
print('after pad', len(pad_x_train[0]))

# befor pad 218
# after pad 500

 

print('after pad',pad_x_train[0])

after pad [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    1   14   22   16   43  530  973 1622 1385   65  458 4468
   66 3941    4  173   36  256    5   25  100   43  838  112   50  670
    2    9   35  480  284    5  150    4  172  112  167    2  336  385
   39    4  172 4536 1111   17  546   38   13  447    4  192   50   16
    6  147 2025   19   14   22    4 1920 4613  469    4   22   71   87
   12   16   43  530   38   76   15   13 1247    4   22   17  515   17
   12   16  626   18    2    5   62  386   12    8  316    8  106    5
    4 2223 5244   16  480   66 3785   33    4  130   12   16   38  619
    5   25  124   51   36  135   48   25 1415   33    6   22   12  215
   28   77   52    5   14  407   16   82    2    8    4  107  117 5952
   15  256    4    2    7 3766    5  723   36   71   43  530  476   26
  400  317   46    7    4    2 1029   13  104   88    4  381   15  297
   98   32 2071   56   26  141    6  194 7486   18    4  226   22   21
  134  476   26  480    5  144   30 5535   18   51   36   28  224   92
   25  104    4  226   65   16   38 1334   88   12   16  283    5   16
 4472  113  103   32   15   16 5345   19  178   32]

 

# embedding 층 : rnn 가장 기본층, 첫번째 층으로 사용됨
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

model = Sequential()
# input_dim 입력 데이터 차수
# output_dim 출력 차수
# input_length 입력데이터 갯수
model.add(Embedding(input_dim = num_words, output_dim = 32, input_length = max_len)) 
model.add(Flatten())
model.add(Dense(1, activation = 'sigmoid'))

 

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_1 (Embedding)      (None, 500, 32)           320000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 16000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 16001     
=================================================================
Total params: 336,001
Trainable params: 336,001
Non-trainable params: 0
_________________________________________________________________

 

history = model.fit(pad_x_train, y_train, batch_size = 32, epochs = 30, validation_split = 0.2)


Epoch 1/30
625/625 [==============================] - 2s 3ms/step - loss: 0.4567 - acc: 0.7761 - val_loss: 0.3132 - val_acc: 0.8670
Epoch 2/30
625/625 [==============================] - 2s 3ms/step - loss: 0.2001 - acc: 0.9254 - val_loss: 0.2965 - val_acc: 0.8762
Epoch 3/30
625/625 [==============================] - 2s 3ms/step - loss: 0.1057 - acc: 0.9703 - val_loss: 0.3016 - val_acc: 0.8780
Epoch 4/30
625/625 [==============================] - 2s 3ms/step - loss: 0.0498 - acc: 0.9912 - val_loss: 0.3101 - val_acc: 0.8838
Epoch 5/30
625/625 [==============================] - 2s 3ms/step - loss: 0.0232 - acc: 0.9979 - val_loss: 0.3377 - val_acc: 0.8838
Epoch 6/30
625/625 [==============================] - 2s 3ms/step - loss: 0.0116 - acc: 0.9995 - val_loss: 0.3689 - val_acc: 0.8798
Epoch 7/30
625/625 [==============================] - 2s 3ms/step - loss: 0.0063 - acc: 0.9999 - val_loss: 0.3901 - val_acc: 0.8820
Epoch 8/30
625/625 [==============================] - 2s 3ms/step - loss: 0.0036 - acc: 0.9999 - val_loss: 0.4229 - val_acc: 0.8776
Epoch 9/30
625/625 [==============================] - 2s 3ms/step - loss: 0.0022 - acc: 1.0000 - val_loss: 0.4352 - val_acc: 0.8800
Epoch 10/30
625/625 [==============================] - 2s 3ms/step - loss: 0.0014 - acc: 1.0000 - val_loss: 0.4565 - val_acc: 0.8798
Epoch 11/30
625/625 [==============================] - 2s 3ms/step - loss: 9.0908e-04 - acc: 1.0000 - val_loss: 0.4762 - val_acc: 0.8786
Epoch 12/30
625/625 [==============================] - 2s 3ms/step - loss: 6.2279e-04 - acc: 1.0000 - val_loss: 0.4971 - val_acc: 0.8778
Epoch 13/30
625/625 [==============================] - 2s 3ms/step - loss: 4.0832e-04 - acc: 1.0000 - val_loss: 0.5179 - val_acc: 0.8774
Epoch 14/30
625/625 [==============================] - 2s 3ms/step - loss: 2.8227e-04 - acc: 1.0000 - val_loss: 0.5364 - val_acc: 0.8758
Epoch 15/30
625/625 [==============================] - 2s 3ms/step - loss: 1.9264e-04 - acc: 1.0000 - val_loss: 0.5530 - val_acc: 0.8768
Epoch 16/30
625/625 [==============================] - 2s 3ms/step - loss: 1.3397e-04 - acc: 1.0000 - val_loss: 0.5750 - val_acc: 0.8752
Epoch 17/30
625/625 [==============================] - 2s 3ms/step - loss: 9.3561e-05 - acc: 1.0000 - val_loss: 0.5911 - val_acc: 0.8766
Epoch 18/30
625/625 [==============================] - 2s 3ms/step - loss: 6.5905e-05 - acc: 1.0000 - val_loss: 0.6112 - val_acc: 0.8768
Epoch 19/30
625/625 [==============================] - 2s 3ms/step - loss: 4.5675e-05 - acc: 1.0000 - val_loss: 0.6288 - val_acc: 0.8758
Epoch 20/30
625/625 [==============================] - 2s 3ms/step - loss: 3.2030e-05 - acc: 1.0000 - val_loss: 0.6464 - val_acc: 0.8760
Epoch 21/30
625/625 [==============================] - 2s 3ms/step - loss: 2.2697e-05 - acc: 1.0000 - val_loss: 0.6652 - val_acc: 0.8746
Epoch 22/30
625/625 [==============================] - 2s 3ms/step - loss: 1.6130e-05 - acc: 1.0000 - val_loss: 0.6839 - val_acc: 0.8756
Epoch 23/30
625/625 [==============================] - 2s 3ms/step - loss: 1.1474e-05 - acc: 1.0000 - val_loss: 0.7017 - val_acc: 0.8754
Epoch 24/30
625/625 [==============================] - 2s 3ms/step - loss: 8.1176e-06 - acc: 1.0000 - val_loss: 0.7189 - val_acc: 0.8746
Epoch 25/30
625/625 [==============================] - 2s 3ms/step - loss: 5.8811e-06 - acc: 1.0000 - val_loss: 0.7372 - val_acc: 0.8748
Epoch 26/30
625/625 [==============================] - 2s 3ms/step - loss: 4.2489e-06 - acc: 1.0000 - val_loss: 0.7547 - val_acc: 0.8740
Epoch 27/30
625/625 [==============================] - 2s 3ms/step - loss: 3.0487e-06 - acc: 1.0000 - val_loss: 0.7724 - val_acc: 0.8750
Epoch 28/30
625/625 [==============================] - 2s 3ms/step - loss: 2.2476e-06 - acc: 1.0000 - val_loss: 0.7881 - val_acc: 0.8748
Epoch 29/30
625/625 [==============================] - 2s 3ms/step - loss: 1.6460e-06 - acc: 1.0000 - val_loss: 0.8056 - val_acc: 0.8746
Epoch 30/30
625/625 [==============================] - 2s 3ms/step - loss: 1.2241e-06 - acc: 1.0000 - val_loss: 0.8222 - val_acc: 0.8748

 

import matplotlib.pyplot as plt
plt.figure(figsize = (12, 4))
plt.subplot(1,2,1)
plt.plot(history.history['loss'], 'b-', label='loss')
plt.plot(history.history['val_loss'], 'r--', label='val_loss')
plt.xlabel('Epoch')
plt.legend()
plt.subplot(1,2,2)
plt.plot(history.history['acc'], 'g-', label='acc')
plt.plot(history.history['val_acc'], 'k--', label='val_acc')
plt.xlabel('Epoch')
plt.legend()
plt.show()

반응형

'Data_Science > Data_Analysis_Py' 카테고리의 다른 글

58. IMDB || SimpleRNN  (0) 2021.12.07
57. seed || simpleRNN  (0) 2021.12.07
54. glob-clothes || conv 다중 분류  (0) 2021.12.07
53. glob-clothes || 데이터셋만들기  (0) 2021.12.07
52. ImageDataGenerator || 이미지 조회  (0) 2021.12.07

+ Recent posts