728x90
반응형
# imdb 리뷰 5만개, 50% 긍정, 부정, 전처리 완료 : 내용이숫자화
from tensorflow.keras.datasets import imdb
num_words = 10000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = num_words)
print(x_train.shape, x_test.shape)
# (25000,) (25000,)
print(x_train[0])
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]
igwi = {}
for key, value in imdb.get_word_index().items() :
igwi[value] = key
for i in range(1, 6) :
print('{}번째 가장 높은 빈도단어는 {}'.format(i, igwi[i]))
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
1646592/1641221 [==============================] - 0s 0us/step
1번째 가장 높은 빈도단어는 the
2번째 가장 높은 빈도단어는 and
3번째 가장 높은 빈도단어는 a
4번째 가장 높은 빈도단어는 of
5번째 가장 높은 빈도단어는 to
print(y_train[:10])
# [1 0 0 1 0 0 1 0 1 0]
import numpy as np
lengths = np.array([len(x) for x in x_train])
print(np.mean(lengths), np.median(lengths))
# 238.71364 178.0
# 히스토그램으로 단어 갯수
import matplotlib.pyplot as plt
plt.hist(lengths)
plt.xlabel('length')
plt.ylabel('frequency')
plt.show()
# padding
from tensorflow.keras.preprocessing.sequence import pad_sequences
a1 = [[1,2,3]]
a2 = [[1,2,3,4,5,6,7,8]]
a1_post = pad_sequences(a1, maxlen=5, padding = 'post')
a2_post = pad_sequences(a2, maxlen=5, padding = 'post')
print(a1_post)
print(a2_post)
# [[1 2 3 0 0]]
# [[4 5 6 7 8]]
# 분석을 위해 데이터의 길이를 통일하게 처리 : padding
# 패딩 : 데이터 길이기 지정 길이보다 짧으면 0으로 채움
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len = 500
print('befor pad', len(x_train[0])) # 218
# pre 앞쪽을 0으로
pad_x_train = pad_sequences(x_train, maxlen = max_len, padding = 'pre')
pad_x_test = pad_sequences(x_test, maxlen = max_len, padding = 'pre')
print('after pad', len(pad_x_train[0]))
# befor pad 218
# after pad 500
print('after pad',pad_x_train[0])
after pad [ 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 1 14 22 16 43 530 973 1622 1385 65 458 4468
66 3941 4 173 36 256 5 25 100 43 838 112 50 670
2 9 35 480 284 5 150 4 172 112 167 2 336 385
39 4 172 4536 1111 17 546 38 13 447 4 192 50 16
6 147 2025 19 14 22 4 1920 4613 469 4 22 71 87
12 16 43 530 38 76 15 13 1247 4 22 17 515 17
12 16 626 18 2 5 62 386 12 8 316 8 106 5
4 2223 5244 16 480 66 3785 33 4 130 12 16 38 619
5 25 124 51 36 135 48 25 1415 33 6 22 12 215
28 77 52 5 14 407 16 82 2 8 4 107 117 5952
15 256 4 2 7 3766 5 723 36 71 43 530 476 26
400 317 46 7 4 2 1029 13 104 88 4 381 15 297
98 32 2071 56 26 141 6 194 7486 18 4 226 22 21
134 476 26 480 5 144 30 5535 18 51 36 28 224 92
25 104 4 226 65 16 38 1334 88 12 16 283 5 16
4472 113 103 32 15 16 5345 19 178 32]
# embedding 층 : rnn 가장 기본층, 첫번째 층으로 사용됨
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten
model = Sequential()
# input_dim 입력 데이터 차수
# output_dim 출력 차수
# input_length 입력데이터 갯수
model.add(Embedding(input_dim = num_words, output_dim = 32, input_length = max_len))
model.add(Flatten())
model.add(Dense(1, activation = 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['acc'])
model.summary()
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_1 (Embedding) (None, 500, 32) 320000
_________________________________________________________________
flatten_1 (Flatten) (None, 16000) 0
_________________________________________________________________
dense_1 (Dense) (None, 1) 16001
=================================================================
Total params: 336,001
Trainable params: 336,001
Non-trainable params: 0
_________________________________________________________________
history = model.fit(pad_x_train, y_train, batch_size = 32, epochs = 30, validation_split = 0.2)
Epoch 1/30
625/625 [==============================] - 2s 3ms/step - loss: 0.4567 - acc: 0.7761 - val_loss: 0.3132 - val_acc: 0.8670
Epoch 2/30
625/625 [==============================] - 2s 3ms/step - loss: 0.2001 - acc: 0.9254 - val_loss: 0.2965 - val_acc: 0.8762
Epoch 3/30
625/625 [==============================] - 2s 3ms/step - loss: 0.1057 - acc: 0.9703 - val_loss: 0.3016 - val_acc: 0.8780
Epoch 4/30
625/625 [==============================] - 2s 3ms/step - loss: 0.0498 - acc: 0.9912 - val_loss: 0.3101 - val_acc: 0.8838
Epoch 5/30
625/625 [==============================] - 2s 3ms/step - loss: 0.0232 - acc: 0.9979 - val_loss: 0.3377 - val_acc: 0.8838
Epoch 6/30
625/625 [==============================] - 2s 3ms/step - loss: 0.0116 - acc: 0.9995 - val_loss: 0.3689 - val_acc: 0.8798
Epoch 7/30
625/625 [==============================] - 2s 3ms/step - loss: 0.0063 - acc: 0.9999 - val_loss: 0.3901 - val_acc: 0.8820
Epoch 8/30
625/625 [==============================] - 2s 3ms/step - loss: 0.0036 - acc: 0.9999 - val_loss: 0.4229 - val_acc: 0.8776
Epoch 9/30
625/625 [==============================] - 2s 3ms/step - loss: 0.0022 - acc: 1.0000 - val_loss: 0.4352 - val_acc: 0.8800
Epoch 10/30
625/625 [==============================] - 2s 3ms/step - loss: 0.0014 - acc: 1.0000 - val_loss: 0.4565 - val_acc: 0.8798
Epoch 11/30
625/625 [==============================] - 2s 3ms/step - loss: 9.0908e-04 - acc: 1.0000 - val_loss: 0.4762 - val_acc: 0.8786
Epoch 12/30
625/625 [==============================] - 2s 3ms/step - loss: 6.2279e-04 - acc: 1.0000 - val_loss: 0.4971 - val_acc: 0.8778
Epoch 13/30
625/625 [==============================] - 2s 3ms/step - loss: 4.0832e-04 - acc: 1.0000 - val_loss: 0.5179 - val_acc: 0.8774
Epoch 14/30
625/625 [==============================] - 2s 3ms/step - loss: 2.8227e-04 - acc: 1.0000 - val_loss: 0.5364 - val_acc: 0.8758
Epoch 15/30
625/625 [==============================] - 2s 3ms/step - loss: 1.9264e-04 - acc: 1.0000 - val_loss: 0.5530 - val_acc: 0.8768
Epoch 16/30
625/625 [==============================] - 2s 3ms/step - loss: 1.3397e-04 - acc: 1.0000 - val_loss: 0.5750 - val_acc: 0.8752
Epoch 17/30
625/625 [==============================] - 2s 3ms/step - loss: 9.3561e-05 - acc: 1.0000 - val_loss: 0.5911 - val_acc: 0.8766
Epoch 18/30
625/625 [==============================] - 2s 3ms/step - loss: 6.5905e-05 - acc: 1.0000 - val_loss: 0.6112 - val_acc: 0.8768
Epoch 19/30
625/625 [==============================] - 2s 3ms/step - loss: 4.5675e-05 - acc: 1.0000 - val_loss: 0.6288 - val_acc: 0.8758
Epoch 20/30
625/625 [==============================] - 2s 3ms/step - loss: 3.2030e-05 - acc: 1.0000 - val_loss: 0.6464 - val_acc: 0.8760
Epoch 21/30
625/625 [==============================] - 2s 3ms/step - loss: 2.2697e-05 - acc: 1.0000 - val_loss: 0.6652 - val_acc: 0.8746
Epoch 22/30
625/625 [==============================] - 2s 3ms/step - loss: 1.6130e-05 - acc: 1.0000 - val_loss: 0.6839 - val_acc: 0.8756
Epoch 23/30
625/625 [==============================] - 2s 3ms/step - loss: 1.1474e-05 - acc: 1.0000 - val_loss: 0.7017 - val_acc: 0.8754
Epoch 24/30
625/625 [==============================] - 2s 3ms/step - loss: 8.1176e-06 - acc: 1.0000 - val_loss: 0.7189 - val_acc: 0.8746
Epoch 25/30
625/625 [==============================] - 2s 3ms/step - loss: 5.8811e-06 - acc: 1.0000 - val_loss: 0.7372 - val_acc: 0.8748
Epoch 26/30
625/625 [==============================] - 2s 3ms/step - loss: 4.2489e-06 - acc: 1.0000 - val_loss: 0.7547 - val_acc: 0.8740
Epoch 27/30
625/625 [==============================] - 2s 3ms/step - loss: 3.0487e-06 - acc: 1.0000 - val_loss: 0.7724 - val_acc: 0.8750
Epoch 28/30
625/625 [==============================] - 2s 3ms/step - loss: 2.2476e-06 - acc: 1.0000 - val_loss: 0.7881 - val_acc: 0.8748
Epoch 29/30
625/625 [==============================] - 2s 3ms/step - loss: 1.6460e-06 - acc: 1.0000 - val_loss: 0.8056 - val_acc: 0.8746
Epoch 30/30
625/625 [==============================] - 2s 3ms/step - loss: 1.2241e-06 - acc: 1.0000 - val_loss: 0.8222 - val_acc: 0.8748
import matplotlib.pyplot as plt
plt.figure(figsize = (12, 4))
plt.subplot(1,2,1)
plt.plot(history.history['loss'], 'b-', label='loss')
plt.plot(history.history['val_loss'], 'r--', label='val_loss')
plt.xlabel('Epoch')
plt.legend()
plt.subplot(1,2,2)
plt.plot(history.history['acc'], 'g-', label='acc')
plt.plot(history.history['val_acc'], 'k--', label='val_acc')
plt.xlabel('Epoch')
plt.legend()
plt.show()
반응형
'Data_Science > Data_Analysis_Py' 카테고리의 다른 글
58. IMDB || SimpleRNN (0) | 2021.12.07 |
---|---|
57. seed || simpleRNN (0) | 2021.12.07 |
54. glob-clothes || conv 다중 분류 (0) | 2021.12.07 |
53. glob-clothes || 데이터셋만들기 (0) | 2021.12.07 |
52. ImageDataGenerator || 이미지 조회 (0) | 2021.12.07 |