#pycharm
#dataset contains lots of movie reviews, all of which are coded in a list of digits. There is a dictionary to translate those reviews to human readable words.
#goal is to predict whether reviews are positive or negative
import tensorflow as td
from tensorflow import keras
import numpy as np
data = keras.datasets.imdb
(train_data, train_labels),(test_data, test_labes) = data.load_data(num_words=10000)
print(train_data[0])
--------------------------------------------
#logs
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50...]
----------------------------------------
#pycharm
#look up dictionary to translate digits into words. define 0 means padding, 1 means start...
word_index = data.get_word_index()
word_index = {k:(v+3) for k, v in word_index.items()}
word_index['<padding>']=0
word_index['<start>']=1
word_index['<unknown>']=2
word_index['<unused>']=3
reverse_word_index = dict([(value,key) for (key, value) in word_index.items()])
def decode_review(text):
return ' '.join([reverse_word_index.get(i,'?') for i in text])
print(decode_review(test_data[0]))
-----------------------------------------
#logs
<start> please give this one a miss br br <unknown> <unknown> and the rest of the cast rendered terrible performances the show is flat flat flat br br i don't know how michael madison could have allowed this one on his plate he almost seemed to know this wasn't going to work ...
----------------------------------
#pycharm
#check word count of each review
print(len(test_data[0]), len(test_data[1]))
---------------------------
#logs
#review length are different
68 260
--------------------------------
#pycharm
#normalize reviews, set max word count 250, less than which, fill at end <padding>, more than that, truncate
...
reverse_word_index = dict([(value,key) for (key, value) in word_index.items()])
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=0, padding='post', maxlen=250)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=0, padding='post', maxlen=250)
def decode_review(text):
return ' '.join([reverse_word_index.get(i,'?') for i in text])
...
-------------------------------------------
#logs
#review 1 filled with padding, reivew 2 truncated, both size are 250 now
...you madison fans give this a miss <padding> <padding> <padding> <padding> ...
250 250
--------------------
reference:
https://www.youtube.com/watch?v=k-_pWoy2fb4&list=PLzMcBGfZo4-lak7tiFDec5_ZMItiIIfmj&index=5
https://www.tensorflow.org/tutorials/keras/basic_text_classification
No comments:
Post a Comment