DEEP LEARNING SECTION - PREDICTING MOVIE GENRES FROM POSTERS¶

import numpy                  as np
import pandas                 as pd
import scipy                  as sp
import sklearn.neighbors      as knn
import matplotlib
import matplotlib.pyplot      as plt
import seaborn
import requests
import urllib
import joblib
import requests
import json
import os
import random
import seaborn as sns
#import statsmodels.api as sm
from matplotlib                           import rcParams
from sklearn                              import discriminant_analysis
from sklearn.decomposition                import PCA
from sklearn                              import preprocessing
from sklearn.linear_model                 import LogisticRegression as LogReg
from sklearn.discriminant_analysis        import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis        import QuadraticDiscriminantAnalysis as QDA
from scipy.stats                          import mode
from sklearn                              import linear_model
from IPython.core.interactiveshell        import InteractiveShell
from time                                 import sleep
from collections                          import Counter
from itertools                            import combinations, permutations

%matplotlib inline

Data clearning and visualization¶

#importing dataset and clean missing values and duplicates

import pandas as pd
df = pd.read_csv('MovieGenre.csv', encoding="ISO-8859-1", usecols=["imdbId", "Title", "Genre", "Poster"])
df.set_index(["imdbId"], inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(subset="Poster", keep=False, inplace=True)
df

# extract movie's year information as a separate column 

import re

re_year = re.compile("\((\d{4})\)")
df["year"] = df.Title.map(lambda x: int(re_year.findall(x)[0]) if re_year.findall(x) else None)
df

#separate genre types to multiple columns
df.columns
df[['maingenre','genre2','genre3']] = df.Genre.str.split("|",expand=True,)
df=df.head(10000)
df

/Users/krystal/opt/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py:3494: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]

df.count

<bound method DataFrame.count of                                      Title                       Genre  \
imdbId                                                                   
114709                    Toy Story (1995)  Animation|Adventure|Comedy   
113497                      Jumanji (1995)     Action|Adventure|Family   
113228             Grumpier Old Men (1995)              Comedy|Romance   
114885            Waiting to Exhale (1995)        Comedy|Drama|Romance   
113041  Father of the Bride Part II (1995)       Comedy|Family|Romance   
...                                    ...                         ...   
36260                  Phantom Lady (1944)       Crime|Drama|Film-Noir   
41088               Act of Violence (1949)    Drama|Film-Noir|Thriller   
93258                   Yan zhi kou (1987)         Drama|Fantasy|Music   
43809              Miracle in Milan (1951)        Comedy|Drama|Fantasy   
384369              Before the Fall (2004)             Drama|Sport|War   

                                                   Poster    year  maingenre  \
imdbId                                                                         
114709  https://images-na.ssl-images-amazon.com/images...  1995.0  Animation   
113497  https://images-na.ssl-images-amazon.com/images...  1995.0     Action   
113228  https://images-na.ssl-images-amazon.com/images...  1995.0     Comedy   
114885  https://images-na.ssl-images-amazon.com/images...  1995.0     Comedy   
113041  https://images-na.ssl-images-amazon.com/images...  1995.0     Comedy   
...                                                   ...     ...        ...   
36260   https://images-na.ssl-images-amazon.com/images...  1944.0      Crime   
41088   https://images-na.ssl-images-amazon.com/images...  1949.0      Drama   
93258   https://images-na.ssl-images-amazon.com/images...  1987.0      Drama   
43809   https://images-na.ssl-images-amazon.com/images...  1951.0     Comedy   
384369  https://images-na.ssl-images-amazon.com/images...  2004.0      Drama   

           genre2     genre3  
imdbId                        
114709  Adventure     Comedy  
113497  Adventure     Family  
113228    Romance       None  
114885      Drama    Romance  
113041     Family    Romance  
...           ...        ...  
36260       Drama  Film-Noir  
41088   Film-Noir   Thriller  
93258     Fantasy      Music  
43809       Drama    Fantasy  
384369      Sport        War  

[10000 rows x 7 columns]>

#count values for each genre
genre_df=df['maingenre'].value_counts()
genre_df

Comedy         2960
Drama          2418
Action         1379
Crime           768
Adventure       582
Horror          436
Biography       384
Documentary     383
Animation       251
Fantasy          88
Family           52
Western          50
Mystery          50
Thriller         48
Romance          47
Sci-Fi           47
Musical          20
Short            15
War              13
Film-Noir         5
Music             4
Name: maingenre, dtype: int64

#put genre value counts in dataframe
genre_count = pd.DataFrame(genre_df).reset_index()
genre_count

#rename genre and counts for plotting
genre_count.columns = ['genre','count']
genre_count

#plotting 
genre_count.sort_values(by="count", ascending=False, inplace=True)

plt.subplots(figsize=(8,5));
sns.barplot(x="genre", y="count", data=genre_count);
plt.xticks(rotation="vertical");
plt.xlabel("Genre");
plt.ylabel("Number of movies");
plt.title("Number of movies per Genre");
plt.ylim((0,4000));

Deep learning data pre-processing¶

#Imports
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout, BatchNormalization, Conv2D, MaxPool2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing import image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df.shape

(10000, 7)

df.columns

Index(['Title', 'Genre', 'Poster', 'year', 'maingenre', 'genre2', 'genre3'], dtype='object')

df2 = df.reset_index()
df2

df2.shape

(10000, 8)

df2.columns

Index(['imdbId', 'Title', 'Genre', 'Poster', 'year', 'maingenre', 'genre2',
       'genre3'],
      dtype='object')

df2['imdbId'] = df2['imdbId'].astype(str)

df2.info

<bound method DataFrame.info of       imdbId                               Title                       Genre  \
0     114709                    Toy Story (1995)  Animation|Adventure|Comedy   
1     113497                      Jumanji (1995)     Action|Adventure|Family   
2     113228             Grumpier Old Men (1995)              Comedy|Romance   
3     114885            Waiting to Exhale (1995)        Comedy|Drama|Romance   
4     113041  Father of the Bride Part II (1995)       Comedy|Family|Romance   
...      ...                                 ...                         ...   
9995   36260                 Phantom Lady (1944)       Crime|Drama|Film-Noir   
9996   41088              Act of Violence (1949)    Drama|Film-Noir|Thriller   
9997   93258                  Yan zhi kou (1987)         Drama|Fantasy|Music   
9998   43809             Miracle in Milan (1951)        Comedy|Drama|Fantasy   
9999  384369              Before the Fall (2004)             Drama|Sport|War   

                                                 Poster    year  maingenre  \
0     https://images-na.ssl-images-amazon.com/images...  1995.0  Animation   
1     https://images-na.ssl-images-amazon.com/images...  1995.0     Action   
2     https://images-na.ssl-images-amazon.com/images...  1995.0     Comedy   
3     https://images-na.ssl-images-amazon.com/images...  1995.0     Comedy   
4     https://images-na.ssl-images-amazon.com/images...  1995.0     Comedy   
...                                                 ...     ...        ...   
9995  https://images-na.ssl-images-amazon.com/images...  1944.0      Crime   
9996  https://images-na.ssl-images-amazon.com/images...  1949.0      Drama   
9997  https://images-na.ssl-images-amazon.com/images...  1987.0      Drama   
9998  https://images-na.ssl-images-amazon.com/images...  1951.0     Comedy   
9999  https://images-na.ssl-images-amazon.com/images...  2004.0      Drama   

         genre2     genre3  
0     Adventure     Comedy  
1     Adventure     Family  
2       Romance       None  
3         Drama    Romance  
4        Family    Romance  
...         ...        ...  
9995      Drama  Film-Noir  
9996  Film-Noir   Thriller  
9997    Fantasy      Music  
9998      Drama    Fantasy  
9999      Sport        War  

[10000 rows x 8 columns]>

#IMAGE PROCESSING CODE STARTS HERE

import os
from sklearn.model_selection import train_test_split

import urllib.request
from urllib.error import HTTPError

from PIL import Image

na_rows = df2[df2.Genre.str.contains('N/A') == True].index
df2 = df2.drop(na_rows, axis=0)

# Select the top 10000 rows of the Dataframe for 2 columns only
df2

Prepare X¶

# Transform images to arrays 
from tqdm import tqdm

img_width = 300
img_height = 350

X = []

for i in tqdm(range(df2.shape[0])):
      path = 'posters/' + df2['imdbId'][i] + '.jpg'
      img = image.load_img(path, target_size=(img_width, img_height, 3))
      img = image.img_to_array(img)
      img = img/255.0
      X.append(img)

X = np.array(X)

100%|██████████| 10000/10000 [00:34<00:00, 286.52it/s]

X.shape

plt.imshow(X[1500])
print('GENRE',df2['Genre'][1500])

GENRE Action|Crime|Sci-Fi

Prepare Y¶

def string_to_vector(data, val_name):
    
    from sklearn.feature_extraction.text import CountVectorizer
    
    # convert any np.nan to a string 'nan'
    data[val_name][pd.isnull(data[val_name])] = 'nan'
    
    vectorizer = CountVectorizer(analyzer = "word",   
                                             tokenizer = None,    
                                             preprocessor = None, 
                                             stop_words = None,   
                                             max_features = 50000)

    val_data = vectorizer.fit_transform(data[val_name])
    df_val = pd.DataFrame(val_data.toarray())
    df_val.columns = vectorizer.get_feature_names()
    df_val.index = data.index
    
    return df_val

import pandas as pd
df_genres = string_to_vector(df2, "Genre")
df_genres.head(10000)

/Users/krystal/opt/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

df_new = pd.concat([df2,df_genres], axis=1)
df_new.head()

df_new2 = df_new.drop(['imdbId','Title', 'Poster','year','maingenre','genre2','genre3'], axis = 1)
df_new2

y = df_new2.drop(['Genre'],axis=1)
y

y = y.to_numpy()
y.shape

(10000, 27)

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.2)

# image shape
X_train[0].shape

(300, 350, 3)

# genre shape
y_train[0].shape

(27,)

y

array([[0, 1, 1, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0]])

Deep learning module¶

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization

Using TensorFlow backend.

#full model
model = Sequential()
model.add(Conv2D(16, (3,3), activation='relu', input_shape =(300,350,3)))
model.add(BatchNormalization())
model.add(MaxPooling2D(2,2))
model.add(Dropout(0.3))

model.add(Conv2D(32, (3,3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(2,2))
model.add(Dropout(0.3))

model.add(Conv2D(64, (3,3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(2,2))
model.add(Dropout(0.4))

model.add(Conv2D(128, (3,3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(2,2))
model.add(Dropout(0.5))

model.add(Flatten())

model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(27, activation='sigmoid'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
conv2d_1 (Conv2D)            (None, 298, 348, 16)      448       
_________________________________________________________________
batch_normalization_1 (Batch (None, 298, 348, 16)      64        
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 149, 174, 16)      0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 149, 174, 16)      0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 147, 172, 32)      4640      
_________________________________________________________________
batch_normalization_2 (Batch (None, 147, 172, 32)      128       
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 73, 86, 32)        0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 73, 86, 32)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 71, 84, 64)        18496     
_________________________________________________________________
batch_normalization_3 (Batch (None, 71, 84, 64)        256       
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 35, 42, 64)        0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 35, 42, 64)        0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 33, 40, 128)       73856     
_________________________________________________________________
batch_normalization_4 (Batch (None, 33, 40, 128)       512       
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 16, 20, 128)       0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 16, 20, 128)       0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 40960)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               5243008   
_________________________________________________________________
batch_normalization_5 (Batch (None, 128)               512       
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               16512     
_________________________________________________________________
batch_normalization_6 (Batch (None, 128)               512       
_________________________________________________________________
dropout_6 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 27)                3483      
=================================================================
Total params: 5,362,427
Trainable params: 5,361,435
Non-trainable params: 992
_________________________________________________________________

model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

Train on 8000 samples, validate on 2000 samples
Epoch 1/5
8000/8000 [==============================] - 2741s 343ms/step - loss: 0.6139 - accuracy: 0.7005 - val_loss: 0.2611 - val_accuracy: 0.9141
Epoch 2/5
8000/8000 [==============================] - 2746s 343ms/step - loss: 0.2717 - accuracy: 0.9069 - val_loss: 0.2710 - val_accuracy: 0.9141
Epoch 3/5
8000/8000 [==============================] - 2881s 360ms/step - loss: 0.2498 - accuracy: 0.9116 - val_loss: 0.2580 - val_accuracy: 0.9141
Epoch 4/5
8000/8000 [==============================] - 2118s 265ms/step - loss: 0.2430 - accuracy: 0.9133 - val_loss: 0.2445 - val_accuracy: 0.9141
Epoch 5/5
8000/8000 [==============================] - 1886s 236ms/step - loss: 0.2386 - accuracy: 0.9150 - val_loss: 0.2519 - val_accuracy: 0.9141

def plot_learningCurve(history, epoch):
  # Plot training & validation accuracy values
  epoch_range = range(1, epoch+1)
  plt.plot(epoch_range, history.history['accuracy'])
  plt.plot(epoch_range, history.history['val_accuracy'])
  plt.title('Model accuracy')
  plt.ylabel('Accuracy')
  plt.xlabel('Epoch')
  plt.legend(['Train', 'TEST'], loc='upper left')
  plt.show()

  # Plot training & validation loss values
  plt.plot(epoch_range, history.history['loss'])
  plt.plot(epoch_range, history.history['val_loss'])
  plt.title('Model loss')
  plt.ylabel('Loss')
  plt.xlabel('Epoch')
  plt.legend(['Train', 'TEST'], loc='upper left')
  plt.show()

plot_learningCurve(history, 5)

Poster prediction¶

# successful case 
img = image.load_img('predict/test2.jpg', target_size=(img_width, img_height, 3))
plt.imshow(img)
img = image.img_to_array(img)
img = img/255.0

img = img.reshape(1, img_width, img_height, 3)

classes = np.array(df_new2.columns[1:])
print(classes)
y_prob = model.predict(img)
top5 = np.argsort(y_prob[0])[:-6:-1]

for i in range(5):
 print('{}'.format(classes[top5[i]])+' ({:.3})'.format(y_prob[0][top5[i]]))

['action' 'adventure' 'animation' 'biography' 'comedy' 'crime'
 'documentary' 'drama' 'family' 'fantasy' 'fi' 'film' 'history' 'horror'
 'music' 'musical' 'mystery' 'noir' 'romance' 'sci' 'short' 'show' 'sport'
 'talk' 'thriller' 'war' 'western']
drama (0.782)
romance (0.238)
comedy (0.237)
crime (0.0744)
thriller (0.0539)

# successful case 
img = image.load_img('predict/test7.jpg', target_size=(img_width, img_height, 3))
plt.imshow(img)
img = image.img_to_array(img)
img = img/255.0

img = img.reshape(1, img_width, img_height, 3)

classes = np.array(df_new2.columns[1:])
print(classes)
y_prob = model.predict(img)
top5 = np.argsort(y_prob[0])[:-6:-1]

for i in range(5):
 print('{}'.format(classes[top5[i]])+' ({:.3})'.format(y_prob[0][top5[i]]))

['action' 'adventure' 'animation' 'biography' 'comedy' 'crime'
 'documentary' 'drama' 'family' 'fantasy' 'fi' 'film' 'history' 'horror'
 'music' 'musical' 'mystery' 'noir' 'romance' 'sci' 'short' 'show' 'sport'
 'talk' 'thriller' 'war' 'western']
drama (0.634)
horror (0.237)
crime (0.171)
thriller (0.168)
action (0.15)

#semi successful case
img = image.load_img('predict/test11.jpg', target_size=(img_width, img_height, 3))
plt.imshow(img)
img = image.img_to_array(img)
img = img/255.0

img = img.reshape(1, img_width, img_height, 3)

classes = np.array(df_new2.columns[1:])
print(classes)
y_prob = model.predict(img)
top5 = np.argsort(y_prob[0])[:-6:-1]

for i in range(5):
 print('{}'.format(classes[top5[i]])+' ({:.3})'.format(y_prob[0][top5[i]]))

['action' 'adventure' 'animation' 'biography' 'comedy' 'crime'
 'documentary' 'drama' 'family' 'fantasy' 'fi' 'film' 'history' 'horror'
 'music' 'musical' 'mystery' 'noir' 'romance' 'sci' 'short' 'show' 'sport'
 'talk' 'thriller' 'war' 'western']
drama (0.657)
horror (0.198)
crime (0.171)
thriller (0.149)
romance (0.128)

# case that failed
img = image.load_img('predict/test17.jpg', target_size=(img_width, img_height, 3))
plt.imshow(img)
img = image.img_to_array(img)
img = img/255.0

img = img.reshape(1, img_width, img_height, 3)

classes = np.array(df_new2.columns[1:])
print(classes)
y_prob = model.predict(img)
top5 = np.argsort(y_prob[0])[:-6:-1]

for i in range(5):
 print('{}'.format(classes[top5[i]])+' ({:.3})'.format(y_prob[0][top5[i]]))

['action' 'adventure' 'animation' 'biography' 'comedy' 'crime'
 'documentary' 'drama' 'family' 'fantasy' 'fi' 'film' 'history' 'horror'
 'music' 'musical' 'mystery' 'noir' 'romance' 'sci' 'short' 'show' 'sport'
 'talk' 'thriller' 'war' 'western']
drama (0.756)
comedy (0.385)
romance (0.303)
crime (0.0806)
adventure (0.0499)

	Title	Genre	Poster
imdbId
114709	Toy Story (1995)	Animation\|Adventure\|Comedy	https://images-na.ssl-images-amazon.com/images...
113497	Jumanji (1995)	Action\|Adventure\|Family	https://images-na.ssl-images-amazon.com/images...
113228	Grumpier Old Men (1995)	Comedy\|Romance	https://images-na.ssl-images-amazon.com/images...
114885	Waiting to Exhale (1995)	Comedy\|Drama\|Romance	https://images-na.ssl-images-amazon.com/images...
113041	Father of the Bride Part II (1995)	Comedy\|Family\|Romance	https://images-na.ssl-images-amazon.com/images...
...	...	...	...
83291	Los violadores (1981)	Action\|Thriller	https://images-na.ssl-images-amazon.com/images...
83168	Tanya's Island (1980)	Drama	https://images-na.ssl-images-amazon.com/images...
82875	Pacific Banana (1981)	Comedy	https://images-na.ssl-images-amazon.com/images...
815258	Werewolf in a Womens Prison (2006)	Horror	https://images-na.ssl-images-amazon.com/images...
79142	Xiao zi ming da (1979)	Action\|Comedy	https://images-na.ssl-images-amazon.com/images...

	index	maingenre
0	Comedy	2960
1	Drama	2418
2	Action	1379
3	Crime	768
4	Adventure	582
5	Horror	436
6	Biography	384
7	Documentary	383
8	Animation	251
9	Fantasy	88
10	Family	52
11	Western	50
12	Mystery	50
13	Thriller	48
14	Romance	47
15	Sci-Fi	47
16	Musical	20
17	Short	15
18	War	13
19	Film-Noir	5
20	Music	4

	action	adventure	animation	biography	comedy	crime	documentary	drama	family	fantasy	...	noir	romance	sci	short	show	sport	talk	thriller	war	western
0	0	1	1	0	1	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
1	1	1	0	0	0	0	0	0	1	0	...	0	0	0	0	0	0	0	0	0	0
2	0	0	0	0	1	0	0	0	0	0	...	0	1	0	0	0	0	0	0	0	0
3	0	0	0	0	1	0	0	1	0	0	...	0	1	0	0	0	0	0	0	0	0
4	0	0	0	0	1	0	0	0	1	0	...	0	1	0	0	0	0	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
9995	0	0	0	0	0	1	0	1	0	0	...	1	0	0	0	0	0	0	0	0	0
9996	0	0	0	0	0	0	0	1	0	0	...	1	0	0	0	0	0	0	1	0	0
9997	0	0	0	0	0	0	0	1	0	1	...	0	0	0	0	0	0	0	0	0	0
9998	0	0	0	0	1	0	0	1	0	1	...	0	0	0	0	0	0	0	0	0	0
9999	0	0	0	0	0	0	0	1	0	0	...	0	0	0	0	0	1	0	0	1	0

DEEP LEARNING SECTION - PREDICTING MOVIE GENRES FROM POSTERS¶

Data clearning and visualization¶

Deep learning data pre-processing¶

Prepare X¶

Prepare Y¶

Deep learning module¶

Poster prediction¶

Final thots: the predictions can largely depend on the genre distribution and the movie years in the training dataset¶