import numpy as np
import pandas as pd
import scipy as sp
import sklearn.neighbors as knn
import matplotlib
import matplotlib.pyplot as plt
import seaborn
import requests
import urllib
import joblib
import requests
import json
import os
import random
import seaborn as sns
#import statsmodels.api as sm
from matplotlib import rcParams
from sklearn import discriminant_analysis
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from scipy.stats import mode
from sklearn import linear_model
from IPython.core.interactiveshell import InteractiveShell
from time import sleep
from collections import Counter
from itertools import combinations, permutations
%matplotlib inline
#importing dataset and clean missing values and duplicates
import pandas as pd
df = pd.read_csv('MovieGenre.csv', encoding="ISO-8859-1", usecols=["imdbId", "Title", "Genre", "Poster"])
df.set_index(["imdbId"], inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(subset="Poster", keep=False, inplace=True)
df
# extract movie's year information as a separate column
import re
re_year = re.compile("\((\d{4})\)")
df["year"] = df.Title.map(lambda x: int(re_year.findall(x)[0]) if re_year.findall(x) else None)
df
#separate genre types to multiple columns
df.columns
df[['maingenre','genre2','genre3']] = df.Genre.str.split("|",expand=True,)
df=df.head(10000)
df
df.count
#count values for each genre
genre_df=df['maingenre'].value_counts()
genre_df
#put genre value counts in dataframe
genre_count = pd.DataFrame(genre_df).reset_index()
genre_count
#rename genre and counts for plotting
genre_count.columns = ['genre','count']
genre_count
#plotting
genre_count.sort_values(by="count", ascending=False, inplace=True)
plt.subplots(figsize=(8,5));
sns.barplot(x="genre", y="count", data=genre_count);
plt.xticks(rotation="vertical");
plt.xlabel("Genre");
plt.ylabel("Number of movies");
plt.title("Number of movies per Genre");
plt.ylim((0,4000));
#Imports
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout, BatchNormalization, Conv2D, MaxPool2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing import image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df.shape
df.columns
df2 = df.reset_index()
df2
df2.shape
df2.columns
df2['imdbId'] = df2['imdbId'].astype(str)
df2.info
#IMAGE PROCESSING CODE STARTS HERE
import os
from sklearn.model_selection import train_test_split
import urllib.request
from urllib.error import HTTPError
from PIL import Image
na_rows = df2[df2.Genre.str.contains('N/A') == True].index
df2 = df2.drop(na_rows, axis=0)
# Select the top 10000 rows of the Dataframe for 2 columns only
df2
# Transform images to arrays
from tqdm import tqdm
img_width = 300
img_height = 350
X = []
for i in tqdm(range(df2.shape[0])):
path = 'posters/' + df2['imdbId'][i] + '.jpg'
img = image.load_img(path, target_size=(img_width, img_height, 3))
img = image.img_to_array(img)
img = img/255.0
X.append(img)
X = np.array(X)
X.shape
plt.imshow(X[1500])
print('GENRE',df2['Genre'][1500])
def string_to_vector(data, val_name):
from sklearn.feature_extraction.text import CountVectorizer
# convert any np.nan to a string 'nan'
data[val_name][pd.isnull(data[val_name])] = 'nan'
vectorizer = CountVectorizer(analyzer = "word",
tokenizer = None,
preprocessor = None,
stop_words = None,
max_features = 50000)
val_data = vectorizer.fit_transform(data[val_name])
df_val = pd.DataFrame(val_data.toarray())
df_val.columns = vectorizer.get_feature_names()
df_val.index = data.index
return df_val
import pandas as pd
df_genres = string_to_vector(df2, "Genre")
df_genres.head(10000)
df_new = pd.concat([df2,df_genres], axis=1)
df_new.head()
df_new2 = df_new.drop(['imdbId','Title', 'Poster','year','maingenre','genre2','genre3'], axis = 1)
df_new2
y = df_new2.drop(['Genre'],axis=1)
y
y = y.to_numpy()
y.shape
# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.2)
# image shape
X_train[0].shape
# genre shape
y_train[0].shape
y
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization
#full model
model = Sequential()
model.add(Conv2D(16, (3,3), activation='relu', input_shape =(300,350,3)))
model.add(BatchNormalization())
model.add(MaxPooling2D(2,2))
model.add(Dropout(0.3))
model.add(Conv2D(32, (3,3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(2,2))
model.add(Dropout(0.3))
model.add(Conv2D(64, (3,3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(2,2))
model.add(Dropout(0.4))
model.add(Conv2D(128, (3,3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(2,2))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(27, activation='sigmoid'))
model.summary()
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))
def plot_learningCurve(history, epoch):
# Plot training & validation accuracy values
epoch_range = range(1, epoch+1)
plt.plot(epoch_range, history.history['accuracy'])
plt.plot(epoch_range, history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'TEST'], loc='upper left')
plt.show()
# Plot training & validation loss values
plt.plot(epoch_range, history.history['loss'])
plt.plot(epoch_range, history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'TEST'], loc='upper left')
plt.show()
plot_learningCurve(history, 5)
# successful case
img = image.load_img('predict/test2.jpg', target_size=(img_width, img_height, 3))
plt.imshow(img)
img = image.img_to_array(img)
img = img/255.0
img = img.reshape(1, img_width, img_height, 3)
classes = np.array(df_new2.columns[1:])
print(classes)
y_prob = model.predict(img)
top5 = np.argsort(y_prob[0])[:-6:-1]
for i in range(5):
print('{}'.format(classes[top5[i]])+' ({:.3})'.format(y_prob[0][top5[i]]))
# successful case
img = image.load_img('predict/test7.jpg', target_size=(img_width, img_height, 3))
plt.imshow(img)
img = image.img_to_array(img)
img = img/255.0
img = img.reshape(1, img_width, img_height, 3)
classes = np.array(df_new2.columns[1:])
print(classes)
y_prob = model.predict(img)
top5 = np.argsort(y_prob[0])[:-6:-1]
for i in range(5):
print('{}'.format(classes[top5[i]])+' ({:.3})'.format(y_prob[0][top5[i]]))
#semi successful case
img = image.load_img('predict/test11.jpg', target_size=(img_width, img_height, 3))
plt.imshow(img)
img = image.img_to_array(img)
img = img/255.0
img = img.reshape(1, img_width, img_height, 3)
classes = np.array(df_new2.columns[1:])
print(classes)
y_prob = model.predict(img)
top5 = np.argsort(y_prob[0])[:-6:-1]
for i in range(5):
print('{}'.format(classes[top5[i]])+' ({:.3})'.format(y_prob[0][top5[i]]))
# case that failed
img = image.load_img('predict/test17.jpg', target_size=(img_width, img_height, 3))
plt.imshow(img)
img = image.img_to_array(img)
img = img/255.0
img = img.reshape(1, img_width, img_height, 3)
classes = np.array(df_new2.columns[1:])
print(classes)
y_prob = model.predict(img)
top5 = np.argsort(y_prob[0])[:-6:-1]
for i in range(5):
print('{}'.format(classes[top5[i]])+' ({:.3})'.format(y_prob[0][top5[i]]))