"Rain Man", 1987
# Dependencies
%matplotlib inline
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
from numpy import NaN
import pandas as pd
import seaborn as seabornInstance
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Turning off a warning for now..
pd.options.mode.chained_assignment = None # default='warn'
# Import movies (IMDB)
title_basics = pd.read_csv('../data/title.basics.tsv',delimiter='\t',encoding='utf-8-sig', low_memory=False)
# Import ratings (IMDB)
title_ratings = pd.read_csv('../data/title.ratings.tsv',delimiter='\t',encoding='utf-8-sig')
# Import crew (IMDB)
title_crew = pd.read_csv('../data/title.crew.tsv',delimiter='\t',encoding='utf-8-sig')
# Import name basics (IMDB)
name_basics = pd.read_csv('../data/name.basics.tsv',delimiter='\t',encoding='utf-8-sig')
# Import box office data (BoxOfficeMojo)
box_office = pd.read_csv('../data/boxoffice.csv')
# Import Oscar data (https://en.wikipedia.org/wiki/List_of_Academy_Award-winning_films)
oscars = pd.read_csv('../data/oscars_cleaned.csv')
# Filter non-movies, adult movies
title_filtered = title_basics[title_basics['titleType']=='movie']
title_filtered = title_filtered[title_filtered['isAdult']==0]
# Split out genres and join
genres_split = title_filtered["genres"].str.split(",", n=2, expand=True)
joined = title_filtered.join(genres_split)
# Rename and drop some columns
cleaned = joined.rename(columns = {'tconst':'IMDB ID', 'titleType': 'Type', 'primaryTitle': 'Title', 'originalTitle': 'Title (original)', 'startYear': 'Year', 'runtimeMinutes': 'Runtime (min)', 0:'Genre (main)', 1:'Genre (sub 1)', 2:'Genre (sub 2)'})
cleaned = cleaned.drop(columns=['endYear', 'genres', 'Genre (sub 1)', 'Genre (sub 2)', 'Title (original)'])
# Merge basic set and rating
with_ratings = cleaned.set_index('IMDB ID').join(title_ratings.set_index('tconst'))
with_ratings = with_ratings.rename(columns = {'averageRating': 'Rating (avg.)', 'numVotes': 'Votes'})
# Merge box office and Oscars
merged = pd.merge(box_office, oscars, left_on='title', right_on='Film', how='outer')
# Merge both of above to make combined set
combined = with_ratings.merge(merged,how='left', left_on='Title', right_on='title')
# Drop, rename, change \N to NaN
combined = combined.drop(['Type', 'isAdult', 'Year_y', 'year', 'rank', 'title', 'Film'], axis=1)
combined = combined.rename(columns = {'Year_x': 'Year', 'studio': 'Studio', 'lifetime_gross': 'Gross (lifetime)'})
combined = combined.replace(r'\\N','NaN', regex=True)
combined = combined[combined['Runtime (min)']!='NaN']
combined = combined[combined['Genre (main)']!='NaN']
# Convert NaN to 0 for Awards and Nominiations
combined['Awards']=combined['Awards'].fillna(0)
combined['Nominations']=combined['Nominations'].fillna(0)
## Drop items we don't have data for (i.e. Studio or Gross (lifetime) data)
dropped = combined.dropna(axis='rows')
#print(dropped.dtypes)
dropped['Year'] = dropped['Year'].astype(float)
dropped['Runtime (min)'] = dropped['Runtime (min)'].astype(float)
dropped['Votes'] = dropped['Votes'].astype(int)
dropped['Awards'] = dropped['Awards'].astype(int)
dropped['Nominations'] = dropped['Nominations'].astype(int)
dropped['Gross (lifetime)'] = dropped['Gross (lifetime)'].astype(float)
# List out genres
genre_list = dropped['Genre (main)'].unique()
genre_list
## Code genres numerically
dropped['Genre (main)'] = dropped['Genre (main)'].replace(r'Action',1, regex=True)
dropped['Genre (main)'] = dropped['Genre (main)'].replace(r'Adventure',2, regex=True)
dropped['Genre (main)'] = dropped['Genre (main)'].replace(r'Biography',3, regex=True)
dropped['Genre (main)'] = dropped['Genre (main)'].replace(r'Comedy',4, regex=True)
dropped['Genre (main)'] = dropped['Genre (main)'].replace(r'Crime',5, regex=True)
dropped['Genre (main)'] = dropped['Genre (main)'].replace(r'Drama',6, regex=True)
dropped['Genre (main)'] = dropped['Genre (main)'].replace(r'Family',7, regex=True)
dropped['Genre (main)'] = dropped['Genre (main)'].replace(r'Fantasy',8, regex=True)
dropped['Genre (main)'] = dropped['Genre (main)'].replace(r'Film-Noir',9, regex=True)
dropped['Genre (main)'] = dropped['Genre (main)'].replace(r'History',10, regex=True)
dropped['Genre (main)'] = dropped['Genre (main)'].replace(r'Horror',11, regex=True)
dropped['Genre (main)'] = dropped['Genre (main)'].replace(r'Musical',12, regex=True)
dropped['Genre (main)'] = dropped['Genre (main)'].replace(r'Mystery',13, regex=True)
dropped['Genre (main)'] = dropped['Genre (main)'].replace(r'Romance',14, regex=True)
dropped['Genre (main)'] = dropped['Genre (main)'].replace(r'Sci-Fi',15, regex=True)
dropped['Genre (main)'] = dropped['Genre (main)'].replace(r'Sport',16, regex=True)
dropped['Genre (main)'] = dropped['Genre (main)'].replace(r'Thriller',17, regex=True)
dropped['Genre (main)'] = dropped['Genre (main)'].replace(r'War',18, regex=True)
dropped['Genre (main)'] = dropped['Genre (main)'].replace(r'Western',19, regex=True)
# Drop studios
dropped = dropped.drop(['Studio'], axis=1)
# Drop Animation, Documentary, Music genres (did not code above, so use text)
dropped = dropped[dropped['Genre (main)']!='Animation']
dropped = dropped[dropped['Genre (main)']!='Documentary']
dropped = dropped[dropped['Genre (main)']!='Music']
#print(dropped.dtypes)
dropped.head()
#dropped['Genre (main)'].value_counts()
dropped.count()
"Sneakers", 1992
# Reset index to avoid sklearn problems
dropped = dropped.reset_index()
dropped = dropped.drop(['index'], axis=1)
# Assign data to X, y (Sklearn requires a two-dimensional array of values so we use reshape to create this)
X = dropped['Rating (avg.)'].values.reshape(-1, 1)
y = dropped['Votes'].values.reshape(-1, 1)
#print("Shape: ", X.shape, y.shape)
# Plot the data to see if a linear trend exists
plt.scatter(X, y)
plt.xlabel('Rating (avg.)')
plt.ylabel('Votes')
# Assign data to X, y (Sklearn requires a two-dimensional array of values so we use reshape to create this)
X = dropped['Year'].values.reshape(-1, 1)
y = dropped['Rating (avg.)'].values.reshape(-1, 1)
print("Shape: ", X.shape, y.shape)
# Plot the data to see if a linear trend exists
plt.scatter(X, y)
plt.xlabel('Year')
plt.ylabel('Rating (avg.)')
# Assign data to X, y (Sklearn requires a two-dimensional array of values so we use reshape to create this)
X = dropped['Year'].values.reshape(-1, 1)
y = dropped['Gross (lifetime)'].values.reshape(-1, 1)
print("Shape: ", X.shape, y.shape)
# Plot the data to see if a linear trend exists
plt.scatter(X, y)
plt.xlabel('Year')
plt.ylabel('Gross (lifetime)')
# Assign data to X, y (Sklearn requires a two-dimensional array of values so we use reshape to create this)
X = dropped['Year'].values.reshape(-1, 1)
y = dropped['Nominations'].values.reshape(-1, 1)
print("Shape: ", X.shape, y.shape)
# Plot the data to see if a linear trend exists
plt.scatter(X, y)
plt.xlabel('Year')
plt.ylabel('Nominations')
# Assign data to X, y (Sklearn requires a two-dimensional array of values so we use reshape to create this)
X = dropped['Year'].values.reshape(-1, 1)
y = dropped['Awards'].values.reshape(-1, 1)
print("Shape: ", X.shape, y.shape)
# Plot the data to see if a linear trend exists
plt.scatter(X, y)
plt.xlabel('Year')
plt.ylabel('Awards')
# Filter to one genre (Drama)
drama = dropped[dropped['Genre (main)']==6]
# Reset index to avoid sklearn problems
drama = drama.reset_index()
drama = drama.drop(['index'], axis=1)
drama.count()
# Assign data to X, y (Sklearn requires a two-dimensional array of values so we use reshape to create this)
X = drama['Rating (avg.)'].values.reshape(-1, 1)
y = drama['Votes'].values.reshape(-1, 1)
#print("Shape: ", X.shape, y.shape)
# Plot the data to see if a linear trend exists
plt.scatter(X, y)
plt.xlabel('Rating (avg.)')
plt.ylabel('Votes')
# Assign data to X, y (Sklearn requires a two-dimensional array of values so we use reshape to create this)
X = drama['Year'].values.reshape(-1, 1)
y = drama['Rating (avg.)'].values.reshape(-1, 1)
print("Shape: ", X.shape, y.shape)
# Plot the data to see if a linear trend exists
plt.scatter(X, y)
plt.xlabel('Year')
plt.ylabel('Rating (avg.)')
# Assign data to X, y (Sklearn requires a two-dimensional array of values so we use reshape to create this)
X = drama['Year'].values.reshape(-1, 1)
y = drama['Gross (lifetime)'].values.reshape(-1, 1)
print("Shape: ", X.shape, y.shape)
# Plot the data to see if a linear trend exists
plt.scatter(X, y)
plt.xlabel('Year')
plt.ylabel('Gross (lifetime)')
# Assign data to X, y (Sklearn requires a two-dimensional array of values so we use reshape to create this)
X = drama['Year'].values.reshape(-1, 1)
y = drama['Nominations'].values.reshape(-1, 1)
print("Shape: ", X.shape, y.shape)
# Plot the data to see if a linear trend exists
plt.scatter(X, y)
plt.xlabel('Year')
plt.ylabel('Nominations')
# Assign data to X, y (Sklearn requires a two-dimensional array of values so we use reshape to create this)
X = drama['Year'].values.reshape(-1, 1)
y = drama['Awards'].values.reshape(-1, 1)
print("Shape: ", X.shape, y.shape)
# Plot the data to see if a linear trend exists
plt.scatter(X, y)
plt.xlabel('Year')
plt.ylabel('Awards')
"Star Trek", 1967
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
x = drama['Year']
y = drama['Runtime (min)']
z = drama['Gross (lifetime)']
ax.scatter(x, y, z, c='b')
ax.set_xlabel('Year')
ax.set_ylabel('Runtime (min)')
ax.set_zlabel('Gross (lifetime)')
plt.show()
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
x = drama['Year']
y = drama['Runtime (min)']
z = drama['Rating (avg.)']
ax.scatter(x, y, z, c='b')
ax.set_xlabel('Year')
ax.set_ylabel('Runtime (min)')
ax.set_zlabel('Rating (avg.)')
plt.show()
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
x = dropped['Year']
y = dropped['Genre (main)']
z = dropped['Rating (avg.)']
ax.scatter(x, y, z, c='b')
ax.set_xlabel('Year')
ax.set_ylabel('Genre (main)')
ax.set_zlabel('Rating (avg.)')
plt.show()
"Tron", 1982
# Assign data to X, y (Sklearn requires a two-dimensional array of values so we use reshape to create this)
X = drama['Rating (avg.)'].values.reshape(-1, 1)
y = drama['Votes'].values.reshape(-1, 1)
#print("Shape: ", X.shape, y.shape)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# Drop NaN
X = X[np.logical_not(np.isnan(X))]
y = y[np.logical_not(np.isnan(y))]
# Create model
model = LinearRegression()
# Fit model to training data
model.fit(X_train, y_train)
# Calculate mean_squared_error and r-squared value for testing data
# Use model to make predictions
predicted = model.predict(X_test)
# Score predictions
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")
# Score and print scores
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)
print(f"Training score: {training_score}")
print(f"Testing score: {testing_score}")
# Plot residuals
plt.scatter(model.predict(X_train), model.predict(X_train) - y_train, c="blue", label="Training data")
plt.scatter(model.predict(X_test), model.predict(X_test) - y_test, c="red", label="Testing data")
plt.legend()
#plt.hlines(y=0, xmin=y.min(), xmax=y.max())
plt.title("Residual plot")
# Assign data to X, y (Sklearn requires a two-dimensional array of values so we use reshape to create this)
X = drama['Awards'].values.reshape(-1, 1)
y = drama['Votes'].values.reshape(-1, 1)
#print("Shape: ", X.shape, y.shape)
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# Drop NaN
X = X[np.logical_not(np.isnan(X))]
y = y[np.logical_not(np.isnan(y))]
# Create model
model = LinearRegression()
# Fit model to training data
model.fit(X_train, y_train)
# Calculate mean_squared_error and r-squared value for testing data
# Use model to make predictions
predicted = model.predict(X_test)
# Score predictions
mse = mean_squared_error(y_test, predicted)
r2 = r2_score(y_test, predicted)
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")
# Score and print scores
training_score = model.score(X_train, y_train)
testing_score = model.score(X_test, y_test)
print(f"Training score: {training_score}")
print(f"Testing score: {testing_score}")
# Plot residuals
plt.scatter(model.predict(X_train), model.predict(X_train) - y_train, c="blue", label="Training data")
plt.scatter(model.predict(X_test), model.predict(X_test) - y_test, c="red", label="Testing data")
plt.legend()
#plt.hlines(y=0, xmin=y.min(), xmax=y.max())
plt.title("Residual plot")
"Spaceballs", 1987
# Reference: https://towardsdatascience.com/a-beginners-guide-to-linear-regression-in-python-with-scikit-learn-83a8f7ae2b4f
dropped.head()
#dropped.shape
#dropped.describe()
X = dropped[['Runtime (min)', 'Genre (main)', 'Rating (avg.)']].values
y = dropped['Gross (lifetime)'].values
# Check average value of Gross (lifetime) column
plt.figure()
seabornInstance.distplot(dropped['Gross (lifetime)'])
# Split 80% of the data to the training set and 20% as test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
# Train model
model = LinearRegression()
# Fit the model and score
model.fit(X, y)
score = model.score(X, y)
print(f"R2 Score: {score}")
# Use residual plot to check predications (since difficult to plot line in 3D space)
predictions = model.predict(X)
plt.scatter(predictions, predictions - y)
plt.hlines(y=0, xmin=predictions.min(), xmax=predictions.max())
plt.show()
# Do prediction on test data
y_pred = model.predict(X_test)
# Look at the differences
differences = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
differences.head()
Additional potential analyses
Refine/refactor
More inspiring words: