Movie Industry Analysis: A Beginner Data Science Project

Image for post
Image for post
Photo by Jake Hills on Unsplash

The Project

The Data

The Process

import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
movie_gross_df = pd.read_csv('zippedData/bom.movie_gross.csv.gz', compression='gzip')movie_basics_df = pd.read_csv('zippedData/imdb.name.basics.csv.gz', compression='gzip')movie_title_akas_df = pd.read_csv('zippedData/imdb.title.akas.csv.gz', compression='gzip')movie_title_basics_df = pd.read_csv('zippedData/imdb.title.basics.csv.gz', compression='gzip')movie_title_crew_df = pd.read_csv('zippedData/imdb.title.crew.csv.gz', compression='gzip')movie_title_principals_df = pd.read_csv('zippedData/imdb.title.principals.csv.gz', compression='gzip')movie_title_ratings_df = pd.read_csv('zippedData/imdb.title.ratings.csv.gz', compression='gzip')movie_info_df = pd.read_csv('zippedData/rt.movie_info.tsv.gz', compression='gzip', sep='\t')movie_reviews_df = pd.read_csv('zippedData/rt.reviews.tsv.gz', compression='gzip', sep='\t', encoding='cp1252')tmdb_movies_df = pd.read_csv('zippedData/tmdb.movies.csv.gz', compression='gzip')movie_budgets_df = pd.read_csv('zippedData/tn.movie_budgets.csv.gz', compression='gzip')
all_dfs = [movie_gross_df, movie_basics_df, movie_title_akas_df, movie_title_basics_df,
movie_title_crew_df, movie_title_principals_df, movie_title_ratings_df, movie_info_df,
movie_reviews_df, tmdb_movies_df, movie_budgets_df]
df_names = ['movie_gross', 'movie_basics', 'movie_title_akas', 'movie_title_basics',
'movie_title_crew', 'movie_title_principals', 'movie_title_ratings', 'movie_info',
'movie_reviews', 'tmdb_movies', 'movie_budgets']
for i in range(len(all_dfs)):
print(df_names[i], '\n')
display(all_dfs[i].info())
display(all_dfs[i].head())
Image for post
Image for post
movie_gross_df.info(), movie_gross_df.head()
movie_basics_df.drop(['birth_year', 'death_year'], axis=1, inplace=True)movie_title_akas_df.drop(['language', 'attributes', 'types'], axis=1, inplace=True)movie_title_principals_df.drop(['job', 'characters'], axis=1, inplace=True)
movie_gross_df['domestic_gross'].fillna(movie_gross_df.domestic_gross.median(), inplace=True)movie_gross_df['foreign_gross'].fillna(movie_gross_df.domestic_gross.median(), inplace=True)
#Remove extraneous comma movie_gross_df['foreign_gross'].replace(',', '', regex=True, inplace=True)#Change foreign_gross to float64
movie_gross_df['foreign_gross'] = movie_gross_df['foreign_gross'].astype('float64')
genres_df = movie_budgets_df.merge(movie_title_basics_df, left_on='movie', right_on='original_title', how='inner')crew_df = genres_df.merge(movie_title_crew_df, left_on='tconst', right_on='tconst', how='inner')combined_df = crew_df.merge(movie_basics_df, left_on='directors', right_on='nconst', how='inner')
# Plot net profit vs. count of movies directed 
plt.figure(figsize=(12,8))
sns.boxplot(top_directors_df.movie, top_directors_df.net_profit)
plt.title('Net Profit vs. Movies Directed', fontsize=16)
plt.xlabel('Total Movies Directed')
plt.ylabel('Net Profit (in billions)')
plt.tight_layout()
plt.show()
Image for post
Image for post
#Plot bar graph
plt.figure(figsize=(12, 8))
sns.barplot(four_or_more_movies.key_0, four_or_more_movies.average_profit)
plt.xticks(rotation=35, horizontalalignment='right')
plt.xlabel(None)
plt.ylabel('Average Profit (in 100 millions)')
plt.title('Top 25 Directors with Four or More Movies', fontsize=16)
plt.tight_layout()
plt.show()
Image for post
Image for post

Aspiring Data Scientist, recent graduate of Flatiron School’s online data science program.

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store