Data Visualization - Python Project

Analysis of IMDb Movie Dataset (1920-2020)

This data was found on Kaggle showing the top 1000 movies and TV shows over the past 100 years. Exploring this data set, trends in Directors, IMDb Scores, and genre popularity will be explored.

Horizontal Bar Chart of the Top 10 IMDb Scores of All Time

Exploring the Top 10 IMDb scores of all time, the rarity of a score greater or equal to 9.0 is extremely pronounced with only 5 movies all time attaining a score of that level.

#imports
import pandas as pd
import numpy as np
from statistics import mean
import string
import wget
import os
import matplotlib.pyplot as plt
import warnings
import matplotlib.patches as mpatches

#bring in the data set
path = "C:/Users/Perry/OneDrive - University of Pittsburgh/Desktop/Loyola Classes/DS 736/IMDB Movies Dataset/"
filename = path + "imdb_top_1000.csv"

#reading into data frame
df = pd.read_csv(filename, usecols=['Series_Title','Released_Year','IMDB_Rating','Meta_score','Director','No_of_Votes', 'Gross', 'Genre'])

#sorting the data into a new data frame based on the IMDb rating
imdb = df.sort_values(by='IMDB_Rating', ascending=False)

#this is hard corded for top 10 percent and then the 60 percent to work for the graph so that the colors are represented correctly
def pick_colors_imdb_t10(data):
    colors = []
    n = len(data)
    for item in range(n):
        pct = item / n
        if pct < 0.10:
            colors.append('gold')
        elif pct < 0.60:
            colors.append('silver')
        else:
            colors.append('peru')
    return colors

# Creating a bar graph to show the ratings for top 10
t10 = imdb.loc[0:9]

#editing the longer names
t10['Series_Title'] = t10['Series_Title'].str.replace(r':\s*', ':\n', regex=True)

colors = pick_colors_imdb_t10(t10)

Top = mpatches.Patch(color='gold', label='Top 2')
Middle = mpatches.Patch(color='silver', label='Top 3-8')
Bottom = mpatches.Patch(color='peru', label='Top 9-12')

fig1 = plt.figure(figsize=(23,12))
fig1.suptitle ('Top 10 IMDB Ratings for Movies of All Time', fontsize=24, fontweight='bold')

ax1 = fig1.add_subplot(1,1,1)

ax1.barh(t10.Series_Title, t10.IMDB_Rating, label='IMDB Rating', color=colors)
ax1.set_xlim(7, 10)

ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)

for row_counter, row_counter_value in enumerate(t10.IMDB_Rating):
    ax1.text(row_counter_value + 0.05, row_counter, str(round(row_counter_value, 2)), color='black',
             fontsize=14, fontweight='bold', ha='center', va='bottom')

ax1.legend(loc='lower right', handles=[Top, Middle, Bottom], fontsize=14)

plt.yticks(fontsize=14)

plt.xticks(fontsize=14)

ax1.invert_yaxis()

plt.show()

Scatter Plot Movies Created by Top 10 Directors Over Time

The scatter plot shows an interesting trend with the top directors in that they very rarely work on more than 1 movies per year. Only in 3 instances did a director work on 2 movies, Alfred Hitchcock once, and Steven Spielberg twice. One important note to this specific data is that the year is based on release year so it is very possible that these directors were working on multiple movies at once but only finished production once a year.

#first getting the top directors based on who shows up the most
top_directors = imdb['Director'].value_counts().head(10).index 

#creating a dataframe using the previous df as a filter
df_top_directors = imdb[imdb['Director'].isin(top_directors)].copy()

df_agg = df_top_directors.groupby(['Director','Released_Year']).size().reset_index(name='Movie_Count')

#adjusting the director names to fit on x axis without rotation
df_agg['Director_label'] = df_agg['Director'].str.replace(' ', '\n')

#creating scatter plot
min_year = df_agg['Released_Year'].min()
max_year = df_agg['Released_Year'].max()
decades = list(range(min_year - min_year % 10, max_year + 10, 10))


fig2 = plt.figure(figsize=(18,10))
fig2.suptitle ('Directors', fontsize=28, fontweight='bold')

ax2 = fig2.add_subplot(1,1,1)

scatter = ax2.scatter(x=df_agg['Director_label'], y=df_agg['Released_Year'], marker= '8',
                      c=df_agg['Movie_Count'], s=df_agg['Movie_Count']*300, cmap='summer', edgecolor='black')

ax2.set_title('Movies by Top 10 Most Common Directors Over Time', fontsize=28, fontweight='bold')
ax2.set_xlabel('Director', fontsize=18, labelpad=15)
ax2.set_ylabel('Year', fontsize=18)

plt.xticks(rotation=0, ha='center', fontsize=14, color='black')

plt.yticks(decades, fontsize=14, color='black')

cbar = plt.colorbar(scatter, ax=ax2)
cbar.set_label('Number of Movies', rotation=270, fontsize=14, labelpad=30)

plt.show()

Line Plot of Average IMDb Score Over Time

The trend over time for the Average IMDb score shows that movies have become more consistent as a whole in the past 20 years hovering around 7.9 average score. In previous decades there was lots of volatility in the average IMDb rating with a notable spike in the late 1930s going from ~7.8 up to ~8.5 and back down to ~7.9 two years later.

# line graph
#data selection
rating_trend = (df.groupby('Released_Year')['IMDB_Rating'].mean().reset_index())

#imdb score limits
y_min = rating_trend['IMDB_Rating'].min()
y_max = rating_trend['IMDB_Rating'].max()

#decade limits
x_min = (rating_trend['Released_Year'].min() // 10) * 10
x_max = ((rating_trend['Released_Year'].max() //10) +1) *10

#creating linegraph visual
fig3 = plt.figure(figsize = (18, 10))
ax = fig3.add_subplot(1, 1, 1)

plt.plot(rating_trend['Released_Year'], rating_trend['IMDB_Rating'], marker='8', label='Average IMDb Rating', color='green', linewidth=2)

plt.title('Average IMDb Rating of Movies by Year', fontsize=18)
plt.xlabel('Release Year', fontsize=14)
plt.ylabel('Average IMDb Rating', fontsize=14)
ax.tick_params(axis='x', labelsize=14, rotation=0)
ax.tick_params(axis='y', labelsize=14, rotation=0)

ax.set_ylim(y_min - 0.1, y_max + 0.1)

ax.set_xticks(np.arange(x_min, x_max, 10))

ax.legend(loc='best', fontsize=14)

plt.show()

Donut Chart of Top 10 Movies Genres in the Top 1000 IMDb Rated Movies

Initially looking at this graphic you may question how there were 2,103 movies in a list of 1000 movies. To address that point there are many movies that were listed as multiple genres. When the genre column was broken into its pieces, if a movie had 4 genres it would count as 4 movies. Notably, 50% of movies have a genre of Drama, Comedy, or Crime.

#data cleaning for the genre column
# split genres (they are comma-separated)
df['Genre_List'] = df['Genre'].str.split(',')
df = df.explode('Genre_List')
df['Genre_List'] = df['Genre_List'].str.strip()

# create decade column
df['Decade'] = (df['Released_Year'] // 10) * 10

#creating data frame for donut chart
top_genres = df['Genre_List'].value_counts().head(10)

total_movies = top_genres.sum()

#creating Donut visual
fig4 = plt.figure(figsize=(10,10))
ax = fig4.add_subplot(1, 1, 1)

#colors for Donut
colormap = plt.get_cmap("tab10")
colors = colormap(np.arange(len(top_genres)))

#Donut Chart
df_top = df[df['Genre_List'].isin(top_genres.index)]

df_top.groupby('Genre_List').size().plot(kind='pie', radius=1, colors=colors, pctdistance=0.85, labeldistance=1.1, 
                                         wedgeprops=dict(edgecolor='w'), textprops={'fontsize':14}, 
                                         autopct=lambda x: '{:.1f}%\n({:,})'.format(x, int(round((x/100) * top_genres.sum()))),
                                         startangle=90)

#adding hole in the middle
hole = plt.Circle((0,0), 0.3, fc='white')
figh = plt.gcf()
figh.gca().add_artist(hole)

#adding text in middle
ax.text(0, 0, f'Total Movies \n{total_movies:,}', ha='center', va='center', fontsize=14)

#getting rid of y axis
ax.yaxis.set_visible(False)

#adjusting labels and adding title
plt.title('Top 10 Genres on IMDb List \n Percent of Total Movies and Number of Movies on IMDb Top 1000', fontsize=18)
ax.axis('equal')

plt.tight_layout()

plt.show()

Bump Chart of Most Popular Genre Over the Decades

Following the trend from the Most Popular Genre in total, Drama ranks as the most popular genre in every decade. Genres such as Biographies, Animation, and Mystery are consistently lower in the rankings. Some of the more inconsistently popular genres are Comedy and Romance. A note of this data. There are very few movies in the 2020 decade since this data set only included the one year.

# bump chart
#data cleaning into df
top10_genre = df['Genre_List'].value_counts().head(10).index

df_top10_genre = df[df['Genre_List'].isin(top10_genre)]

genre_counts = (df_top10_genre.groupby(['Decade', 'Genre_List']).size().reset_index(name='Count'))

bump_df = genre_counts.pivot(index='Decade', columns='Genre_List', values='Count')
bump_df = bump_df.fillna(0)

bump_df_ranked = bump_df.rank(1, ascending=False, method='first')

#visual
fig5 = plt.figure(figsize=(20,10))
ax = fig5.add_subplot(1, 1, 1)

#the basic bump plot
bump_df_ranked.plot(kind='line', ax=ax, marker='o', markeredgewidth=1, linewidth=5, markersize=40, markerfacecolor='white')

ax.invert_yaxis()  # rank 1 at top

num_cols = bump_df_ranked.shape[1]

#titles
plt.title('Top 10 Genre Rankings of Total Movies by Decade', fontsize=22, pad=15)
plt.xlabel('Decade', fontsize=22)
plt.ylabel('Ranking', fontsize=22)
plt.xticks(bump_df_ranked.index, bump_df_ranked.index, fontsize=18)

plt.yticks(range(1, num_cols+1, 1), fontsize=18)

#legend
handles, labels = ax.get_legend_handles_labels()
handles = [handles[9], handles[8], handles[7], handles[6], handles[5], handles[4], handles[3], handles[2], handles[1], handles[0]]
labels = [labels[9], labels[8], labels[7], labels[6], labels[5], labels[4], labels[3], labels[2], labels[1], labels[0]]
ax.legend(handles, labels, bbox_to_anchor=(1.01, 1.01), fontsize=14, labelspacing=1, markerscale=.4, borderpad=1, handletextpad=0.8)

#data in circles
for genre in bump_df_ranked.columns:
    for decade in bump_df_ranked.index:
        rank = bump_df_ranked.loc[decade, genre]
        value = bump_df.loc[decade, genre]
        ax.text(decade, rank, str(int(value)), ha='center', va='center', fontsize=14)

plt.show()