##By: Joseph Carnovale
##Introduction
The dataset chosen for this analysis is titled Movie dataset on kaggle. It details movies that have released between 1980 and 2019. Variables include: Name of Movie, genre, maturity rating, score, director, writer, release year, and star actor. The aim of this analysis is to determine which genre of movie has been the most popular throughout the years and if there if there is any correlation between the most popular genre and the highest grossing films.
##ViS 1
This visualization depicts the top 20 highest grossing films of all time. The immediate standouts are Avatar and Avengers: Endgame (no surprise there). One interesting observation is that over half of the top twenty movies fall below the average. It seems that the top 5 movies account for a large amount of the tot revenue brought in by the top twenty movies. With many of the top movies being action movies I am interested to see if action movies are the dominant genre in future visualizations.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
path = r"C:\Users\Joe\OneDrive\Desktop\ST473data\CSV_FIles\movie data\movies.csv"
filename = path
revenuedf = pd.read_csv(filename)
revenuedf_sorted = revenuedf.sort_values(by='gross', ascending=False)
top_20_movies = revenuedf_sorted.head(20)
avg = top_20_movies.gross.mean()
def pick_colors_according_to_mean_count(top_20_movies):
colors=[]
avg = top_20_movies.gross.mean()
for each in top_20_movies.gross:
if each > avg*1.01:
colors.append('lightcoral')
elif each < avg*0.99:
colors.append('green')
else:
colors.append('black')
return colors
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
mycolors1 = pick_colors_according_to_mean_count(top_20_movies
)
Above = mpatches.Patch(color='lightcoral', label='Above Average')
At = mpatches.Patch(color='black', label='Within 1% of the Average')
Below = mpatches.Patch(color='green', label='Below Average')
fig = plt.figure(figsize=(18, 16))
fig.suptitle('Gross Revenue Analysis of top 20 Movies')
ax1 = fig.add_subplot(2, 1, 1)
ax1.bar(top_20_movies.name, top_20_movies.gross, label='Gross', color=mycolors1)
ax1.legend(handles=[Above, At, Below], fontsize=14)
plt.axhline(top_20_movies.gross.mean(), color='black', linestyle='dashed')
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.set_xticklabels(top_20_movies.name, rotation=90, ha='center')
##Vis 2
This visualization further explores the top 20 movies using a dual bar chart. In addition to gross revenue, the critic score is given for each movie. Interestingly enough, there is not much difference between ratings and the top grossing movie of all time (Avatar) does not have the highest rating. It seems that a movie making money does not correlate to it being a good movie.
def autolabel(these_bars, this_ax, place_of_decimals, symbol):
for each_bar in these_bars:
height = each_bar.get_height()
this_ax.text(each_bar.get_x()+each_bar.get_width()/2, height*1.01, symbol+format(height, place_of_decimals),
fontsize=11, color='black', ha='center', va='bottom')
fig = plt.figure(figsize=(18,10))
ax1 = fig.add_subplot(1, 1, 1)
ax2 = ax1.twinx()
bar_width = 0.4
x_pos = np.arange(20)
gross_bars = ax1.bar(x_pos-(0.5*bar_width), top_20_movies.gross, bar_width, color='grey', edgecolor='black', label='Gross Revenue')
score_bars = ax2.bar(x_pos+(0.5*bar_width), top_20_movies.score, bar_width, color='green', edgecolor='black', label='Rating')
ax1.set_xlabel('Movie Name', fontsize=16, labelpad = 20)
ax1.set_ylabel('Gross Revenue (Billions)', fontsize=16, labelpad=20)
ax2.set_ylabel('IMDB Score', rotation = 270, fontsize=16, labelpad=20)
ax1.tick_params(axis='y', labelsize = 14)
ax2.tick_params(axis='y', labelsize=14)
plt.title('Gross Revenue and IMDB Score\n Top 20 Most Grossing Movies', fontsize =18)
ax1.set_xticks(x_pos)
ax1.set_xticklabels(top_20_movies.name, fontsize= 14)
ax1.set_xticklabels(top_20_movies.name, rotation=90, ha='center')
gross_color, gross_label = ax1.get_legend_handles_labels()
score_color, score_label = ax2.get_legend_handles_labels()
combined_handles = gross_color + score_color
combined_labels = gross_label + score_label
legend = ax1.legend(combined_handles, combined_labels, loc='upper right', frameon=True, ncol=1, borderpad=1, fontsize=14)
autolabel(score_bars, ax2, '', '')
##Vis 3
In the next visualization we beging to analyze the genres of movies. For this vis the top 8 genres were chosen as genres further down than the top 8 contributed very little to the overall percentage of movies. Here we can see that comedy movies account for nearly 30% of all movies in the past 40 years wiht action coming in second with just over 20%. This leads me to the question of how this trend has changed over time and if any other factors contribute to comedy movies being so popular.
genre_counts = revenuedf['genre'].value_counts()
top_8_genres = genre_counts.head(8)
total_movies = top_8_genres.sum()
number_outside_colors = len(genre_counts.index)
outside_color_ref_number = np.arange(len(top_8_genres)) * 4
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(1, 1, 1)
colormap = plt.get_cmap("tab20c")
outer_colors = colormap(outside_color_ref_number)
top_8_genres.plot(
kind='pie', radius=1, colors= outer_colors, pctdistance = 0.8, labeldistance = 1.1,
wedgeprops = dict(edgecolor = 'black'), textprops= {'fontsize':18},
autopct=lambda p: '{:.2f}%\n({:.0f} movies)'.format(p, (p / 100) * total_movies)
)
for text in ax.texts:
if text.get_text().count('%') > 0:
text.set_fontsize(9)
hole = plt.Circle((0, 0), 0.3, fc='white')
ax.add_artist(hole)
ax.text(0, 0, f'Total:\n{total_movies} Movies', ha='center', va='center', fontsize=14, color='black')
plt.title('Total movies per Genre')
plt.show()
##Vis 4
In this visualization we explore the changes in movie trends from 1980-2019 using a multi line plot. Like in the previous visualization, comedy is dominant until around 2005 where action begins to take first place. Another interesting observation is that Action, comedy, and drama are much all much higher than the other entries in the top 10.
top_8_genres = revenuedf['genre'].value_counts().head(8).index
top_8_df = revenuedf[revenuedf['genre'].isin(top_8_genres)]
genre_year_counts = top_8_df.groupby(['genre', 'year']).size().reset_index(name='movie_count')
genre_year = genre_year_counts.pivot(index='year', columns='genre', values='movie_count').fillna(0)
my_colors = {'Action':'red',
'Comedy':'blue',
'Drama':'purple',
'Crime':'gray',
'Biography':'green',
'Adventure':'gold',
'Animation':'orange',
'Horror':'black'}
fig, ax = plt.subplots(figsize=(12, 6))
for genre in top_8_genres:
ax.plot(genre_year.index, genre_year[genre], label=genre, color=my_colors.get(genre, 'gray'), marker='o')
ax.set_title('Number of Movies per Genre by Year (Top 8 Genres)')
ax.set_xlabel('Year')
ax.set_ylabel('Number of Movies')
ax.legend(title='Genre')
plt.tight_layout()
plt.show()
##Vis 5
This final visualization uses a heatmap to get a more accurate view on the trends movie genres took over the years. It is immediately apparent that comedy was at its most popular in the 1990s with a high in 1994. Action and Drama gradually get more popular starting in the early 2000s while all other genres stay relatively consistent throughout the years.
import seaborn as sns
from matplotlib.ticker import FuncFormatter
twenty_year_df = revenuedf[(revenuedf['year'] >= 1999) & (revenuedf['year'] <= 2019)]
twenty_year_counts = twenty_year_df.groupby(['genre', 'year']).size().reset_index(name='movie_count')
top_8_genres = twenty_year_counts.groupby('genre')['movie_count'].sum().nlargest(8).index
top_8_genres_df = genre_year_counts[genre_year_counts['genre'].isin(top_8_genres)]
hm_df = pd.pivot_table(top_8_genres_df, index= 'genre', columns= 'year', values = 'movie_count')
fig = plt.figure(figsize= (50, 10))
ax = fig.add_subplot(1, 1, 1)
comma_fmt = FuncFormatter(lambda x, p: format(int(x), ','))
ax = sns.heatmap(hm_df, linewidth = 0.2, annot = True, cmap = 'coolwarm', fmt= ',.0f',
annot_kws= {'size': 18},
cbar_kws = {'format': comma_fmt, 'orientation': 'vertical'})
plt.tight_layout(pad=10.0)
plt.subplots_adjust(top=0.90, bottom=0.10)
plt.title('Heatmap of the Number of movies per genre from 1999-2019', fontsize=18, pad=15)
plt.xlabel('Year', fontsize=18, labelpad=10)
plt.ylabel('Genre', fontsize=18, labelpad=18)
plt.yticks(rotation= 0, size=14)
## (array([0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5]), <a list of 8 Text yticklabel objects>)
plt.xticks(size=14)
## (array([ 0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5,
## 11.5, 12.5, 13.5, 14.5, 15.5, 16.5, 17.5, 18.5, 19.5, 20.5, 21.5,
## 22.5, 23.5, 24.5, 25.5, 26.5, 27.5, 28.5, 29.5, 30.5, 31.5, 32.5,
## 33.5, 34.5, 35.5, 36.5, 37.5, 38.5, 39.5, 40.5]), <a list of 41 Text xticklabel objects>)
plt.show()
###Conclusion
A majority of the highest grossing movies of all time fall under the action genre but the most popular and most numerous genre goes to comedy. This trend, however, seems to be changing and it looks like action movies will continue to grow in popularity. I think further analysis could be done to explore what exactly made certain movies more successful than other such as specific star actors/directors or waht month the movie was released.