Data Analysis on Disney Movies

Why disney movies?

Many people think that cartoon movies are only for adults. But that’s not true. I strongly believe that in this competitive and busy world, we all are stressed at times. Instead of following unhealthy ways to relieve the stress, watching cartoon movies is the best option for refreshment. When we watch the cartoon movies, we get the message that any problem can be solved.

All our dreams can come true, if we have the courage to pursue them. It’s kind of fun to do the impossible. The way to get started is to quit talking and begin doing. - Walt Disney


About the Dataset

I found the dataset on the website data.world. The link for the dataset :

https://data.world/dot2/disney-character-data-set-project/workspace/dataset?agentid=kgarrett&datasetid=disney-character-success-00-16


Let’s get started with our analysis

I have done the data analysis in R programming Language.

The questions we are going to answer with the help of these datatsets.

  1. Which genre movie is earning the highest Total Gross Income?
  2. Which movie earns the highest income?
  3. What is the trend of movies over the years(1937-2016)?

Step1 : Getting the data

Gross_Income <- read.csv("disney_movies_total_gross.csv",header = TRUE,fileEncoding="UTF-8-BOM")

Step2 : Load the packages you need to analyze your data

library(shiny)
library(ggplot2)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(SnowballC)
library(wordcloud)
## Loading required package: RColorBrewer
library(NLP)
library(RColorBrewer)
library(date)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date

Step 3: Let’s give a look to our datasets

head(Gross_Income)
##                       movie_title release_date     genre MPAA_rating
## 1 Snow White and the Seven Dwarfs Dec 21, 1937   Musical           G
## 2                       Pinocchio  Feb 9, 1940 Adventure           G
## 3                        Fantasia Nov 13, 1940   Musical           G
## 4               Song of the South Nov 12, 1946 Adventure           G
## 5                      Cinderella Feb 15, 1950     Drama           G
## 6    20,000 Leagues Under the Sea Dec 23, 1954 Adventure            
##    total_gross inflation_adjusted_gross
## 1 $184,925,485           $5,228,953,251
## 2  $84,300,000           $2,188,229,052
## 3  $83,320,000           $2,187,090,808
## 4  $65,000,000           $1,078,510,579
## 5  $85,000,000             $920,608,730
## 6  $28,200,000             $528,279,994

Step 4: Start analyzing the data

Let’s start the analysis of the data and answer the questions we discussed earlier.

1. Which genre movie is earning the highest Total Gross Income?

# Change the type of the "total_gross" variable to "integer". 

topDataGross <- as.numeric(gsub('[$,]','',Gross_Income$total_gross))

Gross_Income$total_gross <- topDataGross

subsetData <- subset(Gross_Income,Gross_Income$genre != "")

group_genre <- aggregate(total_gross ~ genre,data = subsetData,FUN = sum)




currencybillion <- (group_genre$total_gross/1000000000)
group_genre$currency <- currencybillion
head(group_genre)
##                 genre total_gross    currency
## 1              Action  4184563282  4.18456328
## 2           Adventure 16389069453 16.38906945
## 3        Black Comedy    97543212  0.09754321
## 4              Comedy  8119619678  8.11961968
## 5 Concert/Performance   103456466  0.10345647
## 6         Documentary   180685619  0.18068562
group_genre$genre <- factor(group_genre$genre, levels = group_genre$genre[order(group_genre$currency)])

ggplot(group_genre,aes(genre,currencybillion)) + geom_bar(stat = "identity", aes(fill = genre)) + theme_minimal() +
  #scale_y_continuous(limits=c(0, 18000000000), breaks=c(0, 3000000000, 6000000000, 9000000000,12000000000,15000000000,18000000000))+
  xlab("Genre")+ylab("Total Gross(billion)") + theme(panel.grid.major = element_blank(),
  axis.text.x  = element_text(angle=75, vjust=0.7, size=9))

Result : As we can see from the histogram that the movies of genre “Adventure” earns the highest total gross income and the movies from genre “Horror” earns the lowest.

  • Let’s see if this insight is true for inflated gross income as well
 topDataGross1 <- as.numeric(gsub('[$,]','',Gross_Income$inflation_adjusted_gross))
      
      Gross_Income$inflation_adjusted_gross <- topDataGross1
      
      subsetData <- subset(Gross_Income,Gross_Income$genre != "")
      
      group_genre <- aggregate(inflation_adjusted_gross ~ genre,data = subsetData,FUN = sum)
      
      currencybillion1 <- (group_genre$inflation_adjusted_gross/1000000000)
group_genre$currency <- currencybillion1
head(group_genre)
##                 genre inflation_adjusted_gross   currency
## 1              Action               5498936786  5.4989368
## 2           Adventure              24561266158 24.5612662
## 3        Black Comedy                156730475  0.1567305
## 4              Comedy              15409526913 15.4095269
## 5 Concert/Performance                114821678  0.1148217
## 6         Documentary                203488418  0.2034884
group_genre$genre <- factor(group_genre$genre, levels = group_genre$genre[order(group_genre$currency)])
      options(scipen=1000000)
      
      ggplot(group_genre,aes(genre,currency)) + geom_bar(stat = "identity", aes(fill = genre)) + theme_minimal() +
 # scale_y_continuous(limits=c(0, 25000000000), breaks=c(0, 5000000000, 9000000000, 13000000000,17000000000,21000000000,25000000000))+
  xlab("Genre")+ylab("Inflated Adjusted Gross(billion)") + theme(panel.grid.major = element_blank(),axis.text.x  = element_text(angle=75, vjust=0.7, size=9))

Result : As we can see from the histogram that the movies of genre “Adventure” earns the highest inflation adjusted gross income and the movies from genre “Concert/Performance” earns the lowest.

### 2. Which movie earns the highest income? - I am going to create a WORDCLOUD for answering the above question i.e the bigger the WORD the highest the income.

      datacloud <-head(arrange(Gross_Income,desc(total_gross)), n = 20)
      
      datacloud$Rank <- rank(datacloud$total_gross) 
      
      
      docs <- Corpus(VectorSource(datacloud$movie_title))

      toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
      
      docs <- tm_map(docs, toSpace, "/")
      docs <- tm_map(docs, toSpace, "@")
      docs <- tm_map(docs, toSpace, "\\|")
      
      # Convert the text to lower case
      
      docs <- tm_map(docs, content_transformer(tolower))
      
      # Remove numbers
      
      docs <- tm_map(docs, removeNumbers)
      
      # Remove english common stopwords
      
      docs <- tm_map(docs, removeWords, stopwords("english"))
      
      # Remove punctuations
      
      docs <- tm_map(docs, removePunctuation)
      
      # Eliminate extra white spaces
      
      docs <- tm_map(docs, stripWhitespace)
      
      dataframe <- data.frame(text=sapply(docs, identity), stringsAsFactors=F)
      
      datacloud$movie_title <- dataframe$text
      
      
      wordcloud(words = datacloud$movie_title ,freq = datacloud$Rank,min.freq=1,scale = c(1.5,0.7),
      max.words=200,random.order=FALSE,rot.per=0.5,colors=brewer.pal(8,"Dark2") ) 

Result : As we can see from the world cloud the movie “star wars ep vii force awakens” have the highest total gross income.

3. What is the trend of movies over the years(1937-2016)?

trendData <- Gross_Income[,c(2,3,5)]

trendData$total_gross <- topDataGross

trendData$release_date <- parse_date_time(Gross_Income$release_date, orders = c("ymd", "dmy", "mdy"))
trendData$year <- year(trendData$release_date)

trendData$release_date <- NULL

trendData1 <- subset(trendData,trendData$genre != "")

group_genre_date <- aggregate(total_gross ~ year + genre,data = trendData1,FUN = sum)

subseTrendData <- subset(group_genre_date,group_genre_date$total_gross != 0)

options(scipen=1000000)
currency <- (subseTrendData$total_gross / 100000000) 
subseTrendData$sample <- currency

ggplot(subseTrendData, aes(x=year, y=currency, color=genre)) + geom_point(size = 1.9) + theme_minimal() + ylab("Total_gross(billion)")

Result: As we can see that till year 1980, Movies were limited to few genre. Before 1940, there were only musical movies. We can see a spike in income of Adventure movies after 2000. So we can say that the love for adventure movies is increasing day by day.