#** Packages installed for project **#
install.packages(“ggplot2”) #load the package ggplot library(ggplot2)
#used to scrape data from web install.packages(“rvest”) library(rvest) install.packages(“xml2”) library(“xml2”) install.packages(“tm”) library(“tm”) install.packages(“SnowballC”) library(“SnowballC”)
install.packages(“wordcloud”) library(wordcloud)
#** Csv file imported **#
Moviedata <- read.csv(file.choose())
#** Getting Summary, determining NA/Null Values **#
View(Moviedata) head(Moviedata) tail(Moviedata) str(Moviedata) summary(Moviedata) View(Moviedata)
#** Removing NA/Null Values **#
colSums(is.na(Moviedata)) rowSums(is.na(Moviedata)) View(Moviedatadata) Moviedata3 <- Moviedata[rowSums(is.na(Moviedata)) == 0, ]
colSums(is.na(Moviedata3))
View(Moviedata3) summary(Moviedata3)
#** Renaming Columns/Variables **#
colnames(Moviedata3) <- c( “DayofWeek”,“Director”,“Genre”,“MovieTitle”,“ReleaseDate”,“Studio”,“AdjGrossMil”,“BudgetMil”,“GrossMil”,“IMDbRating”,“MovieLensRating”,“OverseasMil”,“OverseasPER”,“ProfitMil”,“ProfitPER”,“RuntimeMin”,“USMil”,“USGrossPER”)
#** Proper formatting of Variables **#
ProfitMil<-as.numeric(Moviedata3\(ProfitMil) GrossMil <-as.numeric(Moviedata3\)GrossMil) OverseasMil <-as.numeric(Moviedata3\(OverseasMil) AdjGrossMil <-as.numeric(Moviedata3\)AdjGrossMil)
#****** Visualizations ********###
#1. Year on year number of movies (Histogram)
a <- c(Moviedata3$ReleaseDate) b<-substring(a,7,10) View(b) class(year) year_num <- as.numeric(b)
ggplot(Moviedata3, aes(x=year_num)) + hist(year_num)
#increasing steadily till 90s started. #In 90’s decade movie number doubled, after 2000 Movie numbers has increased exponentially.
#2. Let’s discover which studio has been most profitable (Barplot) **#
ggplot(Moviedata3,aes(x=ProfitPER))+geom_bar(binwidth=1) sort(Moviedata3\(ProfitPER, decreasing = FALSE) p<-ggplot(data=Moviedata3, aes(x=Moviedata3\)ProfitPER, y=Moviedata3$Studio)) + geom_bar(stat=“identity”) p
#3. ** Is there any coorrelation between Runtime and Profit % ** ### (Corrplot and Boxplot)
install.packages(“corrplot”) library(corrplot) plot(Moviedata3\(RuntimeMin,Moviedata3\)ProfitPER) summary(Moviedata3\(RuntimeMin) boxplot(Moviedata3\)RuntimeMin)
#Most of the movies have runtime between 100 minutes to 131 minutes.# # THere seems to be No relation between Profit and runtime
#4. ** Does Big Budget ensures Good Rating ** ### (Scatterplot)
r <- ggplot(Moviedata3,aes(x=Moviedata3\(BudgetMil,y=Moviedata3\)IMDbRating))+ geom_point() r
#6. ** Does Movies profitable in US ensures Overseas Success ** ### (Scatterplot) ggplot(Moviedata3,aes(x=Moviedata3\(OverseasPER,y=Moviedata3\)USGrossPER))+ geom_point() boxplot(Moviedata3\(OverseasPER) boxplot(Moviedata3\)USGrossPER) # Negative coorelation between US Gross % and Overseas % # # Meaning: Movies Profiting in US , does not do well Overseas
#5. ** Which Studios have overseas Presence and how much (violin graph and barplot) g <- ggplot(Moviedata3, aes(Studio, OverseasMil)) g + geom_violin() + theme(axis.text.x = element_text(angle=65, vjust=0.6)) ## Even after negative correlation between USGross% and Overseas% # Art, Veston , UA and Dreamworks remains most profitable in both cases
ggplot(data = Moviedata3, aes(x= Moviedata3\(Studio,y=Moviedata3\)OverseasPER))+ geom_bar(stat = “identity”,width=.5, fill=“tomato3”) + labs(title=“Studios Overseas”)+ theme(axis.text.x = element_text(angle=65, vjust=0.6))
#6. Most number of Movies Production as per Genre (WORD cloud)### install.packages(“data.table”) library(data.table) install.packages(“wordcloud”) library(wordcloud) installed.packages(‘tm’) library(tm) install.packages(“NLP”) library(NLP) install.packages(“grDevices”) library(grDevices) install.packages(“RColorBrewer”) library(RColorBrewer)
#combining all genre together data_no_na_2 <- paste(Moviedata3$Genre, collapse = " ") # setting up source and corpus data_no_na_source <- VectorSource(data_no_na_2) corpus <- Corpus(data_no_na_source) #cleaning corpus <- tm_map(corpus, content_transformer(tolower)) corpus <- tm_map(corpus, removePunctuation) corpus <- tm_map(corpus, stripWhitespace)
#Making a document-term matrix dtm <- DocumentTermMatrix(corpus) dtm2 <- as.matrix(dtm)
#finding most frequent terms frequency <- colSums(dtm2) frequency <- sort(frequency, decreasing = TRUE) words <- names(frequency) wordcloud(words[1:15], frequency)
##– Number of Movies Produced in a Genre –# # Most movies produced are Action, Animation, Comedy and Drama, rest follows # This may be the reason of Action movies have major profit share
#7. #- Common words used in Movie Titles -##
#combining all genre together data_no_na_2 <- paste(Moviedata3$MovieTitle, collapse = " ")
data_no_na_source <- VectorSource(data_no_na_2) corpus <- Corpus(data_no_na_source)
#cleaning corpus <- tm_map(corpus, content_transformer(tolower)) corpus <- tm_map(corpus, removePunctuation) corpus <- tm_map(corpus, stripWhitespace) corpus <- tm_map(corpus_clean, stemDocument)
#Making a document-term matrix dtm <- DocumentTermMatrix(corpus) dtm2 <- as.matrix(dtm)
#finding most frequent terms frequency <- colSums(dtm2) frequency <- sort(frequency, decreasing = TRUE) words <- names(frequency) wordcloud(words[1:20], frequency)
##– Most Common words in Title are Family, Young, World, Friend, Find, live.
#9. ** Which Genre is most popular as per IMDB Ratings ** ### (Multiple boxplot in one chart and geom Count) ggplot(Moviedata3, aes(x=Moviedata3\(USMil,y=Moviedata3\)IMDbRating)) + geom_boxplot()
summary(Moviedata3$IMDbRating) # Most ratings given by IMDB are 6.3 to 7.6 # 50 % of movies has been given rating Above 6.9, rest half has been below median # Mean and median are very close, with very few outliers getting rating below 4.3
g <- ggplot(Moviedata3, aes(IMDbRating, Genre)) g + geom_boxplot(varwidth=T, fill=“tomato”)
#- suggesting most of the movies in these genres gets rating between 6 to 8 # horror, scifi, thriller, crime, roamnce have higher IQRs #- These genres are least produced with range of ratings are from 5.5 to 8, # very few numbers reach to above 8 rating in both cases
#* Movie lens vs imdb rating ggplot(Moviedata3, aes(IMDbRating,MovieLensRating)) + geom_count()
#* IMDb Rating is parallel to the recommendations made by Movielens, very few outliers.
#10. ** Most and least popular genre as per profit% ** ### (barplot) ggplot(data = Moviedata3, aes(x= Moviedata3\(Genre,y=Moviedata3\)ProfitPER))+ geom_bar(stat = “identity”,width=.5, fill=“tomato3”) + labs(title=“Popular Genre”)+ theme(axis.text.x = element_text(angle=65, vjust=0.6))
##– Action, Comedy, Horror, Drama are the popular Genre ##– Average Profitable ones are Adventure and Animation ## – Least profitable ones are Mystery, fantasy and documentary,
#11. ***** Best Director ensuring Most Profit ******** # (Plot and calculaitons) barplot(table(Moviedata3$Director)) ggplot(data = Moviedata3, aes(x=Director,y=Profit.))
director_count <- barplot(table(Moviedata3\(Director)) director_list <- as.data.frame(table(Moviedata3\)Director)) director_list
df <- data.frame(“Director” = Moviedata3\(Director, "Profit%" = Moviedata3\)ProfitPER)
library(data.table) Profit. <- Moviedata3$ProfitPER aggregate <- aggregate(cbind(Count, Profit.) ~Director, transform(df, Count = 1), sum)
sorted_aggregate <- aggregate[order(aggregate$Profit., decreasing = TRUE),] head(sorted_aggregate,n=10) tail(sorted_aggregate,n=10)
##– Top directors as per Profit % are Daniel Myrick, Eduardo Sanchez and Steven Spielberg ##
#10. ** Yearwise budget vs profit *** # (Geom jitter)
library(ggplot2) a <- c(Moviedata3$ReleaseDate) b<-substring(a,7,10) b <- as.numeric(b)
g <- ggplot(Moviedata3, aes(b, Moviedata3$BudgetMil)) g + geom_jitter(aes(col=Genre, size=ProfitMil)) + theme(axis.text.x = element_text(angle=65, vjust=0.6)) g
#11.Studio vs Genre(no of movies) g <- ggplot(Moviedata3, aes(Genre)) g + geom_(aes(fill=Studio), width = 0.5)