#** Packages installed for project **#

install.packages(“ggplot2”) #load the package ggplot library(ggplot2)

#used to scrape data from web install.packages(“rvest”) library(rvest) install.packages(“xml2”) library(“xml2”) install.packages(“tm”) library(“tm”) install.packages(“SnowballC”) library(“SnowballC”)

install.packages(“wordcloud”) library(wordcloud)

#** Csv file imported **#

Moviedata <- read.csv(file.choose())

#** Getting Summary, determining NA/Null Values **#

View(Moviedata) head(Moviedata) tail(Moviedata) str(Moviedata) summary(Moviedata) View(Moviedata)

#** Removing NA/Null Values **#

colSums(is.na(Moviedata)) rowSums(is.na(Moviedata)) View(Moviedatadata) Moviedata3 <- Moviedata[rowSums(is.na(Moviedata)) == 0, ]
colSums(is.na(Moviedata3))

View(Moviedata3) summary(Moviedata3)

#** Renaming Columns/Variables **#

colnames(Moviedata3) <- c( “DayofWeek”,“Director”,“Genre”,“MovieTitle”,“ReleaseDate”,“Studio”,“AdjGrossMil”,“BudgetMil”,“GrossMil”,“IMDbRating”,“MovieLensRating”,“OverseasMil”,“OverseasPER”,“ProfitMil”,“ProfitPER”,“RuntimeMin”,“USMil”,“USGrossPER”)

#** Proper formatting of Variables **#

ProfitMil<-as.numeric(Moviedata3$ProfitMil) GrossMil <-as.numeric(Moviedata3$GrossMil) OverseasMil <-as.numeric(Moviedata3$OverseasMil) AdjGrossMil <-as.numeric(Moviedata3$AdjGrossMil)

#****** Visualizations ********###

#1. Year on year number of movies (Histogram)

a <- c(Moviedata3$ReleaseDate) b<-substring(a,7,10) View(b) class(year) year_num <- as.numeric(b)

ggplot(Moviedata3, aes(x=year_num)) + hist(year_num)

Histogram of “Years Movie have released”- 80’s Decade has seen uprising in release of movies, which kept

#increasing steadily till 90s started. #In 90’s decade movie number doubled, after 2000 Movie numbers has increased exponentially.

#2. Let’s discover which studio has been most profitable (Barplot) **#

ggplot(Moviedata3,aes(x=ProfitPER))+geom_bar(binwidth=1) sort(Moviedata3$ProfitPER, decreasing = FALSE) p<-ggplot(data=Moviedata3, aes(x=Moviedata3$ProfitPER, y=Moviedata3$Studio)) + geom_bar(stat=“identity”) p

Paramount Pictures, Universal, Fox, WB, are amongst the Most profitable Studios over the years

#3. ** Is there any coorrelation between Runtime and Profit % ** ### (Corrplot and Boxplot)

install.packages(“corrplot”) library(corrplot) plot(Moviedata3$RuntimeMin,Moviedata3$ProfitPER) summary(Moviedata3$RuntimeMin) boxplot(Moviedata3$RuntimeMin)

#Most of the movies have runtime between 100 minutes to 131 minutes.# # THere seems to be No relation between Profit and runtime

#4. ** Does Big Budget ensures Good Rating ** ### (Scatterplot)

r <- ggplot(Moviedata3,aes(x=Moviedata3$BudgetMil,y=Moviedata3$IMDbRating))+ geom_point() r

Lower budget movies garner better ratings than high budget movies (more than 200 mil)

observation - shows a curvilinear relation - as the lower the rating higher the budget

#6. ** Does Movies profitable in US ensures Overseas Success ** ### (Scatterplot) ggplot(Moviedata3,aes(x=Moviedata3$OverseasPER,y=Moviedata3$USGrossPER))+ geom_point() boxplot(Moviedata3$OverseasPER) boxplot(Moviedata3$USGrossPER) # Negative coorelation between US Gross % and Overseas % # # Meaning: Movies Profiting in US , does not do well Overseas

#5. ** Which Studios have overseas Presence and how much (violin graph and barplot) g <- ggplot(Moviedata3, aes(Studio, OverseasMil)) g + geom_violin() + theme(axis.text.x = element_text(angle=65, vjust=0.6)) ## Even after negative correlation between USGross% and Overseas% # Art, Veston , UA and Dreamworks remains most profitable in both cases

ggplot(data = Moviedata3, aes(x= Moviedata3$Studio,y=Moviedata3$OverseasPER))+ geom_bar(stat = “identity”,width=.5, fill=“tomato3”) + labs(title=“Studios Overseas”)+ theme(axis.text.x = element_text(angle=65, vjust=0.6))

** this may infer that, studios produce a range of Genre movies,

** instead of sticking to few Genres

#6. Most number of Movies Production as per Genre (WORD cloud)### install.packages(“data.table”) library(data.table) install.packages(“wordcloud”) library(wordcloud) installed.packages(‘tm’) library(tm) install.packages(“NLP”) library(NLP) install.packages(“grDevices”) library(grDevices) install.packages(“RColorBrewer”) library(RColorBrewer)

#combining all genre together data_no_na_2 <- paste(Moviedata3$Genre, collapse = " ") # setting up source and corpus data_no_na_source <- VectorSource(data_no_na_2) corpus <- Corpus(data_no_na_source) #cleaning corpus <- tm_map(corpus, content_transformer(tolower)) corpus <- tm_map(corpus, removePunctuation) corpus <- tm_map(corpus, stripWhitespace)

#Making a document-term matrix dtm <- DocumentTermMatrix(corpus) dtm2 <- as.matrix(dtm)

#finding most frequent terms frequency <- colSums(dtm2) frequency <- sort(frequency, decreasing = TRUE) words <- names(frequency) wordcloud(words[1:15], frequency)

##– Number of Movies Produced in a Genre –# # Most movies produced are Action, Animation, Comedy and Drama, rest follows # This may be the reason of Action movies have major profit share

#7. #- Common words used in Movie Titles -##

#combining all genre together data_no_na_2 <- paste(Moviedata3$MovieTitle, collapse = " ")

setting up source and corpus

data_no_na_source <- VectorSource(data_no_na_2) corpus <- Corpus(data_no_na_source)

#cleaning corpus <- tm_map(corpus, content_transformer(tolower)) corpus <- tm_map(corpus, removePunctuation) corpus <- tm_map(corpus, stripWhitespace) corpus <- tm_map(corpus_clean, stemDocument)

#Making a document-term matrix dtm <- DocumentTermMatrix(corpus) dtm2 <- as.matrix(dtm)

#finding most frequent terms frequency <- colSums(dtm2) frequency <- sort(frequency, decreasing = TRUE) words <- names(frequency) wordcloud(words[1:20], frequency)

##– Most Common words in Title are Family, Young, World, Friend, Find, live.

#9. ** Which Genre is most popular as per IMDB Ratings ** ### (Multiple boxplot in one chart and geom Count) ggplot(Moviedata3, aes(x=Moviedata3$USMil,y=Moviedata3$IMDbRating)) + geom_boxplot()

summary(Moviedata3$IMDbRating) # Most ratings given by IMDB are 6.3 to 7.6 # 50 % of movies has been given rating Above 6.9, rest half has been below median # Mean and median are very close, with very few outliers getting rating below 4.3

g <- ggplot(Moviedata3, aes(IMDbRating, Genre)) g + geom_boxplot(varwidth=T, fill=“tomato”)

Action, Adventure, animation,Comedy have close medians more than 6.5 to with Least IQR

#- suggesting most of the movies in these genres gets rating between 6 to 8 # horror, scifi, thriller, crime, roamnce have higher IQRs #- These genres are least produced with range of ratings are from 5.5 to 8, # very few numbers reach to above 8 rating in both cases

#* Movie lens vs imdb rating ggplot(Moviedata3, aes(IMDbRating,MovieLensRating)) + geom_count()

#* IMDb Rating is parallel to the recommendations made by Movielens, very few outliers.

#10. ** Most and least popular genre as per profit% ** ### (barplot) ggplot(data = Moviedata3, aes(x= Moviedata3$Genre,y=Moviedata3$ProfitPER))+ geom_bar(stat = “identity”,width=.5, fill=“tomato3”) + labs(title=“Popular Genre”)+ theme(axis.text.x = element_text(angle=65, vjust=0.6))

##– Action, Comedy, Horror, Drama are the popular Genre ##– Average Profitable ones are Adventure and Animation ## – Least profitable ones are Mystery, fantasy and documentary,

#11. ***** Best Director ensuring Most Profit ******** # (Plot and calculaitons) barplot(table(Moviedata3$Director)) ggplot(data = Moviedata3, aes(x=Director,y=Profit.))

director_count <- barplot(table(Moviedata3$Director)) director_list <- as.data.frame(table(Moviedata3$Director)) director_list

df <- data.frame(“Director” = Moviedata3$Director, "Profit%" = Moviedata3$ProfitPER)

library(data.table) Profit. <- Moviedata3$ProfitPER aggregate <- aggregate(cbind(Count, Profit.) ~Director, transform(df, Count = 1), sum)

sorted_aggregate <- aggregate[order(aggregate$Profit., decreasing = TRUE),] head(sorted_aggregate,n=10) tail(sorted_aggregate,n=10)

##– Top directors as per Profit % are Daniel Myrick, Eduardo Sanchez and Steven Spielberg ##

#10. ** Yearwise budget vs profit *** # (Geom jitter)

library(ggplot2) a <- c(Moviedata3$ReleaseDate) b<-substring(a,7,10) b <- as.numeric(b)

g <- ggplot(Moviedata3, aes(b, Moviedata3$BudgetMil)) g + geom_jitter(aes(col=Genre, size=ProfitMil)) + theme(axis.text.x = element_text(angle=65, vjust=0.6)) g

#11.Studio vs Genre(no of movies) g <- ggplot(Moviedata3, aes(Genre)) g + geom_(aes(fill=Studio), width = 0.5)