The data used for analysis in this project will be from the IMDB Database. Data has always been at the heart of the movie/film industry. We are going to analyze a number of factors that make a movie successful and eventually help in gaining a high IMDB Score. We will also study the features (like actors, directors, movie names, genres, social media feedback, location, language, etc.) associated with each movie in this project.
library(ggplot2)
library(ggrepel)
library(ggthemes)
library(scales)
library(dplyr)
library(VIM)
library(data.table)
library(formattable)
library(plotly)
library(corrplot)
library(GGally)
library(caret)
library(car)
library(DataExplorer)
library(flextable)
library(knitr)
library(stringr)
library(formattable)
library(corrplot)
library(broom)
library(rpart)
library(rpart.plot)
library(randomForest)
library(caret)
library(e1071)
library(gbm)
movie <- read.csv("C:/Users/Swagatam/Desktop/MS Capstone 2/movie_metadata.csv")
str(movie)
## 'data.frame': 5043 obs. of 28 variables:
## $ color : Factor w/ 3 levels ""," Black and White",..: 3 3 3 3 1 3 3 3 3 3 ...
## $ director_name : Factor w/ 2399 levels "","A. Raven Cruz",..: 929 801 2027 380 606 109 2030 1652 1228 554 ...
## $ num_critic_for_reviews : int 723 302 602 813 NA 462 392 324 635 375 ...
## $ duration : int 178 169 148 164 NA 132 156 100 141 153 ...
## $ director_facebook_likes : int 0 563 0 22000 131 475 0 15 0 282 ...
## $ actor_3_facebook_likes : int 855 1000 161 23000 NA 530 4000 284 19000 10000 ...
## $ actor_2_name : Factor w/ 3033 levels "","50 Cent","A. Michael Baldwin",..: 1408 2218 2489 534 2433 2549 1228 801 2440 653 ...
## $ actor_1_facebook_likes : int 1000 40000 11000 27000 131 640 24000 799 26000 25000 ...
## $ gross : int 760505847 309404152 200074175 448130642 NA 73058679 336530303 200807262 458991599 301956980 ...
## $ genres : Factor w/ 914 levels "Action","Action|Adventure",..: 107 101 128 288 754 126 120 308 126 447 ...
## $ actor_1_name : Factor w/ 2098 levels "","50 Cent","A.J. Buckley",..: 305 983 355 1968 528 443 787 223 338 35 ...
## $ movie_title : Factor w/ 4917 levels "#Horror ","[Rec] 2 ",..: 398 2731 3279 3707 3332 1961 3289 3459 399 1631 ...
## $ num_voted_users : int 886204 471220 275868 1144337 8 212204 383056 294810 462669 321795 ...
## $ cast_total_facebook_likes: int 4834 48350 11700 106759 143 1873 46055 2036 92000 58753 ...
## $ actor_3_name : Factor w/ 3522 levels "","50 Cent","A.J. Buckley",..: 3442 1395 3134 1771 1 2714 1970 2163 3018 2941 ...
## $ facenumber_in_poster : int 0 0 1 0 0 1 0 1 4 3 ...
## $ plot_keywords : Factor w/ 4761 levels "","10 year old|dog|florida|girl|supermarket",..: 1320 4283 2076 3484 1 651 4745 29 1142 2005 ...
## $ movie_imdb_link : Factor w/ 4919 levels "http://www.imdb.com/title/tt0006864/?ref_=fn_tt_tt_1",..: 2965 2721 4533 3756 4918 2476 2526 2458 4546 2551 ...
## $ num_user_for_reviews : int 3054 1238 994 2701 NA 738 1902 387 1117 973 ...
## $ language : Factor w/ 48 levels "","Aboriginal",..: 13 13 13 13 1 13 13 13 13 13 ...
## $ country : Factor w/ 66 levels "","Afghanistan",..: 65 65 63 65 1 65 65 65 65 63 ...
## $ content_rating : Factor w/ 19 levels "","Approved",..: 10 10 10 10 1 10 10 9 10 9 ...
## $ budget : num 2.37e+08 3.00e+08 2.45e+08 2.50e+08 NA ...
## $ title_year : int 2009 2007 2015 2012 NA 2012 2007 2010 2015 2009 ...
## $ actor_2_facebook_likes : int 936 5000 393 23000 12 632 11000 553 21000 11000 ...
## $ imdb_score : num 7.9 7.1 6.8 8.5 7.1 6.6 6.2 7.8 7.5 7.5 ...
## $ aspect_ratio : num 1.78 2.35 2.35 2.35 NA 2.35 2.35 1.85 2.35 2.35 ...
## $ movie_facebook_likes : int 33000 0 85000 164000 0 24000 0 29000 118000 10000 ...
# duplicate rows
sum(duplicated(movie))
## [1] 45
# delete duplicate rows
movie <- movie[!duplicated(movie), ]
##Removing Special Characters in the movie_title column
movie$movie_title <- gsub("Â", "", as.character(factor(movie$movie_title)))
str_trim(movie$movie_title, side = "right")
#Splitting the genres
genre.split<-movie%>%
select(genres,imdb_score)%>%
mutate(Action=ifelse(grepl("Action",genres),1,0),
Adventure=ifelse(grepl("Adventure",genres),1,0),
Animation=ifelse(grepl("Animation",genres),1,0),
Biography=ifelse(grepl("Biography",genres),1,0),
Comedy=ifelse(grepl("Comedy",genres),1,0),
Crime =ifelse(grepl("Crime",genres),1,0),
Documentary=ifelse(grepl("Documentary",genres),1,0),
Drama=ifelse(grepl("Drama",genres),1,0),
Family=ifelse(grepl("Family",genres),1,0),
Fantasy=ifelse(grepl("Fantasy",genres),1,0),
`Film-Noir`=ifelse(grepl("Film-Noir",genres),1,0),
History =ifelse(grepl("History",genres),1,0),
Horror=ifelse(grepl("Horror",genres),1,0),
Musical=ifelse(grepl("Musical",genres),1,0),
Mystery=ifelse(grepl("Mystery",genres),1,0),
News=ifelse(grepl("News",genres),1,0),
Romance=ifelse(grepl("Romance",genres),1,0),
`Sci-Fi`=ifelse(grepl("Sci-Fi",genres),1,0),
Short=ifelse(grepl("Short",genres),1,0),
Sport=ifelse(grepl("Sport",genres),1,0),
War=ifelse(grepl("War",genres),1,0),
Western=ifelse(grepl("Western",genres),1,0))
#Genre wise movie Score
genre.split%>%
tidyr::gather(Genre_Type,Binary,Action:Western)%>%
filter(Binary==1)%>%
select(-c(Binary,genres))%>%
group_by(Genre_Type)%>%
summarise(Mean_Score=mean(imdb_score))%>%
arrange(Mean_Score)%>%
ggplot(aes(x=Genre_Type,y=Mean_Score,fill=Genre_Type))+
geom_bar(stat="identity", color="black")+
coord_flip()
#Removing Genres
movie <- movie%>%select(-genres)
#Count of Missing Values
colSums(sapply(movie, is.na))
## color director_name
## 0 0
## num_critic_for_reviews duration
## 49 15
## director_facebook_likes actor_3_facebook_likes
## 103 23
## actor_2_name actor_1_facebook_likes
## 0 7
## gross actor_1_name
## 874 0
## movie_title num_voted_users
## 0 0
## cast_total_facebook_likes actor_3_name
## 0 0
## facenumber_in_poster plot_keywords
## 13 0
## movie_imdb_link num_user_for_reviews
## 0 21
## language country
## 0 0
## content_rating budget
## 0 487
## title_year actor_2_facebook_likes
## 107 13
## imdb_score aspect_ratio
## 0 327
## movie_facebook_likes
## 0
missing.values <- aggr(movie, sortVars = T, prop = T, sortCombs = T, cex.lab = 1.5, cex.axis = .6, cex.numbers = 5, combined = F, gap = -.2)
##
## Variables sorted by number of missings:
## Variable Count
## gross 0.174869948
## budget 0.097438976
## aspect_ratio 0.065426170
## title_year 0.021408563
## director_facebook_likes 0.020608243
## num_critic_for_reviews 0.009803922
## actor_3_facebook_likes 0.004601841
## num_user_for_reviews 0.004201681
## duration 0.003001200
## facenumber_in_poster 0.002601040
## actor_2_facebook_likes 0.002601040
## actor_1_facebook_likes 0.001400560
## color 0.000000000
## director_name 0.000000000
## actor_2_name 0.000000000
## actor_1_name 0.000000000
## movie_title 0.000000000
## num_voted_users 0.000000000
## cast_total_facebook_likes 0.000000000
## actor_3_name 0.000000000
## plot_keywords 0.000000000
## movie_imdb_link 0.000000000
## language 0.000000000
## country 0.000000000
## content_rating 0.000000000
## imdb_score 0.000000000
## movie_facebook_likes 0.000000000
#New Dimensions
movie <- movie[!is.na(movie$gross), ]
movie <- movie[!is.na(movie$budget), ]
dim(movie)
## [1] 3857 27
#Complete Rows
sum(complete.cases(movie))
## [1] 3768
#New Count of Missing Values
colSums(sapply(movie, is.na))
## color director_name
## 0 0
## num_critic_for_reviews duration
## 1 1
## director_facebook_likes actor_3_facebook_likes
## 0 10
## actor_2_name actor_1_facebook_likes
## 0 3
## gross actor_1_name
## 0 0
## movie_title num_voted_users
## 0 0
## cast_total_facebook_likes actor_3_name
## 0 0
## facenumber_in_poster plot_keywords
## 6 0
## movie_imdb_link num_user_for_reviews
## 0 0
## language country
## 0 0
## content_rating budget
## 0 0
## title_year actor_2_facebook_likes
## 0 5
## imdb_score aspect_ratio
## 0 74
## movie_facebook_likes
## 0
summary(movie$aspect_ratio)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.180 1.850 2.350 2.109 2.350 16.000 74
#Aspect Ratio Analysis
movie$aspect_ratio[is.na(movie$aspect_ratio)] <- 0
mean(movie$imdb_score[movie$aspect_ratio == 1.85])
## [1] 6.373938
mean(movie$imdb_score[movie$aspect_ratio == 2.35])
## [1] 6.508471
mean(movie$imdb_score[movie$aspect_ratio != 1.85 & movie$aspect_ratio != 2.35])
## [1] 6.672519
#Removing Aspect Ratio
movie <- subset(movie, select = -c(aspect_ratio))
# replace NA with column average for facenumber_in_poster
movie$facenumber_in_poster[is.na(movie$facenumber_in_poster)] <- round(mean(movie$facenumber_in_poster, na.rm = TRUE))
# convert 0s into NAs for other predictors
movie[,c(5,6,8,13,24,26)][movie[,c(5,6,8,13,24,26)] == 0] <- NA
# impute missing value with column mean
movie$num_critic_for_reviews[is.na(movie$num_critic_for_reviews)] <- round(mean(movie$num_critic_for_reviews, na.rm = TRUE))
movie$duration[is.na(movie$duration)] <- round(mean(movie$duration, na.rm = TRUE))
movie$director_facebook_likes[is.na(movie$director_facebook_likes)] <- round(mean(movie$director_facebook_likes, na.rm = TRUE))
movie$actor_3_facebook_likes[is.na(movie$actor_3_facebook_likes)] <- round(mean(movie$actor_3_facebook_likes, na.rm = TRUE))
movie$actor_1_facebook_likes[is.na(movie$actor_1_facebook_likes)] <- round(mean(movie$actor_1_facebook_likes, na.rm = TRUE))
movie$cast_total_facebook_likes[is.na(movie$cast_total_facebook_likes)] <- round(mean(movie$cast_total_facebook_likes, na.rm = TRUE))
movie$actor_2_facebook_likes[is.na(movie$actor_2_facebook_likes)] <- round(mean(movie$actor_2_facebook_likes, na.rm = TRUE))
movie$movie_facebook_likes[is.na(movie$movie_facebook_likes)] <- round(mean(movie$movie_facebook_likes, na.rm = TRUE))
#Content Ratings
Movie_Ratings<-movie%>%
select(content_rating)%>%
group_by(content_rating)%>%
summarise(Count=n())%>%
select(content_rating,Count)
Movie_Ratings.df<-as.data.frame(Movie_Ratings)
#Remove Blamk Observations
movie <- movie[!(movie$content_rating %in% ""),]
#Categorization of the content_ratings variable
movie$content_rating[movie$content_rating == 'M'] <- 'PG'
movie$content_rating[movie$content_rating == 'GP'] <- 'PG'
movie$content_rating[movie$content_rating == 'X'] <- 'NC-17'
movie$content_rating[movie$content_rating == 'Approved'] <- 'R'
movie$content_rating[movie$content_rating == 'Not Rated'] <- 'R'
movie$content_rating[movie$content_rating == 'Passed'] <- 'R'
movie$content_rating[movie$content_rating == 'Unrated'] <- 'R'
movie$content_rating <- factor(movie$content_rating)
table(movie$content_rating)
##
## G NC-17 PG PG-13 R
## 91 16 576 1314 1809
#Profit Column
movie <- movie %>%
mutate(profit = gross - budget,
return_on_investment_perc = (profit/budget)*100)
#Removing Color and Language Columns
movie <- subset(movie, select = -c(color))
movie <- subset(movie, select = -c(language))
#Cleaning the Country column into 3 categories
levels(movie$country) <- c(levels(movie$country), "Others")
movie$country[(movie$country != 'USA')&(movie$country != 'UK')] <- 'Others'
movie$country <- factor(movie$country)
##Distribution of IMDB Score Variable
ggplot(movie, aes(x=imdb_score)) +
geom_density(fill="red",alpha = 0.6)+coord_cartesian(xlim = c(0, 10))+
geom_vline(xintercept = mean(movie$imdb_score), color="blue")
summary(movie$imdb_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.60 5.90 6.60 6.46 7.20 9.30
sd(movie$imdb_score)
## [1] 1.054605
profit.movie <-movie%>%
select(movie_title,profit)%>%
filter(!is.na(profit))%>%
arrange(desc(profit))%>%
top_n(20)
p1 <- ggplot(profit.movie, aes(x=reorder(movie_title,profit/1000000), profit/1000000,fill=factor(movie_title))) +
geom_bar(stat = "identity") +
ggtitle("Top Profitable Movies")+coord_flip()+xlab("Movie Name")+ylab("Profit in Million $")+theme_bw()
p1
movie %>%
# filter(title_year %in% c(2000:2016)) %>%
arrange(desc(profit)) %>%
top_n(20, profit) %>%
ggplot(aes(x=budget/1000000, y=profit/1000000)) +
geom_point(size=3) +
geom_smooth(size=2) +
geom_text_repel(aes(label=movie_title)) +
labs(x = "Budget in Million $", y = "Profit in Million $", title = "Top 20 Profitable Movies") +
theme(plot.title = element_text(hjust = 0.5))
movie %>%
mutate(profit = gross - budget,
return_on_investment_perc = (profit/budget)*100) %>%
arrange(desc(profit)) %>%
top_n(20, profit) %>%
ggplot(aes(x=budget/1000000, y = return_on_investment_perc)) +
geom_point(size = 3) +
geom_smooth(size = 2) +
geom_text_repel(aes(label = movie_title), size = 3) +
xlab("Budget in Million $") +
ylab("Percentage Return on Investment")
## Number of Voters and IMDB Score
p<-ggplot(movie, aes(x=imdb_score, y=num_voted_users, group=content_rating))+
geom_point(aes(color=content_rating),size=0.7)+
scale_color_brewer(palette="Dark2")+geom_smooth(aes(color=content_rating),se = FALSE, method = lm)+
xlab("IMDB Score")+ylab("Number of Voters")+labs(color = "Rating\n")
ggplotly(p)
movie %>%
top_n(20, profit) %>%
ggplot(aes(x = imdb_score, y = gross/10^6, size = profit/10^6, color = content_rating)) +
geom_point() +
geom_hline(aes(yintercept = 550)) +
geom_vline(aes(xintercept = 7.75)) +
geom_text_repel(aes(label = movie_title), size = 4) +
xlab("IMDB Score") +
ylab("Gross Money Earned(in million dollars)") +
ggtitle("Commercial Success Vs Critical Acclaim") +
annotate("text", x = 8.5, y = 700, label = "High IMDB Score & High Gross",size=5) +
theme(plot.title = element_text(hjust = 0.5))
#Time Series for IMDB Score
imdb.ts<-movie%>%
select(title_year,imdb_score,country)%>%
group_by(title_year)%>%
summarise(IMDB_Rating=mean(imdb_score))
plot.ts1<-ggplot(data=imdb.ts,aes(x=title_year,y=IMDB_Rating))+geom_point(size=3)+geom_line(size=1)+
geom_smooth(col="red")+xlab("Year of Release")+ylab("IMDB Rating")
ggplotly(plot.ts1)
* The above trend shows the fluctuation of average yearly IMDB Scores of movies released from 1926 to 2016. There’s a clear fall in the trend over the years. The reason behind the highly fluctuating trend in the early years (i.e. between 1925 and 1960) is the lack of data points. We don’t have data for all the years and probably most of them are from highly successful movies. However, we have consistent data for the years after 1965 and it shows a declining trend in that period suggesting that there are very few highly rated in the recent years
#Time Series for Return on Investment
roi.ts<-movie%>%
select(title_year,return_on_investment_perc)%>%
group_by(title_year)%>%
summarise(ROI=mean(return_on_investment_perc))
plot.ts2<-ggplot(data=roi.ts,aes(x=title_year,y=ROI))+geom_point(size=3)+geom_line(size=1)+
geom_smooth(col="green")+xlab("Year of Release")+ylab("Return on Investment")
ggplotly(plot.ts2)
* The above trend shows the average Return on Investment in percentage for the years 1926 to 2016. We can observe that except a few outliers, the returns are quite stable over the years. Some of the years had exceptional Box Office hits and hence contributed to the high Return on Investments. Again, there is consistent data after the year 1960 and hence the better/accurate representation for years after 1960.
director.imdb<-movie%>%
select(director_name,imdb_score)%>%
group_by(director_name)%>%
summarise(Average_IMDB_Rating=mean(imdb_score))%>%
arrange(desc(Average_IMDB_Rating))%>%
top_n(20)
director.df<-as.data.frame(director.imdb)
names(director.df)[names(director.df) == "director_name"] <- "Director"
names(director.df)[names(director.df) == "Average_IMDB_Rating"] <- "Average_IMDB_Rating"
director.table <- formattable(director.df,list(Average_IMDB_Rating=color_bar("lightgreen")))
director.table
| Director | Average_IMDB_Rating |
|---|---|
| Akira Kurosawa | 8.700000 |
| Charles Chaplin | 8.600000 |
| Tony Kaye | 8.600000 |
| Alfred Hitchcock | 8.500000 |
| Damien Chazelle | 8.500000 |
| Majid Majidi | 8.500000 |
| Ron Fricke | 8.500000 |
| Sergio Leone | 8.433333 |
| Christopher Nolan | 8.425000 |
| Asghar Farhadi | 8.400000 |
| Marius A. Markevicius | 8.400000 |
| Richard Marquand | 8.400000 |
| Billy Wilder | 8.300000 |
| Fritz Lang | 8.300000 |
| Lee Unkrich | 8.300000 |
| Lenny Abrahamson | 8.300000 |
| Pete Docter | 8.233333 |
| Hayao Miyazaki | 8.225000 |
| Elia Kazan | 8.200000 |
| George Roy Hill | 8.200000 |
| Joshua Oppenheimer | 8.200000 |
| Juan José Campanella | 8.200000 |
| Quentin Tarantino | 8.200000 |
imdb.user<-ggplot(data=movie,aes(x=imdb_score,y=num_user_for_reviews,colour=factor(country)))+
geom_point(aes(colour= factor(country)),size=0.7)+
geom_smooth(se = FALSE, method = "lm")+xlab("IMDB Score")+ylab("Number of User Reviews")+
ylim(0,1500)+labs(color = "Country\n")
ggplotly(imdb.user)
ggplot(data=movie,aes(x=actor_1_facebook_likes,y=movie_facebook_likes))+
geom_point()+
geom_smooth(se = TRUE, method = "lm")+xlim(0,50000)+ylim(0,200000)
#Unique Director and Actor Names
sum(uniqueN(movie$director_name))
## [1] 1709
sum(uniqueN(movie[, c("actor_1_name", "actor_2_name", "actor_3_name")]))
## [1] 3713
#Dropping unnecessary columns
movie<-movie%>%
select(-c(actor_1_name,actor_2_name,actor_3_name,director_name,
plot_keywords,movie_imdb_link,movie_title,profit,return_on_investment_perc))
#Visualizing Correlation Plots
ggcorr(movie, label = TRUE, label_round = 3, label_size = 3, size = 2, hjust = .85) +
ggtitle("Correlation between continuous variables") +
theme(plot.title = element_text(hjust = 0.5))
#Adding new columns and deleting unnecessary columns
movie<-movie%>%
mutate(other_actor_facebook_likes=actor_2_facebook_likes + actor_3_facebook_likes,
critic_total_ratio=num_critic_for_reviews/num_user_for_reviews)%>%
select (-c(cast_total_facebook_likes, actor_2_facebook_likes, actor_3_facebook_likes,
num_critic_for_reviews, num_user_for_reviews))
#Creating Score Categories
movie <- movie %>% mutate(Rating_Category = cut(imdb_score, c(0, 4, 7, 9, 10),
labels = c("LOW", "MEDIUM", "HIGH", "EXCELLENT")))
movie.final<-movie%>%select(-imdb_score)
##Splitting Data
set.seed(12941211)
training.samples <- movie.final$Rating_Category%>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- movie.final[training.samples, ]
test.data <- movie.final[-training.samples, ]
##Multinomial Logistic Regression
# Fit the model
model.multi <- nnet::multinom(Rating_Category ~., data = train.data)
## # weights: 76 (54 variable)
## initial value 4222.652624
## iter 10 value 2920.708543
## iter 20 value 2665.806594
## iter 30 value 2516.030026
## iter 40 value 2001.743398
## iter 50 value 1910.007769
## iter 60 value 1841.789704
## iter 70 value 1789.350358
## iter 80 value 1773.504074
## iter 90 value 1731.416160
## iter 100 value 1699.949605
## final value 1699.949605
## stopped after 100 iterations
tidy(model.multi)
## # A tibble: 54 x 6
## y.level term estimate std.error statistic p.value
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 MEDIUM (Intercept) 0.989 3.52e-11 -3.07e 8 0.
## 2 MEDIUM duration 1.04 3.83e- 9 1.09e 7 0.
## 3 MEDIUM director_facebook_likes 1.00 2.92e- 8 3.08e 4 0.
## 4 MEDIUM actor_1_facebook_likes 1.00 4.50e- 7 2.98e 1 1.24e-195
## 5 MEDIUM gross 1.00 3.97e- 9 4.54e 0 5.50e- 6
## 6 MEDIUM num_voted_users 1.00 9.95e- 7 3.21e 0 1.33e- 3
## 7 MEDIUM facenumber_in_poster 0.970 4.84e-11 -6.31e 8 0.
## 8 MEDIUM countryUSA 1.46 2.15e-11 1.77e10 0.
## 9 MEDIUM countryOthers 0.864 7.92e-12 -1.85e10 0.
## 10 MEDIUM content_ratingNC-17 0.972 1.74e-13 -1.64e11 0.
## # ... with 44 more rows
formattable(tidy(model.multi))
| y.level | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| MEDIUM | (Intercept) | 0.9892445 | 3.517646e-11 | -3.074156e+08 | 0.000000e+00 |
| MEDIUM | duration | 1.0426545 | 3.827218e-09 | 1.091390e+07 | 0.000000e+00 |
| MEDIUM | director_facebook_likes | 1.0008991 | 2.921973e-08 | 3.075556e+04 | 0.000000e+00 |
| MEDIUM | actor_1_facebook_likes | 1.0000134 | 4.497398e-07 | 2.983838e+01 | 1.242201e-195 |
| MEDIUM | gross | 1.0000000 | 3.973005e-09 | 4.544628e+00 | 5.503245e-06 |
| MEDIUM | num_voted_users | 1.0000032 | 9.954347e-07 | 3.209122e+00 | 1.331411e-03 |
| MEDIUM | facenumber_in_poster | 0.9699032 | 4.841334e-11 | -6.312103e+08 | 0.000000e+00 |
| MEDIUM | countryUSA | 1.4632500 | 2.154944e-11 | 1.766450e+10 | 0.000000e+00 |
| MEDIUM | countryOthers | 0.8638422 | 7.916096e-12 | -1.848956e+10 | 0.000000e+00 |
| MEDIUM | content_ratingNC-17 | 0.9718679 | 1.743993e-13 | -1.636211e+11 | 0.000000e+00 |
| MEDIUM | content_ratingPG | 0.7614068 | 6.527097e-12 | -4.176244e+10 | 0.000000e+00 |
| MEDIUM | content_ratingPG-13 | 1.3367575 | 6.915194e-12 | 4.197234e+10 | 0.000000e+00 |
| MEDIUM | content_ratingR | 1.1863947 | 2.198379e-11 | 7.774775e+09 | 0.000000e+00 |
| MEDIUM | budget | 1.0000000 | 8.054651e-10 | 1.427681e-01 | 8.864733e-01 |
| MEDIUM | title_year | 0.9991708 | 7.070155e-08 | -1.173341e+04 | 0.000000e+00 |
| MEDIUM | movie_facebook_likes | 0.9999518 | 1.573514e-06 | -3.062019e+01 | 6.593292e-206 |
| MEDIUM | other_actor_facebook_likes | 1.0000329 | 1.360915e-07 | 2.421073e+02 | 0.000000e+00 |
| MEDIUM | critic_total_ratio | 1.1175039 | 4.607877e-11 | 2.411035e+09 | 0.000000e+00 |
| HIGH | (Intercept) | 1.0117834 | 3.370811e-11 | 3.475271e+08 | 0.000000e+00 |
| HIGH | duration | 1.0704804 | 3.698383e-09 | 1.841548e+07 | 0.000000e+00 |
| HIGH | director_facebook_likes | 1.0009535 | 2.924924e-08 | 3.258406e+04 | 0.000000e+00 |
| HIGH | actor_1_facebook_likes | 1.0000093 | 4.382051e-07 | 2.129586e+01 | 1.240183e-100 |
| HIGH | gross | 1.0000000 | 4.028574e-09 | 2.249960e+00 | 2.445148e-02 |
| HIGH | num_voted_users | 1.0000177 | 8.749810e-07 | 2.024578e+01 | 3.870046e-91 |
| HIGH | facenumber_in_poster | 0.8632704 | 4.589145e-11 | -3.203806e+09 | 0.000000e+00 |
| HIGH | countryUSA | 0.6533679 | 2.007724e-11 | -2.119888e+10 | 0.000000e+00 |
| HIGH | countryOthers | 1.2105530 | 7.973191e-12 | 2.396497e+10 | 0.000000e+00 |
| HIGH | content_ratingNC-17 | 1.0406276 | 1.603659e-13 | 2.483323e+11 | 0.000000e+00 |
| HIGH | content_ratingPG | 1.2231320 | 6.229889e-12 | 3.233040e+10 | 0.000000e+00 |
| HIGH | content_ratingPG-13 | 0.5768620 | 6.045868e-12 | -9.099639e+10 | 0.000000e+00 |
| HIGH | content_ratingR | 1.1819153 | 2.154356e-11 | 7.758062e+09 | 0.000000e+00 |
| HIGH | budget | 1.0000000 | 1.446472e-09 | -2.359209e+00 | 1.831395e-02 |
| HIGH | title_year | 0.9972185 | 6.775464e-08 | -4.110981e+04 | 0.000000e+00 |
| HIGH | movie_facebook_likes | 0.9999549 | 1.569223e-06 | -2.874106e+01 | 1.171551e-181 |
| HIGH | other_actor_facebook_likes | 1.0000213 | 1.331229e-07 | 1.598463e+02 | 0.000000e+00 |
| HIGH | critic_total_ratio | 1.3445071 | 4.416486e-11 | 6.702782e+09 | 0.000000e+00 |
| EXCELLENT | (Intercept) | 0.9999076 | 9.937065e-13 | -9.301134e+07 | 0.000000e+00 |
| EXCELLENT | duration | 1.1050211 | 6.963051e-11 | 1.434206e+09 | 0.000000e+00 |
| EXCELLENT | director_facebook_likes | 1.0006762 | 2.034928e-09 | 3.322070e+05 | 0.000000e+00 |
| EXCELLENT | actor_1_facebook_likes | 0.9997078 | 6.475285e-09 | -4.513499e+04 | 0.000000e+00 |
| EXCELLENT | gross | 1.0000000 | 1.747296e-08 | -1.663309e+00 | 9.625058e-02 |
| EXCELLENT | num_voted_users | 1.0000265 | 1.316155e-06 | 2.012671e+01 | 4.306684e-90 |
| EXCELLENT | facenumber_in_poster | 0.9251474 | 9.939533e-13 | -7.827550e+10 | 0.000000e+00 |
| EXCELLENT | countryUSA | 1.0027126 | 9.179291e-14 | 2.951105e+10 | 0.000000e+00 |
| EXCELLENT | countryOthers | 1.0025158 | 6.455600e-13 | 3.892264e+09 | 0.000000e+00 |
| EXCELLENT | content_ratingNC-17 | 0.9993790 | 8.514029e-15 | -7.295928e+10 | 0.000000e+00 |
| EXCELLENT | content_ratingPG | 1.0012310 | 3.901479e-13 | 3.153344e+09 | 0.000000e+00 |
| EXCELLENT | content_ratingPG-13 | 1.0017692 | 2.251502e-13 | 7.850927e+09 | 0.000000e+00 |
| EXCELLENT | content_ratingR | 0.9976242 | 8.531311e-13 | -2.788156e+09 | 0.000000e+00 |
| EXCELLENT | budget | 0.9999994 | 1.996783e-07 | -2.989241e+00 | 2.796711e-03 |
| EXCELLENT | title_year | 0.9943303 | 1.976160e-09 | -2.877214e+06 | 0.000000e+00 |
| EXCELLENT | movie_facebook_likes | 0.9999698 | 4.502810e-08 | -6.702105e+02 | 0.000000e+00 |
| EXCELLENT | other_actor_facebook_likes | 1.0002910 | 2.080441e-09 | 1.398712e+05 | 0.000000e+00 |
| EXCELLENT | critic_total_ratio | 1.0300457 | 8.158123e-13 | 3.628673e+10 | 0.000000e+00 |
# Summarize the model
summary(model.multi)
## Call:
## nnet::multinom(formula = Rating_Category ~ ., data = train.data)
##
## Coefficients:
## (Intercept) duration director_facebook_likes
## MEDIUM -1.081380e-02 0.04176988 0.0008986693
## HIGH 1.171448e-02 0.06810750 0.0009530592
## EXCELLENT -9.242597e-05 0.09986446 0.0006760173
## actor_1_facebook_likes gross num_voted_users
## MEDIUM 1.341951e-05 1.805583e-08 3.194471e-06
## HIGH 9.331952e-06 9.064131e-09 1.771467e-05
## EXCELLENT -2.922619e-04 -2.906294e-08 2.648986e-05
## facenumber_in_poster countryUSA countryOthers
## MEDIUM -0.03055900 0.380659992 -0.14636514
## HIGH -0.14702730 -0.425614918 0.19107726
## EXCELLENT -0.07780219 0.002708905 0.00251269
## content_ratingNC-17 content_ratingPG content_ratingPG-13
## MEDIUM -0.0285354089 -0.272587502 0.290246895
## HIGH 0.0398240272 0.201414817 -0.550152167
## EXCELLENT -0.0006211775 0.001230271 0.001767638
## content_ratingR budget title_year movie_facebook_likes
## MEDIUM 0.170919016 1.149947e-10 -0.000829570 -4.818130e-05
## HIGH 0.167136300 -3.412529e-09 -0.002785380 -4.510114e-05
## EXCELLENT -0.002378662 -5.968867e-07 -0.005685836 -3.017831e-05
## other_actor_facebook_likes critic_total_ratio
## MEDIUM 3.294875e-05 0.11109752
## HIGH 2.127920e-05 0.29602748
## EXCELLENT 2.909938e-04 0.02960317
##
## Std. Errors:
## (Intercept) duration director_facebook_likes
## MEDIUM 3.517646e-11 3.827218e-09 2.921973e-08
## HIGH 3.370811e-11 3.698383e-09 2.924924e-08
## EXCELLENT 9.937065e-13 6.963051e-11 2.034928e-09
## actor_1_facebook_likes gross num_voted_users
## MEDIUM 4.497398e-07 3.973005e-09 9.954347e-07
## HIGH 4.382051e-07 4.028574e-09 8.749810e-07
## EXCELLENT 6.475285e-09 1.747296e-08 1.316155e-06
## facenumber_in_poster countryUSA countryOthers
## MEDIUM 4.841334e-11 2.154944e-11 7.916096e-12
## HIGH 4.589145e-11 2.007724e-11 7.973191e-12
## EXCELLENT 9.939533e-13 9.179291e-14 6.455600e-13
## content_ratingNC-17 content_ratingPG content_ratingPG-13
## MEDIUM 1.743993e-13 6.527097e-12 6.915194e-12
## HIGH 1.603659e-13 6.229889e-12 6.045868e-12
## EXCELLENT 8.514029e-15 3.901479e-13 2.251502e-13
## content_ratingR budget title_year movie_facebook_likes
## MEDIUM 2.198379e-11 8.054651e-10 7.070155e-08 1.573514e-06
## HIGH 2.154356e-11 1.446472e-09 6.775464e-08 1.569223e-06
## EXCELLENT 8.531311e-13 1.996783e-07 1.976160e-09 4.502810e-08
## other_actor_facebook_likes critic_total_ratio
## MEDIUM 1.360915e-07 4.607877e-11
## HIGH 1.331229e-07 4.416486e-11
## EXCELLENT 2.080441e-09 8.158123e-13
##
## Residual Deviance: 3399.899
## AIC: 3507.899
# Make predictions
predicted.classes <- model.multi %>% predict(test.data)
head(predicted.classes)
## [1] HIGH HIGH HIGH HIGH MEDIUM MEDIUM
## Levels: LOW MEDIUM HIGH EXCELLENT
# Model accuracy
mean(predicted.classes == test.data$Rating_Category)
## [1] 0.7565789
rpart.fit <- rpart(Rating_Category~., data = train.data, method = 'class')
plotcp(rpart.fit)
rpart.fit.2<-prune.rpart(rpart.fit,cp=0.01)
rpart.plot(rpart.fit.2, extra = 104)
#Prediction
predict_unseen <-predict(rpart.fit.2, test.data, type = 'class')
table_mat <- table(test.data$Rating_Category, predict_unseen)
table_mat
## predict_unseen
## LOW MEDIUM HIGH EXCELLENT
## LOW 0 19 0 0
## MEDIUM 0 465 46 0
## HIGH 0 106 124 0
## EXCELLENT 0 0 0 0
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
print(paste('Accuracy for test', accuracy_Test))
## [1] "Accuracy for test 0.775"
#3hyper parameter Tuning
accuracy_tune <- function(fit) {
predict_unseen <- predict(fit, test.data, type = 'class')
table_mat <- table(test.data$Rating_Category, predict_unseen)
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
accuracy_Test
}
control <- rpart.control(minsplit = 20,
minbucket = round(20 / 3),
maxdepth = 20,
cp = 0.01)
tune_fit <- rpart(Rating_Category~., data = train.data, method = 'class', control = control)
accuracy_tune(tune_fit)
## [1] 0.775
trControl <- trainControl(method = "cv",number = 10,search = "grid")
rf_default <- train(Rating_Category~.,data = train.data,method = "rf",metric = "Accuracy",
trControl = trControl)
print(rf_default)
## Random Forest
##
## 3046 samples
## 13 predictor
## 4 classes: 'LOW', 'MEDIUM', 'HIGH', 'EXCELLENT'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2742, 2742, 2741, 2741, 2741, 2740, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8023682 0.5085675
## 9 0.8161400 0.5579987
## 17 0.8154885 0.5579506
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 9.
#Best mtry
tuneGrid <- expand.grid(.mtry = c(1: 10))
rf_mtry <- train(Rating_Category~.,
data = train.data,
method = "rf",
metric = "Accuracy",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 14,
ntree = 300)
print(rf_mtry)
## Random Forest
##
## 3046 samples
## 13 predictor
## 4 classes: 'LOW', 'MEDIUM', 'HIGH', 'EXCELLENT'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2742, 2740, 2742, 2741, 2742, 2742, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 1 0.7337704 0.2587794
## 2 0.7980966 0.5003992
## 3 0.8053120 0.5314711
## 4 0.8040015 0.5292556
## 5 0.8095775 0.5440927
## 6 0.8072889 0.5390819
## 7 0.8030126 0.5290586
## 8 0.8085896 0.5430727
## 9 0.8072856 0.5400455
## 10 0.8092464 0.5451680
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 5.
best_mtry <- rf_mtry$bestTune$mtry
best_mtry
## [1] 5
max(rf_mtry$results$Accuracy)
## [1] 0.8095775
#Best max nodes
store_maxnode <- list()
tuneGrid <- expand.grid(.mtry = best_mtry)
for (maxnodes in c(5: 30)) {
set.seed(1234)
rf_maxnode <- train(Rating_Category~.,
data = train.data,
method = "rf",
metric = "Accuracy",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 14,
maxnodes = maxnodes,
ntree = 300)
current_iteration <- toString(maxnodes)
store_maxnode[[current_iteration]] <- rf_maxnode
}
results_mtry <- resamples(store_maxnode)
summary(results_mtry) #Best max node=27
##
## Call:
## summary.resamples(object = results_mtry)
##
## Models: 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5 0.7311475 0.7335526 0.7450658 0.7442499 0.7502063 0.7647059 0
## 6 0.7311475 0.7391340 0.7458935 0.7488488 0.7506148 0.7777778 0
## 7 0.7180328 0.7450820 0.7508277 0.7498421 0.7590461 0.7712418 0
## 8 0.7278689 0.7415930 0.7458935 0.7498421 0.7598684 0.7712418 0
## 9 0.7278689 0.7418033 0.7508277 0.7504978 0.7615132 0.7712418 0
## 10 0.7278689 0.7477472 0.7516447 0.7537776 0.7608930 0.7810458 0
## 11 0.7377049 0.7457088 0.7500000 0.7544366 0.7658272 0.7777778 0
## 12 0.7344262 0.7510246 0.7586227 0.7590397 0.7672697 0.7788779 0
## 13 0.7344262 0.7532895 0.7536939 0.7580475 0.7604589 0.7810458 0
## 14 0.7377049 0.7506188 0.7664474 0.7623206 0.7670217 0.7843137 0
## 15 0.7344262 0.7561436 0.7622951 0.7626517 0.7689145 0.7854785 0
## 16 0.7377049 0.7543114 0.7619068 0.7616745 0.7656250 0.7920792 0
## 17 0.7278689 0.7567785 0.7668302 0.7636417 0.7754934 0.7821782 0
## 18 0.7409836 0.7606557 0.7639426 0.7652768 0.7754934 0.7821782 0
## 19 0.7377049 0.7575753 0.7717591 0.7679063 0.7754934 0.7920792 0
## 20 0.7442623 0.7571856 0.7763158 0.7692263 0.7801472 0.7854785 0
## 21 0.7344262 0.7639803 0.7717591 0.7679116 0.7743250 0.7887789 0
## 22 0.7409836 0.7666388 0.7750431 0.7721761 0.7820724 0.7887789 0
## 23 0.7442623 0.7661524 0.7763158 0.7734800 0.7826062 0.7908497 0
## 24 0.7409836 0.7677917 0.7796053 0.7764438 0.7865879 0.7953795 0
## 25 0.7540984 0.7676084 0.7796053 0.7764384 0.7841369 0.7953795 0
## 26 0.7442623 0.7722039 0.7832506 0.7770995 0.7841369 0.7920792 0
## 27 0.7475410 0.7763158 0.7816059 0.7784153 0.7857166 0.7986799 0
## 28 0.7508197 0.7766813 0.7823208 0.7793892 0.7861842 0.7934426 0
## 29 0.7508197 0.7828947 0.7865347 0.7830109 0.7906782 0.7986799 0
## 30 0.7442623 0.7770492 0.7786915 0.7797301 0.7906508 0.8019802 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5 0.3038080 0.3110557 0.3211435 0.3293072 0.3389558 0.3820263 0
## 6 0.2994005 0.3266093 0.3404446 0.3466744 0.3469747 0.4200346 0
## 7 0.2605435 0.3196957 0.3513013 0.3456145 0.3804703 0.3953593 0
## 8 0.2840578 0.3256981 0.3347067 0.3418222 0.3728684 0.3991922 0
## 9 0.2840578 0.3104011 0.3449943 0.3438882 0.3735345 0.4067139 0
## 10 0.2840578 0.3338539 0.3477464 0.3550096 0.3817340 0.4303576 0
## 11 0.3164883 0.3328161 0.3455571 0.3587727 0.3882184 0.4236650 0
## 12 0.3057468 0.3493581 0.3732836 0.3739647 0.4044428 0.4272502 0
## 13 0.3057468 0.3545436 0.3601373 0.3689012 0.3770265 0.4303576 0
## 14 0.3164883 0.3483595 0.3870311 0.3810177 0.4018390 0.4406160 0
## 15 0.3013094 0.3623680 0.3827244 0.3827178 0.4104582 0.4360773 0
## 16 0.3121335 0.3550282 0.3804558 0.3771852 0.3889947 0.4534288 0
## 17 0.2840578 0.3731348 0.3832702 0.3848875 0.4228788 0.4306959 0
## 18 0.3185610 0.3663294 0.4005031 0.3901713 0.4178062 0.4292320 0
## 19 0.3164883 0.3789645 0.3957188 0.4000552 0.4316920 0.4603511 0
## 20 0.3335761 0.3828554 0.4118310 0.4043745 0.4408083 0.4432194 0
## 21 0.3101282 0.3911393 0.4110093 0.4005630 0.4229905 0.4465280 0
## 22 0.3228889 0.3999536 0.4161073 0.4141306 0.4458515 0.4535156 0
## 23 0.3377686 0.4018489 0.4295561 0.4180323 0.4408083 0.4609414 0
## 24 0.3228889 0.4061652 0.4363068 0.4269853 0.4574324 0.4774352 0
## 25 0.3612298 0.4026166 0.4407121 0.4268119 0.4444558 0.4705932 0
## 26 0.3335761 0.4184364 0.4414539 0.4297669 0.4583489 0.4637469 0
## 27 0.3400309 0.4190926 0.4402783 0.4327542 0.4550086 0.4807709 0
## 28 0.3506639 0.4214303 0.4522172 0.4358184 0.4607222 0.4627279 0
## 29 0.3506639 0.4389885 0.4535088 0.4455447 0.4692568 0.4838014 0
## 30 0.3250298 0.4227438 0.4423825 0.4372653 0.4708082 0.4908847 0
#Best ntrees
store_maxtrees <- list()
for (ntree in c(250, 300, 350, 400, 450, 500, 550, 600, 800, 1000, 2000)) {
rf_maxtrees <- train(Rating_Category~.,
data = train.data,
method = "rf",
metric = "Accuracy",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 14,
maxnodes = 27,
ntree = ntree)
key <- toString(ntree)
store_maxtrees[[key]] <- rf_maxtrees
}
results_tree <- resamples(store_maxtrees)
summary(results_tree) #best ntree=250
##
## Call:
## summary.resamples(object = results_tree)
##
## Models: 250, 300, 350, 400, 450, 500, 550, 600, 800, 1000, 2000
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 250 0.7442623 0.7600572 0.7766825 0.7741184 0.7867100 0.8006536 0
## 300 0.7467105 0.7692800 0.7855995 0.7787249 0.7893000 0.7927632 0
## 350 0.7385621 0.7555409 0.7858260 0.7761217 0.7954001 0.8000000 0
## 400 0.7540984 0.7666388 0.7812747 0.7800304 0.7957318 0.8032787 0
## 450 0.7524752 0.7749613 0.7790528 0.7767523 0.7818951 0.7960526 0
## 500 0.7540984 0.7608959 0.7796053 0.7757882 0.7870559 0.7960526 0
## 550 0.7532895 0.7665783 0.7783218 0.7790623 0.7911184 0.8059211 0
## 600 0.7532895 0.7694888 0.7774135 0.7770800 0.7871872 0.7967213 0
## 800 0.7598684 0.7713115 0.7803363 0.7793692 0.7891690 0.7967213 0
## 1000 0.7467105 0.7604834 0.7774188 0.7757508 0.7918033 0.8026316 0
## 2000 0.7483660 0.7656250 0.7750377 0.7784110 0.7893443 0.8151815 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 250 0.3335761 0.3918098 0.4253758 0.4199746 0.4579638 0.5002008 0
## 300 0.3267372 0.4166568 0.4439783 0.4320734 0.4637282 0.4748029 0
## 350 0.3435237 0.3748983 0.4433312 0.4259692 0.4749451 0.4963591 0
## 400 0.3551252 0.3957342 0.4406518 0.4363755 0.4777855 0.4940778 0
## 450 0.3534667 0.4148998 0.4316204 0.4265428 0.4425902 0.4776777 0
## 500 0.3571730 0.3823004 0.4351721 0.4240755 0.4539143 0.4873106 0
## 550 0.3848395 0.3979648 0.4348898 0.4336783 0.4608371 0.4885366 0
## 600 0.3646901 0.3885564 0.4420480 0.4281268 0.4589142 0.4873106 0
## 800 0.3657979 0.4125172 0.4338222 0.4349548 0.4644857 0.4927575 0
## 1000 0.3324969 0.3775106 0.4307925 0.4241780 0.4642420 0.5214985 0
## 2000 0.3411995 0.3954195 0.4208439 0.4310572 0.4574885 0.5336027 0
#Best Model
fit_rf <- train(Rating_Category~.,
train.data,
method = "rf",
metric = "Accuracy",
tuneGrid = tuneGrid,
trControl = trControl,
importance = TRUE,
nodesize = 14,
ntree = 600,
maxnodes = 27)
prediction.rf <-predict(fit_rf, test.data)
confusionMatrix(prediction.rf, test.data$Rating_Category)
## Confusion Matrix and Statistics
##
## Reference
## Prediction LOW MEDIUM HIGH EXCELLENT
## LOW 0 0 0 0
## MEDIUM 19 484 117 0
## HIGH 0 27 113 0
## EXCELLENT 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.7855
## 95% CI : (0.7546, 0.8142)
## No Information Rate : 0.6724
## P-Value [Acc > NIR] : 3.65e-12
##
## Kappa : 0.458
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: LOW Class: MEDIUM Class: HIGH Class: EXCELLENT
## Sensitivity 0.000 0.9472 0.4913 NA
## Specificity 1.000 0.4538 0.9491 1
## Pos Pred Value NaN 0.7806 0.8071 NA
## Neg Pred Value 0.975 0.8071 0.8113 NA
## Prevalence 0.025 0.6724 0.3026 0
## Detection Rate 0.000 0.6368 0.1487 0
## Detection Prevalence 0.000 0.8158 0.1842 0
## Balanced Accuracy 0.500 0.7005 0.7202 NA
#78.42% Accuracy
varImp(fit_rf)
## rf variable importance
##
## variables are sorted by maximum importance across the classes
## LOW MEDIUM HIGH EXCELLENT
## num_voted_users 5.507 72.322 100.000 9.905
## budget 2.050 50.393 19.465 4.341
## director_facebook_likes 4.910 11.302 42.414 7.370
## duration 8.121 37.454 36.981 7.842
## movie_facebook_likes 9.225 22.174 30.460 6.087
## gross 6.540 28.109 1.046 4.341
## countryUSA 1.942 1.152 27.687 4.341
## title_year 0.000 24.371 17.739 6.812
## critic_total_ratio 9.302 10.574 22.247 9.615
## other_actor_facebook_likes 5.749 17.832 22.054 4.341
## content_ratingPG-13 5.497 21.144 12.749 6.087
## actor_1_facebook_likes 6.285 13.300 17.250 6.087
## content_ratingR 6.810 15.505 11.705 6.812
## countryOthers 4.341 6.805 13.599 4.341
## facenumber_in_poster 6.087 10.775 7.187 4.341
## content_ratingPG 4.341 3.034 8.378 4.341
## content_ratingNC-17 4.341 1.045 3.353 4.341
rf <- randomForest(Rating_Category ~ . , data = train.data, mtry = 4)
# Get importance
importance <- importance(rf)
varImportance <- data.frame(Variables = row.names(importance),
Importance = round(importance[ ,'MeanDecreaseGini'],2))
# Create a rank variable based on importance
rankImportance <- varImportance %>%
mutate(Rank = paste0('#',dense_rank(desc(Importance))))
# Use ggplot2 to visualize the relative importance of variables
ggplot(rankImportance, aes(x = reorder(Variables, Importance),
y = Importance, fill = Importance)) +
geom_bar(stat='identity') +
geom_text(aes(x = Variables, y = 0.5, label = Rank),
hjust=0, vjust=0.55, size = 4, colour = 'red') +
labs(x = 'Variables') +
coord_flip() +
theme_few()
tc<-trainControl(method = "repeatedcv", number = 10)
gbm.model = train(Rating_Category ~., data=train.data, method="gbm", trControl=tc)
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.2960
## 2 1.2127 nan 0.1000 0.2067
## 3 1.0895 nan 0.1000 0.1532
## 4 0.9963 nan 0.1000 0.1120
## 5 0.9244 nan 0.1000 0.0874
## 6 0.8694 nan 0.1000 0.0704
## 7 0.8249 nan 0.1000 0.0569
## 8 0.7889 nan 0.1000 0.0451
## 9 0.7595 nan 0.1000 0.0376
## 10 0.7335 nan 0.1000 0.0330
## 20 0.6023 nan 0.1000 0.0061
## 40 0.5251 nan 0.1000 -0.0005
## 60 0.4944 nan 0.1000 0.0009
## 80 0.4755 nan 0.1000 -0.0005
## 100 0.4592 nan 0.1000 -0.0005
## 120 0.4469 nan 0.1000 -0.0019
## 140 0.4381 nan 0.1000 -0.0013
## 150 0.4342 nan 0.1000 -0.0009
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.2866
## 2 1.2288 nan 0.1000 0.2260
## 3 1.0923 nan 0.1000 0.1660
## 4 0.9899 nan 0.1000 0.1213
## 5 0.9133 nan 0.1000 0.0948
## 6 0.8518 nan 0.1000 0.0758
## 7 0.8023 nan 0.1000 0.0606
## 8 0.7618 nan 0.1000 0.0506
## 9 0.7284 nan 0.1000 0.0447
## 10 0.6980 nan 0.1000 0.0301
## 20 0.5503 nan 0.1000 0.0077
## 40 0.4675 nan 0.1000 -0.0005
## 60 0.4328 nan 0.1000 -0.0004
## 80 0.4088 nan 0.1000 -0.0016
## 100 0.3909 nan 0.1000 -0.0013
## 120 0.3733 nan 0.1000 -0.0011
## 140 0.3597 nan 0.1000 -0.0019
## 150 0.3523 nan 0.1000 -0.0028
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.3202
## 2 1.1965 nan 0.1000 0.2222
## 3 1.0621 nan 0.1000 0.1686
## 4 0.9581 nan 0.1000 0.1227
## 5 0.8820 nan 0.1000 0.0960
## 6 0.8203 nan 0.1000 0.0743
## 7 0.7708 nan 0.1000 0.0605
## 8 0.7295 nan 0.1000 0.0524
## 9 0.6950 nan 0.1000 0.0385
## 10 0.6659 nan 0.1000 0.0337
## 20 0.5154 nan 0.1000 0.0072
## 40 0.4284 nan 0.1000 -0.0014
## 60 0.3905 nan 0.1000 -0.0029
## 80 0.3610 nan 0.1000 -0.0006
## 100 0.3377 nan 0.1000 -0.0022
## 120 0.3167 nan 0.1000 -0.0031
## 140 0.2972 nan 0.1000 -0.0033
## 150 0.2902 nan 0.1000 -0.0025
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.2731
## 2 1.2319 nan 0.1000 0.2144
## 3 1.1001 nan 0.1000 0.1504
## 4 1.0017 nan 0.1000 0.1152
## 5 0.9290 nan 0.1000 0.0881
## 6 0.8728 nan 0.1000 0.0699
## 7 0.8269 nan 0.1000 0.0559
## 8 0.7906 nan 0.1000 0.0478
## 9 0.7594 nan 0.1000 0.0368
## 10 0.7329 nan 0.1000 0.0316
## 20 0.6032 nan 0.1000 0.0107
## 40 0.5250 nan 0.1000 0.0006
## 60 0.4973 nan 0.1000 0.0011
## 80 0.4789 nan 0.1000 -0.0016
## 100 0.4638 nan 0.1000 -0.0012
## 120 0.4528 nan 0.1000 -0.0015
## 140 0.4435 nan 0.1000 -0.0016
## 150 0.4399 nan 0.1000 -0.0009
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.3107
## 2 1.2033 nan 0.1000 0.2204
## 3 1.0722 nan 0.1000 0.1563
## 4 0.9738 nan 0.1000 0.1202
## 5 0.8988 nan 0.1000 0.0972
## 6 0.8363 nan 0.1000 0.0708
## 7 0.7887 nan 0.1000 0.0612
## 8 0.7502 nan 0.1000 0.0523
## 9 0.7158 nan 0.1000 0.0410
## 10 0.6875 nan 0.1000 0.0340
## 20 0.5486 nan 0.1000 0.0065
## 40 0.4711 nan 0.1000 -0.0007
## 60 0.4362 nan 0.1000 -0.0006
## 80 0.4126 nan 0.1000 -0.0031
## 100 0.3950 nan 0.1000 -0.0036
## 120 0.3803 nan 0.1000 -0.0014
## 140 0.3655 nan 0.1000 -0.0014
## 150 0.3597 nan 0.1000 -0.0015
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.3217
## 2 1.1934 nan 0.1000 0.2246
## 3 1.0558 nan 0.1000 0.1580
## 4 0.9572 nan 0.1000 0.1239
## 5 0.8786 nan 0.1000 0.0909
## 6 0.8164 nan 0.1000 0.0745
## 7 0.7663 nan 0.1000 0.0614
## 8 0.7248 nan 0.1000 0.0474
## 9 0.6915 nan 0.1000 0.0370
## 10 0.6629 nan 0.1000 0.0343
## 20 0.5179 nan 0.1000 0.0055
## 40 0.4322 nan 0.1000 -0.0018
## 60 0.3929 nan 0.1000 -0.0017
## 80 0.3635 nan 0.1000 -0.0029
## 100 0.3418 nan 0.1000 -0.0023
## 120 0.3218 nan 0.1000 -0.0019
## 140 0.3043 nan 0.1000 -0.0026
## 150 0.2971 nan 0.1000 -0.0031
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.2911
## 2 1.2091 nan 0.1000 0.2092
## 3 1.0873 nan 0.1000 0.1505
## 4 0.9959 nan 0.1000 0.1103
## 5 0.9262 nan 0.1000 0.0880
## 6 0.8709 nan 0.1000 0.0734
## 7 0.8260 nan 0.1000 0.0545
## 8 0.7883 nan 0.1000 0.0465
## 9 0.7573 nan 0.1000 0.0371
## 10 0.7324 nan 0.1000 0.0307
## 20 0.6007 nan 0.1000 0.0052
## 40 0.5225 nan 0.1000 0.0028
## 60 0.4907 nan 0.1000 -0.0015
## 80 0.4718 nan 0.1000 -0.0013
## 100 0.4570 nan 0.1000 -0.0009
## 120 0.4471 nan 0.1000 -0.0003
## 140 0.4385 nan 0.1000 -0.0012
## 150 0.4343 nan 0.1000 -0.0014
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.2848
## 2 1.2301 nan 0.1000 0.2243
## 3 1.0891 nan 0.1000 0.1650
## 4 0.9881 nan 0.1000 0.1228
## 5 0.9103 nan 0.1000 0.0958
## 6 0.8478 nan 0.1000 0.0791
## 7 0.7966 nan 0.1000 0.0595
## 8 0.7565 nan 0.1000 0.0500
## 9 0.7218 nan 0.1000 0.0406
## 10 0.6935 nan 0.1000 0.0314
## 20 0.5444 nan 0.1000 0.0064
## 40 0.4633 nan 0.1000 0.0000
## 60 0.4294 nan 0.1000 -0.0032
## 80 0.4081 nan 0.1000 -0.0011
## 100 0.3907 nan 0.1000 -0.0029
## 120 0.3742 nan 0.1000 -0.0021
## 140 0.3615 nan 0.1000 -0.0020
## 150 0.3564 nan 0.1000 -0.0013
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.3216
## 2 1.1930 nan 0.1000 0.2258
## 3 1.0564 nan 0.1000 0.1586
## 4 0.9562 nan 0.1000 0.1207
## 5 0.8816 nan 0.1000 0.0930
## 6 0.8183 nan 0.1000 0.0764
## 7 0.7680 nan 0.1000 0.0607
## 8 0.7261 nan 0.1000 0.0469
## 9 0.6930 nan 0.1000 0.0471
## 10 0.6613 nan 0.1000 0.0320
## 20 0.5145 nan 0.1000 0.0097
## 40 0.4296 nan 0.1000 -0.0023
## 60 0.3952 nan 0.1000 -0.0013
## 80 0.3647 nan 0.1000 -0.0023
## 100 0.3434 nan 0.1000 -0.0004
## 120 0.3212 nan 0.1000 -0.0010
## 140 0.3052 nan 0.1000 -0.0028
## 150 0.2978 nan 0.1000 -0.0019
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.2714
## 2 1.2333 nan 0.1000 0.2128
## 3 1.1070 nan 0.1000 0.1565
## 4 1.0096 nan 0.1000 0.1143
## 5 0.9388 nan 0.1000 0.0909
## 6 0.8814 nan 0.1000 0.0723
## 7 0.8337 nan 0.1000 0.0551
## 8 0.7970 nan 0.1000 0.0482
## 9 0.7660 nan 0.1000 0.0390
## 10 0.7406 nan 0.1000 0.0349
## 20 0.6022 nan 0.1000 0.0068
## 40 0.5245 nan 0.1000 0.0012
## 60 0.4935 nan 0.1000 -0.0001
## 80 0.4749 nan 0.1000 0.0006
## 100 0.4609 nan 0.1000 -0.0014
## 120 0.4496 nan 0.1000 -0.0006
## 140 0.4405 nan 0.1000 -0.0012
## 150 0.4355 nan 0.1000 -0.0018
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.3060
## 2 1.2048 nan 0.1000 0.2125
## 3 1.0724 nan 0.1000 0.1504
## 4 0.9756 nan 0.1000 0.1135
## 5 0.9019 nan 0.1000 0.0968
## 6 0.8419 nan 0.1000 0.0751
## 7 0.7929 nan 0.1000 0.0607
## 8 0.7542 nan 0.1000 0.0514
## 9 0.7213 nan 0.1000 0.0449
## 10 0.6910 nan 0.1000 0.0366
## 20 0.5487 nan 0.1000 0.0063
## 40 0.4678 nan 0.1000 0.0002
## 60 0.4342 nan 0.1000 -0.0005
## 80 0.4086 nan 0.1000 -0.0018
## 100 0.3908 nan 0.1000 -0.0016
## 120 0.3752 nan 0.1000 -0.0011
## 140 0.3600 nan 0.1000 -0.0023
## 150 0.3532 nan 0.1000 -0.0023
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.3172
## 2 1.1948 nan 0.1000 0.2154
## 3 1.0612 nan 0.1000 0.1610
## 4 0.9616 nan 0.1000 0.1185
## 5 0.8849 nan 0.1000 0.0958
## 6 0.8223 nan 0.1000 0.0775
## 7 0.7715 nan 0.1000 0.0629
## 8 0.7310 nan 0.1000 0.0467
## 9 0.6957 nan 0.1000 0.0402
## 10 0.6670 nan 0.1000 0.0381
## 20 0.5176 nan 0.1000 0.0063
## 40 0.4320 nan 0.1000 -0.0012
## 60 0.3952 nan 0.1000 -0.0001
## 80 0.3670 nan 0.1000 -0.0022
## 100 0.3411 nan 0.1000 -0.0017
## 120 0.3227 nan 0.1000 -0.0021
## 140 0.3052 nan 0.1000 -0.0020
## 150 0.2972 nan 0.1000 -0.0027
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.2978
## 2 1.2091 nan 0.1000 0.2049
## 3 1.0844 nan 0.1000 0.1482
## 4 0.9930 nan 0.1000 0.1122
## 5 0.9220 nan 0.1000 0.0906
## 6 0.8654 nan 0.1000 0.0692
## 7 0.8220 nan 0.1000 0.0552
## 8 0.7854 nan 0.1000 0.0438
## 9 0.7560 nan 0.1000 0.0354
## 10 0.7313 nan 0.1000 0.0346
## 20 0.5989 nan 0.1000 0.0081
## 40 0.5243 nan 0.1000 0.0017
## 60 0.4938 nan 0.1000 -0.0004
## 80 0.4751 nan 0.1000 -0.0003
## 100 0.4599 nan 0.1000 -0.0010
## 120 0.4500 nan 0.1000 -0.0014
## 140 0.4413 nan 0.1000 -0.0011
## 150 0.4369 nan 0.1000 -0.0007
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.2860
## 2 1.2283 nan 0.1000 0.2225
## 3 1.0921 nan 0.1000 0.1654
## 4 0.9905 nan 0.1000 0.1189
## 5 0.9147 nan 0.1000 0.0968
## 6 0.8513 nan 0.1000 0.0806
## 7 0.7992 nan 0.1000 0.0601
## 8 0.7586 nan 0.1000 0.0498
## 9 0.7236 nan 0.1000 0.0399
## 10 0.6945 nan 0.1000 0.0335
## 20 0.5472 nan 0.1000 0.0038
## 40 0.4676 nan 0.1000 -0.0015
## 60 0.4333 nan 0.1000 -0.0021
## 80 0.4098 nan 0.1000 -0.0007
## 100 0.3909 nan 0.1000 -0.0026
## 120 0.3756 nan 0.1000 -0.0014
## 140 0.3622 nan 0.1000 -0.0031
## 150 0.3567 nan 0.1000 -0.0015
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.3020
## 2 1.2196 nan 0.1000 0.2311
## 3 1.0775 nan 0.1000 0.1703
## 4 0.9726 nan 0.1000 0.1289
## 5 0.8894 nan 0.1000 0.0968
## 6 0.8261 nan 0.1000 0.0781
## 7 0.7747 nan 0.1000 0.0587
## 8 0.7334 nan 0.1000 0.0504
## 9 0.6969 nan 0.1000 0.0422
## 10 0.6680 nan 0.1000 0.0378
## 20 0.5179 nan 0.1000 0.0059
## 40 0.4323 nan 0.1000 -0.0016
## 60 0.3918 nan 0.1000 -0.0019
## 80 0.3629 nan 0.1000 -0.0009
## 100 0.3397 nan 0.1000 -0.0033
## 120 0.3214 nan 0.1000 -0.0037
## 140 0.3058 nan 0.1000 -0.0040
## 150 0.2969 nan 0.1000 -0.0017
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.2947
## 2 1.2084 nan 0.1000 0.2032
## 3 1.0859 nan 0.1000 0.1485
## 4 0.9939 nan 0.1000 0.1024
## 5 0.9271 nan 0.1000 0.0881
## 6 0.8707 nan 0.1000 0.0690
## 7 0.8258 nan 0.1000 0.0559
## 8 0.7901 nan 0.1000 0.0470
## 9 0.7583 nan 0.1000 0.0392
## 10 0.7324 nan 0.1000 0.0322
## 20 0.6008 nan 0.1000 0.0056
## 40 0.5228 nan 0.1000 -0.0003
## 60 0.4916 nan 0.1000 -0.0013
## 80 0.4715 nan 0.1000 -0.0004
## 100 0.4581 nan 0.1000 -0.0006
## 120 0.4464 nan 0.1000 -0.0002
## 140 0.4371 nan 0.1000 -0.0011
## 150 0.4328 nan 0.1000 -0.0017
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.3034
## 2 1.2026 nan 0.1000 0.2111
## 3 1.0717 nan 0.1000 0.1525
## 4 0.9753 nan 0.1000 0.1217
## 5 0.8979 nan 0.1000 0.0887
## 6 0.8388 nan 0.1000 0.0774
## 7 0.7897 nan 0.1000 0.0615
## 8 0.7494 nan 0.1000 0.0484
## 9 0.7166 nan 0.1000 0.0425
## 10 0.6878 nan 0.1000 0.0312
## 20 0.5462 nan 0.1000 0.0075
## 40 0.4695 nan 0.1000 0.0006
## 60 0.4345 nan 0.1000 -0.0011
## 80 0.4115 nan 0.1000 -0.0015
## 100 0.3944 nan 0.1000 -0.0015
## 120 0.3772 nan 0.1000 -0.0019
## 140 0.3624 nan 0.1000 -0.0030
## 150 0.3553 nan 0.1000 -0.0021
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.3220
## 2 1.1973 nan 0.1000 0.2164
## 3 1.0636 nan 0.1000 0.1689
## 4 0.9614 nan 0.1000 0.1225
## 5 0.8840 nan 0.1000 0.1012
## 6 0.8197 nan 0.1000 0.0774
## 7 0.7699 nan 0.1000 0.0596
## 8 0.7296 nan 0.1000 0.0515
## 9 0.6932 nan 0.1000 0.0404
## 10 0.6641 nan 0.1000 0.0377
## 20 0.5147 nan 0.1000 0.0033
## 40 0.4284 nan 0.1000 -0.0016
## 60 0.3882 nan 0.1000 -0.0038
## 80 0.3600 nan 0.1000 -0.0016
## 100 0.3376 nan 0.1000 -0.0021
## 120 0.3181 nan 0.1000 -0.0027
## 140 0.3026 nan 0.1000 -0.0032
## 150 0.2946 nan 0.1000 -0.0019
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.2763
## 2 1.2351 nan 0.1000 0.2153
## 3 1.1050 nan 0.1000 0.1539
## 4 1.0071 nan 0.1000 0.1126
## 5 0.9337 nan 0.1000 0.0892
## 6 0.8765 nan 0.1000 0.0696
## 7 0.8306 nan 0.1000 0.0573
## 8 0.7935 nan 0.1000 0.0454
## 9 0.7627 nan 0.1000 0.0399
## 10 0.7367 nan 0.1000 0.0301
## 20 0.6006 nan 0.1000 0.0082
## 40 0.5262 nan 0.1000 0.0005
## 60 0.4936 nan 0.1000 -0.0010
## 80 0.4738 nan 0.1000 -0.0000
## 100 0.4599 nan 0.1000 -0.0003
## 120 0.4478 nan 0.1000 -0.0010
## 140 0.4377 nan 0.1000 -0.0035
## 150 0.4333 nan 0.1000 -0.0017
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.3040
## 2 1.2006 nan 0.1000 0.2122
## 3 1.0683 nan 0.1000 0.1528
## 4 0.9733 nan 0.1000 0.1216
## 5 0.8979 nan 0.1000 0.0848
## 6 0.8397 nan 0.1000 0.0777
## 7 0.7916 nan 0.1000 0.0602
## 8 0.7518 nan 0.1000 0.0485
## 9 0.7186 nan 0.1000 0.0427
## 10 0.6896 nan 0.1000 0.0332
## 20 0.5473 nan 0.1000 0.0053
## 40 0.4655 nan 0.1000 -0.0008
## 60 0.4330 nan 0.1000 -0.0018
## 80 0.4088 nan 0.1000 -0.0013
## 100 0.3895 nan 0.1000 -0.0031
## 120 0.3752 nan 0.1000 -0.0010
## 140 0.3621 nan 0.1000 -0.0028
## 150 0.3568 nan 0.1000 -0.0015
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.3221
## 2 1.1960 nan 0.1000 0.2193
## 3 1.0627 nan 0.1000 0.1643
## 4 0.9583 nan 0.1000 0.1169
## 5 0.8802 nan 0.1000 0.0927
## 6 0.8204 nan 0.1000 0.0773
## 7 0.7695 nan 0.1000 0.0660
## 8 0.7269 nan 0.1000 0.0528
## 9 0.6923 nan 0.1000 0.0441
## 10 0.6632 nan 0.1000 0.0355
## 20 0.5147 nan 0.1000 0.0052
## 40 0.4317 nan 0.1000 -0.0012
## 60 0.3928 nan 0.1000 -0.0026
## 80 0.3613 nan 0.1000 -0.0028
## 100 0.3374 nan 0.1000 -0.0017
## 120 0.3189 nan 0.1000 -0.0030
## 140 0.3037 nan 0.1000 -0.0011
## 150 0.2960 nan 0.1000 -0.0023
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.2760
## 2 1.2344 nan 0.1000 0.2158
## 3 1.1038 nan 0.1000 0.1570
## 4 1.0096 nan 0.1000 0.1164
## 5 0.9345 nan 0.1000 0.0917
## 6 0.8761 nan 0.1000 0.0680
## 7 0.8299 nan 0.1000 0.0570
## 8 0.7932 nan 0.1000 0.0461
## 9 0.7612 nan 0.1000 0.0353
## 10 0.7366 nan 0.1000 0.0328
## 20 0.6041 nan 0.1000 0.0095
## 40 0.5268 nan 0.1000 -0.0019
## 60 0.4959 nan 0.1000 -0.0006
## 80 0.4774 nan 0.1000 -0.0022
## 100 0.4640 nan 0.1000 -0.0009
## 120 0.4533 nan 0.1000 -0.0027
## 140 0.4433 nan 0.1000 -0.0016
## 150 0.4395 nan 0.1000 -0.0016
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.3078
## 2 1.2013 nan 0.1000 0.2127
## 3 1.0718 nan 0.1000 0.1600
## 4 0.9755 nan 0.1000 0.1233
## 5 0.8989 nan 0.1000 0.0895
## 6 0.8405 nan 0.1000 0.0700
## 7 0.7928 nan 0.1000 0.0581
## 8 0.7527 nan 0.1000 0.0500
## 9 0.7188 nan 0.1000 0.0393
## 10 0.6907 nan 0.1000 0.0333
## 20 0.5482 nan 0.1000 0.0042
## 40 0.4698 nan 0.1000 -0.0003
## 60 0.4354 nan 0.1000 -0.0011
## 80 0.4119 nan 0.1000 -0.0012
## 100 0.3937 nan 0.1000 -0.0021
## 120 0.3764 nan 0.1000 -0.0022
## 140 0.3629 nan 0.1000 -0.0006
## 150 0.3567 nan 0.1000 -0.0003
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.3190
## 2 1.1948 nan 0.1000 0.2226
## 3 1.0576 nan 0.1000 0.1647
## 4 0.9574 nan 0.1000 0.1244
## 5 0.8792 nan 0.1000 0.0972
## 6 0.8176 nan 0.1000 0.0740
## 7 0.7680 nan 0.1000 0.0648
## 8 0.7256 nan 0.1000 0.0477
## 9 0.6906 nan 0.1000 0.0399
## 10 0.6627 nan 0.1000 0.0344
## 20 0.5199 nan 0.1000 0.0083
## 40 0.4323 nan 0.1000 -0.0022
## 60 0.3936 nan 0.1000 -0.0015
## 80 0.3674 nan 0.1000 -0.0024
## 100 0.3441 nan 0.1000 -0.0017
## 120 0.3264 nan 0.1000 -0.0024
## 140 0.3076 nan 0.1000 -0.0027
## 150 0.2987 nan 0.1000 -0.0019
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.3012
## 2 1.2129 nan 0.1000 0.2070
## 3 1.0856 nan 0.1000 0.1506
## 4 0.9933 nan 0.1000 0.1090
## 5 0.9233 nan 0.1000 0.0884
## 6 0.8677 nan 0.1000 0.0662
## 7 0.8257 nan 0.1000 0.0556
## 8 0.7901 nan 0.1000 0.0468
## 9 0.7597 nan 0.1000 0.0352
## 10 0.7352 nan 0.1000 0.0346
## 20 0.6038 nan 0.1000 0.0110
## 40 0.5255 nan 0.1000 0.0015
## 60 0.4948 nan 0.1000 -0.0005
## 80 0.4748 nan 0.1000 -0.0013
## 100 0.4604 nan 0.1000 -0.0017
## 120 0.4492 nan 0.1000 -0.0007
## 140 0.4383 nan 0.1000 -0.0020
## 150 0.4336 nan 0.1000 -0.0018
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.3094
## 2 1.2024 nan 0.1000 0.2118
## 3 1.0731 nan 0.1000 0.1535
## 4 0.9764 nan 0.1000 0.1241
## 5 0.8984 nan 0.1000 0.0887
## 6 0.8399 nan 0.1000 0.0768
## 7 0.7919 nan 0.1000 0.0610
## 8 0.7535 nan 0.1000 0.0499
## 9 0.7183 nan 0.1000 0.0405
## 10 0.6881 nan 0.1000 0.0343
## 20 0.5469 nan 0.1000 0.0068
## 40 0.4682 nan 0.1000 -0.0014
## 60 0.4344 nan 0.1000 -0.0026
## 80 0.4142 nan 0.1000 -0.0015
## 100 0.3928 nan 0.1000 -0.0007
## 120 0.3779 nan 0.1000 -0.0015
## 140 0.3622 nan 0.1000 -0.0009
## 150 0.3565 nan 0.1000 -0.0012
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.3091
## 2 1.1957 nan 0.1000 0.2238
## 3 1.0610 nan 0.1000 0.1622
## 4 0.9600 nan 0.1000 0.1262
## 5 0.8799 nan 0.1000 0.0954
## 6 0.8175 nan 0.1000 0.0821
## 7 0.7662 nan 0.1000 0.0609
## 8 0.7245 nan 0.1000 0.0498
## 9 0.6899 nan 0.1000 0.0409
## 10 0.6608 nan 0.1000 0.0328
## 20 0.5156 nan 0.1000 0.0068
## 40 0.4298 nan 0.1000 0.0006
## 60 0.3922 nan 0.1000 -0.0027
## 80 0.3639 nan 0.1000 -0.0035
## 100 0.3430 nan 0.1000 -0.0041
## 120 0.3222 nan 0.1000 -0.0013
## 140 0.3047 nan 0.1000 -0.0020
## 150 0.2968 nan 0.1000 -0.0025
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.2943
## 2 1.2063 nan 0.1000 0.2038
## 3 1.0845 nan 0.1000 0.1496
## 4 0.9936 nan 0.1000 0.1142
## 5 0.9236 nan 0.1000 0.0878
## 6 0.8665 nan 0.1000 0.0690
## 7 0.8233 nan 0.1000 0.0548
## 8 0.7877 nan 0.1000 0.0449
## 9 0.7573 nan 0.1000 0.0376
## 10 0.7314 nan 0.1000 0.0320
## 20 0.6004 nan 0.1000 0.0092
## 40 0.5233 nan 0.1000 0.0011
## 60 0.4910 nan 0.1000 -0.0001
## 80 0.4706 nan 0.1000 -0.0002
## 100 0.4573 nan 0.1000 -0.0008
## 120 0.4450 nan 0.1000 -0.0004
## 140 0.4364 nan 0.1000 -0.0016
## 150 0.4326 nan 0.1000 -0.0007
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.3051
## 2 1.2019 nan 0.1000 0.2098
## 3 1.0714 nan 0.1000 0.1560
## 4 0.9732 nan 0.1000 0.1200
## 5 0.8973 nan 0.1000 0.0928
## 6 0.8388 nan 0.1000 0.0758
## 7 0.7906 nan 0.1000 0.0593
## 8 0.7507 nan 0.1000 0.0454
## 9 0.7181 nan 0.1000 0.0397
## 10 0.6902 nan 0.1000 0.0401
## 20 0.5455 nan 0.1000 0.0074
## 40 0.4682 nan 0.1000 0.0002
## 60 0.4334 nan 0.1000 -0.0008
## 80 0.4091 nan 0.1000 -0.0004
## 100 0.3912 nan 0.1000 0.0001
## 120 0.3747 nan 0.1000 -0.0020
## 140 0.3611 nan 0.1000 -0.0012
## 150 0.3527 nan 0.1000 -0.0008
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.3200
## 2 1.1926 nan 0.1000 0.2170
## 3 1.0599 nan 0.1000 0.1607
## 4 0.9613 nan 0.1000 0.1234
## 5 0.8825 nan 0.1000 0.0981
## 6 0.8196 nan 0.1000 0.0745
## 7 0.7686 nan 0.1000 0.0612
## 8 0.7265 nan 0.1000 0.0508
## 9 0.6912 nan 0.1000 0.0416
## 10 0.6610 nan 0.1000 0.0350
## 20 0.5142 nan 0.1000 0.0043
## 40 0.4287 nan 0.1000 -0.0012
## 60 0.3911 nan 0.1000 -0.0032
## 80 0.3651 nan 0.1000 -0.0019
## 100 0.3409 nan 0.1000 -0.0016
## 120 0.3205 nan 0.1000 -0.0032
## 140 0.3032 nan 0.1000 -0.0020
## 150 0.2953 nan 0.1000 -0.0019
##
## Iter TrainDeviance ValidDeviance StepSize Improve
## 1 1.3863 nan 0.1000 0.3157
## 2 1.1940 nan 0.1000 0.2187
## 3 1.0609 nan 0.1000 0.1628
## 4 0.9611 nan 0.1000 0.1265
## 5 0.8826 nan 0.1000 0.0990
## 6 0.8197 nan 0.1000 0.0732
## 7 0.7698 nan 0.1000 0.0581
## 8 0.7289 nan 0.1000 0.0535
## 9 0.6937 nan 0.1000 0.0437
## 10 0.6645 nan 0.1000 0.0384
## 20 0.5183 nan 0.1000 0.0045
## 40 0.4357 nan 0.1000 -0.0013
## 60 0.3942 nan 0.1000 -0.0032
## 80 0.3689 nan 0.1000 -0.0013
## 100 0.3466 nan 0.1000 -0.0025
## 120 0.3300 nan 0.1000 -0.0017
## 140 0.3144 nan 0.1000 -0.0007
## 150 0.3072 nan 0.1000 -0.0011
plot(gbm.model)
pred.gbm = predict(gbm.model, test.data)
result = data.frame(test.data$Rating_Category, pred.gbm)
print(result)
## test.data.Rating_Category pred.gbm
## 1 HIGH HIGH
## 2 MEDIUM HIGH
## 3 MEDIUM MEDIUM
## 4 MEDIUM MEDIUM
## 5 MEDIUM MEDIUM
## 6 HIGH HIGH
## 7 HIGH HIGH
## 8 MEDIUM HIGH
## 9 MEDIUM MEDIUM
## 10 HIGH HIGH
## 11 HIGH HIGH
## 12 MEDIUM MEDIUM
## 13 MEDIUM MEDIUM
## 14 HIGH MEDIUM
## 15 MEDIUM MEDIUM
## 16 MEDIUM MEDIUM
## 17 MEDIUM MEDIUM
## 18 HIGH HIGH
## 19 MEDIUM MEDIUM
## 20 HIGH HIGH
## 21 HIGH HIGH
## 22 MEDIUM MEDIUM
## 23 MEDIUM MEDIUM
## 24 MEDIUM HIGH
## 25 MEDIUM MEDIUM
## 26 MEDIUM MEDIUM
## 27 HIGH HIGH
## 28 MEDIUM HIGH
## 29 HIGH HIGH
## 30 MEDIUM MEDIUM
## 31 MEDIUM MEDIUM
## 32 MEDIUM MEDIUM
## 33 MEDIUM MEDIUM
## 34 HIGH MEDIUM
## 35 MEDIUM MEDIUM
## 36 MEDIUM MEDIUM
## 37 MEDIUM MEDIUM
## 38 MEDIUM HIGH
## 39 HIGH HIGH
## 40 MEDIUM MEDIUM
## 41 MEDIUM MEDIUM
## 42 MEDIUM MEDIUM
## 43 MEDIUM MEDIUM
## 44 MEDIUM MEDIUM
## 45 HIGH HIGH
## 46 HIGH MEDIUM
## 47 MEDIUM MEDIUM
## 48 MEDIUM MEDIUM
## 49 HIGH MEDIUM
## 50 HIGH HIGH
## 51 HIGH HIGH
## 52 MEDIUM MEDIUM
## 53 HIGH HIGH
## 54 HIGH MEDIUM
## 55 MEDIUM MEDIUM
## 56 MEDIUM MEDIUM
## 57 HIGH HIGH
## 58 MEDIUM MEDIUM
## 59 MEDIUM MEDIUM
## 60 MEDIUM MEDIUM
## 61 MEDIUM MEDIUM
## 62 HIGH HIGH
## 63 MEDIUM MEDIUM
## 64 MEDIUM MEDIUM
## 65 MEDIUM MEDIUM
## 66 MEDIUM MEDIUM
## 67 HIGH HIGH
## 68 HIGH HIGH
## 69 HIGH HIGH
## 70 MEDIUM MEDIUM
## 71 MEDIUM MEDIUM
## 72 LOW MEDIUM
## 73 MEDIUM MEDIUM
## 74 HIGH HIGH
## 75 MEDIUM MEDIUM
## 76 HIGH HIGH
## 77 HIGH HIGH
## 78 HIGH HIGH
## 79 MEDIUM MEDIUM
## 80 HIGH HIGH
## 81 MEDIUM MEDIUM
## 82 MEDIUM MEDIUM
## 83 MEDIUM HIGH
## 84 MEDIUM MEDIUM
## 85 MEDIUM HIGH
## 86 HIGH HIGH
## 87 MEDIUM MEDIUM
## 88 MEDIUM MEDIUM
## 89 MEDIUM MEDIUM
## 90 MEDIUM MEDIUM
## 91 HIGH MEDIUM
## 92 MEDIUM MEDIUM
## 93 MEDIUM HIGH
## 94 MEDIUM MEDIUM
## 95 MEDIUM MEDIUM
## 96 HIGH HIGH
## 97 MEDIUM MEDIUM
## 98 MEDIUM MEDIUM
## 99 MEDIUM MEDIUM
## 100 MEDIUM MEDIUM
## 101 MEDIUM MEDIUM
## 102 MEDIUM MEDIUM
## 103 MEDIUM MEDIUM
## 104 MEDIUM MEDIUM
## 105 MEDIUM MEDIUM
## 106 MEDIUM MEDIUM
## 107 MEDIUM MEDIUM
## 108 MEDIUM MEDIUM
## 109 MEDIUM MEDIUM
## 110 MEDIUM MEDIUM
## 111 MEDIUM MEDIUM
## 112 MEDIUM MEDIUM
## 113 MEDIUM MEDIUM
## 114 HIGH HIGH
## 115 MEDIUM MEDIUM
## 116 MEDIUM LOW
## 117 MEDIUM HIGH
## 118 MEDIUM MEDIUM
## 119 MEDIUM MEDIUM
## 120 MEDIUM MEDIUM
## 121 MEDIUM MEDIUM
## 122 MEDIUM MEDIUM
## 123 MEDIUM MEDIUM
## 124 MEDIUM MEDIUM
## 125 MEDIUM MEDIUM
## 126 MEDIUM MEDIUM
## 127 HIGH MEDIUM
## 128 MEDIUM MEDIUM
## 129 HIGH HIGH
## 130 HIGH MEDIUM
## 131 MEDIUM MEDIUM
## 132 MEDIUM MEDIUM
## 133 MEDIUM MEDIUM
## 134 MEDIUM MEDIUM
## 135 MEDIUM MEDIUM
## 136 HIGH MEDIUM
## 137 MEDIUM HIGH
## 138 MEDIUM MEDIUM
## 139 HIGH HIGH
## 140 MEDIUM MEDIUM
## 141 MEDIUM MEDIUM
## 142 MEDIUM MEDIUM
## 143 LOW MEDIUM
## 144 MEDIUM MEDIUM
## 145 HIGH HIGH
## 146 HIGH HIGH
## 147 MEDIUM MEDIUM
## 148 MEDIUM MEDIUM
## 149 MEDIUM MEDIUM
## 150 MEDIUM MEDIUM
## 151 MEDIUM MEDIUM
## 152 MEDIUM MEDIUM
## 153 MEDIUM MEDIUM
## 154 MEDIUM MEDIUM
## 155 MEDIUM MEDIUM
## 156 HIGH HIGH
## 157 MEDIUM MEDIUM
## 158 MEDIUM MEDIUM
## 159 HIGH HIGH
## 160 MEDIUM MEDIUM
## 161 MEDIUM MEDIUM
## 162 MEDIUM MEDIUM
## 163 MEDIUM MEDIUM
## 164 HIGH HIGH
## 165 MEDIUM MEDIUM
## 166 HIGH HIGH
## 167 MEDIUM MEDIUM
## 168 MEDIUM MEDIUM
## 169 MEDIUM MEDIUM
## 170 MEDIUM MEDIUM
## 171 MEDIUM MEDIUM
## 172 MEDIUM MEDIUM
## 173 MEDIUM MEDIUM
## 174 HIGH HIGH
## 175 HIGH MEDIUM
## 176 HIGH MEDIUM
## 177 HIGH HIGH
## 178 MEDIUM MEDIUM
## 179 MEDIUM MEDIUM
## 180 MEDIUM MEDIUM
## 181 MEDIUM MEDIUM
## 182 MEDIUM MEDIUM
## 183 MEDIUM MEDIUM
## 184 MEDIUM HIGH
## 185 MEDIUM MEDIUM
## 186 MEDIUM MEDIUM
## 187 HIGH MEDIUM
## 188 MEDIUM MEDIUM
## 189 MEDIUM MEDIUM
## 190 MEDIUM MEDIUM
## 191 MEDIUM MEDIUM
## 192 HIGH MEDIUM
## 193 MEDIUM MEDIUM
## 194 MEDIUM MEDIUM
## 195 MEDIUM MEDIUM
## 196 MEDIUM MEDIUM
## 197 MEDIUM MEDIUM
## 198 HIGH MEDIUM
## 199 MEDIUM MEDIUM
## 200 MEDIUM MEDIUM
## 201 MEDIUM MEDIUM
## 202 MEDIUM MEDIUM
## 203 MEDIUM MEDIUM
## 204 HIGH HIGH
## 205 HIGH HIGH
## 206 MEDIUM MEDIUM
## 207 MEDIUM MEDIUM
## 208 MEDIUM MEDIUM
## 209 MEDIUM MEDIUM
## 210 MEDIUM MEDIUM
## 211 MEDIUM MEDIUM
## 212 HIGH HIGH
## 213 HIGH MEDIUM
## 214 MEDIUM MEDIUM
## 215 MEDIUM MEDIUM
## 216 MEDIUM MEDIUM
## 217 MEDIUM MEDIUM
## 218 MEDIUM MEDIUM
## 219 MEDIUM MEDIUM
## 220 MEDIUM MEDIUM
## 221 MEDIUM MEDIUM
## 222 MEDIUM HIGH
## 223 MEDIUM MEDIUM
## 224 MEDIUM MEDIUM
## 225 HIGH HIGH
## 226 HIGH HIGH
## 227 HIGH MEDIUM
## 228 MEDIUM MEDIUM
## 229 MEDIUM MEDIUM
## 230 HIGH MEDIUM
## 231 MEDIUM MEDIUM
## 232 MEDIUM MEDIUM
## 233 HIGH MEDIUM
## 234 MEDIUM MEDIUM
## 235 MEDIUM MEDIUM
## 236 MEDIUM MEDIUM
## 237 MEDIUM MEDIUM
## 238 MEDIUM MEDIUM
## 239 MEDIUM MEDIUM
## 240 MEDIUM MEDIUM
## 241 MEDIUM MEDIUM
## 242 MEDIUM MEDIUM
## 243 MEDIUM MEDIUM
## 244 MEDIUM MEDIUM
## 245 MEDIUM MEDIUM
## 246 MEDIUM MEDIUM
## 247 MEDIUM MEDIUM
## 248 HIGH HIGH
## 249 MEDIUM MEDIUM
## 250 MEDIUM MEDIUM
## 251 MEDIUM MEDIUM
## 252 MEDIUM MEDIUM
## 253 HIGH MEDIUM
## 254 MEDIUM MEDIUM
## 255 HIGH MEDIUM
## 256 HIGH HIGH
## 257 MEDIUM MEDIUM
## 258 HIGH MEDIUM
## 259 MEDIUM MEDIUM
## 260 HIGH HIGH
## 261 MEDIUM MEDIUM
## 262 MEDIUM MEDIUM
## 263 MEDIUM MEDIUM
## 264 MEDIUM MEDIUM
## 265 HIGH HIGH
## 266 HIGH MEDIUM
## 267 MEDIUM HIGH
## 268 HIGH MEDIUM
## 269 HIGH HIGH
## 270 MEDIUM MEDIUM
## 271 MEDIUM MEDIUM
## 272 HIGH HIGH
## 273 MEDIUM MEDIUM
## 274 MEDIUM MEDIUM
## 275 MEDIUM HIGH
## 276 LOW MEDIUM
## 277 MEDIUM MEDIUM
## 278 MEDIUM MEDIUM
## 279 MEDIUM MEDIUM
## 280 HIGH MEDIUM
## 281 MEDIUM MEDIUM
## 282 LOW MEDIUM
## 283 MEDIUM MEDIUM
## 284 MEDIUM MEDIUM
## 285 MEDIUM MEDIUM
## 286 MEDIUM MEDIUM
## 287 MEDIUM HIGH
## 288 MEDIUM HIGH
## 289 MEDIUM MEDIUM
## 290 MEDIUM MEDIUM
## 291 MEDIUM MEDIUM
## 292 MEDIUM MEDIUM
## 293 MEDIUM MEDIUM
## 294 HIGH MEDIUM
## 295 HIGH MEDIUM
## 296 HIGH HIGH
## 297 HIGH HIGH
## 298 HIGH HIGH
## 299 HIGH HIGH
## 300 MEDIUM MEDIUM
## 301 HIGH HIGH
## 302 HIGH MEDIUM
## 303 MEDIUM MEDIUM
## 304 HIGH HIGH
## 305 HIGH HIGH
## 306 MEDIUM MEDIUM
## 307 MEDIUM MEDIUM
## 308 MEDIUM MEDIUM
## 309 MEDIUM MEDIUM
## 310 MEDIUM MEDIUM
## 311 HIGH MEDIUM
## 312 HIGH HIGH
## 313 MEDIUM MEDIUM
## 314 HIGH MEDIUM
## 315 MEDIUM MEDIUM
## 316 HIGH MEDIUM
## 317 MEDIUM MEDIUM
## 318 MEDIUM MEDIUM
## 319 MEDIUM MEDIUM
## 320 MEDIUM MEDIUM
## 321 MEDIUM MEDIUM
## 322 MEDIUM MEDIUM
## 323 MEDIUM MEDIUM
## 324 HIGH MEDIUM
## 325 MEDIUM MEDIUM
## 326 HIGH HIGH
## 327 MEDIUM MEDIUM
## 328 HIGH MEDIUM
## 329 MEDIUM MEDIUM
## 330 HIGH MEDIUM
## 331 HIGH HIGH
## 332 MEDIUM MEDIUM
## 333 MEDIUM MEDIUM
## 334 HIGH MEDIUM
## 335 MEDIUM MEDIUM
## 336 MEDIUM MEDIUM
## 337 MEDIUM MEDIUM
## 338 HIGH HIGH
## 339 HIGH MEDIUM
## 340 HIGH MEDIUM
## 341 MEDIUM MEDIUM
## 342 MEDIUM MEDIUM
## 343 HIGH HIGH
## 344 MEDIUM MEDIUM
## 345 MEDIUM MEDIUM
## 346 HIGH HIGH
## 347 MEDIUM MEDIUM
## 348 MEDIUM MEDIUM
## 349 MEDIUM MEDIUM
## 350 HIGH HIGH
## 351 MEDIUM MEDIUM
## 352 MEDIUM MEDIUM
## 353 MEDIUM MEDIUM
## 354 MEDIUM MEDIUM
## 355 MEDIUM MEDIUM
## 356 MEDIUM MEDIUM
## 357 HIGH MEDIUM
## 358 MEDIUM MEDIUM
## 359 MEDIUM MEDIUM
## 360 HIGH MEDIUM
## 361 MEDIUM MEDIUM
## 362 MEDIUM MEDIUM
## 363 MEDIUM MEDIUM
## 364 MEDIUM MEDIUM
## 365 MEDIUM MEDIUM
## 366 MEDIUM MEDIUM
## 367 HIGH MEDIUM
## 368 MEDIUM HIGH
## 369 MEDIUM MEDIUM
## 370 MEDIUM MEDIUM
## 371 MEDIUM MEDIUM
## 372 MEDIUM MEDIUM
## 373 MEDIUM MEDIUM
## 374 HIGH HIGH
## 375 HIGH HIGH
## 376 HIGH MEDIUM
## 377 MEDIUM MEDIUM
## 378 HIGH MEDIUM
## 379 MEDIUM HIGH
## 380 MEDIUM MEDIUM
## 381 HIGH MEDIUM
## 382 MEDIUM MEDIUM
## 383 MEDIUM MEDIUM
## 384 MEDIUM MEDIUM
## 385 MEDIUM MEDIUM
## 386 MEDIUM MEDIUM
## 387 HIGH HIGH
## 388 HIGH MEDIUM
## 389 MEDIUM HIGH
## 390 HIGH MEDIUM
## 391 HIGH HIGH
## 392 MEDIUM HIGH
## 393 HIGH MEDIUM
## 394 HIGH HIGH
## 395 MEDIUM MEDIUM
## 396 MEDIUM MEDIUM
## 397 HIGH MEDIUM
## 398 MEDIUM MEDIUM
## 399 MEDIUM MEDIUM
## 400 MEDIUM MEDIUM
## 401 LOW MEDIUM
## 402 HIGH MEDIUM
## 403 MEDIUM MEDIUM
## 404 MEDIUM MEDIUM
## 405 MEDIUM MEDIUM
## 406 LOW MEDIUM
## 407 HIGH MEDIUM
## 408 HIGH MEDIUM
## 409 MEDIUM MEDIUM
## 410 HIGH HIGH
## 411 MEDIUM MEDIUM
## 412 MEDIUM MEDIUM
## 413 MEDIUM MEDIUM
## 414 MEDIUM LOW
## 415 MEDIUM MEDIUM
## 416 HIGH MEDIUM
## 417 MEDIUM LOW
## 418 MEDIUM MEDIUM
## 419 MEDIUM MEDIUM
## 420 MEDIUM MEDIUM
## 421 MEDIUM MEDIUM
## 422 MEDIUM LOW
## 423 HIGH MEDIUM
## 424 MEDIUM MEDIUM
## 425 MEDIUM MEDIUM
## 426 HIGH MEDIUM
## 427 LOW LOW
## 428 MEDIUM MEDIUM
## 429 HIGH HIGH
## 430 MEDIUM MEDIUM
## 431 HIGH HIGH
## 432 MEDIUM MEDIUM
## 433 MEDIUM MEDIUM
## 434 MEDIUM MEDIUM
## 435 HIGH HIGH
## 436 MEDIUM MEDIUM
## 437 HIGH HIGH
## 438 MEDIUM MEDIUM
## 439 MEDIUM MEDIUM
## 440 HIGH HIGH
## 441 MEDIUM HIGH
## 442 MEDIUM MEDIUM
## 443 MEDIUM MEDIUM
## 444 MEDIUM MEDIUM
## 445 MEDIUM MEDIUM
## 446 HIGH MEDIUM
## 447 MEDIUM MEDIUM
## 448 MEDIUM MEDIUM
## 449 HIGH HIGH
## 450 LOW MEDIUM
## 451 HIGH HIGH
## 452 MEDIUM MEDIUM
## 453 MEDIUM MEDIUM
## 454 HIGH HIGH
## 455 MEDIUM MEDIUM
## 456 MEDIUM MEDIUM
## 457 LOW MEDIUM
## 458 MEDIUM MEDIUM
## 459 MEDIUM HIGH
## 460 HIGH HIGH
## 461 HIGH HIGH
## 462 MEDIUM MEDIUM
## 463 MEDIUM MEDIUM
## 464 MEDIUM MEDIUM
## 465 MEDIUM MEDIUM
## 466 HIGH MEDIUM
## 467 MEDIUM HIGH
## 468 HIGH HIGH
## 469 HIGH HIGH
## 470 HIGH MEDIUM
## 471 MEDIUM MEDIUM
## 472 MEDIUM HIGH
## 473 MEDIUM HIGH
## 474 MEDIUM MEDIUM
## 475 MEDIUM MEDIUM
## 476 HIGH HIGH
## 477 MEDIUM HIGH
## 478 MEDIUM MEDIUM
## 479 MEDIUM MEDIUM
## 480 MEDIUM MEDIUM
## 481 MEDIUM MEDIUM
## 482 MEDIUM MEDIUM
## 483 MEDIUM MEDIUM
## 484 MEDIUM MEDIUM
## 485 LOW MEDIUM
## 486 MEDIUM MEDIUM
## 487 MEDIUM MEDIUM
## 488 HIGH HIGH
## 489 MEDIUM MEDIUM
## 490 HIGH HIGH
## 491 MEDIUM MEDIUM
## 492 HIGH MEDIUM
## 493 HIGH MEDIUM
## 494 MEDIUM MEDIUM
## 495 MEDIUM MEDIUM
## 496 MEDIUM HIGH
## 497 HIGH MEDIUM
## 498 MEDIUM MEDIUM
## 499 MEDIUM MEDIUM
## 500 HIGH HIGH
## 501 HIGH MEDIUM
## 502 MEDIUM MEDIUM
## 503 MEDIUM MEDIUM
## 504 MEDIUM MEDIUM
## 505 MEDIUM MEDIUM
## 506 MEDIUM MEDIUM
## 507 MEDIUM MEDIUM
## 508 HIGH HIGH
## 509 HIGH HIGH
## 510 MEDIUM MEDIUM
## 511 MEDIUM MEDIUM
## 512 LOW MEDIUM
## 513 MEDIUM MEDIUM
## 514 MEDIUM MEDIUM
## 515 HIGH HIGH
## 516 LOW MEDIUM
## 517 HIGH HIGH
## 518 MEDIUM MEDIUM
## 519 MEDIUM MEDIUM
## 520 HIGH HIGH
## 521 MEDIUM MEDIUM
## 522 HIGH HIGH
## 523 MEDIUM MEDIUM
## 524 MEDIUM MEDIUM
## 525 MEDIUM MEDIUM
## 526 MEDIUM HIGH
## 527 MEDIUM MEDIUM
## 528 MEDIUM MEDIUM
## 529 HIGH MEDIUM
## 530 MEDIUM MEDIUM
## 531 MEDIUM MEDIUM
## 532 MEDIUM MEDIUM
## 533 HIGH HIGH
## 534 MEDIUM HIGH
## 535 MEDIUM MEDIUM
## 536 MEDIUM MEDIUM
## 537 MEDIUM MEDIUM
## 538 HIGH HIGH
## 539 MEDIUM MEDIUM
## 540 MEDIUM MEDIUM
## 541 HIGH HIGH
## 542 HIGH HIGH
## 543 MEDIUM MEDIUM
## 544 HIGH HIGH
## 545 HIGH HIGH
## 546 HIGH MEDIUM
## 547 MEDIUM MEDIUM
## 548 MEDIUM MEDIUM
## 549 MEDIUM HIGH
## 550 MEDIUM MEDIUM
## 551 MEDIUM MEDIUM
## 552 MEDIUM MEDIUM
## 553 MEDIUM MEDIUM
## 554 MEDIUM HIGH
## 555 MEDIUM MEDIUM
## 556 HIGH MEDIUM
## 557 HIGH HIGH
## 558 MEDIUM MEDIUM
## 559 MEDIUM MEDIUM
## 560 HIGH HIGH
## 561 MEDIUM MEDIUM
## 562 MEDIUM MEDIUM
## 563 MEDIUM MEDIUM
## 564 MEDIUM MEDIUM
## 565 HIGH HIGH
## 566 MEDIUM MEDIUM
## 567 MEDIUM MEDIUM
## 568 HIGH MEDIUM
## 569 HIGH MEDIUM
## 570 MEDIUM MEDIUM
## 571 MEDIUM MEDIUM
## 572 MEDIUM MEDIUM
## 573 MEDIUM MEDIUM
## 574 LOW MEDIUM
## 575 MEDIUM HIGH
## 576 LOW MEDIUM
## 577 MEDIUM MEDIUM
## 578 MEDIUM MEDIUM
## 579 MEDIUM HIGH
## 580 MEDIUM HIGH
## 581 HIGH HIGH
## 582 MEDIUM MEDIUM
## 583 MEDIUM MEDIUM
## 584 MEDIUM MEDIUM
## 585 MEDIUM HIGH
## 586 MEDIUM MEDIUM
## 587 MEDIUM MEDIUM
## 588 MEDIUM MEDIUM
## 589 MEDIUM HIGH
## 590 HIGH HIGH
## 591 HIGH HIGH
## 592 HIGH HIGH
## 593 MEDIUM MEDIUM
## 594 MEDIUM MEDIUM
## 595 MEDIUM MEDIUM
## 596 MEDIUM MEDIUM
## 597 HIGH HIGH
## 598 HIGH MEDIUM
## 599 MEDIUM MEDIUM
## 600 MEDIUM MEDIUM
## 601 LOW MEDIUM
## 602 MEDIUM MEDIUM
## 603 LOW MEDIUM
## 604 MEDIUM HIGH
## 605 MEDIUM MEDIUM
## 606 HIGH HIGH
## 607 HIGH HIGH
## 608 MEDIUM MEDIUM
## 609 MEDIUM MEDIUM
## 610 MEDIUM MEDIUM
## 611 MEDIUM MEDIUM
## 612 MEDIUM MEDIUM
## 613 HIGH MEDIUM
## 614 MEDIUM MEDIUM
## 615 HIGH HIGH
## 616 HIGH HIGH
## 617 MEDIUM MEDIUM
## 618 HIGH MEDIUM
## 619 HIGH HIGH
## 620 HIGH HIGH
## 621 HIGH HIGH
## 622 HIGH HIGH
## 623 MEDIUM MEDIUM
## 624 HIGH HIGH
## 625 MEDIUM MEDIUM
## 626 MEDIUM MEDIUM
## 627 HIGH MEDIUM
## 628 MEDIUM MEDIUM
## 629 HIGH MEDIUM
## 630 LOW MEDIUM
## 631 MEDIUM MEDIUM
## 632 MEDIUM HIGH
## 633 MEDIUM MEDIUM
## 634 MEDIUM MEDIUM
## 635 HIGH MEDIUM
## 636 HIGH HIGH
## 637 MEDIUM MEDIUM
## 638 HIGH MEDIUM
## 639 HIGH HIGH
## 640 MEDIUM MEDIUM
## 641 HIGH HIGH
## 642 MEDIUM MEDIUM
## 643 MEDIUM MEDIUM
## 644 MEDIUM MEDIUM
## 645 MEDIUM MEDIUM
## 646 MEDIUM MEDIUM
## 647 MEDIUM MEDIUM
## 648 HIGH HIGH
## 649 MEDIUM MEDIUM
## 650 MEDIUM MEDIUM
## 651 MEDIUM MEDIUM
## 652 LOW MEDIUM
## 653 MEDIUM MEDIUM
## 654 MEDIUM HIGH
## 655 MEDIUM MEDIUM
## 656 HIGH HIGH
## 657 MEDIUM MEDIUM
## 658 HIGH HIGH
## 659 HIGH HIGH
## 660 MEDIUM MEDIUM
## 661 MEDIUM HIGH
## 662 MEDIUM MEDIUM
## 663 MEDIUM MEDIUM
## 664 MEDIUM MEDIUM
## 665 MEDIUM HIGH
## 666 MEDIUM HIGH
## 667 MEDIUM MEDIUM
## 668 HIGH MEDIUM
## 669 HIGH HIGH
## 670 HIGH MEDIUM
## 671 HIGH HIGH
## 672 MEDIUM HIGH
## 673 HIGH HIGH
## 674 MEDIUM MEDIUM
## 675 HIGH HIGH
## 676 MEDIUM MEDIUM
## 677 MEDIUM MEDIUM
## 678 HIGH MEDIUM
## 679 MEDIUM MEDIUM
## 680 MEDIUM MEDIUM
## 681 MEDIUM MEDIUM
## 682 HIGH HIGH
## 683 HIGH MEDIUM
## 684 MEDIUM MEDIUM
## 685 HIGH HIGH
## 686 HIGH HIGH
## 687 MEDIUM MEDIUM
## 688 MEDIUM MEDIUM
## 689 HIGH HIGH
## 690 HIGH HIGH
## 691 HIGH HIGH
## 692 HIGH HIGH
## 693 HIGH HIGH
## 694 MEDIUM HIGH
## 695 MEDIUM HIGH
## 696 LOW MEDIUM
## 697 MEDIUM MEDIUM
## 698 MEDIUM MEDIUM
## 699 HIGH MEDIUM
## 700 HIGH MEDIUM
## 701 MEDIUM MEDIUM
## 702 MEDIUM MEDIUM
## 703 MEDIUM MEDIUM
## 704 HIGH HIGH
## 705 HIGH HIGH
## 706 MEDIUM MEDIUM
## 707 MEDIUM HIGH
## 708 MEDIUM MEDIUM
## 709 MEDIUM MEDIUM
## 710 MEDIUM MEDIUM
## 711 MEDIUM MEDIUM
## 712 MEDIUM MEDIUM
## 713 HIGH HIGH
## 714 MEDIUM MEDIUM
## 715 MEDIUM MEDIUM
## 716 HIGH HIGH
## 717 HIGH HIGH
## 718 MEDIUM HIGH
## 719 MEDIUM MEDIUM
## 720 MEDIUM MEDIUM
## 721 MEDIUM HIGH
## 722 HIGH HIGH
## 723 HIGH MEDIUM
## 724 MEDIUM MEDIUM
## 725 MEDIUM HIGH
## 726 MEDIUM LOW
## 727 MEDIUM MEDIUM
## 728 MEDIUM MEDIUM
## 729 HIGH MEDIUM
## 730 MEDIUM MEDIUM
## 731 MEDIUM MEDIUM
## 732 HIGH HIGH
## 733 HIGH HIGH
## 734 HIGH MEDIUM
## 735 MEDIUM MEDIUM
## 736 HIGH HIGH
## 737 MEDIUM MEDIUM
## 738 MEDIUM MEDIUM
## 739 MEDIUM MEDIUM
## 740 MEDIUM MEDIUM
## 741 HIGH MEDIUM
## 742 MEDIUM MEDIUM
## 743 HIGH HIGH
## 744 MEDIUM MEDIUM
## 745 MEDIUM HIGH
## 746 MEDIUM MEDIUM
## 747 HIGH HIGH
## 748 MEDIUM MEDIUM
## 749 MEDIUM MEDIUM
## 750 HIGH HIGH
## 751 HIGH HIGH
## 752 MEDIUM MEDIUM
## 753 MEDIUM MEDIUM
## 754 MEDIUM MEDIUM
## 755 HIGH MEDIUM
## 756 HIGH MEDIUM
## 757 HIGH MEDIUM
## 758 MEDIUM HIGH
## 759 MEDIUM MEDIUM
## 760 MEDIUM MEDIUM
cm = confusionMatrix(test.data$Rating_Category, as.factor(pred.gbm))
print(cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction LOW MEDIUM HIGH EXCELLENT
## LOW 1 18 0 0
## MEDIUM 5 455 51 0
## HIGH 0 84 146 0
## EXCELLENT 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.7921
## 95% CI : (0.7615, 0.8204)
## No Information Rate : 0.7329
## P-Value [Acc > NIR] : 9.288e-05
##
## Kappa : 0.5149
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: LOW Class: MEDIUM Class: HIGH Class: EXCELLENT
## Sensitivity 0.166667 0.8169 0.7411 NA
## Specificity 0.976127 0.7241 0.8508 1
## Pos Pred Value 0.052632 0.8904 0.6348 NA
## Neg Pred Value 0.993252 0.5904 0.9038 NA
## Prevalence 0.007895 0.7329 0.2592 0
## Detection Rate 0.001316 0.5987 0.1921 0
## Detection Prevalence 0.025000 0.6724 0.3026 0
## Balanced Accuracy 0.571397 0.7705 0.7960 NA