27 Variables and 4943 Obs -> To predict imdb_score
sum(duplicated(traindata$movie_title)) # remove 123 duplicated value
## [1] 123
traindata = traindata[!duplicated(traindata$movie_title),] # 4820 left
traindata = traindata[,names(traindata) !="movie_title"]
traindata = traindata[,names(traindata) !="movie_imdb_link"]
traindata = traindata[,names(traindata) !="genres"]
traindata = traindata[,names(traindata) !="plot_keywords"]
####### Remove Names #######
traindata = traindata[,names(traindata) !="director_name"]
traindata = traindata[,names(traindata) !="actor_1_name"]
traindata = traindata[,names(traindata) !="actor_2_name"]
traindata = traindata[,names(traindata) !="actor_3_name"]
table(traindata$language)/nrow(traindata)
##
## Aboriginal Arabic Aramaic Bosnian
## 0.0020746888 0.0004149378 0.0010373444 0.0002074689 0.0002074689
## Cantonese Chinese Czech Danish Dari
## 0.0022821577 0.0006224066 0.0002074689 0.0010373444 0.0004149378
## Dutch Dzongkha English Filipino French
## 0.0008298755 0.0002074689 0.9321576763 0.0002074689 0.0149377593
## German Greek Hebrew Hindi Hungarian
## 0.0039419087 0.0002074689 0.0010373444 0.0058091286 0.0002074689
## Icelandic Indonesian Italian Japanese Kannada
## 0.0004149378 0.0004149378 0.0020746888 0.0035269710 0.0002074689
## Kazakh Korean Mandarin Maya Mongolian
## 0.0002074689 0.0016597510 0.0043568465 0.0002074689 0.0002074689
## None Norwegian Panjabi Persian Polish
## 0.0004149378 0.0008298755 0.0002074689 0.0008298755 0.0006224066
## Portuguese Romanian Russian Slovenian Spanish
## 0.0016597510 0.0004149378 0.0022821577 0.0002074689 0.0082987552
## Swahili Swedish Telugu Thai Urdu
## 0.0002074689 0.0010373444 0.0002074689 0.0006224066 0.0002074689
## Vietnamese Zulu
## 0.0002074689 0.0004149378
table(traindata$country)/nrow(traindata)
##
## Afghanistan Argentina
## 0.0010373444 0.0002074689 0.0008298755
## Aruba Australia Bahamas
## 0.0002074689 0.0109958506 0.0002074689
## Belgium Brazil Bulgaria
## 0.0006224066 0.0014522822 0.0002074689
## Cambodia Cameroon Canada
## 0.0002074689 0.0002074689 0.0248962656
## Chile China Colombia
## 0.0002074689 0.0053941909 0.0002074689
## Czech Republic Denmark Dominican Republic
## 0.0006224066 0.0022821577 0.0002074689
## Egypt Finland France
## 0.0002074689 0.0002074689 0.0315352697
## Georgia Germany Greece
## 0.0002074689 0.0190871369 0.0004149378
## Hong Kong Hungary Iceland
## 0.0035269710 0.0004149378 0.0006224066
## India Indonesia Iran
## 0.0068464730 0.0002074689 0.0008298755
## Ireland Israel Italy
## 0.0024896266 0.0008298755 0.0045643154
## Japan Kenya Kyrgyzstan
## 0.0045643154 0.0002074689 0.0002074689
## Mexico Netherlands New Line
## 0.0035269710 0.0010373444 0.0002074689
## New Zealand Nigeria Norway
## 0.0024896266 0.0002074689 0.0016597510
## Official site Pakistan Panama
## 0.0002074689 0.0002074689 0.0002074689
## Peru Philippines Poland
## 0.0002074689 0.0002074689 0.0008298755
## Romania Russia Slovakia
## 0.0008298755 0.0022821577 0.0002074689
## Slovenia South Africa South Korea
## 0.0002074689 0.0016597510 0.0029045643
## Soviet Union Spain Sweden
## 0.0002074689 0.0066390041 0.0012448133
## Switzerland Taiwan Thailand
## 0.0006224066 0.0002074689 0.0010373444
## Turkey UK United Arab Emirates
## 0.0002074689 0.0885892116 0.0002074689
## USA West Germany
## 0.7543568465 0.0006224066
par(mfrow=c(1,2))
plot(traindata$language);plot(traindata$country)
Vast majority of the movies are in English(93%) and made in USA(75%), UK(8.8%). Thus, I decided to keep only the movies in English and made in USA for the convinience of the analysis.
####### USA Only #######
traindata = traindata[traindata$country == "USA",] # 984 other countries
traindata = traindata[,names(traindata) !="country"]
####### English Only #######
traindata = traindata[traindata$language == "English",]
traindata = traindata[,names(traindata) !="language"]
par(mfrow=c(2,3))
hist(num_critic_for_reviews,data =traindata);hist(log(num_critic_for_reviews),data=traindata)
## Warning in plot.window(xlim, ylim, "", ...): "data"는 그래픽 매개변수가 아
## 닙니다
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "data"는 그래픽 매개변수가 아닙니다
## Warning in axis(1, ...): "data"는 그래픽 매개변수가 아닙니다
## Warning in axis(2, ...): "data"는 그래픽 매개변수가 아닙니다
## Warning in plot.window(xlim, ylim, "", ...): "data"는 그래픽 매개변수가 아
## 닙니다
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "data"는 그래픽 매개변수가 아닙니다
## Warning in axis(1, ...): "data"는 그래픽 매개변수가 아닙니다
## Warning in axis(2, ...): "data"는 그래픽 매개변수가 아닙니다
hist(num_user_for_reviews,data=traindata);hist(log(num_user_for_reviews),data=traindata)
## Warning in plot.window(xlim, ylim, "", ...): "data"는 그래픽 매개변수가 아
## 닙니다
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "data"는 그래픽 매개변수가 아닙니다
## Warning in axis(1, ...): "data"는 그래픽 매개변수가 아닙니다
## Warning in axis(2, ...): "data"는 그래픽 매개변수가 아닙니다
## Warning in plot.window(xlim, ylim, "", ...): "data"는 그래픽 매개변수가 아
## 닙니다
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "data"는 그래픽 매개변수가 아닙니다
## Warning in axis(1, ...): "data"는 그래픽 매개변수가 아닙니다
## Warning in axis(2, ...): "data"는 그래픽 매개변수가 아닙니다
hist(num_voted_users,data=traindata);hist(log(num_voted_users),data=traindata)
## Warning in plot.window(xlim, ylim, "", ...): "data"는 그래픽 매개변수가 아
## 닙니다
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "data"는 그래픽 매개변수가 아닙니다
## Warning in axis(1, ...): "data"는 그래픽 매개변수가 아닙니다
## Warning in axis(2, ...): "data"는 그래픽 매개변수가 아닙니다
## Warning in plot.window(xlim, ylim, "", ...): "data"는 그래픽 매개변수가 아
## 닙니다
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "data"는 그래픽 매개변수가 아닙니다
## Warning in axis(1, ...): "data"는 그래픽 매개변수가 아닙니다
## Warning in axis(2, ...): "data"는 그래픽 매개변수가 아닙니다
Those 3 variables above shows similar distribution, and we can assume these values might give similar information; they might be correlated.
A = traindata[,c("num_critic_for_reviews","num_user_for_reviews","num_voted_users")]
A = cbind(log(A),traindata$imdb_score);A = na.exclude(A)
cor(A)
## num_critic_for_reviews num_user_for_reviews
## num_critic_for_reviews 1.0000000 0.8218750
## num_user_for_reviews 0.8218750 1.0000000
## num_voted_users 0.8435810 0.9154764
## traindata$imdb_score 0.2964559 0.3455342
## num_voted_users traindata$imdb_score
## num_critic_for_reviews 0.8435810 0.2964559
## num_user_for_reviews 0.9154764 0.3455342
## num_voted_users 1.0000000 0.4060484
## traindata$imdb_score 0.4060484 1.0000000
The correlation matrix shows the the three “num” variables are highly correlated with Pearson Correlation higher than 0.8. So let’s keep only one of them which has the strongest linear relationship with imdb_score
: num_voted_users
sum(is.na(traindata$num_voted_users))
## [1] 0
traindata = traindata[,names(traindata) !="num_critic_for_reviews"] # remove num_critic
traindata = traindata[,names(traindata) !="num_user_for_reviews"]
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
traindata = mutate(traindata,actors_facebook_like = traindata$actor_1_facebook_likes+traindata$actor_2_facebook_likes+traindata$actor_3_facebook_likes)
traindata = traindata[,names(traindata) !="actor_1_facebook_likes"] # each actor like
traindata = traindata[,names(traindata) !="actor_2_facebook_likes"] # each actor like
traindata = traindata[,names(traindata) !="actor_3_facebook_likes"] # each actor like
traindata = traindata[,names(traindata) !="cast_total_facebook_likes"] # cast total
sum(is.na(traindata$actors_facebook_like))
## [1] 13
summary(log(traindata$actors_facebook_like)) # 8
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -Inf 7.249 7.798 -Inf 9.538 13.392 13
traindata$actors_facebook_like[is.na(traindata$actors_facebook_like)] = round(exp(8),0)
table(traindata$color)
##
## Black and White Color
## 10 142 3458
library(dplyr)
traindata%>%
group_by(color)%>%
summarize(median(imdb_score))
Even though there is an imbalance of the number of data between Black and White
and `Color, we keep this variable as it is since we can clearly see that the score between the two differs about one point.
# Check NA
length(traindata$color[(traindata$color !="Color") & (traindata$color!="Black and White")])
## [1] 10
There is 17 NA Values for this data, so let’s do imputation with “Color” as it accounts for the vast majority of the value.
traindata$color[(traindata$color !="Color") & (traindata$color!="Black and White")] = "Color"
par(mfrow=c(1,2))
hist(title_year)
plot(imdb_score~title_year)
summary(title_year)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1916 1999 2005 2002 2011 2016 102
The distribution of the year is extremely left-skwed. Some of the old movies seems to have hightraier score, but the distribution of imdb score for the recent movies is random.
Notice that 75% of the movies scored was released after 1999. So we are going to transform this variable to dummy vairable; old movie or not.
Let’s impute the missing values with the median before transforming the data.
sum(is.na(traindata$title_year)) # 100
## [1] 68
traindata$title_year[is.na(traindata$title_year)] = 2005
old_year = rep(0,nrow(traindata))
for(i in 1:nrow(traindata)){
if(traindata$title_year[i]<1999) old_year[i] = 1
}
traindata = data.frame(traindata,old_year)
traindata = traindata[,names(traindata) !="title_year"]
These are the standard types of aspect ratio. * 1.37(4:3) Academy standard film aspect ratio * 1.85(16:9) A common US widescreen cinema standard * 2.35 A current widescreen cinema standard
Given the background knowledge, let us look into the data.
####### aspect_ratio #######
table(traindata$aspect_ratio)
##
## 1.18 1.2 1.33 1.37 1.44 1.5 1.66 1.75 1.77 1.78 1.85 1.89 2 2.2 2.24
## 1 1 46 85 1 2 37 2 1 93 1464 1 2 10 1
## 2.35 2.39 2.4 2.55 2.76 4 16
## 1597 11 3 2 3 5 31
We might assume all the other numbers other than the listed 3 types are typo, since they shows relatively small portions. So let’s clean up the data into those three types of the ratio.
traindata$aspect_ratio = factor(traindata$aspect_ratio)
levels(traindata$aspect_ratio) = c(rep("1.37",5),rep("1.85",7),rep("2.35",8),"1.37","1.85")
sum(is.na(traindata$aspect_ratio)) #217 NA
## [1] 211
table(traindata$aspect_ratio)
##
## 1.37 1.85 2.35
## 139 1631 1629
We cannot choose one representative ratio, so let’s try random imputation, which keep the distribution as before even after the imputation.
# random imputation
aspect.imp = function(n){
pop = c(rep("1.37",142),rep("1.85",1638),rep("2.35",1639))
sample(pop,n,replace = T)
}
traindata$aspect_ratio[is.na(traindata$aspect_ratio)] = aspect.imp(sum(is.na(traindata$aspect_ratio)))
table(traindata$aspect_ratio)
##
## 1.37 1.85 2.35
## 147 1739 1724
boxplot(imdb_score~content_rating,data = traindata)
Based on the frequences of content rating, let’s combine into the 4 categories: R, PG-13, PG, Others.
table(traindata$content_rating)
##
## Approved G GP M NC-17 Not Rated
## 148 45 83 4 2 5 47
## Passed PG PG-13 R TV-14 TV-G TV-MA
## 9 529 1141 1497 24 8 11
## TV-PG TV-Y TV-Y7 Unrated X
## 11 0 1 34 11
levels(traindata$content_rating) = c(rep("others",8),"PG","PG-13","R",rep("others",8))
table(traindata$content_rating)
##
## others PG PG-13 R
## 443 529 1141 1497
boxplot(imdb_score~content_rating,data = traindata)
par(mfrow=c(1,2))
hist(traindata$duration);hist(log(traindata$duration))
The distribution is right skewed, and there are lots of outliers.
bstat = boxplot(duration)
bstat$stats
## [,1]
## [1,] 58
## [2,] 93
## [3,] 103
## [4,] 118
## [5,] 155
## attr(,"class")
## 1
## "integer"
plot(imdb_score~duration,data = traindata)
abline(v=58,col="red",lty =2);abline(v=155,col="red",lty=2)
abline(h = 6,col="blue",lty=2)
Based on the boxplot statistics, the outliers are defined as the movie with duration shorter than 58 minutes or longer than 155 minutes. The value outside the outlier boundaries tends to have movie rating higher than 6. So, let’s make dummy variable of duration whether it is outlier or not.
duration_factor = rep(0,nrow(traindata))
sum(is.na(traindata$duration))
## [1] 6
traindata$duration[is.na(traindata$duration)] = 103 # median
for(i in 1:nrow(traindata)){
if(traindata$duration[i] > 155 | traindata$duration[i] <58) duration_factor[i] = 1
}
table(duration_factor)
## duration_factor
## 0 1
## 3449 161
traindata = data.frame(traindata,duration_factor)
traindata = traindata[,names(traindata) !="duration"]
hist(gross)
hist(log(gross))
plot(log(gross),imdb_score)
sum(is.na(log(gross)))
## [1] 863
summary(log(gross)) # 17
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5.088 15.485 17.053 16.381 17.948 20.450 863
gross
seems to have slight positive relation with imdb_score
, so let’s keep the value as it is. Since the value of gross
itself is extremely right-skewed, let’s use the median of log transformation value, which is 17 to impute the NA values.
traindata$gross[is.na(traindata$gross)] = round(exp(17),0)
Same as gross
plot(budget,imdb_score)
hist(log(budget))
summary(log(budget)) # 17
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 5.385 15.607 16.811 16.442 17.622 23.226 477
traindata$budget[is.na(traindata$budget)] = exp(17)
####### Movie Facebook Likes #######
movie_likes = rep(0,nrow(traindata))
for(i in 1:nrow(traindata)){
if(log(movie_facebook_likes[i])>7) movie_likes[i] =1
}
traindata = traindata[,names(traindata) !="movie_facebook_likes"]
traindata = data.frame(traindata,movie_likes)
####### Director Facebook Likes #######
sum(is.na(director_facebook_likes))
## [1] 98
traindata$director_facebook_likes[is.na(traindata$director_facebook_likes)] = round(exp(4),0)
director_likes = rep(0,nrow(traindata))
for(i in 1:nrow(traindata)){
if(log(traindata$director_facebook_likes[i])>log(7)) director_likes[i] = 1
}
traindata = traindata[,names(traindata) !="director_facebook_likes"]
traindata = data.frame(traindata,director_likes)
####### facenumber_in_poster #######
table(facenumber_in_poster)
## facenumber_in_poster
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 2118 1218 703 372 202 112 73 48 37 17 10 4 4 2 1
## 15 19 31 43
## 6 1 1 1
traindata$facenumber_in_poster[is.na(traindata$facenumber_in_poster)] = 0
sum(is.na(traindata$facenumber_in_poster))
## [1] 0
facenum = factor(traindata$facenumber_in_poster)
levels(facenum) = c("0","1",rep("2+",17))
traindata = data.frame(traindata,facenum)
traindata = traindata[,names(traindata) !="facenumber_in_poster"] # 3610 left, 13 variables
write.csv(traindata,"train_EDA.csv")
final = read.csv("train_EDA.csv") # knit the test data to align the traindata format
train= read.csv("train_EDA.csv")
attach(train)
## The following objects are masked _by_ .GlobalEnv:
##
## director_likes, duration_factor, facenum, movie_likes,
## old_year
## The following objects are masked from traindata:
##
## aspect_ratio, budget, color, content_rating, gross,
## imdb_score, num_voted_users, X
library(nortest)
ad.test(train$imdb_score^2)
##
## Anderson-Darling normality test
##
## data: train$imdb_score^2
## A = 2.009, p-value = 4.096e-05
hist(train$imdb_score^2)
final1 = train[,-1]
final1$budget = log(final1$budget)
final1$gross = log(final1$gross)
final1$facenum = as.factor(final1$facenum)
set.seed(5)
3610*0.7
## [1] 2527
trainset <- sample(3610,2527)
lm.train <- lm(imdb_score~.-X, data=final1, subset=trainset)
mean((imdb_score-predict(lm.train,train))[-trainset]^2)
## [1] 3.151454e+12
mean((imdb_score-predict(lm.train,train))[trainset]^2)
## [1] 3.242231e+12
train_mse <- c()
test_mse <- c()
for(i in 1:30){
set.seed(i)
trainset <- sample(3610,2527)
train_mse[i] <- mean((imdb_score-predict(lm.train,final1))[trainset]^2)
test_mse[i] <- mean((imdb_score-predict(lm.train,final1))[-trainset]^2)
}
train_mse
## [1] 0.8774790 0.8596545 0.8529831 0.8831807 0.8951869 0.9145522 0.9214736
## [8] 0.8915097 0.8916611 0.8647268 0.9028516 0.8574575 0.8813133 0.8715445
## [15] 0.9071086 0.8949798 0.9028533 0.8846323 0.8695782 0.8951246 0.8471012
## [22] 0.8831854 0.8974433 0.8926559 0.8851056 0.8652052 0.9025727 0.8677095
## [29] 0.8890618 0.8849036
test_mse
## [1] 0.9009307 0.9425214 0.9580878 0.8876268 0.8596124 0.8144267 0.7982767
## [8] 0.8681924 0.8678393 0.9306860 0.8417281 0.9476476 0.8919840 0.9147779
## [15] 0.8317951 0.8600956 0.8417241 0.8842398 0.9193660 0.8597577 0.9718124
## [22] 0.8876158 0.8543473 0.8655181 0.8831355 0.9295696 0.8423788 0.9237263
## [29] 0.8739042 0.8836068
library(nlme)
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
score.gls<-gls(model=imdb_score^2~.,data=final1,weights=varFixed(~imdb_score))
score.gls
## Generalized least squares fit by REML
## Model: imdb_score^2 ~ .
## Data: final1
## Log-restricted-likelihood: -14059.46
##
## Coefficients:
## (Intercept) X colorColor
## 3.884887e+01 6.076192e-04 -7.703839e+00
## gross num_voted_users content_ratingPG
## -2.942000e-01 5.085463e-05 -3.270324e+00
## content_ratingPG-13 content_ratingR budget
## -4.862549e+00 -1.818348e+00 2.391173e-01
## aspect_ratio actors_facebook_like old_year
## 8.988883e-01 3.454743e-05 5.088566e+00
## duration_factor movie_likes director_likes
## 8.810305e+00 4.402444e+00 1.208128e+00
## facenum1 facenum2+
## 2.085816e-01 -8.368678e-01
##
## Variance function:
## Structure: fixed weights
## Formula: ~imdb_score
## Degrees of freedom: 3610 total; 3593 residual
## Residual standard error: 4.730973
plot(score.gls)
TrainSet = final1[trainset,]
ValidSet = final1[-trainset,]
library(e1071)
model <- svm(imdb_score ~ ., data=TrainSet)
predictedY <- predict(model, data=TrainSet)
svm.test <- round(as.numeric(predict(model, ValidSet)),1)
(testmse <- mean((svm.test - ValidSet$imdb_score)^2))
## [1] 0.8163435
svm = vector(length = 30)
for(i in 1:30){
set.seed(i)
train <- sample(nrow(final1), 0.7*nrow(final1), replace = FALSE)
TrainSet <- final1[train,]
ValidSet <- final1[-train,]
model <- svm(imdb_score ~ ., data=TrainSet)
svm.test <- round(as.numeric(predict(model, ValidSet)),1)
svm[i] <- mean((svm.test - ValidSet$imdb_score)^2)
}
library(gbm)
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
## Boosting
score=vector(length=30)
for (i in 1:length(score)){
set.seed(i)
train=sample(1:nrow(final1),nrow(final1)*0.7)
boost.score = gbm(imdb_score~.,data=final1[train,],distribution="gaussian",n.trees=6000,interaction.depth=8)
yhat.boost=round(predict(boost.score,newdata=final1[-train,],n.trees=6000),1)
score.test=final1[-train,"imdb_score"]
score[i]=mean((yhat.boost-score.test)^2)
}
score
## [1] 0.7780979 0.7700646 0.7482179 0.7275531 0.6960572 0.6660295 0.6606556
## [8] 0.7239520 0.7075531 0.8015789 0.6929548 0.7847738 0.7324931 0.7760018
## [15] 0.6882825 0.7013758 0.7228624 0.7184488 0.7931025 0.7127886 0.8042290
## [22] 0.7327147 0.6798707 0.7329178 0.7446999 0.7876454 0.7102216 0.7619945
## [29] 0.7160018 0.7626500
# interaction.depths #8 is minimum
int.cv = vector(length = 10)
set.seed(1)
train=sample(1:nrow(final1),nrow(final1)*0.7)
for(i in 1:10){
boost.score = gbm(imdb_score~.,data=final1[train,],distribution="gaussian",n.trees=5000,interaction.depth=i)
yhat.boost=round(predict(boost.score,newdata=final1[-train,],n.trees=5000),1)
score.test=final1[-train,"imdb_score"]
int.cv[i]=mean((yhat.boost-score.test)^2)
}
plot(int.cv,type="b")
# ntree
ntree.cv = vector(length = 10)
set.seed(1)
train=sample(1:nrow(final1),nrow(final1)*0.7)
for(i in 1:10){
boost.score = gbm(imdb_score~.,data=final1[train,],distribution="gaussian",n.trees=1000*i,interaction.depth=8)
yhat.boost=round(predict(boost.score,newdata=final1[-train,],n.trees=1000*i),1)
score.test=final1[-train,"imdb_score"]
ntree.cv[i]=mean((yhat.boost-score.test)^2)
}
plot(seq(from=1000,to= 10000,by = 1000),ntree.cv,type="b",main = "ntree.cv",xlab = "")
# shringkage
shrink.cv = vector(length = 10)
set.seed(1)
train=sample(1:nrow(final1),nrow(final1)*0.7)
for(i in 1:10){
boost.score = gbm(imdb_score~.,data=final1[train,],distribution="gaussian",n.trees=6000,interaction.depth = 8,shrinkage = 0.1*i)
yhat.boost=round(predict(boost.score,newdata=final1[-train,],n.trees=6000),1)
score.test=final1[-train,"imdb_score"]
shrink.cv[i]=mean((yhat.boost-score.test)^2)
}
plot(seq(from=0.1,to= 1,by = 0.1),shrink.cv,type="b",main = "shrink.cv",xlab = "")
## Random Forest
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
set.seed(60)
train = sample(1:nrow(final1), round(nrow(final1)*0.7,0))
score.test = final1$imdb_score[-train]
set.seed(1)
rf.score=randomForest(imdb_score~.,data=final1,subset=train,mtry=3,importance=TRUE)
# mtry = sqrt(p)
yhat.rf = predict(rf.score,newdata=final1[-train,])
mean((yhat.rf-score.test)^2)
## [1] 0.7957928
importance(rf.score)
## %IncMSE IncNodePurity
## X 31.819668 321.85616
## color 10.860215 34.87457
## gross 34.723210 330.29629
## num_voted_users 73.938427 833.09014
## content_rating 24.307631 145.07818
## budget 32.927346 278.99730
## aspect_ratio 14.693526 74.41146
## actors_facebook_like 17.171702 356.51664
## old_year 32.792916 93.87336
## duration_factor 27.486251 82.43453
## movie_likes 14.762952 114.81998
## director_likes 7.255856 41.94336
## facenum 5.612132 87.39188
varImpPlot(rf.score)
mtry.cv = vector(length = 5)
set.seed(60)
train = sample(1:nrow(final1), round(nrow(final1)*0.7,0))
score.test = final1$imdb_score[-train]
for(i in 2:6){
rf.score=randomForest(imdb_score~.,data=final1,subset=train,mtry=i,importance=TRUE)
yhat.rf = predict(rf.score,newdata=final1[-train,])
mtry.cv[i] = mean((yhat.rf-score.test)^2)
}
# 4 minimum
plot(1:6,mtry.cv,type="b", main = "mtry cv")
abline(v =4,col="red",lwd=2)
ntrf.cv = vector(length = 10) # ntree = 500 using default
set.seed(60)
train = sample(1:nrow(final1), round(nrow(final1)*0.7,0))
score.test = final1$imdb_score[-train]
for(i in 1:10){
rf.score=randomForest(imdb_score~.,data=final1,subset=train,mtry=4,importance=TRUE,ntree = 100*i)
yhat.rf = predict(rf.score,newdata=final1[-train,])
ntrf.cv[i] = mean((yhat.rf-score.test)^2)
}
plot(ntrf.cv,type="b")
ranfor = vector(length = 30)
for (i in 1:30){
set.seed(i)
train = sample(1:nrow(final1), round(nrow(final1)*0.7,0))
score.test = final1$imdb_score[-train]
set.seed(1)
rf.score=randomForest(imdb_score~.,data=final1,subset=train,mtry=4,importance=TRUE)
# mtry = sqrt(p)
yhat.rf = predict(rf.score,newdata=final1[-train,])
ranfor[i] = mean((yhat.rf-score.test)^2)
}
par(mfrow=c(1,5))
boxplot(test_mse,main = "Linear Regression",ylim = c(0.5,1))
boxplot(svm , main = "Support Vector Machine",ylim = c(0.5,1))
boxplot(score, main = "Boosting",ylim = c(0.5,1))
boxplot(ranfor,main = "Random Forest",ylim = c(0.5,1))
wilcox.test(score,ranfor,alternative = c("two.sided"),conf.level = 0.95)
##
## Wilcoxon rank sum test
##
## data: score and ranfor
## W = 371, p-value = 0.2478
## alternative hypothesis: true location shift is not equal to 0