1. Exploratory Data Analysis

27 Variables and 4943 Obs -> To predict imdb_score

1. Remove Variables

1-1. Remove Duplicates

sum(duplicated(traindata$movie_title)) # remove 123 duplicated value

## [1] 123

traindata = traindata[!duplicated(traindata$movie_title),] # 4820 left

1-2. Remove movie_imdb_link: Hyperlink -> no explanatory information for response variable; identifier

traindata = traindata[,names(traindata) !="movie_title"]
traindata = traindata[,names(traindata) !="movie_imdb_link"]

1-3. Remove Text Variable: Hard to find efficient way to transform the data

traindata = traindata[,names(traindata) !="genres"]
traindata = traindata[,names(traindata) !="plot_keywords"]

####### Remove Names #######
traindata = traindata[,names(traindata) !="director_name"] 
traindata = traindata[,names(traindata) !="actor_1_name"] 
traindata = traindata[,names(traindata) !="actor_2_name"] 
traindata = traindata[,names(traindata) !="actor_3_name"]

1-4. Language and Country

table(traindata$language)/nrow(traindata)

## 
##                Aboriginal       Arabic      Aramaic      Bosnian 
## 0.0020746888 0.0004149378 0.0010373444 0.0002074689 0.0002074689 
##    Cantonese      Chinese        Czech       Danish         Dari 
## 0.0022821577 0.0006224066 0.0002074689 0.0010373444 0.0004149378 
##        Dutch     Dzongkha      English     Filipino       French 
## 0.0008298755 0.0002074689 0.9321576763 0.0002074689 0.0149377593 
##       German        Greek       Hebrew        Hindi    Hungarian 
## 0.0039419087 0.0002074689 0.0010373444 0.0058091286 0.0002074689 
##    Icelandic   Indonesian      Italian     Japanese      Kannada 
## 0.0004149378 0.0004149378 0.0020746888 0.0035269710 0.0002074689 
##       Kazakh       Korean     Mandarin         Maya    Mongolian 
## 0.0002074689 0.0016597510 0.0043568465 0.0002074689 0.0002074689 
##         None    Norwegian      Panjabi      Persian       Polish 
## 0.0004149378 0.0008298755 0.0002074689 0.0008298755 0.0006224066 
##   Portuguese     Romanian      Russian    Slovenian      Spanish 
## 0.0016597510 0.0004149378 0.0022821577 0.0002074689 0.0082987552 
##      Swahili      Swedish       Telugu         Thai         Urdu 
## 0.0002074689 0.0010373444 0.0002074689 0.0006224066 0.0002074689 
##   Vietnamese         Zulu 
## 0.0002074689 0.0004149378

table(traindata$country)/nrow(traindata)

## 
##                               Afghanistan            Argentina 
##         0.0010373444         0.0002074689         0.0008298755 
##                Aruba            Australia              Bahamas 
##         0.0002074689         0.0109958506         0.0002074689 
##              Belgium               Brazil             Bulgaria 
##         0.0006224066         0.0014522822         0.0002074689 
##             Cambodia             Cameroon               Canada 
##         0.0002074689         0.0002074689         0.0248962656 
##                Chile                China             Colombia 
##         0.0002074689         0.0053941909         0.0002074689 
##       Czech Republic              Denmark   Dominican Republic 
##         0.0006224066         0.0022821577         0.0002074689 
##                Egypt              Finland               France 
##         0.0002074689         0.0002074689         0.0315352697 
##              Georgia              Germany               Greece 
##         0.0002074689         0.0190871369         0.0004149378 
##            Hong Kong              Hungary              Iceland 
##         0.0035269710         0.0004149378         0.0006224066 
##                India            Indonesia                 Iran 
##         0.0068464730         0.0002074689         0.0008298755 
##              Ireland               Israel                Italy 
##         0.0024896266         0.0008298755         0.0045643154 
##                Japan                Kenya           Kyrgyzstan 
##         0.0045643154         0.0002074689         0.0002074689 
##               Mexico          Netherlands             New Line 
##         0.0035269710         0.0010373444         0.0002074689 
##          New Zealand              Nigeria               Norway 
##         0.0024896266         0.0002074689         0.0016597510 
##        Official site             Pakistan               Panama 
##         0.0002074689         0.0002074689         0.0002074689 
##                 Peru          Philippines               Poland 
##         0.0002074689         0.0002074689         0.0008298755 
##              Romania               Russia             Slovakia 
##         0.0008298755         0.0022821577         0.0002074689 
##             Slovenia         South Africa          South Korea 
##         0.0002074689         0.0016597510         0.0029045643 
##         Soviet Union                Spain               Sweden 
##         0.0002074689         0.0066390041         0.0012448133 
##          Switzerland               Taiwan             Thailand 
##         0.0006224066         0.0002074689         0.0010373444 
##               Turkey                   UK United Arab Emirates 
##         0.0002074689         0.0885892116         0.0002074689 
##                  USA         West Germany 
##         0.7543568465         0.0006224066

par(mfrow=c(1,2))
plot(traindata$language);plot(traindata$country)

Vast majority of the movies are in English(93%) and made in USA(75%), UK(8.8%). Thus, I decided to keep only the movies in English and made in USA for the convinience of the analysis.

####### USA Only #######
traindata = traindata[traindata$country == "USA",] # 984 other countries
traindata = traindata[,names(traindata) !="country"]

####### English Only #######
traindata = traindata[traindata$language == "English",] 
traindata = traindata[,names(traindata) !="language"]

1-5. “Num” Variables

par(mfrow=c(2,3))
hist(num_critic_for_reviews,data =traindata);hist(log(num_critic_for_reviews),data=traindata)

## Warning in plot.window(xlim, ylim, "", ...): "data"는 그래픽 매개변수가 아
## 닙니다

## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "data"는 그래픽 매개변수가 아닙니다

## Warning in axis(1, ...): "data"는 그래픽 매개변수가 아닙니다

## Warning in axis(2, ...): "data"는 그래픽 매개변수가 아닙니다

## Warning in plot.window(xlim, ylim, "", ...): "data"는 그래픽 매개변수가 아
## 닙니다

## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "data"는 그래픽 매개변수가 아닙니다

## Warning in axis(1, ...): "data"는 그래픽 매개변수가 아닙니다

## Warning in axis(2, ...): "data"는 그래픽 매개변수가 아닙니다

hist(num_user_for_reviews,data=traindata);hist(log(num_user_for_reviews),data=traindata)

## Warning in plot.window(xlim, ylim, "", ...): "data"는 그래픽 매개변수가 아
## 닙니다

## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "data"는 그래픽 매개변수가 아닙니다

## Warning in axis(1, ...): "data"는 그래픽 매개변수가 아닙니다

## Warning in axis(2, ...): "data"는 그래픽 매개변수가 아닙니다

## Warning in plot.window(xlim, ylim, "", ...): "data"는 그래픽 매개변수가 아
## 닙니다

## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "data"는 그래픽 매개변수가 아닙니다

## Warning in axis(1, ...): "data"는 그래픽 매개변수가 아닙니다

## Warning in axis(2, ...): "data"는 그래픽 매개변수가 아닙니다

hist(num_voted_users,data=traindata);hist(log(num_voted_users),data=traindata)

## Warning in plot.window(xlim, ylim, "", ...): "data"는 그래픽 매개변수가 아
## 닙니다

## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "data"는 그래픽 매개변수가 아닙니다

## Warning in axis(1, ...): "data"는 그래픽 매개변수가 아닙니다

## Warning in axis(2, ...): "data"는 그래픽 매개변수가 아닙니다

## Warning in plot.window(xlim, ylim, "", ...): "data"는 그래픽 매개변수가 아
## 닙니다

## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "data"는 그래픽 매개변수가 아닙니다

## Warning in axis(1, ...): "data"는 그래픽 매개변수가 아닙니다

## Warning in axis(2, ...): "data"는 그래픽 매개변수가 아닙니다

Those 3 variables above shows similar distribution, and we can assume these values might give similar information; they might be correlated.

A = traindata[,c("num_critic_for_reviews","num_user_for_reviews","num_voted_users")]
A = cbind(log(A),traindata$imdb_score);A = na.exclude(A)
cor(A)

##                        num_critic_for_reviews num_user_for_reviews
## num_critic_for_reviews              1.0000000            0.8218750
## num_user_for_reviews                0.8218750            1.0000000
## num_voted_users                     0.8435810            0.9154764
## traindata$imdb_score                0.2964559            0.3455342
##                        num_voted_users traindata$imdb_score
## num_critic_for_reviews       0.8435810            0.2964559
## num_user_for_reviews         0.9154764            0.3455342
## num_voted_users              1.0000000            0.4060484
## traindata$imdb_score         0.4060484            1.0000000

The correlation matrix shows the the three “num” variables are highly correlated with Pearson Correlation higher than 0.8. So let’s keep only one of them which has the strongest linear relationship with imdb_score: num_voted_users

sum(is.na(traindata$num_voted_users))

## [1] 0

traindata = traindata[,names(traindata) !="num_critic_for_reviews"] # remove num_critic
traindata = traindata[,names(traindata) !="num_user_for_reviews"]

1-6. Actor Facebook Likes

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

traindata = mutate(traindata,actors_facebook_like = traindata$actor_1_facebook_likes+traindata$actor_2_facebook_likes+traindata$actor_3_facebook_likes)
traindata = traindata[,names(traindata) !="actor_1_facebook_likes"] # each actor like
traindata = traindata[,names(traindata) !="actor_2_facebook_likes"] # each actor like
traindata = traindata[,names(traindata) !="actor_3_facebook_likes"] # each actor like
traindata = traindata[,names(traindata) !="cast_total_facebook_likes"] # cast total

sum(is.na(traindata$actors_facebook_like))

## [1] 13

summary(log(traindata$actors_facebook_like)) # 8

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    -Inf   7.249   7.798    -Inf   9.538  13.392      13

traindata$actors_facebook_like[is.na(traindata$actors_facebook_like)] = round(exp(8),0)

2. Variable Transformation

2-1. Color

table(traindata$color)

## 
##                 Black and White           Color 
##              10             142            3458

library(dplyr)
traindata%>%
  group_by(color)%>%
  summarize(median(imdb_score))

Even though there is an imbalance of the number of data between Black and White and `Color, we keep this variable as it is since we can clearly see that the score between the two differs about one point.

# Check NA
length(traindata$color[(traindata$color !="Color") & (traindata$color!="Black and White")])

## [1] 10

There is 17 NA Values for this data, so let’s do imputation with “Color” as it accounts for the vast majority of the value.

traindata$color[(traindata$color !="Color") & (traindata$color!="Black and White")] = "Color"

2-2. Title Year

par(mfrow=c(1,2))
hist(title_year)
plot(imdb_score~title_year)

summary(title_year)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1916    1999    2005    2002    2011    2016     102

The distribution of the year is extremely left-skwed. Some of the old movies seems to have hightraier score, but the distribution of imdb score for the recent movies is random.

Notice that 75% of the movies scored was released after 1999. So we are going to transform this variable to dummy vairable; old movie or not.

Let’s impute the missing values with the median before transforming the data.

sum(is.na(traindata$title_year)) # 100

## [1] 68

traindata$title_year[is.na(traindata$title_year)] = 2005

old_year = rep(0,nrow(traindata))
for(i in 1:nrow(traindata)){
  if(traindata$title_year[i]<1999) old_year[i] = 1
}

traindata = data.frame(traindata,old_year)
traindata = traindata[,names(traindata) !="title_year"]

2-3. Aspect Ratio

These are the standard types of aspect ratio. * 1.37(4:3) Academy standard film aspect ratio * 1.85(16:9) A common US widescreen cinema standard * 2.35 A current widescreen cinema standard

Given the background knowledge, let us look into the data.

####### aspect_ratio #######
table(traindata$aspect_ratio)

## 
## 1.18  1.2 1.33 1.37 1.44  1.5 1.66 1.75 1.77 1.78 1.85 1.89    2  2.2 2.24 
##    1    1   46   85    1    2   37    2    1   93 1464    1    2   10    1 
## 2.35 2.39  2.4 2.55 2.76    4   16 
## 1597   11    3    2    3    5   31

We might assume all the other numbers other than the listed 3 types are typo, since they shows relatively small portions. So let’s clean up the data into those three types of the ratio.

traindata$aspect_ratio = factor(traindata$aspect_ratio)
levels(traindata$aspect_ratio) = c(rep("1.37",5),rep("1.85",7),rep("2.35",8),"1.37","1.85")

sum(is.na(traindata$aspect_ratio)) #217 NA

## [1] 211

table(traindata$aspect_ratio)

## 
## 1.37 1.85 2.35 
##  139 1631 1629

We cannot choose one representative ratio, so let’s try random imputation, which keep the distribution as before even after the imputation.

# random imputation
aspect.imp = function(n){
  pop = c(rep("1.37",142),rep("1.85",1638),rep("2.35",1639))
  sample(pop,n,replace = T)
}

traindata$aspect_ratio[is.na(traindata$aspect_ratio)] = aspect.imp(sum(is.na(traindata$aspect_ratio)))
table(traindata$aspect_ratio)

## 
## 1.37 1.85 2.35 
##  147 1739 1724

2-4. Content Rating

boxplot(imdb_score~content_rating,data = traindata)

Based on the frequences of content rating, let’s combine into the 4 categories: R, PG-13, PG, Others.

table(traindata$content_rating)

## 
##            Approved         G        GP         M     NC-17 Not Rated 
##       148        45        83         4         2         5        47 
##    Passed        PG     PG-13         R     TV-14      TV-G     TV-MA 
##         9       529      1141      1497        24         8        11 
##     TV-PG      TV-Y     TV-Y7   Unrated         X 
##        11         0         1        34        11

levels(traindata$content_rating) = c(rep("others",8),"PG","PG-13","R",rep("others",8))

table(traindata$content_rating)

## 
## others     PG  PG-13      R 
##    443    529   1141   1497

boxplot(imdb_score~content_rating,data = traindata)

2-5. Duration

par(mfrow=c(1,2))
hist(traindata$duration);hist(log(traindata$duration))

The distribution is right skewed, and there are lots of outliers.

bstat = boxplot(duration)

bstat$stats

##      [,1]
## [1,]   58
## [2,]   93
## [3,]  103
## [4,]  118
## [5,]  155
## attr(,"class")
##         1 
## "integer"

plot(imdb_score~duration,data = traindata)
abline(v=58,col="red",lty =2);abline(v=155,col="red",lty=2)
abline(h = 6,col="blue",lty=2)

Based on the boxplot statistics, the outliers are defined as the movie with duration shorter than 58 minutes or longer than 155 minutes. The value outside the outlier boundaries tends to have movie rating higher than 6. So, let’s make dummy variable of duration whether it is outlier or not.

duration_factor = rep(0,nrow(traindata))
sum(is.na(traindata$duration))

## [1] 6

traindata$duration[is.na(traindata$duration)] = 103 # median
for(i in 1:nrow(traindata)){
  if(traindata$duration[i] > 155 | traindata$duration[i] <58) duration_factor[i] = 1
}
table(duration_factor)

## duration_factor
##    0    1 
## 3449  161

traindata = data.frame(traindata,duration_factor)
traindata = traindata[,names(traindata) !="duration"]

2-6. Gross

hist(gross)

hist(log(gross))

plot(log(gross),imdb_score)

sum(is.na(log(gross)))

## [1] 863

summary(log(gross)) # 17

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   5.088  15.485  17.053  16.381  17.948  20.450     863

gross seems to have slight positive relation with imdb_score, so let’s keep the value as it is. Since the value of gross itself is extremely right-skewed, let’s use the median of log transformation value, which is 17 to impute the NA values.

traindata$gross[is.na(traindata$gross)] = round(exp(17),0)

2-7 Budget

Same as gross

plot(budget,imdb_score)

hist(log(budget))

summary(log(budget)) # 17

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   5.385  15.607  16.811  16.442  17.622  23.226     477

traindata$budget[is.na(traindata$budget)] = exp(17)

2-8. Movie Facebook Likes

####### Movie Facebook Likes #######

movie_likes = rep(0,nrow(traindata))
for(i in 1:nrow(traindata)){
  if(log(movie_facebook_likes[i])>7) movie_likes[i] =1
}

traindata = traindata[,names(traindata) !="movie_facebook_likes"]
traindata = data.frame(traindata,movie_likes)

2-9. Director Facebook Likes

####### Director Facebook Likes #######

sum(is.na(director_facebook_likes))

## [1] 98

traindata$director_facebook_likes[is.na(traindata$director_facebook_likes)] = round(exp(4),0)

director_likes = rep(0,nrow(traindata))
for(i in 1:nrow(traindata)){
  if(log(traindata$director_facebook_likes[i])>log(7)) director_likes[i] = 1
}

traindata = traindata[,names(traindata) !="director_facebook_likes"]
traindata = data.frame(traindata,director_likes)

2-10. Face Number in the Poster

####### facenumber_in_poster #######
table(facenumber_in_poster)

## facenumber_in_poster
##    0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
## 2118 1218  703  372  202  112   73   48   37   17   10    4    4    2    1 
##   15   19   31   43 
##    6    1    1    1

traindata$facenumber_in_poster[is.na(traindata$facenumber_in_poster)] = 0
sum(is.na(traindata$facenumber_in_poster))

## [1] 0

facenum = factor(traindata$facenumber_in_poster)
levels(facenum) = c("0","1",rep("2+",17))
traindata = data.frame(traindata,facenum)
traindata = traindata[,names(traindata) !="facenumber_in_poster"] # 3610 left, 13 variables

write.csv(traindata,"train_EDA.csv")
final = read.csv("train_EDA.csv") # knit the test data to align the traindata format

Training the Model and Model Selection

1. Linear Regression

train= read.csv("train_EDA.csv")
attach(train)

## The following objects are masked _by_ .GlobalEnv:
## 
##     director_likes, duration_factor, facenum, movie_likes,
##     old_year

## The following objects are masked from traindata:
## 
##     aspect_ratio, budget, color, content_rating, gross,
##     imdb_score, num_voted_users, X

library(nortest)
ad.test(train$imdb_score^2)

## 
##  Anderson-Darling normality test
## 
## data:  train$imdb_score^2
## A = 2.009, p-value = 4.096e-05

hist(train$imdb_score^2)

final1 = train[,-1]
final1$budget = log(final1$budget)
final1$gross = log(final1$gross)
final1$facenum = as.factor(final1$facenum)

set.seed(5)
3610*0.7

## [1] 2527

trainset <- sample(3610,2527)
lm.train <- lm(imdb_score~.-X, data=final1, subset=trainset)
mean((imdb_score-predict(lm.train,train))[-trainset]^2)

## [1] 3.151454e+12

mean((imdb_score-predict(lm.train,train))[trainset]^2)

## [1] 3.242231e+12

train_mse <- c()
test_mse <- c()
for(i in 1:30){
  set.seed(i)
  trainset <- sample(3610,2527)
  train_mse[i] <- mean((imdb_score-predict(lm.train,final1))[trainset]^2)
  test_mse[i] <- mean((imdb_score-predict(lm.train,final1))[-trainset]^2)
}
train_mse

##  [1] 0.8774790 0.8596545 0.8529831 0.8831807 0.8951869 0.9145522 0.9214736
##  [8] 0.8915097 0.8916611 0.8647268 0.9028516 0.8574575 0.8813133 0.8715445
## [15] 0.9071086 0.8949798 0.9028533 0.8846323 0.8695782 0.8951246 0.8471012
## [22] 0.8831854 0.8974433 0.8926559 0.8851056 0.8652052 0.9025727 0.8677095
## [29] 0.8890618 0.8849036

test_mse

##  [1] 0.9009307 0.9425214 0.9580878 0.8876268 0.8596124 0.8144267 0.7982767
##  [8] 0.8681924 0.8678393 0.9306860 0.8417281 0.9476476 0.8919840 0.9147779
## [15] 0.8317951 0.8600956 0.8417241 0.8842398 0.9193660 0.8597577 0.9718124
## [22] 0.8876158 0.8543473 0.8655181 0.8831355 0.9295696 0.8423788 0.9237263
## [29] 0.8739042 0.8836068

2. Weighted Least Squares

library(nlme)

## 
## Attaching package: 'nlme'

## The following object is masked from 'package:dplyr':
## 
##     collapse

score.gls<-gls(model=imdb_score^2~.,data=final1,weights=varFixed(~imdb_score))  
score.gls

## Generalized least squares fit by REML
##   Model: imdb_score^2 ~ . 
##   Data: final1 
##   Log-restricted-likelihood: -14059.46
## 
## Coefficients:
##          (Intercept)                    X           colorColor 
##         3.884887e+01         6.076192e-04        -7.703839e+00 
##                gross      num_voted_users     content_ratingPG 
##        -2.942000e-01         5.085463e-05        -3.270324e+00 
##  content_ratingPG-13      content_ratingR               budget 
##        -4.862549e+00        -1.818348e+00         2.391173e-01 
##         aspect_ratio actors_facebook_like             old_year 
##         8.988883e-01         3.454743e-05         5.088566e+00 
##      duration_factor          movie_likes       director_likes 
##         8.810305e+00         4.402444e+00         1.208128e+00 
##             facenum1            facenum2+ 
##         2.085816e-01        -8.368678e-01 
## 
## Variance function:
##  Structure: fixed weights
##  Formula: ~imdb_score 
## Degrees of freedom: 3610 total; 3593 residual
## Residual standard error: 4.730973

plot(score.gls)

3. Support Vector Machine

TrainSet = final1[trainset,]
ValidSet = final1[-trainset,]
library(e1071)
model <- svm(imdb_score ~ ., data=TrainSet)
predictedY <- predict(model, data=TrainSet)

svm.test <- round(as.numeric(predict(model, ValidSet)),1)
(testmse <- mean((svm.test - ValidSet$imdb_score)^2))

## [1] 0.8163435

svm  = vector(length = 30)
for(i in 1:30){
  set.seed(i)
  train <- sample(nrow(final1), 0.7*nrow(final1), replace = FALSE)
  TrainSet <- final1[train,]
  ValidSet <- final1[-train,]
  model <- svm(imdb_score ~ ., data=TrainSet)
  svm.test <- round(as.numeric(predict(model, ValidSet)),1)
  svm[i] <- mean((svm.test - ValidSet$imdb_score)^2)

}

4. Boosting

library(gbm)

## Loading required package: survival

## Loading required package: lattice

## Loading required package: splines

## Loading required package: parallel

## Loaded gbm 2.1.3

## Boosting
score=vector(length=30)
for (i in 1:length(score)){
  set.seed(i)
  train=sample(1:nrow(final1),nrow(final1)*0.7)
  boost.score = gbm(imdb_score~.,data=final1[train,],distribution="gaussian",n.trees=6000,interaction.depth=8)
  yhat.boost=round(predict(boost.score,newdata=final1[-train,],n.trees=6000),1)
  score.test=final1[-train,"imdb_score"]
  score[i]=mean((yhat.boost-score.test)^2)
}

score

##  [1] 0.7780979 0.7700646 0.7482179 0.7275531 0.6960572 0.6660295 0.6606556
##  [8] 0.7239520 0.7075531 0.8015789 0.6929548 0.7847738 0.7324931 0.7760018
## [15] 0.6882825 0.7013758 0.7228624 0.7184488 0.7931025 0.7127886 0.8042290
## [22] 0.7327147 0.6798707 0.7329178 0.7446999 0.7876454 0.7102216 0.7619945
## [29] 0.7160018 0.7626500

# interaction.depths #8 is minimum
int.cv = vector(length = 10)
set.seed(1)
train=sample(1:nrow(final1),nrow(final1)*0.7)
for(i in 1:10){
  boost.score = gbm(imdb_score~.,data=final1[train,],distribution="gaussian",n.trees=5000,interaction.depth=i)
  yhat.boost=round(predict(boost.score,newdata=final1[-train,],n.trees=5000),1)
  score.test=final1[-train,"imdb_score"]
  int.cv[i]=mean((yhat.boost-score.test)^2)
}
plot(int.cv,type="b")

# ntree
ntree.cv = vector(length = 10)
set.seed(1)
train=sample(1:nrow(final1),nrow(final1)*0.7)
for(i in 1:10){
  boost.score = gbm(imdb_score~.,data=final1[train,],distribution="gaussian",n.trees=1000*i,interaction.depth=8)
  yhat.boost=round(predict(boost.score,newdata=final1[-train,],n.trees=1000*i),1)
  score.test=final1[-train,"imdb_score"]
  ntree.cv[i]=mean((yhat.boost-score.test)^2)
}
plot(seq(from=1000,to= 10000,by = 1000),ntree.cv,type="b",main = "ntree.cv",xlab = "")

# shringkage
shrink.cv = vector(length = 10)
set.seed(1)
train=sample(1:nrow(final1),nrow(final1)*0.7)
for(i in 1:10){
  boost.score = gbm(imdb_score~.,data=final1[train,],distribution="gaussian",n.trees=6000,interaction.depth = 8,shrinkage = 0.1*i)
  yhat.boost=round(predict(boost.score,newdata=final1[-train,],n.trees=6000),1)
  score.test=final1[-train,"imdb_score"]
  shrink.cv[i]=mean((yhat.boost-score.test)^2)
}
plot(seq(from=0.1,to= 1,by = 0.1),shrink.cv,type="b",main = "shrink.cv",xlab = "")

5. Random Forest

## Random Forest
library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

set.seed(60)
train = sample(1:nrow(final1), round(nrow(final1)*0.7,0))
score.test = final1$imdb_score[-train]
set.seed(1)
rf.score=randomForest(imdb_score~.,data=final1,subset=train,mtry=3,importance=TRUE)
# mtry = sqrt(p)
yhat.rf = predict(rf.score,newdata=final1[-train,])
mean((yhat.rf-score.test)^2)

## [1] 0.7957928

importance(rf.score)

##                        %IncMSE IncNodePurity
## X                    31.819668     321.85616
## color                10.860215      34.87457
## gross                34.723210     330.29629
## num_voted_users      73.938427     833.09014
## content_rating       24.307631     145.07818
## budget               32.927346     278.99730
## aspect_ratio         14.693526      74.41146
## actors_facebook_like 17.171702     356.51664
## old_year             32.792916      93.87336
## duration_factor      27.486251      82.43453
## movie_likes          14.762952     114.81998
## director_likes        7.255856      41.94336
## facenum               5.612132      87.39188

varImpPlot(rf.score)

mtry.cv = vector(length = 5)
set.seed(60)
train = sample(1:nrow(final1), round(nrow(final1)*0.7,0))
score.test = final1$imdb_score[-train]
for(i in 2:6){
  rf.score=randomForest(imdb_score~.,data=final1,subset=train,mtry=i,importance=TRUE)
  yhat.rf = predict(rf.score,newdata=final1[-train,])
  mtry.cv[i] = mean((yhat.rf-score.test)^2)
}
# 4 minimum
plot(1:6,mtry.cv,type="b", main = "mtry cv")
abline(v =4,col="red",lwd=2)

ntrf.cv = vector(length = 10) # ntree = 500 using default
set.seed(60)
train = sample(1:nrow(final1), round(nrow(final1)*0.7,0))
score.test = final1$imdb_score[-train]
for(i in 1:10){
  rf.score=randomForest(imdb_score~.,data=final1,subset=train,mtry=4,importance=TRUE,ntree = 100*i)
  yhat.rf = predict(rf.score,newdata=final1[-train,])
  ntrf.cv[i] = mean((yhat.rf-score.test)^2)
}

plot(ntrf.cv,type="b")

ranfor = vector(length = 30)
for (i in 1:30){
  set.seed(i)
  train = sample(1:nrow(final1), round(nrow(final1)*0.7,0))
  score.test = final1$imdb_score[-train]
  set.seed(1)
  rf.score=randomForest(imdb_score~.,data=final1,subset=train,mtry=4,importance=TRUE)
  # mtry = sqrt(p)
  yhat.rf = predict(rf.score,newdata=final1[-train,])
  ranfor[i] = mean((yhat.rf-score.test)^2)
  
}

par(mfrow=c(1,5))
boxplot(test_mse,main = "Linear Regression",ylim = c(0.5,1))
boxplot(svm , main = "Support Vector Machine",ylim = c(0.5,1))
boxplot(score, main = "Boosting",ylim = c(0.5,1))
boxplot(ranfor,main = "Random Forest",ylim = c(0.5,1))

wilcox.test(score,ranfor,alternative = c("two.sided"),conf.level  = 0.95)

## 
##  Wilcoxon rank sum test
## 
## data:  score and ranfor
## W = 371, p-value = 0.2478
## alternative hypothesis: true location shift is not equal to 0

IMDB Score Prediction

Sowon Jeong