Maria Paracha and Catherine Williams
June 5, 2019
36 predictor variables
Label is continuous = number of wins
Ground Truth
Source: https://www.kaggle.com/gabrielegalimberti/movies-example-for-machine-learning-activities/
#checks if package is installed, if not, installs it. Then loads all packages
ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
packages <- c("tidyverse","caret","performance","Amelia",
"rpart","rpart.plot","randomForest",
"fastDummies","corrplot")
ipak(packages)
theme_set(theme_classic()) #applies classic theme to all charts
movies <- read.csv2("MACHINE_LEARNING_FINAL.csv")
[1] "ï..title" "year" "lifetime_gross"
[4] "ratingInteger" "ratingCount" "duration"
[7] "nrOfWins" "nrOfNominations" "nrOfPhotos"
[10] "nrOfNewsArticles" "nrOfUserReviews" "nrOfGenre"
[13] "Action" "Adult" "Adventure"
[16] "Animation" "Biography" "Comedy"
[19] "Crime" "Documentary" "Drama"
[22] "Family" "Fantasy" "Horror"
[25] "Music" "Musical" "Mystery"
[28] "News" "RealityTV" "Romance"
[31] "SciFi" "Short" "Sport"
[34] "TalkShow" "Thriller" "War"
[37] "Western"
#remove title and columns that do not have more than 1 unique value
movies <- movies[, sapply(movies, function(col) length(unique(col))) > 1] %>% select(-1)
#Exclude outliers for year
outliers <- boxplot(movies$year, main="Boxplot for year")$out
movies <- movies[-which(movies$year %in% outliers),]
#Exclude outliers for ratingCount
outliers <- boxplot(movies$ratingCount, main="Boxplot for ratingCount")$out
movies <- movies[-which(movies$ratingCount %in% outliers),]
#Exclude outliers for nrOfPhotos
outliers <- boxplot(movies$nrOfPhotos, main="Boxplot for nrOfPhotos")$out
movies <- movies[-which(movies$nrOfPhotos %in% outliers),]
Function to create charts comparing all variables against the label
gg_plot <- function(x_col, y_col=movies$nrOfWins, data=movies){
if(is.numeric(data[[x_col]])){
p1 <- data %>% ggplot(mapping=aes_string(x_col, y_col))+
geom_jitter(alpha=0.5)+
geom_smooth(method="loess", se=FALSE)+
labs(title=str_c("Awards vs ", x_col), y="label")
p1 %>% print()
h <- hist(data[[x_col]], breaks = "FD", plot = FALSE) #Freedman-Diaconis rule
p2 <- ggplot(data, aes_string(x_col))+
geom_histogram(aes(y = ..density..), breaks = h$breaks, alpha = 0.3, col = "white")+
geom_density(size = 1) +
labs(title=str_c("Histogram and density for ", x_col))
p2 %>% print() }
else{
p3 <- ggplot(data, aes_string(x_col, y_col))+
geom_boxplot()+
geom_hline(yintercept=mean(y_col), color="red")+
geom_hline(yintercept=median(y_col), color="blue", linetype="dashed")+
labs(title=str_c("Awards: Number of Wins by ", x_col), subtitle="Showing mean(red), median(blue)")
p3 %>% print() }
}
Histogram to show distribution of label
#view correlations, drop the insignificant relationships, sort by highest to lowest, and visualize results graphically
corr_simple <- function(data=movies,drop="nrOfWins"){
df_cor <- data %>% mutate_if(is.factor, as.numeric) %>% select(-drop)
corr <- cor(df_cor)
corr[lower.tri(corr,diag=TRUE)] <- NA #Prepare to drop duplicates and correlations of 1
corr[corr == 1] <- NA #drop perfect correlations
corr <- as.data.frame(as.table(corr)) #Turn into a 3-column table
corr <- na.omit(corr) #remove the NA values from above
corr <- subset(corr, abs(Freq) > 0.3) #select significant values
corr <- corr[order(-abs(corr$Freq)),] #Sort by highest correlation
print(corr)
#turn corr back into matrix in order to plot with corrplot
mtx_corr <- reshape2::acast(corr, Var1~Var2, value.var="Freq")
#plot correlations visually
corrplot::corrplot(mtx_corr, is.corr=FALSE, tl.col="black", na.label=" ")
}
Var1 Var2 Freq
260 ratingCount nrOfUserReviews 0.6775490
98 lifetime_gross ratingCount 0.5786108
196 ratingCount nrOfPhotos 0.5484434
228 ratingCount nrOfNewsArticles 0.5386525
194 lifetime_gross nrOfPhotos 0.4447820
263 nrOfPhotos nrOfUserReviews 0.4181773
264 nrOfNewsArticles nrOfUserReviews 0.3741623
230 nrOfNominations nrOfNewsArticles 0.3701706
164 ratingCount nrOfNominations 0.3560082
258 lifetime_gross nrOfUserReviews 0.3557660
163 ratingInteger nrOfNominations 0.3520855
549 duration Drama 0.3501868
231 nrOfPhotos nrOfNewsArticles 0.3480180
396 Adventure Animation 0.3443654
225 year nrOfNewsArticles 0.3356237
943 Comedy Thriller -0.3125063
547 ratingInteger Drama 0.3100243
226 lifetime_gross nrOfNewsArticles 0.3072152
#feature engineering
movies <- movies %>% mutate(Popularity = ratingCount+nrOfUserReviews+nrOfNewsArticles+nrOfPhotos) %>% select(-ratingCount,-nrOfUserReviews,-nrOfNewsArticles,-nrOfPhotos)
#transformations
movies <- movies %>%
mutate(year.log=log(year),
duration.log=log(duration),
Popularity.log=log(Popularity),
lifetime_gross.log=log(lifetime_gross),
nrOfWins.sqr = sqrt(nrOfWins),
nrOfNominations.sqr = sqrt(nrOfNominations)) %>%
select(-year,-duration,-Popularity,-nrOfNominations,-lifetime_gross)
#dropped column
movies <- movies %>% select(-lifetime_gross.log)
#normalize dataframe
normalize <- function(x)(x - mean(x, na.rm=T))/sd(x, na.rm=T)
movies <- movies %>% mutate_at(vars(year.log,duration.log,Popularity.log,nrOfNominations.sqr), normalize)
Histogram to show distribution of label after transformation
Created dummy variables to convert factors into numbers for linear regression
#convert to numeric dummy variables
movies_num <- dummy_cols(movies) %>% select(-ratingInteger,-nrOfGenre)
movies_num <- movies_num %>% mutate_if(is.logical,as.integer)
#for use with linear regression
set.seed(123)
train_num <- movies_num %>% sample_frac(0.7)
test_num <- movies_num %>% setdiff(train_num)
#for use with regression trees
set.seed(123)
train <- movies %>% sample_frac(0.7)
test <- movies %>% setdiff(train)
Different Linear Regression Models
R-Squared Results
AIC R2 R2_adjusted RMSE
1 5017.302 0.7647038 0.7630284 0.7270008
names overall
13 nrOfNominations.sqr 65.142762
14 ratingInteger_8 12.249629
12 year.log 4.522397
15 ratingInteger_9 3.835766
7 Family 3.509310
11 Thriller 3.267216
1 Action 2.912037
2 Adventure 2.832014
4 Biography 2.586796
6 Documentary 2.542071
10 Sport 2.078357
3 Animation 1.965759
8 Fantasy 1.898556
9 Mystery 1.897670
5 Comedy 1.704279
16 ratingInteger_6 1.526456
AIC R2 R2_adjusted RMSE
1 5027.757 0.7631967 0.7617226 0.7293254
names overall
10 nrOfNominations.sqr 64.1876690
11 ratingInteger_8 12.6982590
9 year.log 4.4751804
12 ratingInteger_9 4.0276171
5 Family 3.2606036
8 Thriller 3.2054662
4 Documentary 2.9272752
2 Adventure 2.5750520
1 Action 2.4912836
13 ratingInteger_7 2.2800969
3 Animation 2.1772591
6 Fantasy 1.8125349
14 ratingInteger_3 1.0098226
7 Musical 0.6886044
AIC R2 R2_adjusted RMSE
1 5027.757 0.7631967 0.7617226 0.7293254
names overall
12 nrOfNominations.sqr 63.1644253
13 ratingInteger_8 12.5246257
11 year.log 4.7560935
14 ratingInteger_9 3.9357266
7 Family 3.3057406
2 Biography 3.1713532
6 Documentary 3.0497574
10 Thriller 3.0424179
4 Adventure 2.6022861
1 Action 2.5970273
5 Animation 1.8491190
8 Fantasy 1.6776378
3 War 0.9333965
15 Popularity.log 0.8682510
9 Musical 0.3862927
lmodel_AIC R-squared: 67.56 %
lmodel_p R-squared: 67.5 %
lmodel_i R-squared: 67.35 %
dtModel R-squared: 58.47 %
rfModel R-squared: 60.6 %