I wish to make a Machine Learning model on the dataset Graduate Admissions that i found on Kaggle. The purpose here for me is to predict the chance that a student can be admitted.
library(tidyverse)## -- Attaching packages ------------------------------------------------------------------------------ tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.8
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## Warning: package 'dplyr' was built under R version 3.5.2
## -- Conflicts --------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(keras)## Warning: package 'keras' was built under R version 3.5.2
library(caret)## Warning: package 'caret' was built under R version 3.5.2
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(corrplot)## corrplot 0.84 loaded
library(randomForest)## Warning: package 'randomForest' was built under R version 3.5.2
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(rpart)
library(kableExtra)## Warning: package 'kableExtra' was built under R version 3.5.2
library(modelr)
library(ggthemes)## Warning: package 'ggthemes' was built under R version 3.5.2
library(scales)## Warning: package 'scales' was built under R version 3.5.2
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(readxl)
gradschool_admissions <- read_excel("D:/Working Directory/Admission_Predict.xlsx")Let us look into the data
glimpse(gradschool_admissions)## Observations: 400
## Variables: 9
## $ `Serial No.` <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
## $ `GRE Score` <dbl> 337, 324, 316, 322, 314, 330, 321, 308, 30...
## $ `TOEFL Score` <dbl> 118, 107, 104, 110, 103, 115, 109, 101, 10...
## $ `University Rating` <dbl> 4, 4, 3, 3, 2, 5, 3, 2, 1, 3, 3, 4, 4, 3, ...
## $ SOP <dbl> 4.5, 4.0, 3.0, 3.5, 2.0, 4.5, 3.0, 3.0, 2....
## $ LOR <dbl> 4.5, 4.5, 3.5, 2.5, 3.0, 3.0, 4.0, 4.0, 1....
## $ CGPA <dbl> 9.65, 8.87, 8.00, 8.67, 8.21, 9.34, 8.20, ...
## $ Research <dbl> 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, ...
## $ `Chance of Admit` <dbl> 0.92, 0.76, 0.72, 0.80, 0.65, 0.90, 0.75, ...
summary(gradschool_admissions)## Serial No. GRE Score TOEFL Score University Rating
## Min. : 1.0 Min. :290.0 Min. : 92.0 Min. :1.000
## 1st Qu.:100.8 1st Qu.:308.0 1st Qu.:103.0 1st Qu.:2.000
## Median :200.5 Median :317.0 Median :107.0 Median :3.000
## Mean :200.5 Mean :316.8 Mean :107.4 Mean :3.087
## 3rd Qu.:300.2 3rd Qu.:325.0 3rd Qu.:112.0 3rd Qu.:4.000
## Max. :400.0 Max. :340.0 Max. :120.0 Max. :5.000
## SOP LOR CGPA Research
## Min. :1.0 Min. :1.000 Min. :6.800 Min. :0.0000
## 1st Qu.:2.5 1st Qu.:3.000 1st Qu.:8.170 1st Qu.:0.0000
## Median :3.5 Median :3.500 Median :8.610 Median :1.0000
## Mean :3.4 Mean :3.453 Mean :8.599 Mean :0.5475
## 3rd Qu.:4.0 3rd Qu.:4.000 3rd Qu.:9.062 3rd Qu.:1.0000
## Max. :5.0 Max. :5.000 Max. :9.920 Max. :1.0000
## Chance of Admit
## Min. :0.3400
## 1st Qu.:0.6400
## Median :0.7300
## Mean :0.7244
## 3rd Qu.:0.8300
## Max. :0.9700
Thankfully there are no NAs hence our analysis is made a lot more easier.
The serial number will not be relevant in the process, hence we will need to get rid of it.
a <- c ("GRE Score", "TOEFL Score", "University Rating", "SOP", "LOR", "CGPA", "Research", "Chance of Admit")
gradschool_admissions1 <- gradschool_admissions[a]
summary(gradschool_admissions1)## GRE Score TOEFL Score University Rating SOP
## Min. :290.0 Min. : 92.0 Min. :1.000 Min. :1.0
## 1st Qu.:308.0 1st Qu.:103.0 1st Qu.:2.000 1st Qu.:2.5
## Median :317.0 Median :107.0 Median :3.000 Median :3.5
## Mean :316.8 Mean :107.4 Mean :3.087 Mean :3.4
## 3rd Qu.:325.0 3rd Qu.:112.0 3rd Qu.:4.000 3rd Qu.:4.0
## Max. :340.0 Max. :120.0 Max. :5.000 Max. :5.0
## LOR CGPA Research Chance of Admit
## Min. :1.000 Min. :6.800 Min. :0.0000 Min. :0.3400
## 1st Qu.:3.000 1st Qu.:8.170 1st Qu.:0.0000 1st Qu.:0.6400
## Median :3.500 Median :8.610 Median :1.0000 Median :0.7300
## Mean :3.453 Mean :8.599 Mean :0.5475 Mean :0.7244
## 3rd Qu.:4.000 3rd Qu.:9.062 3rd Qu.:1.0000 3rd Qu.:0.8300
## Max. :5.000 Max. :9.920 Max. :1.0000 Max. :0.9700
names(gradschool_admissions1)[1] <- "GRE_Score"
names(gradschool_admissions1)[2] <- "TOEFL_Score"
names(gradschool_admissions1)[3] <- "University_Rating"
names(gradschool_admissions1)[8] <- "Admission_Probability"glimpse(gradschool_admissions1)## Observations: 400
## Variables: 8
## $ GRE_Score <dbl> 337, 324, 316, 322, 314, 330, 321, 308, ...
## $ TOEFL_Score <dbl> 118, 107, 104, 110, 103, 115, 109, 101, ...
## $ University_Rating <dbl> 4, 4, 3, 3, 2, 5, 3, 2, 1, 3, 3, 4, 4, 3...
## $ SOP <dbl> 4.5, 4.0, 3.0, 3.5, 2.0, 4.5, 3.0, 3.0, ...
## $ LOR <dbl> 4.5, 4.5, 3.5, 2.5, 3.0, 3.0, 4.0, 4.0, ...
## $ CGPA <dbl> 9.65, 8.87, 8.00, 8.67, 8.21, 9.34, 8.20...
## $ Research <dbl> 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1...
## $ Admission_Probability <dbl> 0.92, 0.76, 0.72, 0.80, 0.65, 0.90, 0.75...
summary(gradschool_admissions1)## GRE_Score TOEFL_Score University_Rating SOP
## Min. :290.0 Min. : 92.0 Min. :1.000 Min. :1.0
## 1st Qu.:308.0 1st Qu.:103.0 1st Qu.:2.000 1st Qu.:2.5
## Median :317.0 Median :107.0 Median :3.000 Median :3.5
## Mean :316.8 Mean :107.4 Mean :3.087 Mean :3.4
## 3rd Qu.:325.0 3rd Qu.:112.0 3rd Qu.:4.000 3rd Qu.:4.0
## Max. :340.0 Max. :120.0 Max. :5.000 Max. :5.0
## LOR CGPA Research Admission_Probability
## Min. :1.000 Min. :6.800 Min. :0.0000 Min. :0.3400
## 1st Qu.:3.000 1st Qu.:8.170 1st Qu.:0.0000 1st Qu.:0.6400
## Median :3.500 Median :8.610 Median :1.0000 Median :0.7300
## Mean :3.453 Mean :8.599 Mean :0.5475 Mean :0.7244
## 3rd Qu.:4.000 3rd Qu.:9.062 3rd Qu.:1.0000 3rd Qu.:0.8300
## Max. :5.000 Max. :9.920 Max. :1.0000 Max. :0.9700
Let us figure out each variable’s relationship with the Admission Probability. I think we can do this by creating a corrplot.
numericVars <- which(sapply(gradschool_admissions1, is.numeric))
all_numVar <- gradschool_admissions1[, numericVars]
cor_numVar <- cor(all_numVar, use = "pairwise.complete.obs")
# Sort on decreasing correlations with Admission Probability
cor_sorted <- as.matrix(sort(cor_numVar[, "Admission_Probability"], decreasing = TRUE))
# Selecting high correlations
Cor_High <- names(which(apply(cor_sorted, 1, function(x) abs(x) > 0.25)))
cor_numVar <- cor_numVar[Cor_High, Cor_High]
corrplot.mixed(cor_numVar, tl.col = "black", tl.pos = "lt")Looks like the college GPA (General Point Average) or basically, your college grades, is the variable that relates enormously with the chances of admission.
ggplot(gradschool_admissions1, aes(x = CGPA, y = Admission_Probability)) + geom_point(col = "orchid") + labs(x = "\n CGPA \n", y = "Probability of Admission") + labs(title = "College GPA vs Admission Probability") + geom_smooth(method = "lm", se = FALSE, col = "red") + theme_solarized_2()So, the better your GPA is, the bigger the chances that you will be admitted.
We will use Keras to create our model. For personal convenience i will use a 1:69 model
index <- 1:69
training <- gradschool_admissions1[index,]
testing <- gradschool_admissions1[-index,]
glimpse(training)## Observations: 69
## Variables: 8
## $ GRE_Score <dbl> 337, 324, 316, 322, 314, 330, 321, 308, ...
## $ TOEFL_Score <dbl> 118, 107, 104, 110, 103, 115, 109, 101, ...
## $ University_Rating <dbl> 4, 4, 3, 3, 2, 5, 3, 2, 1, 3, 3, 4, 4, 3...
## $ SOP <dbl> 4.5, 4.0, 3.0, 3.5, 2.0, 4.5, 3.0, 3.0, ...
## $ LOR <dbl> 4.5, 4.5, 3.5, 2.5, 3.0, 3.0, 4.0, 4.0, ...
## $ CGPA <dbl> 9.65, 8.87, 8.00, 8.67, 8.21, 9.34, 8.20...
## $ Research <dbl> 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1...
## $ Admission_Probability <dbl> 0.92, 0.76, 0.72, 0.80, 0.65, 0.90, 0.75...
glimpse(testing)## Observations: 331
## Variables: 8
## $ GRE_Score <dbl> 328, 332, 336, 321, 314, 314, 329, 327, ...
## $ TOEFL_Score <dbl> 115, 118, 112, 111, 108, 106, 114, 112, ...
## $ University_Rating <dbl> 4, 5, 5, 5, 4, 3, 2, 3, 2, 2, 1, 3, 4, 5...
## $ SOP <dbl> 4.5, 5.0, 5.0, 5.0, 4.5, 3.0, 2.0, 3.0, ...
## $ LOR <dbl> 4.0, 5.0, 5.0, 5.0, 4.0, 5.0, 4.0, 3.0, ...
## $ CGPA <dbl> 9.16, 9.64, 9.76, 9.45, 9.04, 8.90, 8.56...
## $ Research <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1...
## $ Admission_Probability <dbl> 0.78, 0.94, 0.96, 0.93, 0.84, 0.74, 0.72...
This is to ensure that we convert all the indicators to a common scale with an average of zero and an SD (Standard Deviation) of 1.
I got this from a Rick Scavetta ML tutorial
training %>%
mutate_at(vars(-Admission_Probability), scale) -> training
testing %>%
mutate_at(vars(-Admission_Probability), scale) -> testingLet’s assume normal distribution with linear modeling
#LM model
fit_lm <- lm(Admission_Probability~., data = training)
summary(fit_lm)##
## Call:
## lm(formula = Admission_Probability ~ ., data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.19622 -0.04564 0.01261 0.05182 0.13746
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.676667 0.009743 69.449 < 2e-16 ***
## GRE_Score 0.008978 0.023960 0.375 0.70917
## TOEFL_Score 0.019598 0.022351 0.877 0.38402
## University_Rating 0.076811 0.022569 3.403 0.00118 **
## SOP -0.014626 0.015514 -0.943 0.34952
## LOR 0.036605 0.015077 2.428 0.01816 *
## CGPA 0.021622 0.020466 1.056 0.29491
## Research 0.017736 0.010740 1.651 0.10379
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08093 on 61 degrees of freedom
## Multiple R-squared: 0.7788, Adjusted R-squared: 0.7534
## F-statistic: 30.68 on 7 and 61 DF, p-value: < 2.2e-16
#GLM model
fit_glm <- glm(Admission_Probability~., data = training, family = "binomial")## Warning in eval(family$initialize): non-integer #successes in a binomial
## glm!
fit_glm##
## Call: glm(formula = Admission_Probability ~ ., family = "binomial",
## data = training)
##
## Coefficients:
## (Intercept) GRE_Score TOEFL_Score
## 0.827288 0.001672 0.106549
## University_Rating SOP LOR
## 0.400954 -0.084226 0.176093
## CGPA Research
## 0.148194 0.088945
##
## Degrees of Freedom: 68 Total (i.e. Null); 61 Residual
## Null Deviance: 9.081
## Residual Deviance: 2.323 AIC: 74.52
summary(fit_glm)##
## Call:
## glm(formula = Admission_Probability ~ ., family = "binomial",
## data = training)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.47517 -0.10621 0.04003 0.15431 0.33182
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.827288 0.279600 2.959 0.00309 **
## GRE_Score 0.001672 0.648566 0.003 0.99794
## TOEFL_Score 0.106549 0.600600 0.177 0.85919
## University_Rating 0.400954 0.629728 0.637 0.52431
## SOP -0.084226 0.410318 -0.205 0.83736
## LOR 0.176093 0.415558 0.424 0.67175
## CGPA 0.148194 0.555266 0.267 0.78956
## Research 0.088945 0.285940 0.311 0.75575
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9.0806 on 68 degrees of freedom
## Residual deviance: 2.3227 on 61 degrees of freedom
## AIC: 74.519
##
## Number of Fisher Scoring iterations: 4
For LM model
predict_lm <- predict(fit_lm, newdata = testing)
print(head(predict_lm))## 1 2 3 4 5 6
## 0.8159765 0.9487884 0.9370567 0.9103262 0.7778091 0.7253543
#Printing the MAE (Mean Absolute Error)
MAE_lm <- sum(abs(predict_lm - testing$Admission_Probability))/331
print(MAE_lm)## [1] 0.07931887
For GLM model
predict_glm <- predict(fit_glm, data = testing, type = "response")
print(head(predict_glm))## 1 2 3 4 5 6
## 0.8487804 0.8042260 0.6780792 0.6811538 0.5634878 0.8403876
#Printing the MAE (Mean Absolute Error)
MAE_glm <- sum(abs(predict_glm - testing$Admission_Probability))/331## Warning in predict_glm - testing$Admission_Probability: longer object
## length is not a multiple of shorter object length
print(MAE_glm)## [1] 0.1779941
Creating the Model
fit_rpart <- rpart(Admission_Probability~., training)
summary(fit_rpart)## Call:
## rpart(formula = Admission_Probability ~ ., data = training)
## n= 69
##
## CP nsplit rel error xerror xstd
## 1 0.59166251 0 1.0000000 1.0094516 0.11530831
## 2 0.08638143 1 0.4083375 0.5109719 0.08694378
## 3 0.07015371 2 0.3219561 0.4966292 0.08138155
## 4 0.02852592 3 0.2518023 0.4316126 0.07313549
## 5 0.02412725 4 0.2232764 0.4012983 0.06809630
## 6 0.01000000 5 0.1991492 0.3857499 0.06817522
##
## Variable importance
## University_Rating CGPA TOEFL_Score GRE_Score
## 23 19 17 17
## SOP LOR Research
## 12 11 1
##
## Node number 1: 69 observations, complexity param=0.5916625
## mean=0.6766667, MSE=0.02617874
## left son=2 (45 obs) right son=3 (24 obs)
## Primary splits:
## University_Rating < 0.2847977 to the left, improve=0.5916625, (0 missing)
## CGPA < 0.8101575 to the left, improve=0.5652179, (0 missing)
## GRE_Score < 0.623711 to the left, improve=0.5134222, (0 missing)
## TOEFL_Score < 0.09556002 to the left, improve=0.4716635, (0 missing)
## LOR < 0.4423485 to the left, improve=0.4559058, (0 missing)
## Surrogate splits:
## TOEFL_Score < 0.4211719 to the left, agree=0.913, adj=0.750, (0 split)
## GRE_Score < 0.4510488 to the left, agree=0.899, adj=0.708, (0 split)
## CGPA < 0.5238535 to the left, agree=0.899, adj=0.708, (0 split)
## SOP < 0.8907507 to the left, agree=0.812, adj=0.458, (0 split)
## LOR < 0.4423485 to the left, agree=0.783, adj=0.375, (0 split)
##
## Node number 2: 45 observations, complexity param=0.07015371
## mean=0.5857778, MSE=0.01128217
## left son=4 (21 obs) right son=5 (24 obs)
## Primary splits:
## University_Rating < -0.5514169 to the left, improve=0.2495993, (0 missing)
## GRE_Score < -0.5849246 to the left, improve=0.2491616, (0 missing)
## LOR < 0.4423485 to the left, improve=0.2445901, (0 missing)
## CGPA < -1.208286 to the left, improve=0.1542805, (0 missing)
## TOEFL_Score < 0.09556002 to the left, improve=0.1357766, (0 missing)
## Surrogate splits:
## GRE_Score < -0.2396001 to the left, agree=0.778, adj=0.524, (0 split)
## SOP < -0.7482306 to the left, agree=0.711, adj=0.381, (0 split)
## TOEFL_Score < -1.206888 to the left, agree=0.689, adj=0.333, (0 split)
## LOR < -1.150106 to the left, agree=0.689, adj=0.333, (0 split)
## CGPA < -0.7788301 to the left, agree=0.689, adj=0.333, (0 split)
##
## Node number 3: 24 observations, complexity param=0.08638143
## mean=0.8470833, MSE=0.009578993
## left son=6 (7 obs) right son=7 (17 obs)
## Primary splits:
## CGPA < 0.6670055 to the left, improve=0.6787146, (0 missing)
## GRE_Score < 0.623711 to the left, improve=0.4051526, (0 missing)
## University_Rating < 1.121012 to the left, improve=0.3658545, (0 missing)
## TOEFL_Score < 1.072396 to the left, improve=0.2676994, (0 missing)
## LOR < 0.4423485 to the left, improve=0.2649722, (0 missing)
## Surrogate splits:
## GRE_Score < 0.623711 to the left, agree=0.833, adj=0.429, (0 split)
## LOR < 0.4423485 to the left, agree=0.833, adj=0.429, (0 split)
## TOEFL_Score < 0.258366 to the left, agree=0.792, adj=0.286, (0 split)
## SOP < 0.3444236 to the left, agree=0.792, adj=0.286, (0 split)
## University_Rating < 1.121012 to the left, agree=0.750, adj=0.143, (0 split)
##
## Node number 4: 21 observations, complexity param=0.02852592
## mean=0.5290476, MSE=0.006475283
## left son=8 (11 obs) right son=9 (10 obs)
## Primary splits:
## LOR < -0.6192879 to the left, improve=0.3789304, (0 missing)
## CGPA < -1.065134 to the left, improve=0.3097248, (0 missing)
## TOEFL_Score < -1.206888 to the left, improve=0.1184074, (0 missing)
## GRE_Score < -0.930249 to the left, improve=0.0504285, (0 missing)
## SOP < -0.7482306 to the left, improve=0.0203775, (0 missing)
## Surrogate splits:
## SOP < -0.7482306 to the left, agree=0.762, adj=0.5, (0 split)
## CGPA < -1.208286 to the left, agree=0.762, adj=0.5, (0 split)
## University_Rating < -1.387631 to the left, agree=0.714, adj=0.4, (0 split)
## GRE_Score < -1.318739 to the left, agree=0.667, adj=0.3, (0 split)
## TOEFL_Score < -1.369694 to the left, agree=0.667, adj=0.3, (0 split)
##
## Node number 5: 24 observations, complexity param=0.02412725
## mean=0.6354167, MSE=0.01020816
## left son=10 (11 obs) right son=11 (13 obs)
## Primary splits:
## Research < -0.1904462 to the left, improve=0.17788810, (0 missing)
## LOR < -0.0884697 to the left, improve=0.17511520, (0 missing)
## GRE_Score < -0.2827657 to the left, improve=0.10103060, (0 missing)
## CGPA < 0.2017614 to the left, improve=0.06094098, (0 missing)
## SOP < -0.2019035 to the right, improve=0.04952579, (0 missing)
## Surrogate splits:
## LOR < 0.4423485 to the left, agree=0.708, adj=0.364, (0 split)
## CGPA < -0.06306988 to the right, agree=0.667, adj=0.273, (0 split)
## TOEFL_Score < -0.8812757 to the left, agree=0.625, adj=0.182, (0 split)
## GRE_Score < -0.930249 to the left, agree=0.583, adj=0.091, (0 split)
## SOP < -0.2019035 to the left, agree=0.583, adj=0.091, (0 split)
##
## Node number 6: 7 observations
## mean=0.7214286, MSE=0.005669388
##
## Node number 7: 17 observations
## mean=0.8988235, MSE=0.002010381
##
## Node number 8: 11 observations
## mean=0.4818182, MSE=0.003505785
##
## Node number 9: 10 observations
## mean=0.581, MSE=0.004589
##
## Node number 10: 11 observations
## mean=0.5890909, MSE=0.006026446
##
## Node number 11: 13 observations
## mean=0.6746154, MSE=0.01039408
# Using the model to predict
pred_rpart <- predict(fit_rpart, testing)
print(head(pred_rpart))## 1 2 3 4 5 6
## 0.8988235 0.8988235 0.8988235 0.8988235 0.8988235 0.5890909
# Finding the Mean Absolute Error
MAE_rpart <- mae(model = fit_rpart, testing)
print(MAE_rpart)## [1] 0.09328619
Let’ts try to graph the rpart decision tree first
library(rattle)## Warning: package 'rattle' was built under R version 3.5.3
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
##
## Attaching package: 'rattle'
## The following object is masked from 'package:randomForest':
##
## importance
fancyRpartPlot(fit_rpart, main = "Admission Data Tree Graph")I would like to employ a loop that can cut down the decision tree into a slightly more viable size
We will need to engage into looping
get_mae <- function(maxdepth, target, predictors, training_data, testing_data){
predictors <- paste(predictors, collapse = "+")
formula <- as.formula(paste(target, "~", predictors, sep = ""))
model <- rpart(formula, data = training_data, control = rpart.control(maxdepth = maxdepth))
mae <- mae(model, testing_data)
return(mae)
}target <- "Admission_Probability"
predictors <- c("GRE_Score", "TOEFL_Score", "University_Rating", "SOP", "LOR", "CGPA", "Research")
for(i in 1:10){
mae <- get_mae(maxdepth = i, target = target, predictors = predictors, training_data = training, testing_data = testing)
print(glue::glue("Maxdepth:", i, "\t MAE:", mae ))
}## Maxdepth:1 MAE:0.109416582745888
## Maxdepth:2 MAE:0.0975920625047602
## Maxdepth:3 MAE:0.0932861923929988
## Maxdepth:4 MAE:0.0932861923929988
## Maxdepth:5 MAE:0.0932861923929988
## Maxdepth:6 MAE:0.0932861923929988
## Maxdepth:7 MAE:0.0932861923929988
## Maxdepth:8 MAE:0.0932861923929988
## Maxdepth:9 MAE:0.0932861923929988
## Maxdepth:10 MAE:0.0932861923929988
Apparently a max depth of 3 is optimal enough to cut the decision tree into a more accurate size.
fit_rf <- randomForest(Admission_Probability~., data = training)
pred_rf <- predict(fit_rf, testing)
print(head(pred_rf))## 1 2 3 4 5 6
## 0.8622355 0.9373236 0.9213833 0.8834181 0.7587553 0.6856683
mae_rf <- mae(model = fit_rf, testing)
print(mae_rf)## [1] 0.07577058