Robbie Beane
ss <- read.table('data/student_success_data.csv', header=TRUE, sep=',')
ss <- na.omit(ss)
ss <- ss[(ss$G3 <= 20) & (ss$G3 >= 0), ]
ss$Passed <- factor(ifelse(ss$G3 < 10, 0, 1))
ss$G1 <- NULL
ss$G2 <- NULL
ss$G3 <- NULL
ss$absences <- NULL
summary(ss)## school sex age address famsize Pstatus
## GP :462 F:297 Min. :15.00 R:155 GT3:393 A: 45
## MHS:101 M:266 1st Qu.:16.00 U:408 LE3:170 T:518
## Median :16.00
## Mean :16.61
## 3rd Qu.:18.00
## Max. :22.00
## Medu Fedu Mjob Fjob
## Min. :1.000 Min. :1.000 at_home : 66 at_home : 31
## 1st Qu.:2.000 1st Qu.:2.000 health : 39 health : 25
## Median :3.000 Median :3.000 other :184 other :275
## Mean :2.874 Mean :2.686 services:182 services:173
## 3rd Qu.:4.000 3rd Qu.:4.000 teacher : 92 teacher : 59
## Max. :4.000 Max. :4.000
## reason guardian traveltime studytime
## course :215 father:129 Min. :1.000 Min. :1.000
## home :158 mother:398 1st Qu.:1.000 1st Qu.:1.000
## other : 48 other : 36 Median :1.000 Median :2.000
## reputation:142 Mean :1.481 Mean :1.986
## 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :4.000 Max. :4.000
## failures schoolsup famsup paid activities nursery
## Min. :0.0000 no :507 no :254 no :332 no :262 no :102
## 1st Qu.:0.0000 yes: 56 yes:309 yes:231 yes:301 yes:461
## Median :0.0000
## Mean :0.2842
## 3rd Qu.:0.0000
## Max. :3.0000
## higher internet romantic famrel freetime
## no : 22 no : 93 no :341 Min. :1.000 Min. :1.000
## yes:541 yes:470 yes:222 1st Qu.:4.000 1st Qu.:3.000
## Median :4.000 Median :3.000
## Mean :3.938 Mean :3.213
## 3rd Qu.:5.000 3rd Qu.:4.000
## Max. :5.000 Max. :5.000
## goout Dalc Walc health Passed
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.00 0:200
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:3.00 1:363
## Median :3.000 Median :1.000 Median :2.000 Median :4.00
## Mean :3.021 Mean :1.401 Mean :2.176 Mean :3.67
## 3rd Qu.:4.000 3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.:5.00
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.00
##
## Call:
## randomForest(formula = Passed ~ ., data = ss, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 16.7%
## Confusion matrix:
## 0 1 class.error
## 0 142 58 0.29000000
## 1 36 327 0.09917355
m1_opt_ntrees <- which.min(m1$err.rate[,'OOB'])
m1_opt_err_rate <- min(m1$err.rate[,'OOB'])
cat("Optimal Number of Trees: ", m1_opt_ntrees, "\n",
"Minimum Error Rate: ", m1_opt_err_rate, sep="")## Optimal Number of Trees: 124
## Minimum Error Rate: 0.1527531
##
## Call:
## randomForest(formula = Passed ~ ., data = ss, ntree = m1_opt_ntrees, importance = TRUE)
## Type of random forest: classification
## Number of trees: 124
## No. of variables tried at each split: 5
##
## OOB estimate of error rate: 15.28%
## Confusion matrix:
## 0 1 class.error
## 0 146 54 0.27000000
## 1 32 331 0.08815427
oob_acc_list <- c()
opt_ntree_list <- c()
for(i in 1:29){
set.seed(1)
temp_mod <- randomForest(Passed ~ ., ss, ntree=500, importance=TRUE, mtry=i)
oob_acc_list <- c(oob_acc_list, min(temp_mod$err.rate[,'OOB']))
opt_ntree_list <- c(opt_ntree_list, which.min(temp_mod$err.rate[,'OOB']))
}
opt_mtry <- which.min(oob_acc_list)
opt_ntree <- opt_ntree_list[opt_mtry]
min_oob_acc <- min(oob_acc_list)
cat("Optimal Value of mtry: ", opt_mtry, "\n",
"Optimal Value of ntree: ", opt_ntree, "\n",
"Minimum OOB Accuracy: ", min_oob_acc, sep="")## Optimal Value of mtry: 3
## Optimal Value of ntree: 445
## Minimum OOB Accuracy: 0.1420959
plot(1:29, oob_acc_list, xlab="Value of mtry", ylab="Minimum OOB Accuracy Score")
lines(1:29, oob_acc_list)
abline(v=which.min(oob_acc_list), col="red", lty=2, lwd=1)##
## Call:
## randomForest(formula = Passed ~ ., data = ss, ntree = 445, mtry = 3, importance = TRUE)
## Type of random forest: classification
## Number of trees: 445
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 14.21%
## Confusion matrix:
## 0 1 class.error
## 0 145 55 0.27500000
## 1 25 338 0.06887052
set.seed(1)
train(Passed ~ ., ss, method="rf", ntree=445,
trControl = trainControl(method="cv", number=20),
tuneGrid = expand.grid(mtry=c(3)))## Random Forest
##
## 563 samples
## 29 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (20 fold)
## Summary of sample sizes: 535, 535, 535, 535, 535, 534, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8349754 0.6226192
##
## Tuning parameter 'mtry' was held constant at a value of 3
set.seed(1)
randomForest(Passed ~ ., ss, ntree=445, mtry=3, importance=TRUE,
replace=FALSE, sampsize=floor(0.8*nrow(ss)))##
## Call:
## randomForest(formula = Passed ~ ., data = ss, ntree = 445, mtry = 3, importance = TRUE, replace = FALSE, sampsize = floor(0.8 * nrow(ss)))
## Type of random forest: classification
## Number of trees: 445
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 14.92%
## Confusion matrix:
## 0 1 class.error
## 0 147 53 0.26500000
## 1 31 332 0.08539945
error_rate_list <- c()
for (i in 1:20){
set.seed(i)
temp_mod <- randomForest(Passed ~ ., ss, ntree=445, mtry=3, importance=TRUE,
replace=FALSE, sampsize=floor(0.8*nrow(ss)))
error_rate_list <- c(error_rate_list, temp_mod$err.rate[445,"OOB"])
}
mean(error_rate_list)## [1] 0.1590586
## 0 1 MeanDecreaseAccuracy
## school 0.0111720672 0.0025577942 0.005694658
## sex 0.0233632155 0.0012521691 0.009095097
## age 0.0264599152 0.0032338034 0.011469429
## address 0.0149627498 0.0023202583 0.006837335
## famsize 0.0199481289 0.0008092124 0.007596645
## Pstatus 0.0046869379 0.0001193006 0.001765697
## Medu 0.0541793952 0.0174901597 0.030577784
## Fedu 0.0491084012 0.0177485314 0.028881618
## Mjob 0.0599594814 0.0131151192 0.029743839
## Fjob 0.0293255539 0.0039668412 0.013065376
## reason 0.0354658291 0.0025441594 0.014226229
## guardian 0.0156615270 0.0019149910 0.006817719
## traveltime 0.0184574985 0.0006176917 0.006975296
## studytime 0.0255475734 0.0029151586 0.010989474
## failures 0.0492611771 0.0219989065 0.031765734
## schoolsup -0.0004538367 0.0037351486 0.002217851
## famsup 0.0399601697 0.0120905935 0.021994149
## paid 0.0226342909 0.0050237302 0.011328281
## activities 0.0156916157 0.0038482942 0.008056800
## nursery 0.0085281869 0.0028213463 0.004885437
## higher 0.0031016391 0.0013766224 0.001956130
## internet 0.0218552420 0.0046808013 0.010802754
## romantic 0.0175522927 0.0052956381 0.009702732
## famrel 0.0382173216 0.0047969335 0.016753294
## freetime 0.0336506282 0.0042443630 0.014790784
## goout 0.0454265167 0.0091057532 0.022041033
## Dalc 0.0188676402 0.0076632578 0.011721844
## Walc 0.0266031162 0.0072780700 0.014199180
## health 0.0290641642 0.0062458468 0.014451498
## MeanDecreaseGini
## school 3.155759
## sex 5.437597
## age 11.435674
## address 4.639802
## famsize 4.387064
## Pstatus 2.380665
## Medu 14.968563
## Fedu 15.012602
## Mjob 17.688261
## Fjob 10.509067
## reason 11.883354
## guardian 6.183318
## traveltime 6.389089
## studytime 9.277214
## failures 16.271369
## schoolsup 3.580239
## famsup 7.981877
## paid 5.063839
## activities 5.261459
## nursery 4.128403
## higher 2.496847
## internet 5.095232
## romantic 4.948335
## famrel 10.655698
## freetime 11.351832
## goout 14.025130
## Dalc 7.596320
## Walc 11.637881
## health 11.363448
ames <- read.table('data/AmesHousing.txt', header=TRUE, sep='\t')
sapply(ames, function(x) sum(is.na(x)))## Order PID MS.SubClass MS.Zoning
## 0 0 0 0
## Lot.Frontage Lot.Area Street Alley
## 490 0 0 2732
## Lot.Shape Land.Contour Utilities Lot.Config
## 0 0 0 0
## Land.Slope Neighborhood Condition.1 Condition.2
## 0 0 0 0
## Bldg.Type House.Style Overall.Qual Overall.Cond
## 0 0 0 0
## Year.Built Year.Remod.Add Roof.Style Roof.Matl
## 0 0 0 0
## Exterior.1st Exterior.2nd Mas.Vnr.Type Mas.Vnr.Area
## 0 0 0 23
## Exter.Qual Exter.Cond Foundation Bsmt.Qual
## 0 0 0 79
## Bsmt.Cond Bsmt.Exposure BsmtFin.Type.1 BsmtFin.SF.1
## 79 79 79 1
## BsmtFin.Type.2 BsmtFin.SF.2 Bsmt.Unf.SF Total.Bsmt.SF
## 79 1 1 1
## Heating Heating.QC Central.Air Electrical
## 0 0 0 0
## X1st.Flr.SF X2nd.Flr.SF Low.Qual.Fin.SF Gr.Liv.Area
## 0 0 0 0
## Bsmt.Full.Bath Bsmt.Half.Bath Full.Bath Half.Bath
## 2 2 0 0
## Bedroom.AbvGr Kitchen.AbvGr Kitchen.Qual TotRms.AbvGrd
## 0 0 0 0
## Functional Fireplaces Fireplace.Qu Garage.Type
## 0 0 1422 157
## Garage.Yr.Blt Garage.Finish Garage.Cars Garage.Area
## 159 157 1 1
## Garage.Qual Garage.Cond Paved.Drive Wood.Deck.SF
## 158 158 0 0
## Open.Porch.SF Enclosed.Porch X3Ssn.Porch Screen.Porch
## 0 0 0 0
## Pool.Area Pool.QC Fence Misc.Feature
## 0 2917 2358 2824
## Misc.Val Mo.Sold Yr.Sold Sale.Type
## 0 0 0 0
## Sale.Condition SalePrice
## 0 0
ames$Order <- NULL
ames$PID <- NULL
ames$Lot.Frontage <- NULL
ames$Alley <- NULL
ames$Misc.Feature <- NULL
ames$Pool.QC <- NULL
ames$Fireplace.Qu <- NULL
ames$Fence <- NULL
ames <- na.omit(ames)
nrow(ames)## [1] 2683
set.seed(1)
en_mod <- train(SalePrice ~ ., ames, method="glmnet", metric="Rsquared",
trControl = trainControl(method="cv", number=10),
tuneGrid = expand.grid(alpha=seq(0, 1, by=0.2),
lambda=exp(seq(2,14,length=100))
)
)
best_ix <- which.max(en_mod$results$Rsquared)
en_mod$results[best_ix, ]## alpha lambda RMSE Rsquared MAE RMSESD RsquaredSD
## 149 0.2 2485.382 29257.91 0.8655234 16441.26 7145.613 0.06469346
## MAESD
## 149 1612.336
##
## Call:
## randomForest(formula = SalePrice ~ ., data = ames, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 24
##
## Mean of squared residuals: 607927012
## % Var explained: 90.37