The dataset is from the UCI Machine Learning Repository and can be found by clicking the following link: https://archive.ics.uci.edu/ml/datasets/Student+Performance
In this step we need to get a few packages for our analysis. We can do this by using the following code. The dependencies = TRUE ensures we install all of the necessary packages the other packages depend on.
# install.packages(c("shiny","ggvis","reshape2","dplyr","gam","tree","randomForest"),dependencies = TRUE)
Now we are going to load our libraries so we can perform our analysis.
library(shiny)
library(ggvis)
library(reshape2)
library(dplyr)
library(splines)
library(gam)
library(ISLR)
library(tree)
library(randomForest)
We need to import our data into R. We can do this in several ways. In this tutorial we will be doing this via the read.csv() function that is available in base R. The header = T allows us to keep our headers from our excel sheet, and our sep = “,” tells R that our data is separated by commas. We can utilize tab / or semicolon ; as well in our ‘sep =’ argument. We are then renaming our dataset to ‘Student_Performance_Ben_Gonzalez’ which creates an object in R we can use throughout our analysis. The attach() function then ensures we have attached our dataset so we can analyze it.
##Student Performance Dataset-Ben Gonzalez ##########################
Student_Performance_Ben_Gonzalez <-read.csv("~/Datasets/Student_Performance_Ben_Gonzalez.csv",header = T, sep = ",")
attach(Student_Performance_Ben_Gonzalez)
This will give us an overview of what variables are in our dataset. I recommend doing this to help better understand our dataset.
names(Student_Performance_Ben_Gonzalez)
## [1] "school" "sex" "age" "address" "famsize"
## [6] "Pstatus" "Medu" "Fedu" "Mjob" "Fjob"
## [11] "reason" "guardian" "traveltime" "studytime" "failures"
## [16] "schoolsup" "famsup" "paid" "activities" "nursery"
## [21] "higher" "internet" "romantic" "famrel" "freetime"
## [26] "goout" "Dalc" "Walc" "health" "absences"
## [31] "G1" "G2" "G3"
The summary overview gives us a solid understanding of our data, it also may highlight extreme values for us as well.
summary(Student_Performance_Ben_Gonzalez)
## school sex age address famsize Pstatus Medu
## GP:349 F:208 Min. :15.0 R: 88 GT3:281 A: 41 Min. :0.000
## MS: 46 M:187 1st Qu.:16.0 U:307 LE3:114 T:354 1st Qu.:2.000
## Median :17.0 Median :3.000
## Mean :16.7 Mean :2.749
## 3rd Qu.:18.0 3rd Qu.:4.000
## Max. :22.0 Max. :4.000
## Fedu Mjob Fjob reason
## Min. :0.000 at_home : 59 at_home : 20 course :145
## 1st Qu.:2.000 health : 34 health : 18 home :109
## Median :2.000 other :141 other :217 other : 36
## Mean :2.522 services:103 services:111 reputation:105
## 3rd Qu.:3.000 teacher : 58 teacher : 29
## Max. :4.000
## guardian traveltime studytime failures schoolsup
## father: 90 Min. :1.000 Min. :1.000 Min. :0.0000 no :344
## mother:273 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.0000 yes: 51
## other : 32 Median :1.000 Median :2.000 Median :0.0000
## Mean :1.448 Mean :2.035 Mean :0.3342
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:0.0000
## Max. :4.000 Max. :4.000 Max. :3.0000
## famsup paid activities nursery higher internet romantic
## no :153 no :214 no :194 no : 81 no : 20 no : 66 no :263
## yes:242 yes:181 yes:201 yes:314 yes:375 yes:329 yes:132
##
##
##
##
## famrel freetime goout Dalc
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:1.000
## Median :4.000 Median :3.000 Median :3.000 Median :1.000
## Mean :3.944 Mean :3.235 Mean :3.109 Mean :1.481
## 3rd Qu.:5.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:2.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## Walc health absences G1
## Min. :1.000 Min. :1.000 Min. : 0.000 Min. : 3.00
## 1st Qu.:1.000 1st Qu.:3.000 1st Qu.: 0.000 1st Qu.: 8.00
## Median :2.000 Median :4.000 Median : 4.000 Median :11.00
## Mean :2.291 Mean :3.554 Mean : 5.709 Mean :10.91
## 3rd Qu.:3.000 3rd Qu.:5.000 3rd Qu.: 8.000 3rd Qu.:13.00
## Max. :5.000 Max. :5.000 Max. :75.000 Max. :19.00
## G2 G3
## Min. : 0.00 Min. : 0.00
## 1st Qu.: 9.00 1st Qu.: 8.00
## Median :11.00 Median :11.00
## Mean :10.71 Mean :10.42
## 3rd Qu.:13.00 3rd Qu.:14.00
## Max. :19.00 Max. :20.00
This step allows us to look at the structure of our data and which variables are integers or factors. This will help us in determining what statistical techniques we can utilize on our data.
str(Student_Performance_Ben_Gonzalez)
## 'data.frame': 395 obs. of 33 variables:
## $ school : Factor w/ 2 levels "GP","MS": 1 1 1 1 1 1 1 1 1 1 ...
## $ sex : Factor w/ 2 levels "F","M": 1 1 1 1 1 2 2 1 2 2 ...
## $ age : int 18 17 15 15 16 16 16 17 15 15 ...
## $ address : Factor w/ 2 levels "R","U": 2 2 2 2 2 2 2 2 2 2 ...
## $ famsize : Factor w/ 2 levels "GT3","LE3": 1 1 2 1 1 2 2 1 2 1 ...
## $ Pstatus : Factor w/ 2 levels "A","T": 1 2 2 2 2 2 2 1 1 2 ...
## $ Medu : int 4 1 1 4 3 4 2 4 3 3 ...
## $ Fedu : int 4 1 1 2 3 3 2 4 2 4 ...
## $ Mjob : Factor w/ 5 levels "at_home","health",..: 1 1 1 2 3 4 3 3 4 3 ...
## $ Fjob : Factor w/ 5 levels "at_home","health",..: 5 3 3 4 3 3 3 5 3 3 ...
## $ reason : Factor w/ 4 levels "course","home",..: 1 1 3 2 2 4 2 2 2 2 ...
## $ guardian : Factor w/ 3 levels "father","mother",..: 2 1 2 2 1 2 2 2 2 2 ...
## $ traveltime: int 2 1 1 1 1 1 1 2 1 1 ...
## $ studytime : int 2 2 2 3 2 2 2 2 2 2 ...
## $ failures : int 0 0 3 0 0 0 0 0 0 0 ...
## $ schoolsup : Factor w/ 2 levels "no","yes": 2 1 2 1 1 1 1 2 1 1 ...
## $ famsup : Factor w/ 2 levels "no","yes": 1 2 1 2 2 2 1 2 2 2 ...
## $ paid : Factor w/ 2 levels "no","yes": 1 1 2 2 2 2 1 1 2 2 ...
## $ activities: Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 1 1 1 2 ...
## $ nursery : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 2 2 2 2 ...
## $ higher : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ internet : Factor w/ 2 levels "no","yes": 1 2 2 2 1 2 2 1 2 2 ...
## $ romantic : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ famrel : int 4 5 4 3 4 5 4 4 4 5 ...
## $ freetime : int 3 3 3 2 3 4 4 1 2 5 ...
## $ goout : int 4 3 2 2 2 2 4 4 2 1 ...
## $ Dalc : int 1 1 2 1 1 1 1 1 1 1 ...
## $ Walc : int 1 1 3 1 2 2 1 1 1 1 ...
## $ health : int 3 3 3 5 5 5 3 1 1 5 ...
## $ absences : int 6 4 10 2 4 10 0 6 0 0 ...
## $ G1 : int 5 5 7 15 6 15 12 6 16 14 ...
## $ G2 : int 6 5 8 14 10 15 12 5 18 15 ...
## $ G3 : int 6 6 10 15 10 15 11 6 19 15 ...
The most important step in any analysis is plotting data. This step gives us an overview of our data, but also allows us to see things such as extreme values that can affect the statistical techniques we choose to utilize.
##Plots for Student Performance
plot(Mjob,G3)
plot(health,G3)
Plots with labels:
plot(Medu,G3,xlab="Mother's Education",ylab="Final Grade")
plot(Fedu,G3,xlab="Father's Education",ylab="Final Grade")
Here we ‘model’ our data or create the ‘algorithm’ that will help us see the impact of our independent variables on our dependent variable. In this analysis our dependent (Y) variable is G3.
Model 1
## Simple Linear Regression
schoolmodel.1=lm(G3~school,data = Student_Performance_Ben_Gonzalez)
schoolmodel.1
##
## Call:
## lm(formula = G3 ~ school, data = Student_Performance_Ben_Gonzalez)
##
## Coefficients:
## (Intercept) schoolMS
## 10.4900 -0.6421
Model 2
schoolmodel.2=lm(G3~sex,data = Student_Performance_Ben_Gonzalez)
schoolmodel.2
##
## Call:
## lm(formula = G3 ~ sex, data = Student_Performance_Ben_Gonzalez)
##
## Coefficients:
## (Intercept) sexM
## 9.9663 0.9481
Here we will use more than one variable to predict the outcome of G3. Here we use the ‘.’ method after the ~ to tell R we want to use all the variables in the dataset. We can also utilize the + to add variables and we can also utilize the minus “-” symbol to subtract variables. These operations allow us some flexibility in forming the lm() arguments. The lm() function is used for both simple and multiple linear regression. We then utilize the summary argument to look at our results. We can also filter our “pull” certain summary data utilizing the ** dollar sign** operator on the end of our object (e.g. schoolmodelmultiple1.1$adjr2). We can name our object whatever we choose to. I recommend naming your object something meaningful you can remember for each particular dataset.
## Student Performance Multiple Linear Regression Models ##################################
schoolmodelmultiple1.1=lm(G3~.,data = Student_Performance_Ben_Gonzalez)
summary(schoolmodelmultiple1.1)
##
## Call:
## lm(formula = G3 ~ ., data = Student_Performance_Ben_Gonzalez)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.9339 -0.5532 0.2680 0.9689 4.6461
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.115488 2.116958 -0.527 0.598573
## schoolMS 0.480742 0.366512 1.312 0.190485
## sexM 0.174396 0.233588 0.747 0.455805
## age -0.173302 0.100780 -1.720 0.086380 .
## addressU 0.104455 0.270791 0.386 0.699922
## famsizeLE3 0.036512 0.226680 0.161 0.872128
## PstatusT -0.127673 0.335626 -0.380 0.703875
## Medu 0.129685 0.149999 0.865 0.387859
## Fedu -0.133940 0.128768 -1.040 0.298974
## Mjobhealth -0.146426 0.518491 -0.282 0.777796
## Mjobother 0.074088 0.332044 0.223 0.823565
## Mjobservices 0.046956 0.369587 0.127 0.898973
## Mjobteacher -0.026276 0.481632 -0.055 0.956522
## Fjobhealth 0.330948 0.666601 0.496 0.619871
## Fjobother -0.083582 0.476796 -0.175 0.860945
## Fjobservices -0.322142 0.493265 -0.653 0.514130
## Fjobteacher -0.112364 0.601448 -0.187 0.851907
## reasonhome -0.209183 0.256392 -0.816 0.415123
## reasonother 0.307554 0.380214 0.809 0.419120
## reasonreputation 0.129106 0.267254 0.483 0.629335
## guardianmother 0.195741 0.252672 0.775 0.439046
## guardianother 0.006565 0.463650 0.014 0.988710
## traveltime 0.096994 0.157800 0.615 0.539170
## studytime -0.104754 0.134814 -0.777 0.437667
## failures -0.160539 0.161006 -0.997 0.319399
## schoolsupyes 0.456448 0.319538 1.428 0.154043
## famsupyes 0.176870 0.224204 0.789 0.430710
## paidyes 0.075764 0.222100 0.341 0.733211
## activitiesyes -0.346047 0.205938 -1.680 0.093774 .
## nurseryyes -0.222716 0.254184 -0.876 0.381518
## higheryes 0.225921 0.500398 0.451 0.651919
## internetyes -0.144462 0.287528 -0.502 0.615679
## romanticyes -0.272008 0.219732 -1.238 0.216572
## famrel 0.356876 0.114124 3.127 0.001912 **
## freetime 0.047002 0.110209 0.426 0.670021
## goout 0.012007 0.105230 0.114 0.909224
## Dalc -0.185019 0.153124 -1.208 0.227741
## Walc 0.176772 0.114943 1.538 0.124966
## health 0.062995 0.074800 0.842 0.400259
## absences 0.045879 0.013412 3.421 0.000698 ***
## G1 0.188847 0.062373 3.028 0.002645 **
## G2 0.957330 0.053460 17.907 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.901 on 353 degrees of freedom
## Multiple R-squared: 0.8458, Adjusted R-squared: 0.8279
## F-statistic: 47.21 on 41 and 353 DF, p-value: < 2.2e-16
This step allow us to utilize a subset selection method. This method utilizes either a forward or backward method approach. The forward method adds a variable in each run and then finds which combination of variables works best. The backward method is the opposite and utilizes a ‘drop’ method and drops a variable in each model until it finds the best fit. There is also an nvmax = method where we determine the maximum number of variables we want to load into the model, and R then finds the best fitting model.
##Subset selection including forward selection modeling for Student Performance Dataset #############
library(leaps)
studentmodel.1=regsubsets(G3~.,data = Student_Performance_Ben_Gonzalez)
#Student Model 1
student.summary1=summary(studentmodel.1)
student.summary1
## Subset selection object
## Call: regsubsets.formula(G3 ~ ., data = Student_Performance_Ben_Gonzalez)
## 41 Variables (and intercept)
## Forced in Forced out
## schoolMS FALSE FALSE
## sexM FALSE FALSE
## age FALSE FALSE
## addressU FALSE FALSE
## famsizeLE3 FALSE FALSE
## PstatusT FALSE FALSE
## Medu FALSE FALSE
## Fedu FALSE FALSE
## Mjobhealth FALSE FALSE
## Mjobother FALSE FALSE
## Mjobservices FALSE FALSE
## Mjobteacher FALSE FALSE
## Fjobhealth FALSE FALSE
## Fjobother FALSE FALSE
## Fjobservices FALSE FALSE
## Fjobteacher FALSE FALSE
## reasonhome FALSE FALSE
## reasonother FALSE FALSE
## reasonreputation FALSE FALSE
## guardianmother FALSE FALSE
## guardianother FALSE FALSE
## traveltime FALSE FALSE
## studytime FALSE FALSE
## failures FALSE FALSE
## schoolsupyes FALSE FALSE
## famsupyes FALSE FALSE
## paidyes FALSE FALSE
## activitiesyes FALSE FALSE
## nurseryyes FALSE FALSE
## higheryes FALSE FALSE
## internetyes FALSE FALSE
## romanticyes FALSE FALSE
## famrel FALSE FALSE
## freetime FALSE FALSE
## goout FALSE FALSE
## Dalc FALSE FALSE
## Walc FALSE FALSE
## health FALSE FALSE
## absences FALSE FALSE
## G1 FALSE FALSE
## G2 FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
## schoolMS sexM age addressU famsizeLE3 PstatusT Medu Fedu
## 1 ( 1 ) " " " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " " " " " "
## 5 ( 1 ) " " " " "*" " " " " " " " " " "
## 6 ( 1 ) " " " " "*" " " " " " " " " " "
## 7 ( 1 ) " " " " "*" " " " " " " " " " "
## 8 ( 1 ) "*" " " "*" " " " " " " " " " "
## Mjobhealth Mjobother Mjobservices Mjobteacher Fjobhealth
## 1 ( 1 ) " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " "
## 6 ( 1 ) " " " " " " " " " "
## 7 ( 1 ) " " " " " " " " " "
## 8 ( 1 ) " " " " " " " " " "
## Fjobother Fjobservices Fjobteacher reasonhome reasonother
## 1 ( 1 ) " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " "
## 6 ( 1 ) " " "*" " " " " " "
## 7 ( 1 ) " " "*" " " " " " "
## 8 ( 1 ) " " "*" " " " " " "
## reasonreputation guardianmother guardianother traveltime
## 1 ( 1 ) " " " " " " " "
## 2 ( 1 ) " " " " " " " "
## 3 ( 1 ) " " " " " " " "
## 4 ( 1 ) " " " " " " " "
## 5 ( 1 ) " " " " " " " "
## 6 ( 1 ) " " " " " " " "
## 7 ( 1 ) " " " " " " " "
## 8 ( 1 ) " " " " " " " "
## studytime failures schoolsupyes famsupyes paidyes activitiesyes
## 1 ( 1 ) " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " "
## 4 ( 1 ) " " " " " " " " " " " "
## 5 ( 1 ) " " " " " " " " " " " "
## 6 ( 1 ) " " " " " " " " " " " "
## 7 ( 1 ) " " " " " " " " " " " "
## 8 ( 1 ) " " " " " " " " " " " "
## nurseryyes higheryes internetyes romanticyes famrel freetime
## 1 ( 1 ) " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " "*" " "
## 3 ( 1 ) " " " " " " " " "*" " "
## 4 ( 1 ) " " " " " " " " "*" " "
## 5 ( 1 ) " " " " " " " " "*" " "
## 6 ( 1 ) " " " " " " " " "*" " "
## 7 ( 1 ) " " " " " " " " "*" " "
## 8 ( 1 ) " " " " " " " " "*" " "
## goout Dalc Walc health absences G1 G2
## 1 ( 1 ) " " " " " " " " " " " " "*"
## 2 ( 1 ) " " " " " " " " " " " " "*"
## 3 ( 1 ) " " " " " " " " "*" " " "*"
## 4 ( 1 ) " " " " " " " " "*" "*" "*"
## 5 ( 1 ) " " " " " " " " "*" "*" "*"
## 6 ( 1 ) " " " " " " " " "*" "*" "*"
## 7 ( 1 ) " " " " "*" " " "*" "*" "*"
## 8 ( 1 ) " " " " "*" " " "*" "*" "*"
student.summary1$rsq
## [1] 0.8187861 0.8233988 0.8277751 0.8306520 0.8336020 0.8351525 0.8367070
## [8] 0.8379842
student.summary1$adjr2
## [1] 0.8183250 0.8224978 0.8264537 0.8289151 0.8314632 0.8326033 0.8337533
## [8] 0.8346263
student.summary1$bic
## [1] -662.7327 -666.9385 -670.8713 -671.5463 -672.5089 -670.2279 -667.9914
## [8] -665.1142
which.max(student.summary1$rss)
## [1] 1
which.min(student.summary1$bic)
## [1] 5
which.max(student.summary1$adjr2)
## [1] 8
In this step we can create a more flexible model to fit our data. The gam package allows us to utilize polynomial regression in our models. Dependent upon the type of data we have this can be a quite useful technique.
##Student Performance Generalized Additive Models ######################
library(gam)
train=sample(c(TRUE,FALSE),nrow(Student_Performance_Ben_Gonzalez),rep=TRUE)
test=-train
## Student Performance GAM Spline Based Approach
gam.studentperformance1=gam(G3~activities+s(age,2)+s(famrel,2)+s(absences,2)+s(G1,2)+s(G2,2),data = Student_Performance_Ben_Gonzalez)
plot(gam.studentperformance1,se=TRUE,col="red")
summary(gam.studentperformance1)
##
## Call: gam(formula = G3 ~ activities + s(age, 2) + s(famrel, 2) + s(absences,
## 2) + s(G1, 2) + s(G2, 2), data = Student_Performance_Ben_Gonzalez)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.5749 -0.4565 0.3059 0.9689 4.1166
##
## (Dispersion Parameter for gaussian family taken to be 3.2955)
##
## Null Deviance: 8269.909 on 394 degrees of freedom
## Residual Deviance: 1262.196 on 383.0002 degrees of freedom
## AIC: 1605.841
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## activities 1 1.2 1.2 0.364 0.5466777
## s(age, 2) 1 220.5 220.5 66.922 4.211e-15 ***
## s(famrel, 2) 1 36.8 36.8 11.163 0.0009166 ***
## s(absences, 2) 1 40.6 40.6 12.311 0.0005038 ***
## s(G1, 2) 1 5138.7 5138.7 1559.287 < 2.2e-16 ***
## s(G2, 2) 1 1434.6 1434.6 435.304 < 2.2e-16 ***
## Residuals 383 1262.2 3.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## activities
## s(age, 2) 1 0.8872 0.34683
## s(famrel, 2) 1 0.2173 0.64136
## s(absences, 2) 1 25.9843 5.426e-07 ***
## s(G1, 2) 1 1.0798 0.29939
## s(G2, 2) 1 2.9437 0.08702 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This step creates a random forest or ‘decision tree’ for our data. The random forest is a very useful tool in that it visualizes our data to a layperson audience. This helps individuals to understand our data and what impact each variable has on each other.
####Student Performance Dataset Tree Based Approaches########################################
## Student Performance Random Forest #############################################
library(tree)
names(Student_Performance_Ben_Gonzalez)
library(randomForest)
attach(Student_Performance_Ben_Gonzalez)
lstMSEs=numeric()
set.seed(1)
maxnumpreds=ncol(Student_Performance_Ben_Gonzalez)-1
maxnumtrees=10
for(numpreds in 1:maxnumpreds){
for(numtrees in 1:maxnumtrees){
nrow(Student_Performance_Ben_Gonzalez)
train=sample(1:nrow(Student_Performance_Ben_Gonzalez),nrow(Student_Performance_Ben_Gonzalez)/2)
model.bagged=randomForest(G3~.,data = Student_Performance_Ben_Gonzalez,subset = train,mtry=numpreds,ntree=numtrees,importance=TRUE)
pred.vals.bagged=predict(model.bagged,newdata = Student_Performance_Ben_Gonzalez[-train])
testvals=Student_Performance_Ben_Gonzalez$G3[-train]
mse=mean((pred.vals.bagged - testvals)^2)
lstMSEs=rbind(lstMSEs,mse)
print(paste(" Processed Trees:",numtrees))
}
print(paste(" Processed Predictors:",numpreds))
}
matMSEs=matrix(lstMSEs,nrow = maxnumpreds,ncol=maxnumtrees)
# print(paste("The optimal configuration is",loc[1],"predictors and",loc[2], "trees"))
# length(lstMSEs)
# list(lstMSEs)
min(lstMSEs)
min(matMSEs)
lstMSEs[1:10]
loc=which(matMSEs==min(matMSEs),arr.ind=TRUE)
print(paste("The optimal configuration is",loc[1],"predictors and",loc[2], "trees"))
length(lstMSEs)
print(paste(" Processed Trees:", numtrees))
print(paste(" Processed Predictors:",numpreds))
matMSEs[loc[1],loc[2]]
which(matMSEs==min(matMSEs),arr.ind = TRUE)
importance(model.bagged)
We can also use the tree() function to run a randomforest as well.
tree.student1=tree(G3~.,data = Student_Performance_Ben_Gonzalez)
plot(model.bagged)
Let’s create a useful visualization by plotting our tree. This shortened code is a more concise code than the code utilized above.
plot(tree.student1)
text(tree.student1,pretty = 0)
Now let’s look at which variables can cause an increase in our MSE.
varImpPlot(model.bagged)
model.bagged
min(lstMSEs)
This concludes the tutorial utilizing the Student Performance Dataset.