The dataset is from the UCI Machine Learning Repository and can be found by clicking the following link: https://archive.ics.uci.edu/ml/datasets/Student+Performance

Step 1: Load Libraries

In this step we need to get a few packages for our analysis. We can do this by using the following code. The dependencies = TRUE ensures we install all of the necessary packages the other packages depend on.

# install.packages(c("shiny","ggvis","reshape2","dplyr","gam","tree","randomForest"),dependencies = TRUE)

Now we are going to load our libraries so we can perform our analysis.

library(shiny)
library(ggvis)
library(reshape2)
library(dplyr)
library(splines)
library(gam)
library(ISLR)
library(tree)
library(randomForest)

Step 2: Import the data into R

We need to import our data into R. We can do this in several ways. In this tutorial we will be doing this via the read.csv() function that is available in base R. The header = T allows us to keep our headers from our excel sheet, and our sep = “,” tells R that our data is separated by commas. We can utilize tab / or semicolon ; as well in our ‘sep =’ argument. We are then renaming our dataset to ‘Student_Performance_Ben_Gonzalez’ which creates an object in R we can use throughout our analysis. The attach() function then ensures we have attached our dataset so we can analyze it.

##Student Performance Dataset-Ben Gonzalez ##########################

Student_Performance_Ben_Gonzalez <-read.csv("~/Datasets/Student_Performance_Ben_Gonzalez.csv",header = T, sep = ",")
attach(Student_Performance_Ben_Gonzalez)

Step 3: Look at variable names

This will give us an overview of what variables are in our dataset. I recommend doing this to help better understand our dataset.

names(Student_Performance_Ben_Gonzalez)
##  [1] "school"     "sex"        "age"        "address"    "famsize"   
##  [6] "Pstatus"    "Medu"       "Fedu"       "Mjob"       "Fjob"      
## [11] "reason"     "guardian"   "traveltime" "studytime"  "failures"  
## [16] "schoolsup"  "famsup"     "paid"       "activities" "nursery"   
## [21] "higher"     "internet"   "romantic"   "famrel"     "freetime"  
## [26] "goout"      "Dalc"       "Walc"       "health"     "absences"  
## [31] "G1"         "G2"         "G3"

Step 4: Look at a summary of our data

The summary overview gives us a solid understanding of our data, it also may highlight extreme values for us as well.

summary(Student_Performance_Ben_Gonzalez)
##  school   sex          age       address famsize   Pstatus      Medu      
##  GP:349   F:208   Min.   :15.0   R: 88   GT3:281   A: 41   Min.   :0.000  
##  MS: 46   M:187   1st Qu.:16.0   U:307   LE3:114   T:354   1st Qu.:2.000  
##                   Median :17.0                             Median :3.000  
##                   Mean   :16.7                             Mean   :2.749  
##                   3rd Qu.:18.0                             3rd Qu.:4.000  
##                   Max.   :22.0                             Max.   :4.000  
##       Fedu             Mjob           Fjob            reason   
##  Min.   :0.000   at_home : 59   at_home : 20   course    :145  
##  1st Qu.:2.000   health  : 34   health  : 18   home      :109  
##  Median :2.000   other   :141   other   :217   other     : 36  
##  Mean   :2.522   services:103   services:111   reputation:105  
##  3rd Qu.:3.000   teacher : 58   teacher : 29                   
##  Max.   :4.000                                                 
##    guardian     traveltime      studytime        failures      schoolsup
##  father: 90   Min.   :1.000   Min.   :1.000   Min.   :0.0000   no :344  
##  mother:273   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.0000   yes: 51  
##  other : 32   Median :1.000   Median :2.000   Median :0.0000            
##               Mean   :1.448   Mean   :2.035   Mean   :0.3342            
##               3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:0.0000            
##               Max.   :4.000   Max.   :4.000   Max.   :3.0000            
##  famsup     paid     activities nursery   higher    internet  romantic 
##  no :153   no :214   no :194    no : 81   no : 20   no : 66   no :263  
##  yes:242   yes:181   yes:201    yes:314   yes:375   yes:329   yes:132  
##                                                                        
##                                                                        
##                                                                        
##                                                                        
##      famrel         freetime         goout            Dalc      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:4.000   1st Qu.:3.000   1st Qu.:2.000   1st Qu.:1.000  
##  Median :4.000   Median :3.000   Median :3.000   Median :1.000  
##  Mean   :3.944   Mean   :3.235   Mean   :3.109   Mean   :1.481  
##  3rd Qu.:5.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:2.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##       Walc           health         absences            G1       
##  Min.   :1.000   Min.   :1.000   Min.   : 0.000   Min.   : 3.00  
##  1st Qu.:1.000   1st Qu.:3.000   1st Qu.: 0.000   1st Qu.: 8.00  
##  Median :2.000   Median :4.000   Median : 4.000   Median :11.00  
##  Mean   :2.291   Mean   :3.554   Mean   : 5.709   Mean   :10.91  
##  3rd Qu.:3.000   3rd Qu.:5.000   3rd Qu.: 8.000   3rd Qu.:13.00  
##  Max.   :5.000   Max.   :5.000   Max.   :75.000   Max.   :19.00  
##        G2              G3       
##  Min.   : 0.00   Min.   : 0.00  
##  1st Qu.: 9.00   1st Qu.: 8.00  
##  Median :11.00   Median :11.00  
##  Mean   :10.71   Mean   :10.42  
##  3rd Qu.:13.00   3rd Qu.:14.00  
##  Max.   :19.00   Max.   :20.00

Step 5: Look at the structure of our data

This step allows us to look at the structure of our data and which variables are integers or factors. This will help us in determining what statistical techniques we can utilize on our data.

str(Student_Performance_Ben_Gonzalez)
## 'data.frame':    395 obs. of  33 variables:
##  $ school    : Factor w/ 2 levels "GP","MS": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sex       : Factor w/ 2 levels "F","M": 1 1 1 1 1 2 2 1 2 2 ...
##  $ age       : int  18 17 15 15 16 16 16 17 15 15 ...
##  $ address   : Factor w/ 2 levels "R","U": 2 2 2 2 2 2 2 2 2 2 ...
##  $ famsize   : Factor w/ 2 levels "GT3","LE3": 1 1 2 1 1 2 2 1 2 1 ...
##  $ Pstatus   : Factor w/ 2 levels "A","T": 1 2 2 2 2 2 2 1 1 2 ...
##  $ Medu      : int  4 1 1 4 3 4 2 4 3 3 ...
##  $ Fedu      : int  4 1 1 2 3 3 2 4 2 4 ...
##  $ Mjob      : Factor w/ 5 levels "at_home","health",..: 1 1 1 2 3 4 3 3 4 3 ...
##  $ Fjob      : Factor w/ 5 levels "at_home","health",..: 5 3 3 4 3 3 3 5 3 3 ...
##  $ reason    : Factor w/ 4 levels "course","home",..: 1 1 3 2 2 4 2 2 2 2 ...
##  $ guardian  : Factor w/ 3 levels "father","mother",..: 2 1 2 2 1 2 2 2 2 2 ...
##  $ traveltime: int  2 1 1 1 1 1 1 2 1 1 ...
##  $ studytime : int  2 2 2 3 2 2 2 2 2 2 ...
##  $ failures  : int  0 0 3 0 0 0 0 0 0 0 ...
##  $ schoolsup : Factor w/ 2 levels "no","yes": 2 1 2 1 1 1 1 2 1 1 ...
##  $ famsup    : Factor w/ 2 levels "no","yes": 1 2 1 2 2 2 1 2 2 2 ...
##  $ paid      : Factor w/ 2 levels "no","yes": 1 1 2 2 2 2 1 1 2 2 ...
##  $ activities: Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 1 1 1 2 ...
##  $ nursery   : Factor w/ 2 levels "no","yes": 2 1 2 2 2 2 2 2 2 2 ...
##  $ higher    : Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ internet  : Factor w/ 2 levels "no","yes": 1 2 2 2 1 2 2 1 2 2 ...
##  $ romantic  : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ famrel    : int  4 5 4 3 4 5 4 4 4 5 ...
##  $ freetime  : int  3 3 3 2 3 4 4 1 2 5 ...
##  $ goout     : int  4 3 2 2 2 2 4 4 2 1 ...
##  $ Dalc      : int  1 1 2 1 1 1 1 1 1 1 ...
##  $ Walc      : int  1 1 3 1 2 2 1 1 1 1 ...
##  $ health    : int  3 3 3 5 5 5 3 1 1 5 ...
##  $ absences  : int  6 4 10 2 4 10 0 6 0 0 ...
##  $ G1        : int  5 5 7 15 6 15 12 6 16 14 ...
##  $ G2        : int  6 5 8 14 10 15 12 5 18 15 ...
##  $ G3        : int  6 6 10 15 10 15 11 6 19 15 ...

Step 6: Plot our data

The most important step in any analysis is plotting data. This step gives us an overview of our data, but also allows us to see things such as extreme values that can affect the statistical techniques we choose to utilize.

##Plots for Student Performance

plot(Mjob,G3)

plot(health,G3)

Plots with labels:

plot(Medu,G3,xlab="Mother's Education",ylab="Final Grade")

plot(Fedu,G3,xlab="Father's Education",ylab="Final Grade")

Step 7: Simple Linear Regression

Here we ‘model’ our data or create the ‘algorithm’ that will help us see the impact of our independent variables on our dependent variable. In this analysis our dependent (Y) variable is G3.

Model 1

## Simple Linear Regression
schoolmodel.1=lm(G3~school,data = Student_Performance_Ben_Gonzalez)
schoolmodel.1
## 
## Call:
## lm(formula = G3 ~ school, data = Student_Performance_Ben_Gonzalez)
## 
## Coefficients:
## (Intercept)     schoolMS  
##     10.4900      -0.6421

Model 2

schoolmodel.2=lm(G3~sex,data = Student_Performance_Ben_Gonzalez)
schoolmodel.2
## 
## Call:
## lm(formula = G3 ~ sex, data = Student_Performance_Ben_Gonzalez)
## 
## Coefficients:
## (Intercept)         sexM  
##      9.9663       0.9481

Step 8: Multiple Linear Regression

Here we will use more than one variable to predict the outcome of G3. Here we use the ‘.’ method after the ~ to tell R we want to use all the variables in the dataset. We can also utilize the + to add variables and we can also utilize the minus “-” symbol to subtract variables. These operations allow us some flexibility in forming the lm() arguments. The lm() function is used for both simple and multiple linear regression. We then utilize the summary argument to look at our results. We can also filter our “pull” certain summary data utilizing the ** dollar sign** operator on the end of our object (e.g. schoolmodelmultiple1.1$adjr2). We can name our object whatever we choose to. I recommend naming your object something meaningful you can remember for each particular dataset.

## Student Performance Multiple Linear Regression Models ##################################
schoolmodelmultiple1.1=lm(G3~.,data = Student_Performance_Ben_Gonzalez)
summary(schoolmodelmultiple1.1)
## 
## Call:
## lm(formula = G3 ~ ., data = Student_Performance_Ben_Gonzalez)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.9339 -0.5532  0.2680  0.9689  4.6461 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1.115488   2.116958  -0.527 0.598573    
## schoolMS          0.480742   0.366512   1.312 0.190485    
## sexM              0.174396   0.233588   0.747 0.455805    
## age              -0.173302   0.100780  -1.720 0.086380 .  
## addressU          0.104455   0.270791   0.386 0.699922    
## famsizeLE3        0.036512   0.226680   0.161 0.872128    
## PstatusT         -0.127673   0.335626  -0.380 0.703875    
## Medu              0.129685   0.149999   0.865 0.387859    
## Fedu             -0.133940   0.128768  -1.040 0.298974    
## Mjobhealth       -0.146426   0.518491  -0.282 0.777796    
## Mjobother         0.074088   0.332044   0.223 0.823565    
## Mjobservices      0.046956   0.369587   0.127 0.898973    
## Mjobteacher      -0.026276   0.481632  -0.055 0.956522    
## Fjobhealth        0.330948   0.666601   0.496 0.619871    
## Fjobother        -0.083582   0.476796  -0.175 0.860945    
## Fjobservices     -0.322142   0.493265  -0.653 0.514130    
## Fjobteacher      -0.112364   0.601448  -0.187 0.851907    
## reasonhome       -0.209183   0.256392  -0.816 0.415123    
## reasonother       0.307554   0.380214   0.809 0.419120    
## reasonreputation  0.129106   0.267254   0.483 0.629335    
## guardianmother    0.195741   0.252672   0.775 0.439046    
## guardianother     0.006565   0.463650   0.014 0.988710    
## traveltime        0.096994   0.157800   0.615 0.539170    
## studytime        -0.104754   0.134814  -0.777 0.437667    
## failures         -0.160539   0.161006  -0.997 0.319399    
## schoolsupyes      0.456448   0.319538   1.428 0.154043    
## famsupyes         0.176870   0.224204   0.789 0.430710    
## paidyes           0.075764   0.222100   0.341 0.733211    
## activitiesyes    -0.346047   0.205938  -1.680 0.093774 .  
## nurseryyes       -0.222716   0.254184  -0.876 0.381518    
## higheryes         0.225921   0.500398   0.451 0.651919    
## internetyes      -0.144462   0.287528  -0.502 0.615679    
## romanticyes      -0.272008   0.219732  -1.238 0.216572    
## famrel            0.356876   0.114124   3.127 0.001912 ** 
## freetime          0.047002   0.110209   0.426 0.670021    
## goout             0.012007   0.105230   0.114 0.909224    
## Dalc             -0.185019   0.153124  -1.208 0.227741    
## Walc              0.176772   0.114943   1.538 0.124966    
## health            0.062995   0.074800   0.842 0.400259    
## absences          0.045879   0.013412   3.421 0.000698 ***
## G1                0.188847   0.062373   3.028 0.002645 ** 
## G2                0.957330   0.053460  17.907  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.901 on 353 degrees of freedom
## Multiple R-squared:  0.8458, Adjusted R-squared:  0.8279 
## F-statistic: 47.21 on 41 and 353 DF,  p-value: < 2.2e-16

Step 9: Subset Selection Method

This step allow us to utilize a subset selection method. This method utilizes either a forward or backward method approach. The forward method adds a variable in each run and then finds which combination of variables works best. The backward method is the opposite and utilizes a ‘drop’ method and drops a variable in each model until it finds the best fit. There is also an nvmax = method where we determine the maximum number of variables we want to load into the model, and R then finds the best fitting model.

##Subset selection including forward selection modeling for Student Performance Dataset #############
library(leaps)

studentmodel.1=regsubsets(G3~.,data = Student_Performance_Ben_Gonzalez)
#Student Model 1
student.summary1=summary(studentmodel.1)
student.summary1
## Subset selection object
## Call: regsubsets.formula(G3 ~ ., data = Student_Performance_Ben_Gonzalez)
## 41 Variables  (and intercept)
##                  Forced in Forced out
## schoolMS             FALSE      FALSE
## sexM                 FALSE      FALSE
## age                  FALSE      FALSE
## addressU             FALSE      FALSE
## famsizeLE3           FALSE      FALSE
## PstatusT             FALSE      FALSE
## Medu                 FALSE      FALSE
## Fedu                 FALSE      FALSE
## Mjobhealth           FALSE      FALSE
## Mjobother            FALSE      FALSE
## Mjobservices         FALSE      FALSE
## Mjobteacher          FALSE      FALSE
## Fjobhealth           FALSE      FALSE
## Fjobother            FALSE      FALSE
## Fjobservices         FALSE      FALSE
## Fjobteacher          FALSE      FALSE
## reasonhome           FALSE      FALSE
## reasonother          FALSE      FALSE
## reasonreputation     FALSE      FALSE
## guardianmother       FALSE      FALSE
## guardianother        FALSE      FALSE
## traveltime           FALSE      FALSE
## studytime            FALSE      FALSE
## failures             FALSE      FALSE
## schoolsupyes         FALSE      FALSE
## famsupyes            FALSE      FALSE
## paidyes              FALSE      FALSE
## activitiesyes        FALSE      FALSE
## nurseryyes           FALSE      FALSE
## higheryes            FALSE      FALSE
## internetyes          FALSE      FALSE
## romanticyes          FALSE      FALSE
## famrel               FALSE      FALSE
## freetime             FALSE      FALSE
## goout                FALSE      FALSE
## Dalc                 FALSE      FALSE
## Walc                 FALSE      FALSE
## health               FALSE      FALSE
## absences             FALSE      FALSE
## G1                   FALSE      FALSE
## G2                   FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: exhaustive
##          schoolMS sexM age addressU famsizeLE3 PstatusT Medu Fedu
## 1  ( 1 ) " "      " "  " " " "      " "        " "      " "  " " 
## 2  ( 1 ) " "      " "  " " " "      " "        " "      " "  " " 
## 3  ( 1 ) " "      " "  " " " "      " "        " "      " "  " " 
## 4  ( 1 ) " "      " "  " " " "      " "        " "      " "  " " 
## 5  ( 1 ) " "      " "  "*" " "      " "        " "      " "  " " 
## 6  ( 1 ) " "      " "  "*" " "      " "        " "      " "  " " 
## 7  ( 1 ) " "      " "  "*" " "      " "        " "      " "  " " 
## 8  ( 1 ) "*"      " "  "*" " "      " "        " "      " "  " " 
##          Mjobhealth Mjobother Mjobservices Mjobteacher Fjobhealth
## 1  ( 1 ) " "        " "       " "          " "         " "       
## 2  ( 1 ) " "        " "       " "          " "         " "       
## 3  ( 1 ) " "        " "       " "          " "         " "       
## 4  ( 1 ) " "        " "       " "          " "         " "       
## 5  ( 1 ) " "        " "       " "          " "         " "       
## 6  ( 1 ) " "        " "       " "          " "         " "       
## 7  ( 1 ) " "        " "       " "          " "         " "       
## 8  ( 1 ) " "        " "       " "          " "         " "       
##          Fjobother Fjobservices Fjobteacher reasonhome reasonother
## 1  ( 1 ) " "       " "          " "         " "        " "        
## 2  ( 1 ) " "       " "          " "         " "        " "        
## 3  ( 1 ) " "       " "          " "         " "        " "        
## 4  ( 1 ) " "       " "          " "         " "        " "        
## 5  ( 1 ) " "       " "          " "         " "        " "        
## 6  ( 1 ) " "       "*"          " "         " "        " "        
## 7  ( 1 ) " "       "*"          " "         " "        " "        
## 8  ( 1 ) " "       "*"          " "         " "        " "        
##          reasonreputation guardianmother guardianother traveltime
## 1  ( 1 ) " "              " "            " "           " "       
## 2  ( 1 ) " "              " "            " "           " "       
## 3  ( 1 ) " "              " "            " "           " "       
## 4  ( 1 ) " "              " "            " "           " "       
## 5  ( 1 ) " "              " "            " "           " "       
## 6  ( 1 ) " "              " "            " "           " "       
## 7  ( 1 ) " "              " "            " "           " "       
## 8  ( 1 ) " "              " "            " "           " "       
##          studytime failures schoolsupyes famsupyes paidyes activitiesyes
## 1  ( 1 ) " "       " "      " "          " "       " "     " "          
## 2  ( 1 ) " "       " "      " "          " "       " "     " "          
## 3  ( 1 ) " "       " "      " "          " "       " "     " "          
## 4  ( 1 ) " "       " "      " "          " "       " "     " "          
## 5  ( 1 ) " "       " "      " "          " "       " "     " "          
## 6  ( 1 ) " "       " "      " "          " "       " "     " "          
## 7  ( 1 ) " "       " "      " "          " "       " "     " "          
## 8  ( 1 ) " "       " "      " "          " "       " "     " "          
##          nurseryyes higheryes internetyes romanticyes famrel freetime
## 1  ( 1 ) " "        " "       " "         " "         " "    " "     
## 2  ( 1 ) " "        " "       " "         " "         "*"    " "     
## 3  ( 1 ) " "        " "       " "         " "         "*"    " "     
## 4  ( 1 ) " "        " "       " "         " "         "*"    " "     
## 5  ( 1 ) " "        " "       " "         " "         "*"    " "     
## 6  ( 1 ) " "        " "       " "         " "         "*"    " "     
## 7  ( 1 ) " "        " "       " "         " "         "*"    " "     
## 8  ( 1 ) " "        " "       " "         " "         "*"    " "     
##          goout Dalc Walc health absences G1  G2 
## 1  ( 1 ) " "   " "  " "  " "    " "      " " "*"
## 2  ( 1 ) " "   " "  " "  " "    " "      " " "*"
## 3  ( 1 ) " "   " "  " "  " "    "*"      " " "*"
## 4  ( 1 ) " "   " "  " "  " "    "*"      "*" "*"
## 5  ( 1 ) " "   " "  " "  " "    "*"      "*" "*"
## 6  ( 1 ) " "   " "  " "  " "    "*"      "*" "*"
## 7  ( 1 ) " "   " "  "*"  " "    "*"      "*" "*"
## 8  ( 1 ) " "   " "  "*"  " "    "*"      "*" "*"
student.summary1$rsq
## [1] 0.8187861 0.8233988 0.8277751 0.8306520 0.8336020 0.8351525 0.8367070
## [8] 0.8379842
student.summary1$adjr2
## [1] 0.8183250 0.8224978 0.8264537 0.8289151 0.8314632 0.8326033 0.8337533
## [8] 0.8346263
student.summary1$bic
## [1] -662.7327 -666.9385 -670.8713 -671.5463 -672.5089 -670.2279 -667.9914
## [8] -665.1142
which.max(student.summary1$rss)
## [1] 1
which.min(student.summary1$bic)
## [1] 5
which.max(student.summary1$adjr2)
## [1] 8

Step 10: Generalized Additive Models

In this step we can create a more flexible model to fit our data. The gam package allows us to utilize polynomial regression in our models. Dependent upon the type of data we have this can be a quite useful technique.

##Student Performance Generalized Additive Models ######################
library(gam)
train=sample(c(TRUE,FALSE),nrow(Student_Performance_Ben_Gonzalez),rep=TRUE)
test=-train
## Student Performance GAM Spline Based Approach
gam.studentperformance1=gam(G3~activities+s(age,2)+s(famrel,2)+s(absences,2)+s(G1,2)+s(G2,2),data = Student_Performance_Ben_Gonzalez)
plot(gam.studentperformance1,se=TRUE,col="red")

summary(gam.studentperformance1)
## 
## Call: gam(formula = G3 ~ activities + s(age, 2) + s(famrel, 2) + s(absences, 
##     2) + s(G1, 2) + s(G2, 2), data = Student_Performance_Ben_Gonzalez)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.5749 -0.4565  0.3059  0.9689  4.1166 
## 
## (Dispersion Parameter for gaussian family taken to be 3.2955)
## 
##     Null Deviance: 8269.909 on 394 degrees of freedom
## Residual Deviance: 1262.196 on 383.0002 degrees of freedom
## AIC: 1605.841 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##                 Df Sum Sq Mean Sq  F value    Pr(>F)    
## activities       1    1.2     1.2    0.364 0.5466777    
## s(age, 2)        1  220.5   220.5   66.922 4.211e-15 ***
## s(famrel, 2)     1   36.8    36.8   11.163 0.0009166 ***
## s(absences, 2)   1   40.6    40.6   12.311 0.0005038 ***
## s(G1, 2)         1 5138.7  5138.7 1559.287 < 2.2e-16 ***
## s(G2, 2)         1 1434.6  1434.6  435.304 < 2.2e-16 ***
## Residuals      383 1262.2     3.3                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                Npar Df  Npar F     Pr(F)    
## (Intercept)                                 
## activities                                  
## s(age, 2)            1  0.8872   0.34683    
## s(famrel, 2)         1  0.2173   0.64136    
## s(absences, 2)       1 25.9843 5.426e-07 ***
## s(G1, 2)             1  1.0798   0.29939    
## s(G2, 2)             1  2.9437   0.08702 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Step 11: Random Forest

This step creates a random forest or ‘decision tree’ for our data. The random forest is a very useful tool in that it visualizes our data to a layperson audience. This helps individuals to understand our data and what impact each variable has on each other.

####Student Performance Dataset Tree Based Approaches########################################


## Student Performance Random Forest #############################################
library(tree)
names(Student_Performance_Ben_Gonzalez)
library(randomForest)
attach(Student_Performance_Ben_Gonzalez)


lstMSEs=numeric()
set.seed(1)
maxnumpreds=ncol(Student_Performance_Ben_Gonzalez)-1
maxnumtrees=10

for(numpreds in 1:maxnumpreds){
  for(numtrees in 1:maxnumtrees){
    
    nrow(Student_Performance_Ben_Gonzalez)
    train=sample(1:nrow(Student_Performance_Ben_Gonzalez),nrow(Student_Performance_Ben_Gonzalez)/2)
    
    
    model.bagged=randomForest(G3~.,data = Student_Performance_Ben_Gonzalez,subset = train,mtry=numpreds,ntree=numtrees,importance=TRUE)
    
    
    
    pred.vals.bagged=predict(model.bagged,newdata = Student_Performance_Ben_Gonzalez[-train])
    testvals=Student_Performance_Ben_Gonzalez$G3[-train]
    mse=mean((pred.vals.bagged - testvals)^2)
    lstMSEs=rbind(lstMSEs,mse)
    print(paste("     Processed Trees:",numtrees))
  }
  print(paste("     Processed Predictors:",numpreds))
}

matMSEs=matrix(lstMSEs,nrow = maxnumpreds,ncol=maxnumtrees)


# print(paste("The optimal configuration is",loc[1],"predictors and",loc[2], "trees"))
# length(lstMSEs)
# list(lstMSEs)

min(lstMSEs)
min(matMSEs)
lstMSEs[1:10]

loc=which(matMSEs==min(matMSEs),arr.ind=TRUE)
print(paste("The optimal configuration is",loc[1],"predictors and",loc[2], "trees"))
length(lstMSEs)
print(paste("        Processed Trees:", numtrees))
print(paste("        Processed Predictors:",numpreds))
matMSEs[loc[1],loc[2]]



which(matMSEs==min(matMSEs),arr.ind = TRUE)
importance(model.bagged)

We can also use the tree() function to run a randomforest as well.

tree.student1=tree(G3~.,data = Student_Performance_Ben_Gonzalez)
plot(model.bagged)

Let’s create a useful visualization by plotting our tree. This shortened code is a more concise code than the code utilized above.

plot(tree.student1)
text(tree.student1,pretty = 0)

Now let’s look at which variables can cause an increase in our MSE.

varImpPlot(model.bagged)

model.bagged
min(lstMSEs)

This concludes the tutorial utilizing the Student Performance Dataset.