set.seed(123)

mat_df <- data.frame(matrix(ncol = 3, nrow = 0) ,stringsAsFactors = FALSE)
por_df <- data.frame(matrix(ncol = 3, nrow = 0) ,stringsAsFactors = FALSE)

1. Data Exploration

mat <- read.csv("https://raw.githubusercontent.com/irene908/DATA621/main/student-mat.csv")

por <- read.csv("https://raw.githubusercontent.com/irene908/DATA621/main/student-por.csv")
summary(mat)
##     school              sex                 age         address         
##  Length:395         Length:395         Min.   :15.0   Length:395        
##  Class :character   Class :character   1st Qu.:16.0   Class :character  
##  Mode  :character   Mode  :character   Median :17.0   Mode  :character  
##                                        Mean   :16.7                     
##                                        3rd Qu.:18.0                     
##                                        Max.   :22.0                     
##    famsize            Pstatus               Medu            Fedu      
##  Length:395         Length:395         Min.   :0.000   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:2.000   1st Qu.:2.000  
##  Mode  :character   Mode  :character   Median :3.000   Median :2.000  
##                                        Mean   :2.749   Mean   :2.522  
##                                        3rd Qu.:4.000   3rd Qu.:3.000  
##                                        Max.   :4.000   Max.   :4.000  
##      Mjob               Fjob              reason            guardian        
##  Length:395         Length:395         Length:395         Length:395        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    traveltime      studytime        failures       schoolsup        
##  Min.   :1.000   Min.   :1.000   Min.   :0.0000   Length:395        
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.0000   Class :character  
##  Median :1.000   Median :2.000   Median :0.0000   Mode  :character  
##  Mean   :1.448   Mean   :2.035   Mean   :0.3342                     
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:0.0000                     
##  Max.   :4.000   Max.   :4.000   Max.   :3.0000                     
##     famsup              paid            activities          nursery         
##  Length:395         Length:395         Length:395         Length:395        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     higher            internet           romantic             famrel     
##  Length:395         Length:395         Length:395         Min.   :1.000  
##  Class :character   Class :character   Class :character   1st Qu.:4.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :4.000  
##                                                           Mean   :3.944  
##                                                           3rd Qu.:5.000  
##                                                           Max.   :5.000  
##     freetime         goout            Dalc            Walc      
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :3.000   Median :3.000   Median :1.000   Median :2.000  
##  Mean   :3.235   Mean   :3.109   Mean   :1.481   Mean   :2.291  
##  3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:2.000   3rd Qu.:3.000  
##  Max.   :5.000   Max.   :5.000   Max.   :5.000   Max.   :5.000  
##      health         absences            G1              G2       
##  Min.   :1.000   Min.   : 0.000   Min.   : 3.00   Min.   : 0.00  
##  1st Qu.:3.000   1st Qu.: 0.000   1st Qu.: 8.00   1st Qu.: 9.00  
##  Median :4.000   Median : 4.000   Median :11.00   Median :11.00  
##  Mean   :3.554   Mean   : 5.709   Mean   :10.91   Mean   :10.71  
##  3rd Qu.:5.000   3rd Qu.: 8.000   3rd Qu.:13.00   3rd Qu.:13.00  
##  Max.   :5.000   Max.   :75.000   Max.   :19.00   Max.   :19.00  
##        G3       
##  Min.   : 0.00  
##  1st Qu.: 8.00  
##  Median :11.00  
##  Mean   :10.42  
##  3rd Qu.:14.00  
##  Max.   :20.00
summary(por)
##     school              sex                 age          address         
##  Length:649         Length:649         Min.   :15.00   Length:649        
##  Class :character   Class :character   1st Qu.:16.00   Class :character  
##  Mode  :character   Mode  :character   Median :17.00   Mode  :character  
##                                        Mean   :16.74                     
##                                        3rd Qu.:18.00                     
##                                        Max.   :22.00                     
##    famsize            Pstatus               Medu            Fedu      
##  Length:649         Length:649         Min.   :0.000   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:2.000   1st Qu.:1.000  
##  Mode  :character   Mode  :character   Median :2.000   Median :2.000  
##                                        Mean   :2.515   Mean   :2.307  
##                                        3rd Qu.:4.000   3rd Qu.:3.000  
##                                        Max.   :4.000   Max.   :4.000  
##      Mjob               Fjob              reason            guardian        
##  Length:649         Length:649         Length:649         Length:649        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    traveltime      studytime        failures       schoolsup        
##  Min.   :1.000   Min.   :1.000   Min.   :0.0000   Length:649        
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:0.0000   Class :character  
##  Median :1.000   Median :2.000   Median :0.0000   Mode  :character  
##  Mean   :1.569   Mean   :1.931   Mean   :0.2219                     
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:0.0000                     
##  Max.   :4.000   Max.   :4.000   Max.   :3.0000                     
##     famsup              paid            activities          nursery         
##  Length:649         Length:649         Length:649         Length:649        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     higher            internet           romantic             famrel     
##  Length:649         Length:649         Length:649         Min.   :1.000  
##  Class :character   Class :character   Class :character   1st Qu.:4.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :4.000  
##                                                           Mean   :3.931  
##                                                           3rd Qu.:5.000  
##                                                           Max.   :5.000  
##     freetime        goout            Dalc            Walc          health     
##  Min.   :1.00   Min.   :1.000   Min.   :1.000   Min.   :1.00   Min.   :1.000  
##  1st Qu.:3.00   1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.00   1st Qu.:2.000  
##  Median :3.00   Median :3.000   Median :1.000   Median :2.00   Median :4.000  
##  Mean   :3.18   Mean   :3.185   Mean   :1.502   Mean   :2.28   Mean   :3.536  
##  3rd Qu.:4.00   3rd Qu.:4.000   3rd Qu.:2.000   3rd Qu.:3.00   3rd Qu.:5.000  
##  Max.   :5.00   Max.   :5.000   Max.   :5.000   Max.   :5.00   Max.   :5.000  
##     absences            G1             G2              G3       
##  Min.   : 0.000   Min.   : 0.0   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.: 0.000   1st Qu.:10.0   1st Qu.:10.00   1st Qu.:10.00  
##  Median : 2.000   Median :11.0   Median :11.00   Median :12.00  
##  Mean   : 3.659   Mean   :11.4   Mean   :11.57   Mean   :11.91  
##  3rd Qu.: 6.000   3rd Qu.:13.0   3rd Qu.:13.00   3rd Qu.:14.00  
##  Max.   :32.000   Max.   :19.0   Max.   :19.00   Max.   :19.00
#mat <- as.data.frame(mat)
mat_new <- mat %>% gather(key = 'key', value = 'value')
mat_new %>% gather() %>% ggplot(aes(x= value)) + geom_histogram(stat = "count") + facet_wrap(~key, scales = 'free', ncol=7)
## Warning: Ignoring unknown parameters: binwidth, bins, pad

#mat <- as.data.frame(mat)
por_new <- por %>% gather(key = 'key', value = 'value')
por_new %>% gather() %>% ggplot(aes(x= value)) + geom_histogram(stat = "count") + facet_wrap(~key, scales = 'free', ncol=7)
## Warning: Ignoring unknown parameters: binwidth, bins, pad

corrplot(cor(mat[,sapply(mat, is.numeric)], use = "complete.obs"), type="upper", diag = FALSE, col=brewer.pal(n=10, name="Paired"))

corrplot(cor(por[,sapply(por, is.numeric)], use = "complete.obs"), type="upper", diag = FALSE, col=brewer.pal(n=10, name="Paired"))

2. Data Preparation

#encode math data
mat_encoder <- onehot(mat, stringsAsFactors = TRUE)
mat <- predict(mat_encoder, mat)
remove <- c("school=GP", "sex=F", "address=U", "famsize=LE3", "Pstatus=T",
                        "Mjob=at_home", "Fjob=at_home","reason=course", "guardian=other",
                        "schoolsup=no", "famsup=no", "paid=no", "activities=no", "nursery=no",
                        "higher=no", "internet=no", "romantic=no")
mat <- mat[,!(colnames(mat) %in% remove)]

#encode por data
por_encoder <- onehot(por, stringsAsFactors = TRUE)
por <- predict(por_encoder, por)
por <- por[,!(colnames(por) %in% remove)]
head(mat)
##      school=MS sex=M age address=R famsize=GT3 Pstatus=A Medu Fedu Mjob=health
## [1,]         0     0  18         0           1         1    4    4           0
## [2,]         0     0  17         0           1         0    1    1           0
## [3,]         0     0  15         0           0         0    1    1           0
## [4,]         0     0  15         0           1         0    4    2           1
## [5,]         0     0  16         0           1         0    3    3           0
## [6,]         0     1  16         0           0         0    4    3           0
##      Mjob=other Mjob=services Mjob=teacher Fjob=health Fjob=other Fjob=services
## [1,]          0             0            0           0          0             0
## [2,]          0             0            0           0          1             0
## [3,]          0             0            0           0          1             0
## [4,]          0             0            0           0          0             1
## [5,]          1             0            0           0          1             0
## [6,]          0             1            0           0          1             0
##      Fjob=teacher reason=home reason=other reason=reputation guardian=father
## [1,]            1           0            0                 0               0
## [2,]            0           0            0                 0               1
## [3,]            0           0            1                 0               0
## [4,]            0           1            0                 0               0
## [5,]            0           1            0                 0               1
## [6,]            0           0            0                 1               0
##      guardian=mother traveltime studytime failures schoolsup=yes famsup=yes
## [1,]               1          2         2        0             1          0
## [2,]               0          1         2        0             0          1
## [3,]               1          1         2        3             1          0
## [4,]               1          1         3        0             0          1
## [5,]               0          1         2        0             0          1
## [6,]               1          1         2        0             0          1
##      paid=yes activities=yes nursery=yes higher=yes internet=yes romantic=yes
## [1,]        0              0           1          1            0            0
## [2,]        0              0           0          1            1            0
## [3,]        1              0           1          1            1            0
## [4,]        1              1           1          1            1            1
## [5,]        1              0           1          1            0            0
## [6,]        1              1           1          1            1            0
##      famrel freetime goout Dalc Walc health absences G1 G2 G3
## [1,]      4        3     4    1    1      3        6  5  6  6
## [2,]      5        3     3    1    1      3        4  5  5  6
## [3,]      4        3     2    2    3      3       10  7  8 10
## [4,]      3        2     2    1    1      5        2 15 14 15
## [5,]      4        3     2    1    2      5        4  6 10 10
## [6,]      5        4     2    1    2      5       10 15 15 15
head(por)
##      school=MS sex=M age address=R famsize=GT3 Pstatus=A Medu Fedu Mjob=health
## [1,]         0     0  18         0           1         1    4    4           0
## [2,]         0     0  17         0           1         0    1    1           0
## [3,]         0     0  15         0           0         0    1    1           0
## [4,]         0     0  15         0           1         0    4    2           1
## [5,]         0     0  16         0           1         0    3    3           0
## [6,]         0     1  16         0           0         0    4    3           0
##      Mjob=other Mjob=services Mjob=teacher Fjob=health Fjob=other Fjob=services
## [1,]          0             0            0           0          0             0
## [2,]          0             0            0           0          1             0
## [3,]          0             0            0           0          1             0
## [4,]          0             0            0           0          0             1
## [5,]          1             0            0           0          1             0
## [6,]          0             1            0           0          1             0
##      Fjob=teacher reason=home reason=other reason=reputation guardian=father
## [1,]            1           0            0                 0               0
## [2,]            0           0            0                 0               1
## [3,]            0           0            1                 0               0
## [4,]            0           1            0                 0               0
## [5,]            0           1            0                 0               1
## [6,]            0           0            0                 1               0
##      guardian=mother traveltime studytime failures schoolsup=yes famsup=yes
## [1,]               1          2         2        0             1          0
## [2,]               0          1         2        0             0          1
## [3,]               1          1         2        0             1          0
## [4,]               1          1         3        0             0          1
## [5,]               0          1         2        0             0          1
## [6,]               1          1         2        0             0          1
##      paid=yes activities=yes nursery=yes higher=yes internet=yes romantic=yes
## [1,]        0              0           1          1            0            0
## [2,]        0              0           0          1            1            0
## [3,]        0              0           1          1            1            0
## [4,]        0              1           1          1            1            1
## [5,]        0              0           1          1            0            0
## [6,]        0              1           1          1            1            0
##      famrel freetime goout Dalc Walc health absences G1 G2 G3
## [1,]      4        3     4    1    1      3        4  0 11 11
## [2,]      5        3     3    1    1      3        2  9 11 11
## [3,]      4        3     2    2    3      3        6 12 13 12
## [4,]      3        2     2    1    1      5        0 14 14 14
## [5,]      4        3     2    1    2      5        0 11 13 13
## [6,]      5        4     2    1    2      5        6 12 12 13

3. Build Models

Split the datasets into train and test in the ration 80:20.

#mat
mat<-as.data.frame(mat)
sample <- sample.split(mat, SplitRatio = 0.8)

mat_train <- subset(mat,sample==TRUE)
mat_test <- subset(mat,sample==FALSE)
mat_final_test <- mat_test


#por
por<-as.data.frame(por)
sample <- sample.split(por, SplitRatio = 0.8)

por_train <- subset(por,sample==TRUE)
por_test <- subset(por,sample==FALSE)
por_final_test <- por_test
#row count of mat. train, test sets
df_mat <- c(nrow(mat),nrow(mat_train),nrow(mat_test))
df_mat
## [1] 395 310  85
#row count of por. train, test sets
df_por <- c(nrow(por),nrow(por_train),nrow(por_test))
df_por
## [1] 649 511 138

MODEL 1 Linear

mat_stepAIC <- stepAIC(lm(G3~., mat_train), direction = "both", trace = FALSE)
summary(mat_stepAIC)
## 
## Call:
## lm(formula = G3 ~ `school=MS` + age + `activities=yes` + famrel + 
##     absences + G1 + G2, data = mat_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.5092 -0.4793  0.2797  1.0859  3.5641 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.02171    1.65461   1.222 0.222713    
## `school=MS`       0.86791    0.37042   2.343 0.019775 *  
## age              -0.33471    0.09431  -3.549 0.000448 ***
## `activities=yes` -0.46345    0.21538  -2.152 0.032204 *  
## famrel            0.39598    0.11779   3.362 0.000874 ***
## absences          0.05176    0.01377   3.758 0.000206 ***
## G1                0.16650    0.06462   2.577 0.010449 *  
## G2                0.97267    0.05722  16.998  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.874 on 302 degrees of freedom
## Multiple R-squared:  0.8449, Adjusted R-squared:  0.8413 
## F-statistic:   235 on 7 and 302 DF,  p-value: < 2.2e-16
mat_test$lm_G3 <- predict(mat_stepAIC, mat_test, type = "response")

mat_rmse_lm <- RMSE(as.integer(mat_test$lm_G3), mat_test$G3)
mat_r2_lm <- R2(mat_test$lm_G3, mat_test$G3)

mat_df <- rbind(mat_df, c(' LM Model' ,round(mat_rmse_lm,3), round(mat_r2_lm,3)), stringsAsFactors = FALSE)
por_stepAIC <- stepAIC(lm(G3~., por_train), direction = "both", trace = FALSE)
summary(por_stepAIC)
## 
## Call:
## lm(formula = G3 ~ `address=R` + `Mjob=other` + `Fjob=services` + 
##     `reason=other` + `guardian=father` + `guardian=mother` + 
##     traveltime + failures + `schoolsup=yes` + `higher=yes` + 
##     Walc + absences + G1 + G2, data = por_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.7354 -0.4908  0.0046  0.5559  5.6645 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.53740    0.41399   1.298  0.19485    
## `address=R`       -0.28519    0.12955  -2.201  0.02817 *  
## `Mjob=other`      -0.17216    0.11382  -1.513  0.13104    
## `Fjob=services`   -0.22009    0.12710  -1.732  0.08396 .  
## `reason=other`    -0.29851    0.18231  -1.637  0.10218    
## `guardian=father` -0.38023    0.25047  -1.518  0.12963    
## `guardian=mother` -0.45029    0.23174  -1.943  0.05257 .  
## traveltime         0.18304    0.08049   2.274  0.02339 *  
## failures          -0.27084    0.10528  -2.573  0.01038 *  
## `schoolsup=yes`   -0.30984    0.19511  -1.588  0.11292    
## `higher=yes`       0.30935    0.19378   1.596  0.11104    
## Walc              -0.08220    0.04392  -1.871  0.06187 .  
## absences           0.02083    0.01214   1.715  0.08690 .  
## G1                 0.11044    0.04095   2.697  0.00724 ** 
## G2                 0.89734    0.03786  23.699  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.236 on 496 degrees of freedom
## Multiple R-squared:  0.8616, Adjusted R-squared:  0.8577 
## F-statistic: 220.5 on 14 and 496 DF,  p-value: < 2.2e-16
por_test$lm_G3 <- predict(por_stepAIC, por_test, type = "response")

por_rmse_lm <- RMSE(as.integer(por_test$lm_G3), por_test$G3)
por_r2_lm <- R2(por_test$lm_G3, por_test$G3)

por_df <- rbind(por_df, c('LM Model', round(por_rmse_lm,3), round(por_r2_lm,3)), stringsAsFactors = FALSE)
par(mfrow=c(2,2))
plot(mat_stepAIC)

par(mfrow=c(2,2))
plot(por_stepAIC)

MODEL 2 - Gaussian

mat_glm <- glm(G3~., mat_train, family = gaussian)
mat_test$glm_G3 <- predict(mat_glm, newdata = mat_test)
mat_glm <- stepAIC(mat_glm, direction = "both", trace = FALSE)
summary(mat_glm)
## 
## Call:
## glm(formula = G3 ~ `school=MS` + age + `activities=yes` + famrel + 
##     absences + G1 + G2, family = gaussian, data = mat_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -8.5092  -0.4793   0.2797   1.0859   3.5641  
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.02171    1.65461   1.222 0.222713    
## `school=MS`       0.86791    0.37042   2.343 0.019775 *  
## age              -0.33471    0.09431  -3.549 0.000448 ***
## `activities=yes` -0.46345    0.21538  -2.152 0.032204 *  
## famrel            0.39598    0.11779   3.362 0.000874 ***
## absences          0.05176    0.01377   3.758 0.000206 ***
## G1                0.16650    0.06462   2.577 0.010449 *  
## G2                0.97267    0.05722  16.998  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 3.510145)
## 
##     Null deviance: 6835.0  on 309  degrees of freedom
## Residual deviance: 1060.1  on 302  degrees of freedom
## AIC: 1278.9
## 
## Number of Fisher Scoring iterations: 2
mat_rmse_glm <- RMSE(as.integer(mat_test$glm_G3), mat_test$G3)
mat_r2_glm <- R2(as.integer(mat_test$glm_G3), mat_test$G3)

mat_df <- rbind(mat_df, c('GLM Model' ,round(mat_rmse_glm,3), round(mat_r2_glm,3)), stringsAsFactors = FALSE)
por_glm <- glm(G3~., por_train, family = gaussian)
por_test$glm_G3 <- predict(por_glm, newdata = por_test)
por_glm <- stepAIC(por_glm, direction = "both", trace = FALSE)
summary(por_glm)
## 
## Call:
## glm(formula = G3 ~ `address=R` + `Mjob=other` + `Fjob=services` + 
##     `reason=other` + `guardian=father` + `guardian=mother` + 
##     traveltime + failures + `schoolsup=yes` + `higher=yes` + 
##     Walc + absences + G1 + G2, family = gaussian, data = por_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -8.7354  -0.4908   0.0046   0.5559   5.6645  
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        0.53740    0.41399   1.298  0.19485    
## `address=R`       -0.28519    0.12955  -2.201  0.02817 *  
## `Mjob=other`      -0.17216    0.11382  -1.513  0.13104    
## `Fjob=services`   -0.22009    0.12710  -1.732  0.08396 .  
## `reason=other`    -0.29851    0.18231  -1.637  0.10218    
## `guardian=father` -0.38023    0.25047  -1.518  0.12963    
## `guardian=mother` -0.45029    0.23174  -1.943  0.05257 .  
## traveltime         0.18304    0.08049   2.274  0.02339 *  
## failures          -0.27084    0.10528  -2.573  0.01038 *  
## `schoolsup=yes`   -0.30984    0.19511  -1.588  0.11292    
## `higher=yes`       0.30935    0.19378   1.596  0.11104    
## Walc              -0.08220    0.04392  -1.871  0.06187 .  
## absences           0.02083    0.01214   1.715  0.08690 .  
## G1                 0.11044    0.04095   2.697  0.00724 ** 
## G2                 0.89734    0.03786  23.699  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.527285)
## 
##     Null deviance: 5471.87  on 510  degrees of freedom
## Residual deviance:  757.53  on 496  degrees of freedom
## AIC: 1683.3
## 
## Number of Fisher Scoring iterations: 2
por_rmse_glm <-  RMSE(as.integer(por_test$glm_G3), por_test$G3)
por_r2_glm <- R2(as.integer(por_test$glm_G3), por_test$G3)

por_df <- rbind(por_df, c('GLM Model', round(por_rmse_glm,3), round(por_r2_glm,3)), stringsAsFactors = FALSE)
par(mfrow=c(2,2))
plot(mat_glm)

par(mfrow=c(2,2))
plot(por_glm)

MODEL 3 - Poisson

mat_pois <- glm(G3~., mat_train, family = poisson)
mat_test$glm_G3_pois <- predict(mat_pois, newdata = mat_test)
mat_pois <- stepAIC(mat_pois, direction = "both", trace = FALSE)
summary(mat_pois)
## 
## Call:
## glm(formula = G3 ~ `school=MS` + failures + `schoolsup=yes` + 
##     famrel + goout + Walc + absences + G1 + G2, family = poisson, 
##     data = mat_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.0619  -0.2330   0.1673   0.4956   1.4665  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      0.816069   0.130894   6.235 4.53e-10 ***
## `school=MS`      0.117340   0.059360   1.977  0.04807 *  
## failures        -0.061129   0.033009  -1.852  0.06404 .  
## `schoolsup=yes`  0.087165   0.057492   1.516  0.12949    
## famrel           0.046037   0.019728   2.334  0.01962 *  
## goout           -0.030011   0.019253  -1.559  0.11906    
## Walc             0.028753   0.015984   1.799  0.07204 .  
## absences         0.007101   0.002181   3.256  0.00113 ** 
## G1              -0.029678   0.012913  -2.298  0.02154 *  
## G2               0.143832   0.012436  11.566  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 964.94  on 309  degrees of freedom
## Residual deviance: 358.80  on 300  degrees of freedom
## AIC: 1562
## 
## Number of Fisher Scoring iterations: 5
mat_rmse_glm <- RMSE(as.integer(mat_test$glm_G3_pois), mat_test$G3)
mat_r2_glm <- R2(as.integer(mat_test$glm_G3_pois), mat_test$G3)

mat_df <- rbind(mat_df, c('GLM - Poisson Model' ,round(mat_rmse_glm,3), round(mat_r2_glm,3)), stringsAsFactors = FALSE)
por_pois <- glm(G3~., por_train, family = poisson)
por_test$glm_G3_pois <- predict(por_pois, newdata = por_test)
por_pois <- stepAIC(por_pois, direction = "both", trace = FALSE)
summary(por_pois)
## 
## Call:
## glm(formula = G3 ~ failures + G2, family = poisson, data = por_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.1447  -0.0992  -0.0102   0.2679   1.6973  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.456864   0.062531  23.298   <2e-16 ***
## failures    -0.039794   0.027494  -1.447    0.148    
## G2           0.085915   0.004794  17.920   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 601.0  on 510  degrees of freedom
## Residual deviance: 201.6  on 508  degrees of freedom
## AIC: 2370.6
## 
## Number of Fisher Scoring iterations: 4
por_rmse_glm <-  RMSE(as.integer(por_test$glm_G3_pois), por_test$G3)
por_r2_glm <- R2(as.integer(por_test$glm_G3_pois), por_test$G3)

por_df <- rbind(por_df, c('GLM - Poisson Model', round(por_rmse_glm,3), round(por_r2_glm,3)), stringsAsFactors = FALSE)
par(mfrow=c(2,2))
plot(mat_pois)

par(mfrow=c(2,2))
plot(por_pois)

x <- c("Model", "RMSE", "R2")

colnames(mat_df) <- x
mat_df
##                 Model  RMSE    R2
## 1            LM Model     2 0.788
## 2           GLM Model 2.041 0.775
## 3 GLM - Poisson Model 9.645 0.554
colnames(por_df) <- x
por_df
##                 Model  RMSE    R2
## 1            LM Model 1.409 0.822
## 2           GLM Model 1.412 0.825
## 3 GLM - Poisson Model 9.984 0.309

4. Model Selection

mat_final_test <- mat_final_test %>% select(-G3)

mat_final_test$G3 <- mat_test$lm_G3

por_final_test <- por_final_test %>% select(-G3)

por_final_test$G3 <- por_test$lm_G3

write.csv(mat_final_test, 'DATA621_FinalProject_Mat.csv')

write.csv(por_final_test, 'DATA621_FinalProject_Por.csv')