set.seed(123)
mat_df <- data.frame(matrix(ncol = 3, nrow = 0) ,stringsAsFactors = FALSE)
por_df <- data.frame(matrix(ncol = 3, nrow = 0) ,stringsAsFactors = FALSE)
mat <- read.csv("https://raw.githubusercontent.com/irene908/DATA621/main/student-mat.csv")
por <- read.csv("https://raw.githubusercontent.com/irene908/DATA621/main/student-por.csv")
summary(mat)
## school sex age address
## Length:395 Length:395 Min. :15.0 Length:395
## Class :character Class :character 1st Qu.:16.0 Class :character
## Mode :character Mode :character Median :17.0 Mode :character
## Mean :16.7
## 3rd Qu.:18.0
## Max. :22.0
## famsize Pstatus Medu Fedu
## Length:395 Length:395 Min. :0.000 Min. :0.000
## Class :character Class :character 1st Qu.:2.000 1st Qu.:2.000
## Mode :character Mode :character Median :3.000 Median :2.000
## Mean :2.749 Mean :2.522
## 3rd Qu.:4.000 3rd Qu.:3.000
## Max. :4.000 Max. :4.000
## Mjob Fjob reason guardian
## Length:395 Length:395 Length:395 Length:395
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## traveltime studytime failures schoolsup
## Min. :1.000 Min. :1.000 Min. :0.0000 Length:395
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.0000 Class :character
## Median :1.000 Median :2.000 Median :0.0000 Mode :character
## Mean :1.448 Mean :2.035 Mean :0.3342
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:0.0000
## Max. :4.000 Max. :4.000 Max. :3.0000
## famsup paid activities nursery
## Length:395 Length:395 Length:395 Length:395
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## higher internet romantic famrel
## Length:395 Length:395 Length:395 Min. :1.000
## Class :character Class :character Class :character 1st Qu.:4.000
## Mode :character Mode :character Mode :character Median :4.000
## Mean :3.944
## 3rd Qu.:5.000
## Max. :5.000
## freetime goout Dalc Walc
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.000
## Median :3.000 Median :3.000 Median :1.000 Median :2.000
## Mean :3.235 Mean :3.109 Mean :1.481 Mean :2.291
## 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:2.000 3rd Qu.:3.000
## Max. :5.000 Max. :5.000 Max. :5.000 Max. :5.000
## health absences G1 G2
## Min. :1.000 Min. : 0.000 Min. : 3.00 Min. : 0.00
## 1st Qu.:3.000 1st Qu.: 0.000 1st Qu.: 8.00 1st Qu.: 9.00
## Median :4.000 Median : 4.000 Median :11.00 Median :11.00
## Mean :3.554 Mean : 5.709 Mean :10.91 Mean :10.71
## 3rd Qu.:5.000 3rd Qu.: 8.000 3rd Qu.:13.00 3rd Qu.:13.00
## Max. :5.000 Max. :75.000 Max. :19.00 Max. :19.00
## G3
## Min. : 0.00
## 1st Qu.: 8.00
## Median :11.00
## Mean :10.42
## 3rd Qu.:14.00
## Max. :20.00
summary(por)
## school sex age address
## Length:649 Length:649 Min. :15.00 Length:649
## Class :character Class :character 1st Qu.:16.00 Class :character
## Mode :character Mode :character Median :17.00 Mode :character
## Mean :16.74
## 3rd Qu.:18.00
## Max. :22.00
## famsize Pstatus Medu Fedu
## Length:649 Length:649 Min. :0.000 Min. :0.000
## Class :character Class :character 1st Qu.:2.000 1st Qu.:1.000
## Mode :character Mode :character Median :2.000 Median :2.000
## Mean :2.515 Mean :2.307
## 3rd Qu.:4.000 3rd Qu.:3.000
## Max. :4.000 Max. :4.000
## Mjob Fjob reason guardian
## Length:649 Length:649 Length:649 Length:649
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## traveltime studytime failures schoolsup
## Min. :1.000 Min. :1.000 Min. :0.0000 Length:649
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:0.0000 Class :character
## Median :1.000 Median :2.000 Median :0.0000 Mode :character
## Mean :1.569 Mean :1.931 Mean :0.2219
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:0.0000
## Max. :4.000 Max. :4.000 Max. :3.0000
## famsup paid activities nursery
## Length:649 Length:649 Length:649 Length:649
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## higher internet romantic famrel
## Length:649 Length:649 Length:649 Min. :1.000
## Class :character Class :character Class :character 1st Qu.:4.000
## Mode :character Mode :character Mode :character Median :4.000
## Mean :3.931
## 3rd Qu.:5.000
## Max. :5.000
## freetime goout Dalc Walc health
## Min. :1.00 Min. :1.000 Min. :1.000 Min. :1.00 Min. :1.000
## 1st Qu.:3.00 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.00 1st Qu.:2.000
## Median :3.00 Median :3.000 Median :1.000 Median :2.00 Median :4.000
## Mean :3.18 Mean :3.185 Mean :1.502 Mean :2.28 Mean :3.536
## 3rd Qu.:4.00 3rd Qu.:4.000 3rd Qu.:2.000 3rd Qu.:3.00 3rd Qu.:5.000
## Max. :5.00 Max. :5.000 Max. :5.000 Max. :5.00 Max. :5.000
## absences G1 G2 G3
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.:10.0 1st Qu.:10.00 1st Qu.:10.00
## Median : 2.000 Median :11.0 Median :11.00 Median :12.00
## Mean : 3.659 Mean :11.4 Mean :11.57 Mean :11.91
## 3rd Qu.: 6.000 3rd Qu.:13.0 3rd Qu.:13.00 3rd Qu.:14.00
## Max. :32.000 Max. :19.0 Max. :19.00 Max. :19.00
#mat <- as.data.frame(mat)
mat_new <- mat %>% gather(key = 'key', value = 'value')
mat_new %>% gather() %>% ggplot(aes(x= value)) + geom_histogram(stat = "count") + facet_wrap(~key, scales = 'free', ncol=7)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
#mat <- as.data.frame(mat)
por_new <- por %>% gather(key = 'key', value = 'value')
por_new %>% gather() %>% ggplot(aes(x= value)) + geom_histogram(stat = "count") + facet_wrap(~key, scales = 'free', ncol=7)
## Warning: Ignoring unknown parameters: binwidth, bins, pad
corrplot(cor(mat[,sapply(mat, is.numeric)], use = "complete.obs"), type="upper", diag = FALSE, col=brewer.pal(n=10, name="Paired"))
corrplot(cor(por[,sapply(por, is.numeric)], use = "complete.obs"), type="upper", diag = FALSE, col=brewer.pal(n=10, name="Paired"))
#encode math data
mat_encoder <- onehot(mat, stringsAsFactors = TRUE)
mat <- predict(mat_encoder, mat)
remove <- c("school=GP", "sex=F", "address=U", "famsize=LE3", "Pstatus=T",
"Mjob=at_home", "Fjob=at_home","reason=course", "guardian=other",
"schoolsup=no", "famsup=no", "paid=no", "activities=no", "nursery=no",
"higher=no", "internet=no", "romantic=no")
mat <- mat[,!(colnames(mat) %in% remove)]
#encode por data
por_encoder <- onehot(por, stringsAsFactors = TRUE)
por <- predict(por_encoder, por)
por <- por[,!(colnames(por) %in% remove)]
head(mat)
## school=MS sex=M age address=R famsize=GT3 Pstatus=A Medu Fedu Mjob=health
## [1,] 0 0 18 0 1 1 4 4 0
## [2,] 0 0 17 0 1 0 1 1 0
## [3,] 0 0 15 0 0 0 1 1 0
## [4,] 0 0 15 0 1 0 4 2 1
## [5,] 0 0 16 0 1 0 3 3 0
## [6,] 0 1 16 0 0 0 4 3 0
## Mjob=other Mjob=services Mjob=teacher Fjob=health Fjob=other Fjob=services
## [1,] 0 0 0 0 0 0
## [2,] 0 0 0 0 1 0
## [3,] 0 0 0 0 1 0
## [4,] 0 0 0 0 0 1
## [5,] 1 0 0 0 1 0
## [6,] 0 1 0 0 1 0
## Fjob=teacher reason=home reason=other reason=reputation guardian=father
## [1,] 1 0 0 0 0
## [2,] 0 0 0 0 1
## [3,] 0 0 1 0 0
## [4,] 0 1 0 0 0
## [5,] 0 1 0 0 1
## [6,] 0 0 0 1 0
## guardian=mother traveltime studytime failures schoolsup=yes famsup=yes
## [1,] 1 2 2 0 1 0
## [2,] 0 1 2 0 0 1
## [3,] 1 1 2 3 1 0
## [4,] 1 1 3 0 0 1
## [5,] 0 1 2 0 0 1
## [6,] 1 1 2 0 0 1
## paid=yes activities=yes nursery=yes higher=yes internet=yes romantic=yes
## [1,] 0 0 1 1 0 0
## [2,] 0 0 0 1 1 0
## [3,] 1 0 1 1 1 0
## [4,] 1 1 1 1 1 1
## [5,] 1 0 1 1 0 0
## [6,] 1 1 1 1 1 0
## famrel freetime goout Dalc Walc health absences G1 G2 G3
## [1,] 4 3 4 1 1 3 6 5 6 6
## [2,] 5 3 3 1 1 3 4 5 5 6
## [3,] 4 3 2 2 3 3 10 7 8 10
## [4,] 3 2 2 1 1 5 2 15 14 15
## [5,] 4 3 2 1 2 5 4 6 10 10
## [6,] 5 4 2 1 2 5 10 15 15 15
head(por)
## school=MS sex=M age address=R famsize=GT3 Pstatus=A Medu Fedu Mjob=health
## [1,] 0 0 18 0 1 1 4 4 0
## [2,] 0 0 17 0 1 0 1 1 0
## [3,] 0 0 15 0 0 0 1 1 0
## [4,] 0 0 15 0 1 0 4 2 1
## [5,] 0 0 16 0 1 0 3 3 0
## [6,] 0 1 16 0 0 0 4 3 0
## Mjob=other Mjob=services Mjob=teacher Fjob=health Fjob=other Fjob=services
## [1,] 0 0 0 0 0 0
## [2,] 0 0 0 0 1 0
## [3,] 0 0 0 0 1 0
## [4,] 0 0 0 0 0 1
## [5,] 1 0 0 0 1 0
## [6,] 0 1 0 0 1 0
## Fjob=teacher reason=home reason=other reason=reputation guardian=father
## [1,] 1 0 0 0 0
## [2,] 0 0 0 0 1
## [3,] 0 0 1 0 0
## [4,] 0 1 0 0 0
## [5,] 0 1 0 0 1
## [6,] 0 0 0 1 0
## guardian=mother traveltime studytime failures schoolsup=yes famsup=yes
## [1,] 1 2 2 0 1 0
## [2,] 0 1 2 0 0 1
## [3,] 1 1 2 0 1 0
## [4,] 1 1 3 0 0 1
## [5,] 0 1 2 0 0 1
## [6,] 1 1 2 0 0 1
## paid=yes activities=yes nursery=yes higher=yes internet=yes romantic=yes
## [1,] 0 0 1 1 0 0
## [2,] 0 0 0 1 1 0
## [3,] 0 0 1 1 1 0
## [4,] 0 1 1 1 1 1
## [5,] 0 0 1 1 0 0
## [6,] 0 1 1 1 1 0
## famrel freetime goout Dalc Walc health absences G1 G2 G3
## [1,] 4 3 4 1 1 3 4 0 11 11
## [2,] 5 3 3 1 1 3 2 9 11 11
## [3,] 4 3 2 2 3 3 6 12 13 12
## [4,] 3 2 2 1 1 5 0 14 14 14
## [5,] 4 3 2 1 2 5 0 11 13 13
## [6,] 5 4 2 1 2 5 6 12 12 13
Split the datasets into train and test in the ration 80:20.
#mat
mat<-as.data.frame(mat)
sample <- sample.split(mat, SplitRatio = 0.8)
mat_train <- subset(mat,sample==TRUE)
mat_test <- subset(mat,sample==FALSE)
mat_final_test <- mat_test
#por
por<-as.data.frame(por)
sample <- sample.split(por, SplitRatio = 0.8)
por_train <- subset(por,sample==TRUE)
por_test <- subset(por,sample==FALSE)
por_final_test <- por_test
#row count of mat. train, test sets
df_mat <- c(nrow(mat),nrow(mat_train),nrow(mat_test))
df_mat
## [1] 395 310 85
#row count of por. train, test sets
df_por <- c(nrow(por),nrow(por_train),nrow(por_test))
df_por
## [1] 649 511 138
MODEL 1 Linear
mat_stepAIC <- stepAIC(lm(G3~., mat_train), direction = "both", trace = FALSE)
summary(mat_stepAIC)
##
## Call:
## lm(formula = G3 ~ `school=MS` + age + `activities=yes` + famrel +
## absences + G1 + G2, data = mat_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.5092 -0.4793 0.2797 1.0859 3.5641
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.02171 1.65461 1.222 0.222713
## `school=MS` 0.86791 0.37042 2.343 0.019775 *
## age -0.33471 0.09431 -3.549 0.000448 ***
## `activities=yes` -0.46345 0.21538 -2.152 0.032204 *
## famrel 0.39598 0.11779 3.362 0.000874 ***
## absences 0.05176 0.01377 3.758 0.000206 ***
## G1 0.16650 0.06462 2.577 0.010449 *
## G2 0.97267 0.05722 16.998 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.874 on 302 degrees of freedom
## Multiple R-squared: 0.8449, Adjusted R-squared: 0.8413
## F-statistic: 235 on 7 and 302 DF, p-value: < 2.2e-16
mat_test$lm_G3 <- predict(mat_stepAIC, mat_test, type = "response")
mat_rmse_lm <- RMSE(as.integer(mat_test$lm_G3), mat_test$G3)
mat_r2_lm <- R2(mat_test$lm_G3, mat_test$G3)
mat_df <- rbind(mat_df, c(' LM Model' ,round(mat_rmse_lm,3), round(mat_r2_lm,3)), stringsAsFactors = FALSE)
por_stepAIC <- stepAIC(lm(G3~., por_train), direction = "both", trace = FALSE)
summary(por_stepAIC)
##
## Call:
## lm(formula = G3 ~ `address=R` + `Mjob=other` + `Fjob=services` +
## `reason=other` + `guardian=father` + `guardian=mother` +
## traveltime + failures + `schoolsup=yes` + `higher=yes` +
## Walc + absences + G1 + G2, data = por_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.7354 -0.4908 0.0046 0.5559 5.6645
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.53740 0.41399 1.298 0.19485
## `address=R` -0.28519 0.12955 -2.201 0.02817 *
## `Mjob=other` -0.17216 0.11382 -1.513 0.13104
## `Fjob=services` -0.22009 0.12710 -1.732 0.08396 .
## `reason=other` -0.29851 0.18231 -1.637 0.10218
## `guardian=father` -0.38023 0.25047 -1.518 0.12963
## `guardian=mother` -0.45029 0.23174 -1.943 0.05257 .
## traveltime 0.18304 0.08049 2.274 0.02339 *
## failures -0.27084 0.10528 -2.573 0.01038 *
## `schoolsup=yes` -0.30984 0.19511 -1.588 0.11292
## `higher=yes` 0.30935 0.19378 1.596 0.11104
## Walc -0.08220 0.04392 -1.871 0.06187 .
## absences 0.02083 0.01214 1.715 0.08690 .
## G1 0.11044 0.04095 2.697 0.00724 **
## G2 0.89734 0.03786 23.699 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.236 on 496 degrees of freedom
## Multiple R-squared: 0.8616, Adjusted R-squared: 0.8577
## F-statistic: 220.5 on 14 and 496 DF, p-value: < 2.2e-16
por_test$lm_G3 <- predict(por_stepAIC, por_test, type = "response")
por_rmse_lm <- RMSE(as.integer(por_test$lm_G3), por_test$G3)
por_r2_lm <- R2(por_test$lm_G3, por_test$G3)
por_df <- rbind(por_df, c('LM Model', round(por_rmse_lm,3), round(por_r2_lm,3)), stringsAsFactors = FALSE)
par(mfrow=c(2,2))
plot(mat_stepAIC)
par(mfrow=c(2,2))
plot(por_stepAIC)
MODEL 2 - Gaussian
mat_glm <- glm(G3~., mat_train, family = gaussian)
mat_test$glm_G3 <- predict(mat_glm, newdata = mat_test)
mat_glm <- stepAIC(mat_glm, direction = "both", trace = FALSE)
summary(mat_glm)
##
## Call:
## glm(formula = G3 ~ `school=MS` + age + `activities=yes` + famrel +
## absences + G1 + G2, family = gaussian, data = mat_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.5092 -0.4793 0.2797 1.0859 3.5641
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.02171 1.65461 1.222 0.222713
## `school=MS` 0.86791 0.37042 2.343 0.019775 *
## age -0.33471 0.09431 -3.549 0.000448 ***
## `activities=yes` -0.46345 0.21538 -2.152 0.032204 *
## famrel 0.39598 0.11779 3.362 0.000874 ***
## absences 0.05176 0.01377 3.758 0.000206 ***
## G1 0.16650 0.06462 2.577 0.010449 *
## G2 0.97267 0.05722 16.998 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 3.510145)
##
## Null deviance: 6835.0 on 309 degrees of freedom
## Residual deviance: 1060.1 on 302 degrees of freedom
## AIC: 1278.9
##
## Number of Fisher Scoring iterations: 2
mat_rmse_glm <- RMSE(as.integer(mat_test$glm_G3), mat_test$G3)
mat_r2_glm <- R2(as.integer(mat_test$glm_G3), mat_test$G3)
mat_df <- rbind(mat_df, c('GLM Model' ,round(mat_rmse_glm,3), round(mat_r2_glm,3)), stringsAsFactors = FALSE)
por_glm <- glm(G3~., por_train, family = gaussian)
por_test$glm_G3 <- predict(por_glm, newdata = por_test)
por_glm <- stepAIC(por_glm, direction = "both", trace = FALSE)
summary(por_glm)
##
## Call:
## glm(formula = G3 ~ `address=R` + `Mjob=other` + `Fjob=services` +
## `reason=other` + `guardian=father` + `guardian=mother` +
## traveltime + failures + `schoolsup=yes` + `higher=yes` +
## Walc + absences + G1 + G2, family = gaussian, data = por_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.7354 -0.4908 0.0046 0.5559 5.6645
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.53740 0.41399 1.298 0.19485
## `address=R` -0.28519 0.12955 -2.201 0.02817 *
## `Mjob=other` -0.17216 0.11382 -1.513 0.13104
## `Fjob=services` -0.22009 0.12710 -1.732 0.08396 .
## `reason=other` -0.29851 0.18231 -1.637 0.10218
## `guardian=father` -0.38023 0.25047 -1.518 0.12963
## `guardian=mother` -0.45029 0.23174 -1.943 0.05257 .
## traveltime 0.18304 0.08049 2.274 0.02339 *
## failures -0.27084 0.10528 -2.573 0.01038 *
## `schoolsup=yes` -0.30984 0.19511 -1.588 0.11292
## `higher=yes` 0.30935 0.19378 1.596 0.11104
## Walc -0.08220 0.04392 -1.871 0.06187 .
## absences 0.02083 0.01214 1.715 0.08690 .
## G1 0.11044 0.04095 2.697 0.00724 **
## G2 0.89734 0.03786 23.699 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.527285)
##
## Null deviance: 5471.87 on 510 degrees of freedom
## Residual deviance: 757.53 on 496 degrees of freedom
## AIC: 1683.3
##
## Number of Fisher Scoring iterations: 2
por_rmse_glm <- RMSE(as.integer(por_test$glm_G3), por_test$G3)
por_r2_glm <- R2(as.integer(por_test$glm_G3), por_test$G3)
por_df <- rbind(por_df, c('GLM Model', round(por_rmse_glm,3), round(por_r2_glm,3)), stringsAsFactors = FALSE)
par(mfrow=c(2,2))
plot(mat_glm)
par(mfrow=c(2,2))
plot(por_glm)
MODEL 3 - Poisson
mat_pois <- glm(G3~., mat_train, family = poisson)
mat_test$glm_G3_pois <- predict(mat_pois, newdata = mat_test)
mat_pois <- stepAIC(mat_pois, direction = "both", trace = FALSE)
summary(mat_pois)
##
## Call:
## glm(formula = G3 ~ `school=MS` + failures + `schoolsup=yes` +
## famrel + goout + Walc + absences + G1 + G2, family = poisson,
## data = mat_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.0619 -0.2330 0.1673 0.4956 1.4665
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.816069 0.130894 6.235 4.53e-10 ***
## `school=MS` 0.117340 0.059360 1.977 0.04807 *
## failures -0.061129 0.033009 -1.852 0.06404 .
## `schoolsup=yes` 0.087165 0.057492 1.516 0.12949
## famrel 0.046037 0.019728 2.334 0.01962 *
## goout -0.030011 0.019253 -1.559 0.11906
## Walc 0.028753 0.015984 1.799 0.07204 .
## absences 0.007101 0.002181 3.256 0.00113 **
## G1 -0.029678 0.012913 -2.298 0.02154 *
## G2 0.143832 0.012436 11.566 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 964.94 on 309 degrees of freedom
## Residual deviance: 358.80 on 300 degrees of freedom
## AIC: 1562
##
## Number of Fisher Scoring iterations: 5
mat_rmse_glm <- RMSE(as.integer(mat_test$glm_G3_pois), mat_test$G3)
mat_r2_glm <- R2(as.integer(mat_test$glm_G3_pois), mat_test$G3)
mat_df <- rbind(mat_df, c('GLM - Poisson Model' ,round(mat_rmse_glm,3), round(mat_r2_glm,3)), stringsAsFactors = FALSE)
por_pois <- glm(G3~., por_train, family = poisson)
por_test$glm_G3_pois <- predict(por_pois, newdata = por_test)
por_pois <- stepAIC(por_pois, direction = "both", trace = FALSE)
summary(por_pois)
##
## Call:
## glm(formula = G3 ~ failures + G2, family = poisson, data = por_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.1447 -0.0992 -0.0102 0.2679 1.6973
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.456864 0.062531 23.298 <2e-16 ***
## failures -0.039794 0.027494 -1.447 0.148
## G2 0.085915 0.004794 17.920 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 601.0 on 510 degrees of freedom
## Residual deviance: 201.6 on 508 degrees of freedom
## AIC: 2370.6
##
## Number of Fisher Scoring iterations: 4
por_rmse_glm <- RMSE(as.integer(por_test$glm_G3_pois), por_test$G3)
por_r2_glm <- R2(as.integer(por_test$glm_G3_pois), por_test$G3)
por_df <- rbind(por_df, c('GLM - Poisson Model', round(por_rmse_glm,3), round(por_r2_glm,3)), stringsAsFactors = FALSE)
par(mfrow=c(2,2))
plot(mat_pois)
par(mfrow=c(2,2))
plot(por_pois)
x <- c("Model", "RMSE", "R2")
colnames(mat_df) <- x
mat_df
## Model RMSE R2
## 1 LM Model 2 0.788
## 2 GLM Model 2.041 0.775
## 3 GLM - Poisson Model 9.645 0.554
colnames(por_df) <- x
por_df
## Model RMSE R2
## 1 LM Model 1.409 0.822
## 2 GLM Model 1.412 0.825
## 3 GLM - Poisson Model 9.984 0.309
mat_final_test <- mat_final_test %>% select(-G3)
mat_final_test$G3 <- mat_test$lm_G3
por_final_test <- por_final_test %>% select(-G3)
por_final_test$G3 <- por_test$lm_G3
write.csv(mat_final_test, 'DATA621_FinalProject_Mat.csv')
write.csv(por_final_test, 'DATA621_FinalProject_Por.csv')