library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.1 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
d1=read.table("student-mat.csv",sep=";",header=TRUE)
#view(d1)
#str(d1) -- 395 participants
#Regression
#A: My response variable is going to be final GPA (G3). This variable is measured on a scale of 1-20 GPA points, with higher GPAs = higher acedemic preformance
#Numeric predictor variable: study hours on a scale of 1-5 (1 = low amount of time spent studying, 5 = high amounts of time spent studying)
#Catagorical explanatory variables: family size (binary -- >3 and <3 people in the family) -- will include gender in the final analysis
#B:
SLRGPAstudyhours <- lm(G3~studytime, d1)
summary(SLRGPAstudyhours)
##
## Call:
## lm(formula = G3 ~ studytime, data = d1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.4643 -1.8623 0.5357 3.0697 9.1377
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.3283 0.6033 15.463 <2e-16 ***
## studytime 0.5340 0.2741 1.949 0.0521 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.565 on 393 degrees of freedom
## Multiple R-squared: 0.009569, Adjusted R-squared: 0.007049
## F-statistic: 3.797 on 1 and 393 DF, p-value: 0.05206
ggplot(d1, aes(studytime, G3))+
geom_point()+
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#study time is marginally significant, but very close (p = 0.052)
#C:
d1$famsize1 <- factor(d1$famsize, levels = c("GT3", "LE3"))
contrasts(d1$famsize1) #run on gender also
## LE3
## GT3 0
## LE3 1
#D:
modwithFamSize <- lm(G3 ~ famsize, d1)
summary(modwithFamSize)
##
## Call:
## lm(formula = G3 ~ famsize, data = d1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.0000 -2.1779 0.8221 3.0000 9.8221
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.1779 0.2727 37.317 <2e-16 ***
## famsizeLE3 0.8221 0.5077 1.619 0.106
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.572 on 393 degrees of freedom
## Multiple R-squared: 0.006627, Adjusted R-squared: 0.004099
## F-statistic: 2.622 on 1 and 393 DF, p-value: 0.1062
anova(modwithFamSize)
## Analysis of Variance Table
##
## Response: G3
## Df Sum Sq Mean Sq F value Pr(>F)
## famsize 1 54.8 54.806 2.6218 0.1062
## Residuals 393 8215.1 20.904
#There is no significant difference between these two levels
ggplot(d1, aes(famsize, G3))+
geom_boxplot()

#E:
modFamsizeandstudytime <- lm(G3 ~ studytime+famsize, d1)
summary(modFamsizeandstudytime)
##
## Call:
## lm(formula = G3 ~ studytime + famsize, data = d1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.1746 -2.0350 0.2949 3.2949 8.7251
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.9958 0.6301 14.276 <2e-16 ***
## studytime 0.5698 0.2740 2.079 0.0383 *
## famsizeLE3 0.8996 0.5069 1.775 0.0767 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.553 on 392 degrees of freedom
## Multiple R-squared: 0.01746, Adjusted R-squared: 0.01245
## F-statistic: 3.483 on 2 and 392 DF, p-value: 0.03165
anova(modFamsizeandstudytime)
## Analysis of Variance Table
##
## Response: G3
## Df Sum Sq Mean Sq F value Pr(>F)
## studytime 1 79.1 79.132 3.8176 0.05143 .
## famsize 1 65.3 65.281 3.1494 0.07673 .
## Residuals 392 8125.5 20.728
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(d1, aes(studytime, G3))+
geom_point()+
geom_smooth(method="lm", se=FALSE)+
geom_abline(slope=0.5698, intercept=9.8956, color = "firebrick1")
## `geom_smooth()` using formula 'y ~ x'

#GT3(0):Yhat = 8.9958 + 0.5698x
#LE3(1)Yhat = (8.9958 + 0.8996) + 0.5698x = 9.8956 + 0.5698x
#F:
modinteraction <- lm(G3~studytime*famsize, d1)
summary(modinteraction)
##
## Call:
## lm(formula = G3 ~ studytime * famsize, data = d1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.3369 -2.0028 0.2263 3.1621 9.0980
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.3976 0.7124 13.192 <2e-16 ***
## studytime 0.3761 0.3175 1.185 0.237
## famsizeLE3 -0.5953 1.3385 -0.445 0.657
## studytime:famsizeLE3 0.7575 0.6278 1.207 0.228
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.55 on 391 degrees of freedom
## Multiple R-squared: 0.02111, Adjusted R-squared: 0.0136
## F-statistic: 2.81 on 3 and 391 DF, p-value: 0.03927
anova(modinteraction)
## Analysis of Variance Table
##
## Response: G3
## Df Sum Sq Mean Sq F value Pr(>F)
## studytime 1 79.1 79.132 3.8220 0.05130 .
## famsize 1 65.3 65.281 3.1530 0.07656 .
## studytime:famsize 1 30.1 30.141 1.4558 0.22833
## Residuals 391 8095.4 20.704
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(d1, aes(studytime, G3, color = famsize))+
geom_point()+
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#GT3(0): Yhat = 9.3976 + 0.376x
#LE3(1): Yhat = (9.3976 - 0.5953) + (0.3761 + 0.7575)x = 8.8023 + 1.134x
#G:
#SLRGPAstudyhours has an MSE of 20.7361
mod_sum1 <- summary(SLRGPAstudyhours)
mod_sum1
##
## Call:
## lm(formula = G3 ~ studytime, data = d1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.4643 -1.8623 0.5357 3.0697 9.1377
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.3283 0.6033 15.463 <2e-16 ***
## studytime 0.5340 0.2741 1.949 0.0521 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.565 on 393 degrees of freedom
## Multiple R-squared: 0.009569, Adjusted R-squared: 0.007049
## F-statistic: 3.797 on 1 and 393 DF, p-value: 0.05206
mean(mod_sum1$residuals^2)
## [1] 20.73614
#modwithFamsize has an MSE of 20.79773
mod_sum2 <- summary(modwithFamSize)
mean(mod_sum2$residuals^2)
## [1] 20.79773
#modFamsizeandstudytime has an MSE of 20.57088
mod_sum3 <- summary(modFamsizeandstudytime)
mean(mod_sum3$residuals^2)
## [1] 20.57088
#modinteraction has a MSE of 20.49457
mod_sum4 <- summary(modinteraction)
mean(mod_sum4$residuals^2)
## [1] 20.49457
#H:
#The relationship in the SLR (study hours vs GPA) model was almost sig. (t=1.95, p=0.052)
#The relationship between GPA and family size is not significant
#The relationship between studyhours and GPA when controling for faily size is significant (t=2.079, p=0.0383). Famsize becomes marginally significant when study hours are controlled for (p=0.0767)
#The interaction between study hours and Famsize is not significant
#The interaction model had the lowest MSE but modFamsizeandstudytime was the most significant
#Interpretation: GPA is somewhat positivly related to studyhours, but not the size of the student's family. Studytime becomes less affective if the family is bigger (lower GPAs on average).
#Final anysis: Gender Component
modgender1 <- lm(G3~sex, d1)
summary(modgender1)
##
## Call:
## lm(formula = G3 ~ sex, data = d1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.9144 -1.9663 0.0856 3.0856 9.0856
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.9663 0.3164 31.503 <2e-16 ***
## sexM 0.9481 0.4598 2.062 0.0399 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.563 on 393 degrees of freedom
## Multiple R-squared: 0.0107, Adjusted R-squared: 0.008186
## F-statistic: 4.252 on 1 and 393 DF, p-value: 0.03987
ggplot(d1, aes(sex, G3))+
geom_boxplot()

#Boys tend to get higher grades that girls
modgender2 <- lm(studytime ~ sex, d1)
summary(modgender2)
##
## Call:
## lm(formula = studytime ~ sex, data = d1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2789 -0.7647 -0.2788 0.2353 2.2353
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.27885 0.05546 41.086 < 2e-16 ***
## sexM -0.51414 0.08061 -6.378 5.05e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7999 on 393 degrees of freedom
## Multiple R-squared: 0.0938, Adjusted R-squared: 0.09149
## F-statistic: 40.68 on 1 and 393 DF, p-value: 5.045e-10
#Boys study less tha girls
ggplot(d1, aes(sex, studytime))+
geom_boxplot()

#Does the realtionships between study hours and GPA differ between boys and girls for large vs small fmaily sizes?
ggplot(d1, aes(studytime, G3, color = sex))+
geom_point()+
facet_wrap(~famsize)+
geom_smooth(method = 'lm', se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#study time and GPAmath seem to be related the same for both genders in smaller families but rated differently by gender in bigger families
modgender3 <- lm(G3 ~ studytime+sex, d1)
summary(modgender3)
##
## Call:
## lm(formula = G3 ~ studytime + sex, data = d1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.6583 -2.0980 0.2512 3.2512 9.0313
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.1885 0.7221 11.340 < 2e-16 ***
## studytime 0.7801 0.2854 2.734 0.00655 **
## sexM 1.3492 0.4791 2.816 0.00510 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.526 on 392 degrees of freedom
## Multiple R-squared: 0.02921, Adjusted R-squared: 0.02426
## F-statistic: 5.898 on 2 and 392 DF, p-value: 0.002996
modgenderinteraction1 <- lm(G3~studytime*sex, d1)
summary(modgenderinteraction1)
##
## Call:
## lm(formula = G3 ~ studytime * sex, data = d1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.1054 -2.1451 0.1989 3.1989 8.8351
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.6156 0.9586 8.987 <2e-16 ***
## studytime 0.5927 0.3975 1.491 0.137
## sexM 0.5691 1.2465 0.457 0.648
## studytime:sexM 0.3874 0.5715 0.678 0.498
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.529 on 391 degrees of freedom
## Multiple R-squared: 0.03035, Adjusted R-squared: 0.02291
## F-statistic: 4.08 on 3 and 391 DF, p-value: 0.00716
#significant without the interaction, but not signitifanct with the interaction
modgender4 <- lm(G3~studytime+sex+famsize, d1)
summary(modgender4)
##
## Call:
## lm(formula = G3 ~ studytime + sex + famsize, data = d1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.4414 -2.0347 0.4573 3.1631 9.2603
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.9374 0.7377 10.760 < 2e-16 ***
## studytime 0.8022 0.2852 2.813 0.00515 **
## sexM 1.2951 0.4793 2.702 0.00720 **
## famsizeLE3 0.8030 0.5042 1.593 0.11202
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.517 on 391 degrees of freedom
## Multiple R-squared: 0.03547, Adjusted R-squared: 0.02807
## F-statistic: 4.793 on 3 and 391 DF, p-value: 0.002723
#everything significant except famsize
modgenderinteraction2 <- lm(G3~studytime*sex+famsize, d1)
summary(modgenderinteraction2)
##
## Call:
## lm(formula = G3 ~ studytime * sex + famsize, data = d1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.9165 -1.8920 0.3025 3.1326 9.0071
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.3908 0.9668 8.679 <2e-16 ***
## studytime 0.6021 0.3967 1.518 0.130
## sexM 0.4603 1.2457 0.369 0.712
## famsizeLE3 0.8137 0.5047 1.612 0.108
## studytime:sexM 0.4142 0.5705 0.726 0.468
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.519 on 390 degrees of freedom
## Multiple R-squared: 0.03677, Adjusted R-squared: 0.02689
## F-statistic: 3.722 on 4 and 390 DF, p-value: 0.005489
#insignificant
modgender5<- lm(G3~sex+famsize, d1)
summary(modgender5)
##
## Call:
## lm(formula = G3 ~ sex + famsize, data = d1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.405 -1.783 0.329 3.217 9.329
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.7828 0.3405 28.731 <2e-16 ***
## sexM 0.8882 0.4610 1.927 0.0547 .
## famsizeLE3 0.7341 0.5080 1.445 0.1492
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.556 on 392 degrees of freedom
## Multiple R-squared: 0.01595, Adjusted R-squared: 0.01092
## F-statistic: 3.176 on 2 and 392 DF, p-value: 0.04283
modgenderinteraction3 <- lm(G3~sex*famsize, d1)
summary(modgenderinteraction3)
##
## Call:
## lm(formula = G3 ~ sex * famsize, data = d1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.0161 -1.9808 0.3718 3.1360 9.3718
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.6282 0.3646 26.406 <2e-16 ***
## sexM 1.2358 0.5467 2.261 0.0243 *
## famsizeLE3 1.3526 0.7292 1.855 0.0644 .
## sexM:famsizeLE3 -1.2004 1.0160 -1.182 0.2381
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.554 on 391 degrees of freedom
## Multiple R-squared: 0.01945, Adjusted R-squared: 0.01192
## F-statistic: 2.585 on 3 and 391 DF, p-value: 0.05291
#famsize becomes marginally significant
ggplot(d1, aes(sex, G3))+
geom_boxplot()+
geom_smooth(method="lm", se=FALSE)+
facet_wrap(~famsize)
## `geom_smooth()` using formula 'y ~ x'
