library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.1     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
d1=read.table("student-mat.csv",sep=";",header=TRUE)

#view(d1)
#str(d1) -- 395 participants 

#Regression 
  #A: My response variable is going to be final GPA (G3). This variable is measured on a scale of 1-20 GPA points, with higher GPAs = higher acedemic preformance 
      #Numeric predictor variable: study hours on a scale of 1-5 (1 = low amount of time spent studying, 5 = high amounts of time spent studying)
      #Catagorical explanatory variables: family size (binary -- >3 and <3 people in the family) -- will include gender in the final analysis 
  #B:  
SLRGPAstudyhours <- lm(G3~studytime, d1)
summary(SLRGPAstudyhours)
## 
## Call:
## lm(formula = G3 ~ studytime, data = d1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.4643  -1.8623   0.5357   3.0697   9.1377 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.3283     0.6033  15.463   <2e-16 ***
## studytime     0.5340     0.2741   1.949   0.0521 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.565 on 393 degrees of freedom
## Multiple R-squared:  0.009569,   Adjusted R-squared:  0.007049 
## F-statistic: 3.797 on 1 and 393 DF,  p-value: 0.05206
  ggplot(d1, aes(studytime, G3))+
    geom_point()+
    geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

    #study time is marginally significant, but very close (p = 0.052) 
  #C:
d1$famsize1 <- factor(d1$famsize, levels = c("GT3", "LE3"))
contrasts(d1$famsize1) #run on gender also
##     LE3
## GT3   0
## LE3   1
  #D:
modwithFamSize <- lm(G3 ~ famsize, d1)
summary(modwithFamSize)
## 
## Call:
## lm(formula = G3 ~ famsize, data = d1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.0000  -2.1779   0.8221   3.0000   9.8221 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  10.1779     0.2727  37.317   <2e-16 ***
## famsizeLE3    0.8221     0.5077   1.619    0.106    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.572 on 393 degrees of freedom
## Multiple R-squared:  0.006627,   Adjusted R-squared:  0.004099 
## F-statistic: 2.622 on 1 and 393 DF,  p-value: 0.1062
anova(modwithFamSize)
## Analysis of Variance Table
## 
## Response: G3
##            Df Sum Sq Mean Sq F value Pr(>F)
## famsize     1   54.8  54.806  2.6218 0.1062
## Residuals 393 8215.1  20.904
    #There is no significant difference between these two levels
  ggplot(d1, aes(famsize, G3))+
    geom_boxplot()

  #E: 
modFamsizeandstudytime <- lm(G3 ~ studytime+famsize, d1)
summary(modFamsizeandstudytime)
## 
## Call:
## lm(formula = G3 ~ studytime + famsize, data = d1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.1746  -2.0350   0.2949   3.2949   8.7251 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.9958     0.6301  14.276   <2e-16 ***
## studytime     0.5698     0.2740   2.079   0.0383 *  
## famsizeLE3    0.8996     0.5069   1.775   0.0767 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.553 on 392 degrees of freedom
## Multiple R-squared:  0.01746,    Adjusted R-squared:  0.01245 
## F-statistic: 3.483 on 2 and 392 DF,  p-value: 0.03165
anova(modFamsizeandstudytime)
## Analysis of Variance Table
## 
## Response: G3
##            Df Sum Sq Mean Sq F value  Pr(>F)  
## studytime   1   79.1  79.132  3.8176 0.05143 .
## famsize     1   65.3  65.281  3.1494 0.07673 .
## Residuals 392 8125.5  20.728                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  ggplot(d1, aes(studytime, G3))+
    geom_point()+
    geom_smooth(method="lm", se=FALSE)+
    geom_abline(slope=0.5698, intercept=9.8956, color = "firebrick1")
## `geom_smooth()` using formula 'y ~ x'

      #GT3(0):Yhat = 8.9958 + 0.5698x
      #LE3(1)Yhat = (8.9958 + 0.8996) + 0.5698x = 9.8956 + 0.5698x
  #F: 
modinteraction <- lm(G3~studytime*famsize, d1)
summary(modinteraction)
## 
## Call:
## lm(formula = G3 ~ studytime * famsize, data = d1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.3369  -2.0028   0.2263   3.1621   9.0980 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            9.3976     0.7124  13.192   <2e-16 ***
## studytime              0.3761     0.3175   1.185    0.237    
## famsizeLE3            -0.5953     1.3385  -0.445    0.657    
## studytime:famsizeLE3   0.7575     0.6278   1.207    0.228    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.55 on 391 degrees of freedom
## Multiple R-squared:  0.02111,    Adjusted R-squared:  0.0136 
## F-statistic:  2.81 on 3 and 391 DF,  p-value: 0.03927
anova(modinteraction)
## Analysis of Variance Table
## 
## Response: G3
##                    Df Sum Sq Mean Sq F value  Pr(>F)  
## studytime           1   79.1  79.132  3.8220 0.05130 .
## famsize             1   65.3  65.281  3.1530 0.07656 .
## studytime:famsize   1   30.1  30.141  1.4558 0.22833  
## Residuals         391 8095.4  20.704                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  ggplot(d1, aes(studytime, G3, color = famsize))+
    geom_point()+
    geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

      #GT3(0): Yhat = 9.3976 + 0.376x 
      #LE3(1): Yhat = (9.3976 - 0.5953) + (0.3761 + 0.7575)x = 8.8023 + 1.134x
  #G:
    #SLRGPAstudyhours has an MSE of 20.7361
      mod_sum1 <- summary(SLRGPAstudyhours)
      mod_sum1
## 
## Call:
## lm(formula = G3 ~ studytime, data = d1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.4643  -1.8623   0.5357   3.0697   9.1377 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.3283     0.6033  15.463   <2e-16 ***
## studytime     0.5340     0.2741   1.949   0.0521 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.565 on 393 degrees of freedom
## Multiple R-squared:  0.009569,   Adjusted R-squared:  0.007049 
## F-statistic: 3.797 on 1 and 393 DF,  p-value: 0.05206
      mean(mod_sum1$residuals^2)
## [1] 20.73614
    #modwithFamsize has an MSE of 20.79773
      mod_sum2 <- summary(modwithFamSize)
      mean(mod_sum2$residuals^2)
## [1] 20.79773
    #modFamsizeandstudytime has an MSE of 20.57088
      mod_sum3 <- summary(modFamsizeandstudytime)
      mean(mod_sum3$residuals^2)
## [1] 20.57088
    #modinteraction has a MSE of 20.49457
      mod_sum4 <- summary(modinteraction)
      mean(mod_sum4$residuals^2)
## [1] 20.49457
  #H:
    #The relationship in the SLR (study hours vs GPA) model was almost sig. (t=1.95, p=0.052)
    #The relationship between GPA and family size is not significant 
    #The relationship between studyhours and GPA when controling for faily size is significant (t=2.079, p=0.0383). Famsize becomes marginally significant when study hours are controlled for (p=0.0767)
    #The interaction between study hours and Famsize is not significant
    #The interaction model had the lowest MSE but modFamsizeandstudytime was the most significant
    #Interpretation: GPA is somewhat positivly related to studyhours, but not the size of the student's family. Studytime becomes less affective if the family is bigger (lower GPAs on average). 
#Final anysis: Gender Component
modgender1 <- lm(G3~sex, d1)
summary(modgender1)
## 
## Call:
## lm(formula = G3 ~ sex, data = d1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.9144  -1.9663   0.0856   3.0856   9.0856 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.9663     0.3164  31.503   <2e-16 ***
## sexM          0.9481     0.4598   2.062   0.0399 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.563 on 393 degrees of freedom
## Multiple R-squared:  0.0107, Adjusted R-squared:  0.008186 
## F-statistic: 4.252 on 1 and 393 DF,  p-value: 0.03987
ggplot(d1, aes(sex, G3))+
  geom_boxplot()

    #Boys tend to get higher grades that girls

modgender2 <- lm(studytime ~ sex, d1)
  summary(modgender2)
## 
## Call:
## lm(formula = studytime ~ sex, data = d1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2789 -0.7647 -0.2788  0.2353  2.2353 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.27885    0.05546  41.086  < 2e-16 ***
## sexM        -0.51414    0.08061  -6.378 5.05e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7999 on 393 degrees of freedom
## Multiple R-squared:  0.0938, Adjusted R-squared:  0.09149 
## F-statistic: 40.68 on 1 and 393 DF,  p-value: 5.045e-10
    #Boys study less tha girls 

ggplot(d1, aes(sex, studytime))+
  geom_boxplot()

#Does the realtionships between study hours and GPA differ between boys and girls for large vs small fmaily sizes? 
ggplot(d1, aes(studytime, G3, color = sex))+
  geom_point()+
  facet_wrap(~famsize)+
  geom_smooth(method = 'lm', se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

#study time and GPAmath seem to be related the same for both genders in smaller families but rated differently by gender       in bigger families

modgender3 <- lm(G3 ~ studytime+sex, d1)
  summary(modgender3)
## 
## Call:
## lm(formula = G3 ~ studytime + sex, data = d1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.6583  -2.0980   0.2512   3.2512   9.0313 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.1885     0.7221  11.340  < 2e-16 ***
## studytime     0.7801     0.2854   2.734  0.00655 ** 
## sexM          1.3492     0.4791   2.816  0.00510 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.526 on 392 degrees of freedom
## Multiple R-squared:  0.02921,    Adjusted R-squared:  0.02426 
## F-statistic: 5.898 on 2 and 392 DF,  p-value: 0.002996
modgenderinteraction1 <- lm(G3~studytime*sex, d1)
  summary(modgenderinteraction1)
## 
## Call:
## lm(formula = G3 ~ studytime * sex, data = d1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.1054  -2.1451   0.1989   3.1989   8.8351 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      8.6156     0.9586   8.987   <2e-16 ***
## studytime        0.5927     0.3975   1.491    0.137    
## sexM             0.5691     1.2465   0.457    0.648    
## studytime:sexM   0.3874     0.5715   0.678    0.498    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.529 on 391 degrees of freedom
## Multiple R-squared:  0.03035,    Adjusted R-squared:  0.02291 
## F-statistic:  4.08 on 3 and 391 DF,  p-value: 0.00716
      #significant without the interaction, but not signitifanct with the interaction


modgender4 <- lm(G3~studytime+sex+famsize, d1)
  summary(modgender4)
## 
## Call:
## lm(formula = G3 ~ studytime + sex + famsize, data = d1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.4414  -2.0347   0.4573   3.1631   9.2603 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7.9374     0.7377  10.760  < 2e-16 ***
## studytime     0.8022     0.2852   2.813  0.00515 ** 
## sexM          1.2951     0.4793   2.702  0.00720 ** 
## famsizeLE3    0.8030     0.5042   1.593  0.11202    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.517 on 391 degrees of freedom
## Multiple R-squared:  0.03547,    Adjusted R-squared:  0.02807 
## F-statistic: 4.793 on 3 and 391 DF,  p-value: 0.002723
    #everything significant except famsize

modgenderinteraction2 <- lm(G3~studytime*sex+famsize, d1)
  summary(modgenderinteraction2)
## 
## Call:
## lm(formula = G3 ~ studytime * sex + famsize, data = d1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.9165  -1.8920   0.3025   3.1326   9.0071 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      8.3908     0.9668   8.679   <2e-16 ***
## studytime        0.6021     0.3967   1.518    0.130    
## sexM             0.4603     1.2457   0.369    0.712    
## famsizeLE3       0.8137     0.5047   1.612    0.108    
## studytime:sexM   0.4142     0.5705   0.726    0.468    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.519 on 390 degrees of freedom
## Multiple R-squared:  0.03677,    Adjusted R-squared:  0.02689 
## F-statistic: 3.722 on 4 and 390 DF,  p-value: 0.005489
    #insignificant 


modgender5<- lm(G3~sex+famsize, d1)
  summary(modgender5)
## 
## Call:
## lm(formula = G3 ~ sex + famsize, data = d1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.405  -1.783   0.329   3.217   9.329 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.7828     0.3405  28.731   <2e-16 ***
## sexM          0.8882     0.4610   1.927   0.0547 .  
## famsizeLE3    0.7341     0.5080   1.445   0.1492    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.556 on 392 degrees of freedom
## Multiple R-squared:  0.01595,    Adjusted R-squared:  0.01092 
## F-statistic: 3.176 on 2 and 392 DF,  p-value: 0.04283
modgenderinteraction3 <- lm(G3~sex*famsize, d1)
  summary(modgenderinteraction3)
## 
## Call:
## lm(formula = G3 ~ sex * famsize, data = d1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.0161  -1.9808   0.3718   3.1360   9.3718 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       9.6282     0.3646  26.406   <2e-16 ***
## sexM              1.2358     0.5467   2.261   0.0243 *  
## famsizeLE3        1.3526     0.7292   1.855   0.0644 .  
## sexM:famsizeLE3  -1.2004     1.0160  -1.182   0.2381    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.554 on 391 degrees of freedom
## Multiple R-squared:  0.01945,    Adjusted R-squared:  0.01192 
## F-statistic: 2.585 on 3 and 391 DF,  p-value: 0.05291
      #famsize becomes marginally significant 
ggplot(d1, aes(sex, G3))+
  geom_boxplot()+
  geom_smooth(method="lm", se=FALSE)+
  facet_wrap(~famsize)
## `geom_smooth()` using formula 'y ~ x'