# Start
rm(list = ls())
getwd()
## [1] "C:/Users/huyn0/Desktop/BigData"
gpa<- read.csv('gpa1.csv', header = TRUE, sep = '\t')

head(gpa)
##   age soph junior senior senior5 male campus business engineer colGPA hsGPA ACT
## 1  21    0      0      1       0    0      0        1        0    3.0   3.0  21
## 2  21    0      0      1       0    0      0        1        0    3.4   3.2  24
## 3  20    0      1      0       0    0      0        1        0    3.0   3.6  26
## 4  19    1      0      0       0    1      1        1        0    3.5   3.5  27
## 5  20    0      1      0       0    0      0        1        0    3.6   3.9  28
## 6  20    0      0      1       0    1      1        1        0    3.0   3.4  25
##   job19 job20 drive bike walk voluntr PC greek car siblings bgfriend clubs
## 1     0     1     1    0    0       0  0     0   1        1        0     0
## 2     0     1     1    0    0       0  0     0   1        0        1     1
## 3     1     0     0    0    1       0  0     0   1        1        0     1
## 4     1     0     0    0    1       0  0     0   0        1        0     0
## 5     0     1     0    1    0       0  0     0   1        1        1     0
## 6     0     0     0    0    1       0  0     0   1        1        0     0
##   skipped alcohol gradMI fathcoll mothcoll
## 1       2     1.0      1        0        0
## 2       0     1.0      1        1        1
## 3       0     1.0      1        1        1
## 4       0     0.0      0        0        0
## 5       0     1.5      1        1        0
## 6       0     0.0      0        1        0
str(gpa)
## 'data.frame':    141 obs. of  29 variables:
##  $ age     : int  21 21 20 19 20 20 22 22 22 19 ...
##  $ soph    : int  0 0 0 1 0 0 0 0 0 1 ...
##  $ junior  : int  0 0 1 0 1 0 0 0 0 0 ...
##  $ senior  : int  1 1 0 0 0 1 0 0 0 0 ...
##  $ senior5 : int  0 0 0 0 0 0 1 1 1 0 ...
##  $ male    : int  0 0 0 1 0 1 0 0 0 0 ...
##  $ campus  : int  0 0 0 1 0 1 0 0 0 0 ...
##  $ business: int  1 1 1 1 1 1 1 0 0 1 ...
##  $ engineer: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ colGPA  : num  3 3.4 3 3.5 3.6 ...
##  $ hsGPA   : num  3 3.2 3.6 3.5 3.9 ...
##  $ ACT     : int  21 24 26 27 28 25 25 22 21 27 ...
##  $ job19   : int  0 0 1 1 0 0 0 1 1 1 ...
##  $ job20   : int  1 1 0 0 1 0 0 0 0 0 ...
##  $ drive   : int  1 1 0 0 0 0 0 1 1 0 ...
##  $ bike    : int  0 0 0 0 1 0 1 0 0 0 ...
##  $ walk    : int  0 0 1 1 0 1 0 0 0 1 ...
##  $ voluntr : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PC      : int  0 0 0 0 0 0 0 1 0 1 ...
##  $ greek   : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ car     : int  1 1 1 0 1 1 1 0 1 0 ...
##  $ siblings: int  1 0 1 1 1 1 1 1 1 1 ...
##  $ bgfriend: int  0 1 0 0 1 0 0 1 1 0 ...
##  $ clubs   : int  0 1 1 0 0 0 1 0 1 1 ...
##  $ skipped : num  2 0 0 0 0 0 0 3 2 0.5 ...
##  $ alcohol : num  1 1 1 0 1.5 0 2 3 2.5 0.75 ...
##  $ gradMI  : int  1 1 1 0 1 0 1 1 1 1 ...
##  $ fathcoll: int  0 1 1 0 1 1 0 1 1 0 ...
##  $ mothcoll: int  0 1 1 0 0 0 1 1 1 1 ...
lm.hsGPA <- lm(colGPA ~ hsGPA, data = gpa)
summary(lm.hsGPA)
## 
## Call:
## lm(formula = colGPA ~ hsGPA, data = gpa)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.85220 -0.26274 -0.04868  0.28902  0.88551 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.41543    0.30694   4.611 8.98e-06 ***
## hsGPA        0.48243    0.08983   5.371 3.21e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.34 on 139 degrees of freedom
## Multiple R-squared:  0.1719, Adjusted R-squared:  0.1659 
## F-statistic: 28.85 on 1 and 139 DF,  p-value: 3.211e-07
lm.act <- lm(colGPA ~ hsGPA + ACT, data = gpa)
summary(lm.act)
## 
## Call:
## lm(formula = colGPA ~ hsGPA + ACT, data = gpa)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.85442 -0.24666 -0.02614  0.28127  0.85357 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.286328   0.340822   3.774 0.000238 ***
## hsGPA       0.453456   0.095813   4.733 5.42e-06 ***
## ACT         0.009426   0.010777   0.875 0.383297    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3403 on 138 degrees of freedom
## Multiple R-squared:  0.1764, Adjusted R-squared:  0.1645 
## F-statistic: 14.78 on 2 and 138 DF,  p-value: 1.526e-06
lm.PC <- lm(colGPA ~ hsGPA + ACT+ PC , data = gpa)
summary(lm.PC)
## 
## Call:
## lm(formula = colGPA ~ hsGPA + ACT + PC, data = gpa)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7901 -0.2622 -0.0107  0.2334  0.7570 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.263520   0.333125   3.793 0.000223 ***
## hsGPA       0.447242   0.093647   4.776 4.54e-06 ***
## ACT         0.008659   0.010534   0.822 0.412513    
## PC          0.157309   0.057287   2.746 0.006844 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3325 on 137 degrees of freedom
## Multiple R-squared:  0.2194, Adjusted R-squared:  0.2023 
## F-statistic: 12.83 on 3 and 137 DF,  p-value: 1.932e-07
lm.gradMI <- lm(colGPA ~ hsGPA + ACT+ PC + gradMI, data = gpa)
summary(lm.gradMI)
## 
## Call:
## lm(formula = colGPA ~ hsGPA + ACT + PC + gradMI, data = gpa)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.81644 -0.22752 -0.01169  0.21927  0.73319 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.08297    0.33876   3.197  0.00173 ** 
## hsGPA        0.44379    0.09239   4.804 4.06e-06 ***
## ACT          0.01015    0.01041   0.975  0.33137    
## PC           0.15087    0.05658   2.666  0.00860 ** 
## gradMI       0.18204    0.08306   2.192  0.03010 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.328 on 136 degrees of freedom
## Multiple R-squared:  0.246,  Adjusted R-squared:  0.2238 
## F-statistic: 11.09 on 4 and 136 DF,  p-value: 8.114e-08
# install.packages('stargazer')
library(stargazer)
## Warning: package 'stargazer' was built under R version 4.0.3

개인적 판단

  1. lm.act는 강의내용과 같이 조정된 결정계수 값 하락 등의 이유로 마냥 좋은 변수라고만은 할 수 없습니다.

  2. lm.PC는 결정계수가 늘어났을 뿐만아니라, 무엇보다도 조정된 결정계수가 약 0.04만큼 늘어났기 때문에 유의미한 통제변수라고 생각합니다.(p밸류또한 ***로 유의미하다고 생각합니다.)

  3. hsGPA의 값이 소폭 하락했지만(영향력미비) lm.gradMI 또한 결정계수 및 조정된 결정계수 값이 상승했습니다. 때문에유의미한 결과를 얻을 수 있다고 판단하여 변수 추가를 하였습니다.:

stargazer(lm.hsGPA, lm.act, lm.PC, lm.gradMI,
          type = 'text',
          keep.stat = c('n','rsq', 'adj.rsq'))
## 
## ================================================
##                      Dependent variable:        
##              -----------------------------------
##                            colGPA               
##                (1)      (2)      (3)      (4)   
## ------------------------------------------------
## hsGPA        0.482*** 0.453*** 0.447*** 0.444***
##              (0.090)  (0.096)  (0.094)  (0.092) 
##                                                 
## ACT                    0.009    0.009    0.010  
##                       (0.011)  (0.011)  (0.010) 
##                                                 
## PC                             0.157*** 0.151***
##                                (0.057)  (0.057) 
##                                                 
## gradMI                                  0.182** 
##                                         (0.083) 
##                                                 
## Constant     1.415*** 1.286*** 1.264*** 1.083***
##              (0.307)  (0.341)  (0.333)  (0.339) 
##                                                 
## ------------------------------------------------
## Observations   141      141      141      141   
## R2            0.172    0.176    0.219    0.246  
## Adjusted R2   0.166    0.164    0.202    0.224  
## ================================================
## Note:                *p<0.1; **p<0.05; ***p<0.01