# Start
rm(list = ls())
getwd()
## [1] "C:/Users/huyn0/Desktop/BigData"
gpa<- read.csv('gpa1.csv', header = TRUE, sep = '\t')
head(gpa)
## age soph junior senior senior5 male campus business engineer colGPA hsGPA ACT
## 1 21 0 0 1 0 0 0 1 0 3.0 3.0 21
## 2 21 0 0 1 0 0 0 1 0 3.4 3.2 24
## 3 20 0 1 0 0 0 0 1 0 3.0 3.6 26
## 4 19 1 0 0 0 1 1 1 0 3.5 3.5 27
## 5 20 0 1 0 0 0 0 1 0 3.6 3.9 28
## 6 20 0 0 1 0 1 1 1 0 3.0 3.4 25
## job19 job20 drive bike walk voluntr PC greek car siblings bgfriend clubs
## 1 0 1 1 0 0 0 0 0 1 1 0 0
## 2 0 1 1 0 0 0 0 0 1 0 1 1
## 3 1 0 0 0 1 0 0 0 1 1 0 1
## 4 1 0 0 0 1 0 0 0 0 1 0 0
## 5 0 1 0 1 0 0 0 0 1 1 1 0
## 6 0 0 0 0 1 0 0 0 1 1 0 0
## skipped alcohol gradMI fathcoll mothcoll
## 1 2 1.0 1 0 0
## 2 0 1.0 1 1 1
## 3 0 1.0 1 1 1
## 4 0 0.0 0 0 0
## 5 0 1.5 1 1 0
## 6 0 0.0 0 1 0
str(gpa)
## 'data.frame': 141 obs. of 29 variables:
## $ age : int 21 21 20 19 20 20 22 22 22 19 ...
## $ soph : int 0 0 0 1 0 0 0 0 0 1 ...
## $ junior : int 0 0 1 0 1 0 0 0 0 0 ...
## $ senior : int 1 1 0 0 0 1 0 0 0 0 ...
## $ senior5 : int 0 0 0 0 0 0 1 1 1 0 ...
## $ male : int 0 0 0 1 0 1 0 0 0 0 ...
## $ campus : int 0 0 0 1 0 1 0 0 0 0 ...
## $ business: int 1 1 1 1 1 1 1 0 0 1 ...
## $ engineer: int 0 0 0 0 0 0 0 0 0 0 ...
## $ colGPA : num 3 3.4 3 3.5 3.6 ...
## $ hsGPA : num 3 3.2 3.6 3.5 3.9 ...
## $ ACT : int 21 24 26 27 28 25 25 22 21 27 ...
## $ job19 : int 0 0 1 1 0 0 0 1 1 1 ...
## $ job20 : int 1 1 0 0 1 0 0 0 0 0 ...
## $ drive : int 1 1 0 0 0 0 0 1 1 0 ...
## $ bike : int 0 0 0 0 1 0 1 0 0 0 ...
## $ walk : int 0 0 1 1 0 1 0 0 0 1 ...
## $ voluntr : int 0 0 0 0 0 0 0 0 0 0 ...
## $ PC : int 0 0 0 0 0 0 0 1 0 1 ...
## $ greek : int 0 0 0 0 0 0 1 0 0 0 ...
## $ car : int 1 1 1 0 1 1 1 0 1 0 ...
## $ siblings: int 1 0 1 1 1 1 1 1 1 1 ...
## $ bgfriend: int 0 1 0 0 1 0 0 1 1 0 ...
## $ clubs : int 0 1 1 0 0 0 1 0 1 1 ...
## $ skipped : num 2 0 0 0 0 0 0 3 2 0.5 ...
## $ alcohol : num 1 1 1 0 1.5 0 2 3 2.5 0.75 ...
## $ gradMI : int 1 1 1 0 1 0 1 1 1 1 ...
## $ fathcoll: int 0 1 1 0 1 1 0 1 1 0 ...
## $ mothcoll: int 0 1 1 0 0 0 1 1 1 1 ...
lm.hsGPA <- lm(colGPA ~ hsGPA, data = gpa)
summary(lm.hsGPA)
##
## Call:
## lm(formula = colGPA ~ hsGPA, data = gpa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.85220 -0.26274 -0.04868 0.28902 0.88551
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.41543 0.30694 4.611 8.98e-06 ***
## hsGPA 0.48243 0.08983 5.371 3.21e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.34 on 139 degrees of freedom
## Multiple R-squared: 0.1719, Adjusted R-squared: 0.1659
## F-statistic: 28.85 on 1 and 139 DF, p-value: 3.211e-07
lm.act <- lm(colGPA ~ hsGPA + ACT, data = gpa)
summary(lm.act)
##
## Call:
## lm(formula = colGPA ~ hsGPA + ACT, data = gpa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.85442 -0.24666 -0.02614 0.28127 0.85357
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.286328 0.340822 3.774 0.000238 ***
## hsGPA 0.453456 0.095813 4.733 5.42e-06 ***
## ACT 0.009426 0.010777 0.875 0.383297
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3403 on 138 degrees of freedom
## Multiple R-squared: 0.1764, Adjusted R-squared: 0.1645
## F-statistic: 14.78 on 2 and 138 DF, p-value: 1.526e-06
lm.PC <- lm(colGPA ~ hsGPA + ACT+ PC , data = gpa)
summary(lm.PC)
##
## Call:
## lm(formula = colGPA ~ hsGPA + ACT + PC, data = gpa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7901 -0.2622 -0.0107 0.2334 0.7570
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.263520 0.333125 3.793 0.000223 ***
## hsGPA 0.447242 0.093647 4.776 4.54e-06 ***
## ACT 0.008659 0.010534 0.822 0.412513
## PC 0.157309 0.057287 2.746 0.006844 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3325 on 137 degrees of freedom
## Multiple R-squared: 0.2194, Adjusted R-squared: 0.2023
## F-statistic: 12.83 on 3 and 137 DF, p-value: 1.932e-07
lm.gradMI <- lm(colGPA ~ hsGPA + ACT+ PC + gradMI, data = gpa)
summary(lm.gradMI)
##
## Call:
## lm(formula = colGPA ~ hsGPA + ACT + PC + gradMI, data = gpa)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.81644 -0.22752 -0.01169 0.21927 0.73319
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.08297 0.33876 3.197 0.00173 **
## hsGPA 0.44379 0.09239 4.804 4.06e-06 ***
## ACT 0.01015 0.01041 0.975 0.33137
## PC 0.15087 0.05658 2.666 0.00860 **
## gradMI 0.18204 0.08306 2.192 0.03010 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.328 on 136 degrees of freedom
## Multiple R-squared: 0.246, Adjusted R-squared: 0.2238
## F-statistic: 11.09 on 4 and 136 DF, p-value: 8.114e-08
# install.packages('stargazer')
library(stargazer)
## Warning: package 'stargazer' was built under R version 4.0.3
lm.act는 강의내용과 같이 조정된 결정계수 값 하락 등의 이유로 마냥 좋은 변수라고만은 할 수 없습니다.
lm.PC는 결정계수가 늘어났을 뿐만아니라, 무엇보다도 조정된 결정계수가 약 0.04만큼 늘어났기 때문에 유의미한 통제변수라고 생각합니다.(p밸류또한 ***로 유의미하다고 생각합니다.)
hsGPA의 값이 소폭 하락했지만(영향력미비) lm.gradMI 또한 결정계수 및 조정된 결정계수 값이 상승했습니다. 때문에유의미한 결과를 얻을 수 있다고 판단하여 변수 추가를 하였습니다.:
stargazer(lm.hsGPA, lm.act, lm.PC, lm.gradMI,
type = 'text',
keep.stat = c('n','rsq', 'adj.rsq'))
##
## ================================================
## Dependent variable:
## -----------------------------------
## colGPA
## (1) (2) (3) (4)
## ------------------------------------------------
## hsGPA 0.482*** 0.453*** 0.447*** 0.444***
## (0.090) (0.096) (0.094) (0.092)
##
## ACT 0.009 0.009 0.010
## (0.011) (0.011) (0.010)
##
## PC 0.157*** 0.151***
## (0.057) (0.057)
##
## gradMI 0.182**
## (0.083)
##
## Constant 1.415*** 1.286*** 1.264*** 1.083***
## (0.307) (0.341) (0.333) (0.339)
##
## ------------------------------------------------
## Observations 141 141 141 141
## R2 0.172 0.176 0.219 0.246
## Adjusted R2 0.166 0.164 0.202 0.224
## ================================================
## Note: *p<0.1; **p<0.05; ***p<0.01