# For the preparation, import the data and rename it.
library(haven)
HW3_data<- read_sav("Desktop/asgusam5(regression).sav")
head(HW3_data)
## # A tibble: 6 x 58
## id gender month year language book home_computer home_desk
## <dbl> <dbl+l> <dbl+lb> <dbl+l> <dbl+lb> <dbl+l> <dbl+lbl> <dbl+lbl>
## 1 1 1 [GIR… 1 [JAN… 5 [200… 1 [ALWA… 4 [TWO… 1 [YES] 0 [NO]
## 2 2 0 [BOY] 9 [SEP… 4 [200… 1 [ALWA… 3 [ONE… 1 [YES] 1 [YES]
## 3 3 0 [BOY] 10 [OCT… 4 [200… 1 [ALWA… 4 [TWO… 1 [YES] 1 [YES]
## 4 4 1 [GIR… 8 [AUG… 4 [200… 1 [ALWA… 3 [ONE… 1 [YES] 1 [YES]
## 5 5 0 [BOY] 8 [AUG… 4 [200… 1 [ALWA… 5 [THR… 1 [YES] 1 [YES]
## 6 6 0 [BOY] 11 [NOV… 4 [200… 1 [ALWA… 3 [ONE… 1 [YES] 1 [YES]
## # … with 50 more variables: home_book <dbl+lbl>, home_room <dbl+lbl>,
## # home_internet <dbl+lbl>, computer_home <dbl+lbl>,
## # computer_school <dbl+lbl>, computer_some <dbl+lbl>,
## # parentsupport1 <dbl+lbl>, parentsupport2 <dbl+lbl>,
## # parentsupport3 <dbl+lbl>, parentsupport4 <dbl+lbl>, school1 <dbl+lbl>,
## # school2 <dbl+lbl>, school3 <dbl+lbl>, studentbullied1 <dbl+lbl>,
## # studentbullied2 <dbl+lbl>, studentbullied3 <dbl+lbl>,
## # studentbullied4 <dbl+lbl>, studentbullied5 <dbl+lbl>,
## # studentbullied6 <dbl+lbl>, learning1 <dbl+lbl>, learning2 <dbl+lbl>,
## # learning3 <dbl+lbl>, learning4 <dbl+lbl>, learning5 <dbl+lbl>,
## # learning6 <dbl+lbl>, learning7 <dbl+lbl>, engagement1 <dbl+lbl>,
## # engagement2 <dbl+lbl>, engagement3 <dbl+lbl>, engagement4 <dbl+lbl>,
## # engagement5 <dbl+lbl>, confidence1 <dbl+lbl>, confidence2 <dbl+lbl>,
## # confidence3 <dbl+lbl>, confidence4 <dbl+lbl>, confidence5 <dbl+lbl>,
## # confidence6 <dbl+lbl>, score1 <dbl>, score2 <dbl>, score3 <dbl>,
## # score4 <dbl>, score5 <dbl>, ParentSupport <dbl>, Home <dbl>,
## # school <dbl>, StudentBullied <dbl>, learning <dbl>, engagement <dbl>,
## # confidence <dbl>, ScienceScore <dbl>
HW3_simpleReg <- HW3_data[,c("Home","ScienceScore")]
# Draw the scatter plot using 'home' & 'Science Score'
plot(HW3_simpleReg$Home,HW3_simpleReg$ScienceScore,xlab="Sum of Home Stuff",ylab="Science Score")
summary(HW3_simpleReg) # In there we can see there are 121 NA's in the Home variable.
## Home ScienceScore
## Min. :0.000 Min. :276.7
## 1st Qu.:4.000 1st Qu.:492.7
## Median :4.000 Median :546.8
## Mean :4.162 Mean :541.4
## 3rd Qu.:5.000 3rd Qu.:594.0
## Max. :5.000 Max. :774.0
## NA's :121
sd(na.omit(HW3_simpleReg$Home)) # Need to remove the NAs by using omit.na() function to calculate the sd.
## [1] 1.049018
sd(HW3_simpleReg$ScienceScore) # There is no NA in Science Score, so we don't need to change anything.
## [1] 74.52246
SimReg <- lm(ScienceScore~Home,HW3_simpleReg)
summary(SimReg)
##
## Call:
## lm(formula = ScienceScore ~ Home, data = HW3_simpleReg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -283.948 -46.359 3.969 49.166 216.981
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 440.9921 2.5138 175.43 <2e-16 ***
## Home 24.1780 0.5857 41.28 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 69.91 on 12946 degrees of freedom
## (121 observations deleted due to missingness)
## Multiple R-squared: 0.1163, Adjusted R-squared: 0.1163
## F-statistic: 1704 on 1 and 12946 DF, p-value: < 2.2e-16
# Expected Answers: According to the R output in the previous section, when students have more home stuffs, they earned higher science score.
# Science Score is the dependent variable. Home Stuffs Possession is the independent variable.
# Since the R^2=.116, Home Stuffs can account for 11.6% of the variation in Science Score.
# From the ANOVA table, We can conclude that our regression model results in significantly better prediction of science score than if we used the mean value of science score. In short, the regression model overall predicts Science Score significantly well. (F=1704.176)
# β_0=440.992, which means that when the sum of home stuffs is equal to zero, the model predicts that science score was 440.992.
# β_1=24.178, represents the regression slope. The value indicates the change in the outcome associated with a unit change in the predictor.
Note: Choose science score as a dependent variable and 3 independent variables you want to study. (Hint: To select the independent variables, run correlation and choose highly correlated variables with science score.
# In this example, we choose the variable Home Stuff, Student Bullied and Sum of Engagement.
# Firstly, we need to prepare the dataset.
HW3_MultiReg <- HW3_data[,c("ScienceScore","Home","StudentBullied","engagement")]
# Perform descriptive analysis
summary(HW3_MultiReg)
## ScienceScore Home StudentBullied engagement
## Min. :276.7 Min. :0.000 Min. : 1.00 Min. : 1.000
## 1st Qu.:492.7 1st Qu.:4.000 1st Qu.:16.00 1st Qu.: 5.000
## Median :546.8 Median :4.000 Median :21.00 Median : 7.000
## Mean :541.4 Mean :4.162 Mean :19.13 Mean : 7.745
## 3rd Qu.:594.0 3rd Qu.:5.000 3rd Qu.:23.00 3rd Qu.: 9.000
## Max. :774.0 Max. :5.000 Max. :24.00 Max. :20.000
## NA's :121 NA's :142 NA's :214
# No sd? No worry, let's remove the NA first and use the sd() function
sd(HW3_MultiReg$ScienceScore)
## [1] 74.52246
sd(na.omit(HW3_MultiReg$Home))
## [1] 1.049018
sd(na.omit(HW3_MultiReg$StudentBullied))
## [1] 4.823351
sd(na.omit(HW3_MultiReg$engagement))
## [1] 2.87122
# In the following operation, we still need use cleaned data set (without NA's) couple times, so we made a cleaned version for the further application.
cleaned_HW3_MultiReg <- na.omit(HW3_MultiReg)
# In this example, we centered the engagement. (the reason will be explianed in the following section)
mean(cleaned_HW3_MultiReg$engagement)
## [1] 7.747064
cleaned_HW3_MultiReg$engagement <- cleaned_HW3_MultiReg$engagement-7.75
# Run the multiple regression
MultiReg <- lm(ScienceScore~Home+StudentBullied+engagement,cleaned_HW3_MultiReg)
summary(MultiReg)
##
## Call:
## lm(formula = ScienceScore ~ Home + StudentBullied + engagement,
## data = cleaned_HW3_MultiReg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -248.927 -45.114 3.389 47.410 210.588
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 402.0056 3.3076 121.54 <2e-16 ***
## Home 22.1075 0.5813 38.03 <2e-16 ***
## StudentBullied 2.5166 0.1264 19.91 <2e-16 ***
## engagement -3.1257 0.2113 -14.79 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 67.95 on 12683 degrees of freedom
## Multiple R-squared: 0.158, Adjusted R-squared: 0.1578
## F-statistic: 793.3 on 3 and 12683 DF, p-value: < 2.2e-16
# R^2=.158. In our model, Home Stuffs, Being Bullied and Engagement levels can account for 15.8% of the variation in Science Score.
# Check the F-statistic, we can conclude that our regression model results in significantly better prediction of science score than if we used the mean value of science score. In short, the regression model overall predicts Science Score significantly well. (F=793.3, p<0.001)
# Check the coefficient table, β_0=402.020, which means that when the sum of home stuffs is equal to zero, the student have not being bullied (=0) and the average engagement score is equal to 7.75, the model predicts that science score was 402.020. β_1=-3.126, β_2=-22.108 and β_3=-2.517 represent the regressions’ slopes. The values indicate the change in the outcome associated with a unit change in the predictor.
# β_1, β_2 and β_3 are different from zero. We can conclude that the confidence makes a significant contribution to predict science score at α=0.05.
# When a student has higher engagement (scale is inverse), he/she has more home stuffs and is less bullied (scale is inverse), a student earn higher scores.
# Load the Hmisc package to run the correlation matrix
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
rcorr(as.matrix(cleaned_HW3_MultiReg),type="pearson") #To obtain the correlation matrix and p-value matrix
## ScienceScore Home StudentBullied engagement
## ScienceScore 1.00 0.34 0.20 -0.16
## Home 0.34 1.00 0.10 -0.07
## StudentBullied 0.20 0.10 1.00 -0.08
## engagement -0.16 -0.07 -0.08 1.00
##
## n= 12687
##
##
## P
## ScienceScore Home StudentBullied engagement
## ScienceScore 0 0 0
## Home 0 0 0
## StudentBullied 0 0 0
## engagement 0 0 0
# The correlation between de independent variables and the dependent variables are higher than 0.16.
# The correlations between the independent variables are all lower than 0.10.
# Load the lmtest package to run the Durbin-Watson test
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
dwtest(ScienceScore~Home+StudentBullied+engagement, data=cleaned_HW3_MultiReg) # Run the Durbin-Watson test
##
## Durbin-Watson test
##
## data: ScienceScore ~ Home + StudentBullied + engagement
## DW = 1.451, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is greater than 0
# The value of Durbin-Watson is 1.451 (almost 2), indicating no serial correlation among the residuals.
# Calculate the mahalanobis distance
colmean <- colMeans(na.omit(cleaned_HW3_MultiReg))
Sx <- cov(cleaned_HW3_MultiReg)
D2 <- mahalanobis(cleaned_HW3_MultiReg, colmean, Sx,inverted=FALSE)
mean(D2) #The Mean distance is 3.99, which is less than 25, which means we are safe.
## [1] 3.999685
head(sort(D2),10) # Check top 10 small value
## [1] 0.03130753 0.03717616 0.03942821 0.03942821 0.04051665 0.05627274
## [7] 0.06240742 0.06837855 0.07972976 0.08090561
tail(sort(D2),10) # Check top 10 large value
## [1] 33.77593 34.34625 34.80713 35.19361 35.19361 35.75464 36.71458
## [8] 36.88294 36.91094 37.00067
# Mahalanobis distance: There are some values that can be a problem. The maximum value is above 25.
# Check Homoscedasticity
plot(MultiReg,1)
# The graphic indicates that there is constant variation and there is no evidence of heteroscedasticity.
# Check Normality & Homoscedasticity
plot(MultiReg,2)
# Now we need standardize() function in robustHD package to standaarize our residuals.
library(robustHD)
## Loading required package: perry
## Loading required package: parallel
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
hist(standardize(MultiReg$residuals))
# The histogram and normal probability plots of the standardized residuals are almost symmetrical and lying about the diagonal line respectively. These suggest that the Normality assumption is not violated.
# Scale-Location (or Spread-Location). Used to check the homogeneity of variance of the residuals (homoscedasticity). Horizontal line with equally spread points is a good indication of homoscedasticity.
plot(MultiReg,3)
# Check Outlier
plot(MultiReg,4)
# Residuals vs Leverage. Used to identify influential cases, that is extreme values that might influence the regression results when included or excluded from the analysis.
# Cook’s Distance: There are some values that can be a problem. The maximum value is above 0.00032.
– Which is your independent variable?
# The independent variables are Home Stuffs, Student Bullied and Engagement.
– (optional) Center the independent variables based on your interest (e.g., mean, max, min, or specific value and so on)
# For Home Stuff, having zero stuffs at home is a possibility. That is why this variable was not centered.
# For Student Bullied, there is the possibility that the student have been never bullied. That is why this variable was not centered.
# For Engagement, there is no meaning of having zero engagement. That is why this variable was centered.
(Note. Your interpretation of intercept will differ from others based on the choice of centering methods) ### 7. Interpret the results and make your conclusion (Please include assumption check and outlier treatment if needed)
# For the convenience, the interpretation is provided on the previous section, along with the R outputs.