Weekly Exercise

Do three curvilinear regression analyses (linear, quadratic, and cubic) using ‘curvilinear_practice.sav’.

# Prepare the data first
library(haven)
HW4_data <- read_sav("Desktop/curvilinear_practice.sav")
head(HW4_data)
## # A tibble: 6 x 2
##       y     x
##   <dbl> <dbl>
## 1     4     2
## 2     6     2
## 3     5     2
## 4     7     4
## 5    10     4
## 6    10     4

Review the research question below:

When students spend more time for practice (X), their score (Y) will be increased?

1. Draw the scatter plot

# Draw the sactter plot
plot(HW4_data$x,HW4_data$y)

2. Perform curvilinear regression analyses.

Linear regression model

# Run a linear regression model by using lm() function
HW4_lm <- lm(y~x,data=HW4_data)
summary(HW4_lm)
## 
## Call:
## lm(formula = y ~ x, data = HW4_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7424 -1.6130  0.1245  0.9356  5.6245 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.8646     1.0772   3.588   0.0021 ** 
## x             1.4389     0.1274  11.295 1.33e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.112 on 18 degrees of freedom
## Multiple R-squared:  0.8764, Adjusted R-squared:  0.8695 
## F-statistic: 127.6 on 1 and 18 DF,  p-value: 1.329e-09
# Both the constant (intercept) and the slope are significant. 
# Add fit lines
plot(y~x,data=HW4_data)
abline(lm(y~x,data=HW4_data))

Quadratic regression model

# This is pretty similar with the multiple regression. The second independent variable here is the squared value of the 1st variable.
# Run a Quadratic regression model by using lm() function
# Add a culcum as x^2
HW4_data$x2 <- HW4_data$x^2
HW4_Quadm <- lm(y~x+x2,data=HW4_data)
summary(HW4_Quadm)
## 
## Call:
## lm(formula = y ~ x + x2, data = HW4_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8418 -0.6870 -0.1126  0.6913  3.8206 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.77726    1.36506  -1.302 0.210296    
## x            3.43995    0.42058   8.179  2.7e-07 ***
## x2          -0.13380    0.02754  -4.858 0.000148 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.406 on 17 degrees of freedom
## Multiple R-squared:  0.9482, Adjusted R-squared:  0.9421 
## F-statistic: 155.7 on 2 and 17 DF,  p-value: 1.175e-11
# The constant (intercept) is not significant and the slopes are significant.
# load the ggplot package to use ggplot() function
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
# Add fit lines
ggplot(HW4_data, aes(x=x, y=y)) + geom_point()+stat_smooth(se=F, method='lm', formula=y~poly(x,2))

Cubic regression model

# This is still pretty similar with the multiple regression. The third independent variable here is the cubic value of the 1st variable.
# Add a cunlumn as the cubic value of x
HW4_data$x3 <- HW4_data$x^3
# Run a Quadratic regression model by using lm() function
HW4_cubreg <- lm(y~x+x2+x3,data=HW4_data)
summary(HW4_cubreg)
## 
## Call:
## lm(formula = y ~ x + x2 + x3, data = HW4_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6033 -0.7887  0.0367  0.5494  3.7376 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -0.3653250  2.7034153  -0.135    0.894  
## x            2.5878285  1.4631647   1.769    0.096 .
## x2           0.0002056  0.2217818   0.001    0.999  
## x3          -0.0060313  0.0099021  -0.609    0.551  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.433 on 16 degrees of freedom
## Multiple R-squared:  0.9494, Adjusted R-squared:  0.9399 
## F-statistic: 100.1 on 3 and 16 DF,  p-value: 1.403e-10
# The constant (intercept) and the slopes are no significant.
# load the ggplot package to use ggplot() function
# Add fit lines
ggplot(HW4_cubreg, aes(x=x, y=y)) + geom_point()+stat_smooth(se=F, method='lm', formula=y~poly(x,3))

3. Compare the Model

# From the previous section, we got three regression models, namely, linear, quadratic and cubic regression model. 
# We can use the anova() function to compare three models
anova(HW4_lm,HW4_Quadm) # Compare linear with quadratic regression model
## Analysis of Variance Table
## 
## Model 1: y ~ x
## Model 2: y ~ x + x2
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1     18 80.273                                  
## 2     17 33.614  1    46.659 23.597 0.0001477 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(HW4_Quadm,HW4_cubreg) # Compare quadratic with cubic regression model
## Analysis of Variance Table
## 
## Model 1: y ~ x + x2
## Model 2: y ~ x + x2 + x3
##   Res.Df    RSS Df Sum of Sq     F Pr(>F)
## 1     17 33.614                          
## 2     16 32.852  1   0.76176 0.371  0.551
anova(HW4_lm,HW4_cubreg) # Compare linear with cubic regression model
## Analysis of Variance Table
## 
## Model 1: y ~ x
## Model 2: y ~ x + x2 + x3
##   Res.Df    RSS Df Sum of Sq      F   Pr(>F)    
## 1     18 80.273                                 
## 2     16 32.852  2    47.421 11.548 0.000787 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4. Interpretation

  • Adding a quadratic component to the model produced a significant increase in fit, but adding a cubic component did not. Accordingly, quadratic model was adopted, F(2,17)= 307.793, p<.001, R^2= 0.948.
  • When students spend more time for practice (X), their score (Y) will be increased? Y ̂= -1.777+3.440 (X_1 )-0.134(X_1 )^2 Then Y ̂=17.16