SEM_hmk2
Rose Maier
Due Mon, April 14 (email me prior to class and bring a hard copy to class)

For this assignment, you should download the dataset grade_read_attend.sav (linked above). The dataset is in SPSS format. (Credit for the dataset goes to Jeremy Miles.)

This (made-up) dataset contains data from 40 students in a literature class. The dataset contains 3 variables:

In your favorite statistical package (SPSS, SAS, R, whatever) do the following:

Run a simple regression in which you regress GRADE on ATTEND. (In regressionspeak, you say “regress Y on X,” where Y is the dependent/response variable and X is the independent/input variable. Thus, I am telling you to treat GRADE as the dependent variable and ATTEND as the independent variable.) Using the output of your analysis, do the following:

library(foreign)
data <- read.spss("/Users/TARDIS/Downloads/grade_read_attend.sav", to.data.frame = TRUE)

# Run a simple regression in which you regress GRADE on ATTEND
model1 <- lm(GRADE ~ ATTEND, data = data)
summary(model1)
## 
## Call:
## lm(formula = GRADE ~ ATTEND, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -27.78 -10.90   2.02  12.43  31.76 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   36.998      8.169    4.53  5.7e-05 ***
## ATTEND         1.883      0.555    3.39   0.0016 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.8 on 38 degrees of freedom
## Multiple R-squared:  0.233,  Adjusted R-squared:  0.212 
## F-statistic: 11.5 on 1 and 38 DF,  p-value: 0.00163

# Write out the algebraic equation representing this analysis
print(paste("GRADE = ", round(model1$coeff[1], digits = 2), " + ", round(model1$coeff[2], 
    digits = 2), "*ATTEND + e", sep = ""))
## [1] "GRADE = 37 + 1.88*ATTEND + e"

# Create a graph in which the Y-axis is GRADE and the X-axis is ATTEND. Draw
# a line representing the slope of ATTEND for a realistic range of values.
library(ggplot2)
ggplot(data, aes(x = ATTEND, y = GRADE)) + geom_point(shape = 1) + geom_smooth(method = lm, 
    se = FALSE)

plot of chunk unnamed-chunk-2

Run a multiple regression in which you regress GRADE on BOOKS and ATTEND. Using the output of your analysis:


# Run a multiple regression in which you regress GRADE on BOOKS and ATTEND
model2 <- lm(GRADE ~ ATTEND + BOOKS, data = data)
summary(model2)
## 
## Call:
## lm(formula = GRADE ~ ATTEND + BOOKS, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -20.80 -13.37   0.06   9.17  32.29 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   37.379      7.745    4.83  2.4e-05 ***
## ATTEND         1.283      0.587    2.19    0.035 *  
## BOOKS          4.037      1.753    2.30    0.027 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.1 on 37 degrees of freedom
## Multiple R-squared:  0.329,  Adjusted R-squared:  0.292 
## F-statistic: 9.06 on 2 and 37 DF,  p-value: 0.000628

# Write out the algebraic equation representing this analysis
print(paste("GRADE = ", round(model2$coeff[1], digits = 2), " + ", round(model2$coeff[2], 
    digits = 2), "*ATTEND", " + ", round(model2$coeff[3], digits = 2), "*BOOKS + e", 
    sep = ""))
## [1] "GRADE = 37.38 + 1.28*ATTEND + 4.04*BOOKS + e"

# Create a graph with the same axes as 1b above. Graph the regression lines
# of ATTEND for students who have read 0, 2, and 4 books (so you should draw
# 3 lines).
ggplot(data, aes(x = ATTEND, y = GRADE)) + geom_point(shape = 1) + geom_abline(aes(color = "0 books"), 
    intercept = model2$coeff[1], slope = model2$coeff[2], show_guide = TRUE) + 
    geom_abline(aes(color = "2 books"), intercept = model2$coeff[1] + 2 * model2$coeff[3], 
        slope = model2$coeff[2], show_guide = TRUE) + geom_abline(aes(color = "4 books"), 
    intercept = model2$coeff[1] + 4 * model2$coeff[3], slope = model2$coeff[2], 
    show_guide = TRUE) + guides(color = guide_legend(title = "Number of books read"))

plot of chunk unnamed-chunk-3

Run a multiple regression in which you regress GRADE on BOOKS, ATTEND, and the interaction of BOOKS with ATTEND. Do not center any of the variables. Using the output of your analysis:


# Run a multiple regression in which you regress GRADE on BOOKS, ATTEND, and
# the interaction of BOOKS with ATTEND
model3 <- lm(GRADE ~ ATTEND * BOOKS, data = data)
summary(model3)
## 
## Call:
## lm(formula = GRADE ~ ATTEND * BOOKS, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -24.84 -11.85   1.47   8.16  34.20 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    55.221     11.262    4.90    2e-05 ***
## ATTEND         -0.137      0.878   -0.16    0.877    
## BOOKS          -6.208      5.151   -1.21    0.236    
## ATTEND:BOOKS    0.735      0.349    2.10    0.042 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.4 on 36 degrees of freedom
## Multiple R-squared:  0.402,  Adjusted R-squared:  0.352 
## F-statistic: 8.07 on 3 and 36 DF,  p-value: 0.000306
b0 <- round(model3$coeff[1], digits = 2)
attend <- round(model3$coeff[2], digits = 2)
books <- round(model3$coeff[3], digits = 2)
interaction <- round(model3$coeff[4], digits = 2)

# Write the algebraic equation representing the results of this analysis
# three times. The first time, write it in standard form.
print(paste("GRADE = ", b0, " + ", attend, "*ATTEND", " + ", books, "*BOOKS + ", 
    interaction, "*ATTEND*BOOKS + e", sep = ""))
## [1] "GRADE = 55.22 + -0.14*ATTEND + -6.21*BOOKS + 0.73*ATTEND*BOOKS + e"
# The second time, rearrange it so that you can easily see the conditional
# slope of BOOKS.
print(paste("GRADE = ", b0, " + ", attend, "*ATTEND", " + (", books, " + ", 
    interaction, "*ATTEND)*BOOKS + e", sep = ""))
## [1] "GRADE = 55.22 + -0.14*ATTEND + (-6.21 + 0.73*ATTEND)*BOOKS + e"
# The third time, rearrange again so you can easily see the conditional
# slope of ATTEND.
print(paste("GRADE = ", b0, " + (", attend, " + ", interaction, "*BOOKS)*ATTEND + ", 
    books, "*BOOKS + e", sep = ""))
## [1] "GRADE = 55.22 + (-0.14 + 0.73*BOOKS)*ATTEND + -6.21*BOOKS + e"


# Draw a graph with GRADE on the Y-axis and ATTEND on the X-axis. Draw 3
# lines depicting the regressions of ATTEND for students who have read 0, 2,
# and 4 books.
ggplot(data, aes(x = ATTEND, y = GRADE)) + geom_point(shape = 1) + geom_abline(aes(color = "0 books"), 
    intercept = b0, slope = attend, show_guide = TRUE) + geom_abline(aes(color = "2 books"), 
    intercept = b0 + 2 * books, slope = attend + 2 * interaction, show_guide = TRUE) + 
    geom_abline(aes(color = "4 books"), intercept = b0 + 4 * books, slope = attend + 
        4 * interaction, show_guide = TRUE) + guides(color = guide_legend(title = "Number of books read"))

plot of chunk unnamed-chunk-4


# Draw a graph with GRADE on the Y-axis and BOOKS on the X-axis. Draw 3
# lines depicting the regressions of BOOKS for students who have attended
# 10, 15, and 20 times.
ggplot(data, aes(x = BOOKS, y = GRADE)) + geom_point(shape = 1) + geom_abline(aes(color = "Attended 10 times"), 
    intercept = b0 + 10 * attend, slope = books + 10 * interaction, show_guide = TRUE) + 
    geom_abline(aes(color = "Attended 15 times"), intercept = b0 + 15 * attend, 
        slope = books + 15 * interaction, show_guide = TRUE) + geom_abline(aes(color = "Attended 20 times"), 
    intercept = b0 + 20 * attend, slope = books + 20 * interaction, show_guide = TRUE) + 
    guides(color = guide_legend(title = "Number of classes attended"))

plot of chunk unnamed-chunk-4

Repeat the analysis you ran for #3, only this time you should first center BOOKS and ATTEND around their means, and then regress GRADE on BOOKS(centered), ATTEND(centered), and their interaction.

data$ATTENDc <- data$ATTEND - mean(data$ATTEND)
data$BOOKSc <- data$BOOKS - mean(data$BOOKS)

# Run a multiple regression in which you regress GRADE on BOOKS, ATTEND, and
# the interaction of BOOKS with ATTEND
model4 <- lm(GRADE ~ ATTENDc * BOOKSc, data = data)
summary(model4)
## 
## Call:
## lm(formula = GRADE ~ ATTENDc * BOOKSc, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -24.84 -11.85   1.47   8.16  34.20 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      61.602      2.319   26.57   <2e-16 ***
## ATTENDc           1.333      0.562    2.37    0.023 *  
## BOOKSc            4.155      1.678    2.48    0.018 *  
## ATTENDc:BOOKSc    0.735      0.349    2.10    0.042 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.4 on 36 degrees of freedom
## Multiple R-squared:  0.402,  Adjusted R-squared:  0.352 
## F-statistic: 8.07 on 3 and 36 DF,  p-value: 0.000306
b0 <- round(model4$coeff[1], digits = 2)
attend <- round(model4$coeff[2], digits = 2)
books <- round(model4$coeff[3], digits = 2)
interaction <- round(model4$coeff[4], digits = 2)


# Write the algebraic equation representing the results of this analysis
# three times. The first time, write it in standard form.
print(paste("GRADE = ", b0, " + ", attend, "*ATTENDc", " + ", books, "*BOOKSc + ", 
    interaction, "*ATTENDc*BOOKSc + e", sep = ""))
## [1] "GRADE = 61.6 + 1.33*ATTENDc + 4.15*BOOKSc + 0.73*ATTENDc*BOOKSc + e"
# The second time, rearrange it so that you can easily see the conditional
# slope of BOOKSc.
print(paste("GRADE = ", b0, " + ", attend, "*ATTENDc", " + (", books, " + ", 
    interaction, "*ATTENDc)*BOOKSc + e", sep = ""))
## [1] "GRADE = 61.6 + 1.33*ATTENDc + (4.15 + 0.73*ATTENDc)*BOOKSc + e"
# The third time, rearrange again so you can easily see the conditional
# slope of ATTENDc.
print(paste("GRADE = ", b0, " + (", attend, " + ", interaction, "*BOOKSc)*ATTENDc + ", 
    books, "*BOOKSc + e", sep = ""))
## [1] "GRADE = 61.6 + (1.33 + 0.73*BOOKSc)*ATTENDc + 4.15*BOOKSc + e"


# Draw a graph with GRADE on the Y-axis and ATTENDc on the X-axis. Draw 3
# lines depicting the regressions of ATTENDc for students who have read 0,
# 2, and 4 booksc.
books0c <- 0 - mean(data$BOOKS)
books2c <- 2 - mean(data$BOOKS)
books4c <- 4 - mean(data$BOOKS)

ggplot(data, aes(x = ATTENDc, y = GRADE)) + geom_point(shape = 1) + geom_abline(aes(color = "0 books"), 
    intercept = (b0 + books0c * books), slope = (attend + books0c * interaction), 
    show_guide = TRUE) + geom_abline(aes(color = "2 books"), intercept = (b0 + 
    books2c * books), slope = (attend + books2c * interaction), show_guide = TRUE) + 
    geom_abline(aes(color = "4 books"), intercept = (b0 + books4c * books), 
        slope = (attend + books4c * interaction), show_guide = TRUE) + guides(color = guide_legend(title = "Number of books read"))

plot of chunk unnamed-chunk-5


# Draw a graph with GRADE on the Y-axis and BOOKS on the X-axis. Draw 3
# lines depicting the regressions of BOOKS for students who have attended
# 10, 15, and 20 times.
attend10c <- 10 - mean(data$ATTEND)
attend15c <- 15 - mean(data$ATTEND)
attend20c <- 20 - mean(data$ATTEND)

ggplot(data, aes(x = BOOKSc, y = GRADE)) + geom_point(shape = 1) + geom_abline(aes(color = "Attended 10 times"), 
    intercept = (b0 + attend10c * attend), slope = (books + attend10c * interaction), 
    show_guide = TRUE) + geom_abline(aes(color = "Attended 15 times"), intercept = (b0 + 
    attend15c * attend), slope = (books + attend15c * interaction), show_guide = TRUE) + 
    geom_abline(aes(color = "Attended 20 times"), intercept = (b0 + attend20c * 
        attend), slope = (books + attend20c * interaction), show_guide = TRUE) + 
    guides(color = guide_legend(title = "Number of classes attended"))

plot of chunk unnamed-chunk-5

Repeat #3, only this time you should first z-score all three variables (GRADE, BOOKS, and ATTEND), and then run the regression on the z-scored variables, including the product of z-BOOKS times z-ATTEND.

data$ATTENDz <- (data$ATTEND - mean(data$ATTEND))/sd(data$ATTEND)
data$BOOKSz <- (data$BOOKS - mean(data$BOOKS))/sd(data$BOOKS)

# Run a multiple regression in which you regress GRADE on BOOKSz, ATTENDz,
# and the interaction of BOOKSz with ATTENDz
model5 <- lm(GRADE ~ ATTENDz * BOOKSz, data = data)
summary(model5)
## 
## Call:
## lm(formula = GRADE ~ ATTENDz * BOOKSz, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -24.84 -11.85   1.47   8.16  34.20 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       61.60       2.32   26.57   <2e-16 ***
## ATTENDz            5.70       2.40    2.37    0.023 *  
## BOOKSz             5.95       2.40    2.48    0.018 *  
## ATTENDz:BOOKSz     4.50       2.14    2.10    0.042 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.4 on 36 degrees of freedom
## Multiple R-squared:  0.402,  Adjusted R-squared:  0.352 
## F-statistic: 8.07 on 3 and 36 DF,  p-value: 0.000306
b0 <- round(model5$coeff[1], digits = 2)
attend <- round(model5$coeff[2], digits = 2)
books <- round(model5$coeff[3], digits = 2)
interaction <- round(model5$coeff[4], digits = 2)


# Write the algebraic equation representing the results of this analysis
# three times. The first time, write it in standard form.
print(paste("GRADE = ", b0, " + ", attend, "*ATTENDz", " + ", books, "*BOOKSz + ", 
    interaction, "*ATTENDz*BOOKSz + e", sep = ""))
## [1] "GRADE = 61.6 + 5.7*ATTENDz + 5.95*BOOKSz + 4.5*ATTENDz*BOOKSz + e"
# The second time, rearrange it so that you can easily see the conditional
# slope of BOOKSz.
print(paste("GRADE = ", b0, " + ", attend, "*ATTENDz", " + (", books, " + ", 
    interaction, "*ATTENDz)*BOOKSz + e", sep = ""))
## [1] "GRADE = 61.6 + 5.7*ATTENDz + (5.95 + 4.5*ATTENDz)*BOOKSz + e"
# The third time, rearrange again so you can easily see the conditional
# slope of ATTENDz.
print(paste("GRADE = ", b0, " + (", attend, " + ", interaction, "*BOOKSz)*ATTENDz + ", 
    books, "*BOOKSz + e", sep = ""))
## [1] "GRADE = 61.6 + (5.7 + 4.5*BOOKSz)*ATTENDz + 5.95*BOOKSz + e"


# Draw a graph with GRADE on the Y-axis and ATTEND on the X-axis. Draw 3
# lines depicting the regressions of ATTEND: one line for students who have
# read an average number of books, one line for students whose value on
# BOOKS is 1 standard deviation below the mean, and one line for students
# whose value on BOOKS is 1 standard deviation above the mean.
ggplot(data, aes(x = ATTENDz, y = GRADE)) + geom_point(shape = 1) + geom_abline(aes(color = "low books (-1SD)"), 
    intercept = b0 + -1 * books, slope = attend + -1 * interaction, show_guide = TRUE) + 
    geom_abline(aes(color = "ave. books (mean)"), intercept = b0, slope = attend, 
        show_guide = TRUE) + geom_abline(aes(color = "high books (+1SD)"), intercept = b0 + 
    1 * books, slope = attend + 1 * interaction, show_guide = TRUE) + guides(color = guide_legend(title = "Number of books read"))

plot of chunk unnamed-chunk-6



# Draw a graph with GRADE on the Y-axis and BOOKS on the X-axis. Draw 3
# lines depicting the regressions of BOOKS: one line for students who have
# attended an average number of times, one line for students whose
# attendance is 1 standard deviation below the mean, and one line for
# students whose attendance is 1 standard deviation above the mean.
ggplot(data, aes(x = BOOKSz, y = GRADE)) + geom_point(shape = 1) + geom_abline(aes(colour = "low attendance (-1SD)"), 
    intercept = b0 + -1 * attend, slope = books + -1 * interaction, size = 1, 
    fullrange = T, show_guide = TRUE) + geom_abline(aes(colour = "ave. attendance (mean)"), 
    intercept = b0, slope = books, size = 1, fullrange = T, show_guide = TRUE) + 
    geom_abline(aes(colour = "high attendance (+1SD)"), intercept = b0 + 1 * 
        attend, slope = books + 1 * interaction, size = 1, fullrange = T, show_guide = TRUE) + 
    guides(colour = guide_legend(title = "Attendance"))

plot of chunk unnamed-chunk-6

Dataset source: http://www.jeremymiles.co.uk/regressionbook/data/