Problem statement

In Rstudio, open and edit this R script from the SAT scores example to add an overall regression line of SAT score by teacher’s salary to three separate regression lines of the same set of variables by states with low, medium and high percentage of students taking the SAT exam.

Data management

# input data
dta <- read.table("http://www.amstat.org/publications/jse/datasets/sat.dat.txt")
# view first 6 lines
knitr::kable(head(dta))
V1 V2 V3 V4 V5 V6 V7 V8
Alabama 4.405 17.2 31.144 8 491 538 1029
Alaska 8.963 17.6 47.951 47 445 489 934
Arizona 4.778 19.3 32.175 27 448 496 944
Arkansas 4.459 17.1 28.934 6 482 523 1005
California 4.992 24.0 41.078 45 417 485 902
Colorado 5.443 18.4 34.571 29 462 518 980
#assign variable names
names(dta) <- c("State", "Expend", "Ratio", "Salary", "Frac", "Verbal", "Math","Sat")
# check data structure
str(dta)
'data.frame':   50 obs. of  8 variables:
 $ State : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
 $ Expend: num  4.41 8.96 4.78 4.46 4.99 ...
 $ Ratio : num  17.2 17.6 19.3 17.1 24 18.4 14.4 16.6 19.1 16.3 ...
 $ Salary: num  31.1 48 32.2 28.9 41.1 ...
 $ Frac  : int  8 47 27 6 45 29 81 68 48 65 ...
 $ Verbal: int  491 445 448 482 417 462 431 429 420 406 ...
 $ Math  : int  538 489 496 523 485 518 477 468 469 448 ...
 $ Sat   : int  1029 934 944 1005 902 980 908 897 889 854 ...
# view first 6 lines
knitr::kable(head(dta))
State Expend Ratio Salary Frac Verbal Math Sat
Alabama 4.405 17.2 31.144 8 491 538 1029
Alaska 8.963 17.6 47.951 47 445 489 934
Arizona 4.778 19.3 32.175 27 448 496 944
Arkansas 4.459 17.1 28.934 6 482 523 1005
California 4.992 24.0 41.078 45 417 485 902
Colorado 5.443 18.4 34.571 29 462 518 980

Visualization

# load data management and plotting package
library(tidyverse)
# create a factor variable with 3 levels from Frac
dta <- mutate(dta, Fracf = cut(Frac, breaks = c(0, 22, 49, 81),labels = c("Low", "Medium", "High")))
ggplot(data=dta, aes(x=Salary, y=Sat, label=State)) +
   stat_smooth(method="lm", 
             formula= y ~ x,
             se=F, 
             color="black", 
             linetype=3, 
             size=rel(.5)) +
   stat_smooth(aes(group = Fracf),# add an overall regression line of SAT score
               method="lm", 
               formula= y ~ x,
               se=F, 
               color="gray", 
               linetype=2, 
               size=rel(.5))+
   geom_text(aes(color=Fracf), 
           check_overlap=TRUE, 
           show.legend=FALSE, 
           size=rel(2)) +
   labs(x="Salary ($1000)", 
      y="SAT Score") +
   theme_bw()

The End