Homework2： In Rstudio, open and edit this R script from the SAT scores example to add an overall regression line of SAT score by teacher’s salary to three separate regression lines of the same set of variables by states with low, medium and high percentage of students taking the SAT exam.

1 Data management

# input data
dta <- read.table("http://www.amstat.org/publications/jse/datasets/sat.dat.txt")

# assign variable names
names(dta) <- c("State", "Expend", "Ratio", "Salary", "Frac", "Verbal", "Math", "Sat")

# check data structure
str(dta)

## 'data.frame':    50 obs. of  8 variables:
##  $ State : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ Expend: num  4.41 8.96 4.78 4.46 4.99 ...
##  $ Ratio : num  17.2 17.6 19.3 17.1 24 18.4 14.4 16.6 19.1 16.3 ...
##  $ Salary: num  31.1 48 32.2 28.9 41.1 ...
##  $ Frac  : int  8 47 27 6 45 29 81 68 48 65 ...
##  $ Verbal: int  491 445 448 482 417 462 431 429 420 406 ...
##  $ Math  : int  538 489 496 523 485 518 477 468 469 448 ...
##  $ Sat   : int  1029 934 944 1005 902 980 908 897 889 854 ...

# look at the first 6 lines
head(dta)

##        State Expend Ratio Salary Frac Verbal Math  Sat
## 1    Alabama  4.405  17.2 31.144    8    491  538 1029
## 2     Alaska  8.963  17.6 47.951   47    445  489  934
## 3    Arizona  4.778  19.3 32.175   27    448  496  944
## 4   Arkansas  4.459  17.1 28.934    6    482  523 1005
## 5 California  4.992  24.0 41.078   45    417  485  902
## 6   Colorado  5.443  18.4 34.571   29    462  518  980

# load data management and plotting package
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## √ ggplot2 3.3.2     √ purrr   0.3.4
## √ tibble  3.0.4     √ dplyr   1.0.2
## √ tidyr   1.1.2     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.5.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

# create a factor variable with 3 levels from Frac
dta <- mutate(dta, Fracf = cut(Frac, breaks = c(0, 22, 49, 81),
                        labels = c("Low", "Medium", "High")))

2 plot

ggplot(data=dta, aes(x=Salary, y=Sat, label=State, group=Fracf)) +
 stat_smooth(method="lm", 
             formula= y ~ x,
             se=F, 
             color="gray", 
             linetype=2, 
             size=rel(.5)) +
 geom_text(aes(color=Fracf), 
           check_overlap=TRUE, 
           show.legend=FALSE, 
           size=rel(2)) +
 labs(x="Salary ($1000)", 
      y="SAT Score") +
 theme_bw()

3 add an overall regression line

# add an overall regression line
ggplot(data=dta, aes(x=Salary, y=Sat, label=State)) +
   stat_smooth(method="lm", 
             formula= y ~ x,
             se=F, 
             color="black", 
             linetype=3, 
             size=rel(.5)) +
   stat_smooth(aes(group = Fracf),
               method="lm", 
               formula= y ~ x,
               se=F, 
               color="gray", 
               linetype=2, 
               size=rel(.5))+
   geom_text(aes(color=Fracf), 
           check_overlap=TRUE, 
           show.legend=FALSE, 
           size=rel(2)) +
   labs(x="Salary ($1000)", 
      y="SAT Score") +
   theme_bw()

#在散佈圖上加上趨勢線
#abline(lm(Salary ~ Sat))

W2 exercise2：SAT by states

Ching-Fang Wu

Thu Jan 14 13:52:43 2021

1 Data management

2 plot

3 add an overall regression line

3.1 THE END