Homework2: In Rstudio, open and edit this R script from the SAT scores example to add an overall regression line of SAT score by teacher’s salary to three separate regression lines of the same set of variables by states with low, medium and high percentage of students taking the SAT exam.
# input data
dta <- read.table("http://www.amstat.org/publications/jse/datasets/sat.dat.txt")
# assign variable names
names(dta) <- c("State", "Expend", "Ratio", "Salary", "Frac", "Verbal", "Math", "Sat")
# check data structure
str(dta)
## 'data.frame': 50 obs. of 8 variables:
## $ State : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ Expend: num 4.41 8.96 4.78 4.46 4.99 ...
## $ Ratio : num 17.2 17.6 19.3 17.1 24 18.4 14.4 16.6 19.1 16.3 ...
## $ Salary: num 31.1 48 32.2 28.9 41.1 ...
## $ Frac : int 8 47 27 6 45 29 81 68 48 65 ...
## $ Verbal: int 491 445 448 482 417 462 431 429 420 406 ...
## $ Math : int 538 489 496 523 485 518 477 468 469 448 ...
## $ Sat : int 1029 934 944 1005 902 980 908 897 889 854 ...
# look at the first 6 lines
head(dta)
## State Expend Ratio Salary Frac Verbal Math Sat
## 1 Alabama 4.405 17.2 31.144 8 491 538 1029
## 2 Alaska 8.963 17.6 47.951 47 445 489 934
## 3 Arizona 4.778 19.3 32.175 27 448 496 944
## 4 Arkansas 4.459 17.1 28.934 6 482 523 1005
## 5 California 4.992 24.0 41.078 45 417 485 902
## 6 Colorado 5.443 18.4 34.571 29 462 518 980
# load data management and plotting package
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## √ ggplot2 3.3.2 √ purrr 0.3.4
## √ tibble 3.0.4 √ dplyr 1.0.2
## √ tidyr 1.1.2 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# create a factor variable with 3 levels from Frac
dta <- mutate(dta, Fracf = cut(Frac, breaks = c(0, 22, 49, 81),
labels = c("Low", "Medium", "High")))
ggplot(data=dta, aes(x=Salary, y=Sat, label=State, group=Fracf)) +
stat_smooth(method="lm",
formula= y ~ x,
se=F,
color="gray",
linetype=2,
size=rel(.5)) +
geom_text(aes(color=Fracf),
check_overlap=TRUE,
show.legend=FALSE,
size=rel(2)) +
labs(x="Salary ($1000)",
y="SAT Score") +
theme_bw()
# add an overall regression line
ggplot(data=dta, aes(x=Salary, y=Sat, label=State)) +
stat_smooth(method="lm",
formula= y ~ x,
se=F,
color="black",
linetype=3,
size=rel(.5)) +
stat_smooth(aes(group = Fracf),
method="lm",
formula= y ~ x,
se=F,
color="gray",
linetype=2,
size=rel(.5))+
geom_text(aes(color=Fracf),
check_overlap=TRUE,
show.legend=FALSE,
size=rel(2)) +
labs(x="Salary ($1000)",
y="SAT Score") +
theme_bw()
#在散佈圖上加上趨勢線
#abline(lm(Salary ~ Sat))