Problem statement
In Rstudio, open and edit this R script from the SAT scores example to add an overall regression line of SAT score by teacher’s salary to three separate regression lines of the same set of variables by states with low, medium and high percentage of students taking the SAT exam.
Data management
# input data
dta <- read.table("http://www.amstat.org/publications/jse/datasets/sat.dat.txt")
# view first 6 lines
knitr::kable(head(dta))
Alabama |
4.405 |
17.2 |
31.144 |
8 |
491 |
538 |
1029 |
Alaska |
8.963 |
17.6 |
47.951 |
47 |
445 |
489 |
934 |
Arizona |
4.778 |
19.3 |
32.175 |
27 |
448 |
496 |
944 |
Arkansas |
4.459 |
17.1 |
28.934 |
6 |
482 |
523 |
1005 |
California |
4.992 |
24.0 |
41.078 |
45 |
417 |
485 |
902 |
Colorado |
5.443 |
18.4 |
34.571 |
29 |
462 |
518 |
980 |
#assign variable names
names(dta) <- c("State", "Expend", "Ratio", "Salary", "Frac", "Verbal", "Math","Sat")
# check data structure
str(dta)
'data.frame': 50 obs. of 8 variables:
$ State : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
$ Expend: num 4.41 8.96 4.78 4.46 4.99 ...
$ Ratio : num 17.2 17.6 19.3 17.1 24 18.4 14.4 16.6 19.1 16.3 ...
$ Salary: num 31.1 48 32.2 28.9 41.1 ...
$ Frac : int 8 47 27 6 45 29 81 68 48 65 ...
$ Verbal: int 491 445 448 482 417 462 431 429 420 406 ...
$ Math : int 538 489 496 523 485 518 477 468 469 448 ...
$ Sat : int 1029 934 944 1005 902 980 908 897 889 854 ...
# view first 6 lines
knitr::kable(head(dta))
Alabama |
4.405 |
17.2 |
31.144 |
8 |
491 |
538 |
1029 |
Alaska |
8.963 |
17.6 |
47.951 |
47 |
445 |
489 |
934 |
Arizona |
4.778 |
19.3 |
32.175 |
27 |
448 |
496 |
944 |
Arkansas |
4.459 |
17.1 |
28.934 |
6 |
482 |
523 |
1005 |
California |
4.992 |
24.0 |
41.078 |
45 |
417 |
485 |
902 |
Colorado |
5.443 |
18.4 |
34.571 |
29 |
462 |
518 |
980 |
Visualization
# load data management and plotting package
library(tidyverse)
# create a factor variable with 3 levels from Frac
dta <- mutate(dta, Fracf = cut(Frac, breaks = c(0, 22, 49, 81),labels = c("Low", "Medium", "High")))
ggplot(data=dta, aes(x=Salary, y=Sat, label=State)) +
stat_smooth(method="lm",
formula= y ~ x,
se=F,
color="black",
linetype=3,
size=rel(.5)) +
stat_smooth(aes(group = Fracf),# add an overall regression line of SAT score
method="lm",
formula= y ~ x,
se=F,
color="gray",
linetype=2,
size=rel(.5))+
geom_text(aes(color=Fracf),
check_overlap=TRUE,
show.legend=FALSE,
size=rel(2)) +
labs(x="Salary ($1000)",
y="SAT Score") +
theme_bw()

The End