# input data
dta <- read.table("http://www.amstat.org/publications/jse/datasets/sat.dat.txt")
#assign variable names
names(dta) <- c("State", "Expend", "Ratio", "Salary", "Frac", "Verbal", "Math",
"Sat")
# check data structure
str(dta)
## 'data.frame': 50 obs. of 8 variables:
## $ State : chr "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ Expend: num 4.41 8.96 4.78 4.46 4.99 ...
## $ Ratio : num 17.2 17.6 19.3 17.1 24 18.4 14.4 16.6 19.1 16.3 ...
## $ Salary: num 31.1 48 32.2 28.9 41.1 ...
## $ Frac : int 8 47 27 6 45 29 81 68 48 65 ...
## $ Verbal: int 491 445 448 482 417 462 431 429 420 406 ...
## $ Math : int 538 489 496 523 485 518 477 468 469 448 ...
## $ Sat : int 1029 934 944 1005 902 980 908 897 889 854 ...
# look at the first 6 lines
head(dta)
## State Expend Ratio Salary Frac Verbal Math Sat
## 1 Alabama 4.405 17.2 31.144 8 491 538 1029
## 2 Alaska 8.963 17.6 47.951 47 445 489 934
## 3 Arizona 4.778 19.3 32.175 27 448 496 944
## 4 Arkansas 4.459 17.1 28.934 6 482 523 1005
## 5 California 4.992 24.0 41.078 45 417 485 902
## 6 Colorado 5.443 18.4 34.571 29 462 518 980
# load data management and plotting package
library(tidyverse)
## -- Attaching packages -------------------------------------------------------- tidyverse 1.3.0 --
## √ ggplot2 3.3.2 √ purrr 0.3.4
## √ tibble 3.0.3 √ dplyr 1.0.2
## √ tidyr 1.1.2 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.5.0
## -- Conflicts ----------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# create a factor variable with 3 levels from Frac
dta <- mutate(dta, Fracf = cut(Frac, breaks = c(0, 22, 49, 81),
labels = c("Low", "Medium", "High")))
# plot with 3 levels
# if want show overall Fracf regression,delete group=Fracf and add stat_smooth()..
ggplot(data=dta, aes(x=Salary, y=Sat, label=State, group=Fracf)) +
stat_smooth(method="lm",
formula= y ~ x,
se=F,
color="gray",
linetype=2,
size=rel(.5)) +
geom_text(aes(color=Fracf),
check_overlap=TRUE,
show.legend=FALSE,
size=rel(2)) +
labs(x="Salary ($1000)",
y="SAT Score") +
theme_bw()

# end