# input data
dta <- read.table("http://www.amstat.org/publications/jse/datasets/sat.dat.txt")

#assign variable names
names(dta) <- c("State", "Expend", "Ratio", "Salary", "Frac", "Verbal", "Math",
                "Sat")

# check data structure
str(dta)
## 'data.frame':    50 obs. of  8 variables:
##  $ State : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ Expend: num  4.41 8.96 4.78 4.46 4.99 ...
##  $ Ratio : num  17.2 17.6 19.3 17.1 24 18.4 14.4 16.6 19.1 16.3 ...
##  $ Salary: num  31.1 48 32.2 28.9 41.1 ...
##  $ Frac  : int  8 47 27 6 45 29 81 68 48 65 ...
##  $ Verbal: int  491 445 448 482 417 462 431 429 420 406 ...
##  $ Math  : int  538 489 496 523 485 518 477 468 469 448 ...
##  $ Sat   : int  1029 934 944 1005 902 980 908 897 889 854 ...
# look at the first 6 lines
head(dta)
##        State Expend Ratio Salary Frac Verbal Math  Sat
## 1    Alabama  4.405  17.2 31.144    8    491  538 1029
## 2     Alaska  8.963  17.6 47.951   47    445  489  934
## 3    Arizona  4.778  19.3 32.175   27    448  496  944
## 4   Arkansas  4.459  17.1 28.934    6    482  523 1005
## 5 California  4.992  24.0 41.078   45    417  485  902
## 6   Colorado  5.443  18.4 34.571   29    462  518  980
# load data management and plotting package
library(tidyverse)
## -- Attaching packages -------------------------------------------------------- tidyverse 1.3.0 --
## √ ggplot2 3.3.2     √ purrr   0.3.4
## √ tibble  3.0.3     √ dplyr   1.0.2
## √ tidyr   1.1.2     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.5.0
## -- Conflicts ----------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
# create a factor variable with 3 levels from Frac
dta <- mutate(dta, Fracf = cut(Frac, breaks = c(0, 22, 49, 81),
                           labels = c("Low", "Medium", "High")))

# plot with 3 levels
# if want show overall Fracf regression,delete group=Fracf and add stat_smooth().. 
ggplot(data=dta, aes(x=Salary, y=Sat, label=State, group=Fracf)) +
   stat_smooth(method="lm", 
               formula= y ~ x,
               se=F, 
               color="gray", 
               linetype=2, 
               size=rel(.5)) +
   geom_text(aes(color=Fracf), 
             check_overlap=TRUE, 
             show.legend=FALSE, 
             size=rel(2)) +
   labs(x="Salary ($1000)", 
        y="SAT Score") +
   theme_bw()

# end