# Installing required packages
if (!require("dplyr"))
  install.packages("dplyr")
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
if (!require("tidyverse"))
  install.packages("tidyverse")
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.4.4     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)

# Read the data
mydata <- read.csv("https://raw.githubusercontent.com/drkblake/Data/main/Educ_Income_2022.csv") #Edit YOURFILENAME.csv
head(mydata,10)
##    GEOID          County     State PctCollege FamIncome
## 1  47001 Anderson County Tennessee       24.5     75637
## 2  47003  Bedford County Tennessee       17.1     71159
## 3  47005   Benton County Tennessee       11.1     65800
## 4  47007  Bledsoe County Tennessee       10.3     59695
## 5  47009   Blount County Tennessee       26.0     85194
## 6  47011  Bradley County Tennessee       23.9     75270
## 7  47013 Campbell County Tennessee       12.9     61629
## 8  47015   Cannon County Tennessee       17.5     71000
## 9  47017  Carroll County Tennessee       19.8     68542
## 10 47019   Carter County Tennessee       21.1     61776
# Specify the DV and IV
mydata$DV <- mydata$FamIncome #Edit YOURDVNAME
mydata$IV <- mydata$PctCollege #Edit YOURIVNAME

# Look at the DV and IV
ggplot(mydata, aes(x = DV)) + geom_histogram(color = "black", fill = "#1f78b4")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(mydata, aes(x = IV)) + geom_histogram(color = "black", fill = "#1f78b4")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Creating and summarizing an initial regression model called myreg, and checking for bivariate outliers.
options(scipen = 999)
myreg <- lm(DV ~ IV,
            data = mydata)
plot(mydata$IV, mydata$DV)


mydata$DV <- mydata$FamIncome
mydata$IV <- mydata$PctCollege 
myreg <- lm(DV ~ IV,
            data = mydata)
plot(mydata$IV, mydata$DV)
abline(lm(mydata$DV ~ mydata$IV))

summary(myreg)
## 
## Call:
## lm(formula = DV ~ IV, data = mydata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15380.9  -5904.4    538.2   4730.4  17582.3 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 44664.55    1903.16   23.47 <0.0000000000000002 ***
## IV           1349.18      90.88   14.85 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7516 on 93 degrees of freedom
## Multiple R-squared:  0.7032, Adjusted R-squared:  0.7001 
## F-statistic: 220.4 on 1 and 93 DF,  p-value: < 0.00000000000000022
leverage <- as.data.frame(hatvalues(myreg))
view(leverage)

mydata <- mydata[-c(94), ]


mydata$DV <- mydata$FamIncome
mydata$IV <- mydata$PctCollege
myreg <- lm(DV ~ IV,
            data = mydata)
plot(mydata$IV, mydata$DV)
abline(lm(mydata$DV ~ mydata$IV))

summary(myreg)
## 
## Call:
## lm(formula = DV ~ IV, data = mydata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15004.2  -4596.6    161.3   4151.4  18594.8 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)  47333.4     2057.6   23.00 <0.0000000000000002 ***
## IV            1196.4      102.6   11.66 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7242 on 92 degrees of freedom
## Multiple R-squared:  0.5965, Adjusted R-squared:  0.5921 
## F-statistic:   136 on 1 and 92 DF,  p-value: < 0.00000000000000022

The data shows that there is a significant correlation between Family Income and the percent of those in college. The scatter plot displays that the higher income counties are able to put more people in general in college. The one outlier was Williamson County which was very high in the level of both college students and family income. When omitted, it did not change the results of the scatter plot much. The conclusion to be reached here is that the more money a county has, the more of its citizens end up with a collge education.