# Installing required packages
if (!require("dplyr"))
install.packages("dplyr")
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
if (!require("tidyverse"))
install.packages("tidyverse")
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.4.4 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
# Read the data
mydata <- read.csv("https://raw.githubusercontent.com/drkblake/Data/main/Educ_Income_2022.csv") #Edit YOURFILENAME.csv
head(mydata,10)
## GEOID County State PctCollege FamIncome
## 1 47001 Anderson County Tennessee 24.5 75637
## 2 47003 Bedford County Tennessee 17.1 71159
## 3 47005 Benton County Tennessee 11.1 65800
## 4 47007 Bledsoe County Tennessee 10.3 59695
## 5 47009 Blount County Tennessee 26.0 85194
## 6 47011 Bradley County Tennessee 23.9 75270
## 7 47013 Campbell County Tennessee 12.9 61629
## 8 47015 Cannon County Tennessee 17.5 71000
## 9 47017 Carroll County Tennessee 19.8 68542
## 10 47019 Carter County Tennessee 21.1 61776
# Specify the DV and IV
mydata$DV <- mydata$FamIncome #Edit YOURDVNAME
mydata$IV <- mydata$PctCollege #Edit YOURIVNAME
# Look at the DV and IV
ggplot(mydata, aes(x = DV)) + geom_histogram(color = "black", fill = "#1f78b4")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(mydata, aes(x = IV)) + geom_histogram(color = "black", fill = "#1f78b4")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Creating and summarizing an initial regression model called myreg, and checking for bivariate outliers.
options(scipen = 999)
myreg <- lm(DV ~ IV,
data = mydata)
plot(mydata$IV, mydata$DV)
mydata$DV <- mydata$FamIncome
mydata$IV <- mydata$PctCollege
myreg <- lm(DV ~ IV,
data = mydata)
plot(mydata$IV, mydata$DV)
abline(lm(mydata$DV ~ mydata$IV))
summary(myreg)
##
## Call:
## lm(formula = DV ~ IV, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15380.9 -5904.4 538.2 4730.4 17582.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44664.55 1903.16 23.47 <0.0000000000000002 ***
## IV 1349.18 90.88 14.85 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7516 on 93 degrees of freedom
## Multiple R-squared: 0.7032, Adjusted R-squared: 0.7001
## F-statistic: 220.4 on 1 and 93 DF, p-value: < 0.00000000000000022
leverage <- as.data.frame(hatvalues(myreg))
view(leverage)
mydata <- mydata[-c(94), ]
mydata$DV <- mydata$FamIncome
mydata$IV <- mydata$PctCollege
myreg <- lm(DV ~ IV,
data = mydata)
plot(mydata$IV, mydata$DV)
abline(lm(mydata$DV ~ mydata$IV))
summary(myreg)
##
## Call:
## lm(formula = DV ~ IV, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15004.2 -4596.6 161.3 4151.4 18594.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47333.4 2057.6 23.00 <0.0000000000000002 ***
## IV 1196.4 102.6 11.66 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7242 on 92 degrees of freedom
## Multiple R-squared: 0.5965, Adjusted R-squared: 0.5921
## F-statistic: 136 on 1 and 92 DF, p-value: < 0.00000000000000022
The data shows that there is a significant correlation between Family Income and the percent of those in college. The scatter plot displays that the higher income counties are able to put more people in general in college. The one outlier was Williamson County which was very high in the level of both college students and family income. When omitted, it did not change the results of the scatter plot much. The conclusion to be reached here is that the more money a county has, the more of its citizens end up with a collge education.