# Installing required packages
if (!require("dplyr"))
install.packages("dplyr")
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
if (!require("tidyverse"))
install.packages("tidyverse")
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.4.4 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
# Read the data
# NOTE: You may edit the URL to load a different dataset
mydata <- read.csv("https://raw.githubusercontent.com/drkblake/Data/main/Educ_Income_2022.csv")
head(mydata,10)
## GEOID County State PctCollege FamIncome
## 1 47001 Anderson County Tennessee 24.5 75637
## 2 47003 Bedford County Tennessee 17.1 71159
## 3 47005 Benton County Tennessee 11.1 65800
## 4 47007 Bledsoe County Tennessee 10.3 59695
## 5 47009 Blount County Tennessee 26.0 85194
## 6 47011 Bradley County Tennessee 23.9 75270
## 7 47013 Campbell County Tennessee 12.9 61629
## 8 47015 Cannon County Tennessee 17.5 71000
## 9 47017 Carroll County Tennessee 19.8 68542
## 10 47019 Carter County Tennessee 21.1 61776
mydata$DV <- mydata$PctCollege #Edit YOURDVNAME
mydata$IV <- mydata$FamIncome #Edit YOURIVNAME
ggplot(mydata, aes(x = DV)) + geom_histogram(color = "black", fill = "#1f78b4")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(mydata, aes(x = IV)) + geom_histogram(color = "black", fill = "#1f78b4")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Creating and summarizing an initial regression model called myreg, and checking for bivariate outliers.
options(scipen = 999)
myreg <- lm(DV ~ IV,
data = mydata)
plot(mydata$IV, mydata$DV)
abline(lm(mydata$DV ~ mydata$IV))
summary(myreg)
##
## Call:
## lm(formula = DV ~ IV, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.0510 -3.1047 -0.3815 3.1929 15.9863
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.59952875 2.52111063 -6.981 0.000000000428 ***
## IV 0.00052124 0.00003511 14.846 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.672 on 93 degrees of freedom
## Multiple R-squared: 0.7032, Adjusted R-squared: 0.7001
## F-statistic: 220.4 on 1 and 93 DF, p-value: < 0.00000000000000022
Set up and run a regression analysis in R that investigates the
relationship between PctCollege and FamIncome,
then incorporate your code and output into an RMarkdown document, along
with a short description of what the results of the analysis mean.
Publish your RMarkdown document on your RPubs site, then then submit the
published document’s URL using the Week
13 Lab drop box. The assignment is due by 11:59 p.m. on
Friday.
The code here helps to determine outliers in large groups of people, with this line whenever there is still continuous independent variables (family income & percent of people that go to college). This code helps to establish that overall on average people earn more whenever considering family income whenever having earned a college education. (The IV is represented as <.0002 for the independent variable, with the income slowly increasing whenever more people go to college,