library(LearnEDAfunctions)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: ggplot2
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ✔ readr 2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(vcd)
## Warning: package 'vcd' was built under R version 4.3.2
## Loading required package: grid
library(ggplot2)
ohio <- read.csv("C:/Users/eclai/Downloads/ohio.csv")
For this project, I will be using a data set called ohio which consists of three variables, each with 88 observations. County - this variable holds all the counties in Ohio Oct.2023 - this variable holds the number of people that are employed in October of 2023 Oct.2020 - this variable holds the number of people that are employed in October of 2020
Firstly, we will look at the data distribution We will use histograms and stemplots to do this and record our observations
head(ohio)
## County Oct.2023 Oct.2020
## 1 ADAMS COUNTY 10600 10200
## 2 ALLEN COUNTY 44400 45400
## 3 ASHLAND COUNTY 25000 25700
## 4 ASHTABULA COUNTY 40700 41400
## 5 ATHENS COUNTY 25700 25000
## 6 AUGLAIZE COUNTY 23700 23500
ggplot(ohio, aes(Oct.2023)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(ohio, aes(Oct.2020)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
aplpack::stem.leaf(ohio$Oct.2023, depth=FALSE)
## 1 | 2: represents 12000
## leaf unit: 1000
## n: 88
## 0 | 4456688
## 1 | 001222222334566667778899
## 2 | 0011122234555555556889
## 3 | 04589
## 4 | 022478
## 5 | 28
## 6 | 06
## 7 | 78
## 8 | 029
## 9 | 36
## 10 | 6
## 11 | 2
## 12 | 13
## HI: 150900 174400 194000 194700 238700 256900 412300 588800 685900
aplpack::stem.leaf(ohio$Oct.2020, depth=FALSE)
## 1 | 2: represents 12000
## leaf unit: 1000
## n: 88
## 0 | 4556688
## 1 | 001222223334566667788889
## 2 | 0001122235555556667788
## 3 | 0348
## 4 | 0124458
## 5 | 189
## 6 | 7
## 7 | 589
## 8 | 179
## 9 | 3
## 10 | 19
## 11 | 45
## HI: 140000 174500 185400 195100 236100 253300 392800 546100 669500
hist(ohio$Oct.2023)
Quickly, we see that our data is pretty right skewed. We also notice some unusually high outliers.
Let us quickly calculate some letter values and find the fences:
fivenum(ohio$Oct.2023)
## [1] 4300 16800 25400 59550 685900
fivenum(ohio$Oct.2020)
## [1] 4300 16900 25500 59150 669500
mean(ohio$Oct.2023)
## [1] 63420.45
mean(ohio$Oct.2020)
## [1] 61846.59
for Oct.2023: median (M) = 25400
lower fourth (Fl): 16800
upper fourth (Fu): 59550 smallest observation (LO): 4300
highest observation (HI): 685900 mean = 63420.45 fourth spread = 59550 -
16800 = 42750 Step = 1.5 * 42750 = 64125 inner lower fence: Fl - step =
16800 - 64125 = -47325 inner upper fence: Fu - step = 59550 + 64125 =
123675 outer lower fence: Fl - 2 * step = 16800 - 2 * 64125 = -111450
outer upper fence: Fu - 2 * step = 59550 + 2 * 64125 = 187800
for Oct.2020: median (M) = 25500
lower fourth (Fl): 16900
upper fourth (Fu): 59150
smallest observation (LO): 4300
highest observation (HI): 669500 mean = 61846.59 fourth spread = 59150 -
16900 = 42250 Step = 1.5 * 42750 = 63375 inner lower fence: Fl - step =
16900 - 63375 = -46475 inner upper fence: Fu - step = 59150 + 63375 =
122525 outer lower fence: Fl - 2 * step = 16900 - 2 * 63375 = -109850
outer upper fence: Fu - 2 * step = 59150 + 2 * 63375 = 185900
hist(ohio$Oct.2023)
hist(ohio$Oct.2020)
Again, looking at both of our variables, it is clear that our data is not symmetric but rather right skewed.
roots23 <- sqrt(ohio$Oct.2023)
logs23 <- log(ohio$Oct.2023)
recroots23 <- - 1 / sqrt(ohio$Oct.2023)
hinkley(ohio$Oct.2023)
## h
## 0.9025627
hinkley(roots23)
## h
## 0.4407146
hinkley(logs23)
## h
## 0.1842239
hinkley(recroots23)
## h
## -0.0108887
symplot(ohio$Oct.2023)
symplot(roots23)
symplot(logs23)
symplot(recroots23)
roots20 <- sqrt(ohio$Oct.2020)
logs20 <- log(ohio$Oct.2020)
recroots20 <- - 1 / sqrt(ohio$Oct.2020)
hinkley(ohio$Oct.2020)
## h
## 0.8628271
hinkley(roots20)
## h
## 0.4249418
hinkley(logs20)
## h
## 0.1764004
hinkley(recroots20)
## h
## -0.01604501
symplot(ohio$Oct.2020)
symplot(roots20)
symplot(logs20)
symplot(recroots20)
hist(recroots23)
hist(recroots20)
Our goal is to find a Reexpression to achieve approximate symmetry. Using the hinkley function, we compute Hinkley’s measure for the roots, logs, and reciprocal roots. For both of our variables, Oct.2023 and Oct.2020, we see that the recroots reexpression was the most successful in achieving symmetry. Of course, it is still not entirely symmetric especially due to the high outliers, but there definitely is a sizeable improvement as compared to our raw data.
Since our two variables Oct.2023 and Oct.2020 hold the number of people employed for 2023 and 2020 in various counties in ohio, it seems reasonable to assume that there might be a relationship between them.
ggplot(ohio, aes(Oct.2023, Oct.2020)) +
geom_point()
ggplot(ohio, aes(recroots23, recroots20)) +
geom_point()
ggplot(ohio, aes(logs23, logs20)) +
geom_point()
rline(Oct.2023 ~ Oct.2020, ohio, 5)[c("a", "b", "xC")]
## $a
## [1] 25660.6
##
## $b
## [1] 1.015993
##
## $xC
## [1] 25500
ggplot(ohio, aes(Oct.2023, Oct.2020)) +
geom_point() +
geom_abline(slope = 1.015993,
intercept = -25500 * 1.015993 + 25660.6)
Clearly, there appears to be a linear relationship between Oct.2023 and Oct.2020, it seems reasonable to fit a resistant line here. y = a0 + b0(x − xC) y = 25660.6 + 1.015993(x - 25500) y = -247.2215 + 1.015993x