Project

library(LearnEDAfunctions)

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Loading required package: ggplot2

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ✔ readr     2.1.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(vcd)

## Warning: package 'vcd' was built under R version 4.3.2

## Loading required package: grid

library(ggplot2)
ohio <- read.csv("C:/Users/eclai/Downloads/ohio.csv")

For this project, I will be using a data set called ohio which consists of three variables, each with 88 observations. County - this variable holds all the counties in Ohio Oct.2023 - this variable holds the number of people that are employed in October of 2023 Oct.2020 - this variable holds the number of people that are employed in October of 2020

Firstly, we will look at the data distribution We will use histograms and stemplots to do this and record our observations

head(ohio)

##             County Oct.2023 Oct.2020
## 1     ADAMS COUNTY    10600    10200
## 2     ALLEN COUNTY    44400    45400
## 3   ASHLAND COUNTY    25000    25700
## 4 ASHTABULA COUNTY    40700    41400
## 5    ATHENS COUNTY    25700    25000
## 6  AUGLAIZE COUNTY    23700    23500

ggplot(ohio, aes(Oct.2023)) +
geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(ohio, aes(Oct.2020)) +
geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

aplpack::stem.leaf(ohio$Oct.2023, depth=FALSE)

## 1 | 2: represents 12000
##  leaf unit: 1000
##             n: 88
##     0 | 4456688
##     1 | 001222222334566667778899
##     2 | 0011122234555555556889
##     3 | 04589
##     4 | 022478
##     5 | 28
##     6 | 06
##     7 | 78
##     8 | 029
##     9 | 36
##    10 | 6
##    11 | 2
##    12 | 13
## HI: 150900 174400 194000 194700 238700 256900 412300 588800 685900

aplpack::stem.leaf(ohio$Oct.2020, depth=FALSE)

## 1 | 2: represents 12000
##  leaf unit: 1000
##             n: 88
##     0 | 4556688
##     1 | 001222223334566667788889
##     2 | 0001122235555556667788
##     3 | 0348
##     4 | 0124458
##     5 | 189
##     6 | 7
##     7 | 589
##     8 | 179
##     9 | 3
##    10 | 19
##    11 | 45
## HI: 140000 174500 185400 195100 236100 253300 392800 546100 669500

hist(ohio$Oct.2023)

Quickly, we see that our data is pretty right skewed. We also notice some unusually high outliers.

Let us quickly calculate some letter values and find the fences:

fivenum(ohio$Oct.2023)

## [1]   4300  16800  25400  59550 685900

fivenum(ohio$Oct.2020)

## [1]   4300  16900  25500  59150 669500

mean(ohio$Oct.2023)

## [1] 63420.45

mean(ohio$Oct.2020)

## [1] 61846.59

for Oct.2023: median (M) = 25400
lower fourth (Fl): 16800
upper fourth (Fu): 59550 smallest observation (LO): 4300
highest observation (HI): 685900 mean = 63420.45 fourth spread = 59550 - 16800 = 42750 Step = 1.5 * 42750 = 64125 inner lower fence: Fl - step = 16800 - 64125 = -47325 inner upper fence: Fu - step = 59550 + 64125 = 123675 outer lower fence: Fl - 2 * step = 16800 - 2 * 64125 = -111450 outer upper fence: Fu - 2 * step = 59550 + 2 * 64125 = 187800

for Oct.2020: median (M) = 25500
lower fourth (Fl): 16900
upper fourth (Fu): 59150
smallest observation (LO): 4300
highest observation (HI): 669500 mean = 61846.59 fourth spread = 59150 - 16900 = 42250 Step = 1.5 * 42750 = 63375 inner lower fence: Fl - step = 16900 - 63375 = -46475 inner upper fence: Fu - step = 59150 + 63375 = 122525 outer lower fence: Fl - 2 * step = 16900 - 2 * 63375 = -109850 outer upper fence: Fu - 2 * step = 59150 + 2 * 63375 = 185900

hist(ohio$Oct.2023)

hist(ohio$Oct.2020)

Again, looking at both of our variables, it is clear that our data is not symmetric but rather right skewed.

roots23 <- sqrt(ohio$Oct.2023)
logs23 <- log(ohio$Oct.2023)
recroots23 <- - 1 / sqrt(ohio$Oct.2023)
hinkley(ohio$Oct.2023)

##         h 
## 0.9025627

hinkley(roots23)

##         h 
## 0.4407146

hinkley(logs23)

##         h 
## 0.1842239

hinkley(recroots23)

##          h 
## -0.0108887

symplot(ohio$Oct.2023)

symplot(roots23)

symplot(logs23)

symplot(recroots23)

roots20 <- sqrt(ohio$Oct.2020)
logs20 <- log(ohio$Oct.2020)
recroots20 <- - 1 / sqrt(ohio$Oct.2020)
hinkley(ohio$Oct.2020)

##         h 
## 0.8628271

hinkley(roots20)

##         h 
## 0.4249418

hinkley(logs20)

##         h 
## 0.1764004

hinkley(recroots20)

##           h 
## -0.01604501

symplot(ohio$Oct.2020)

symplot(roots20)

symplot(logs20)

symplot(recroots20)

hist(recroots23)

hist(recroots20)

Our goal is to find a Reexpression to achieve approximate symmetry. Using the hinkley function, we compute Hinkley’s measure for the roots, logs, and reciprocal roots. For both of our variables, Oct.2023 and Oct.2020, we see that the recroots reexpression was the most successful in achieving symmetry. Of course, it is still not entirely symmetric especially due to the high outliers, but there definitely is a sizeable improvement as compared to our raw data.

Since our two variables Oct.2023 and Oct.2020 hold the number of people employed for 2023 and 2020 in various counties in ohio, it seems reasonable to assume that there might be a relationship between them.

ggplot(ohio, aes(Oct.2023, Oct.2020)) +
geom_point()

ggplot(ohio, aes(recroots23, recroots20)) +
geom_point()

ggplot(ohio, aes(logs23, logs20)) +
geom_point()

rline(Oct.2023 ~ Oct.2020, ohio, 5)[c("a", "b", "xC")]

## $a
## [1] 25660.6
## 
## $b
## [1] 1.015993
## 
## $xC
## [1] 25500

ggplot(ohio, aes(Oct.2023, Oct.2020)) +
geom_point() +
geom_abline(slope = 1.015993,
intercept = -25500 * 1.015993 + 25660.6)

Clearly, there appears to be a linear relationship between Oct.2023 and Oct.2020, it seems reasonable to fit a resistant line here. y = a0 + b0(x − xC) y = 25660.6 + 1.015993(x - 25500) y = -247.2215 + 1.015993x

Project

Emily Weiland

2023-11-26