if(!require(tidyverse))install.packages("tidyverse")
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyverse)
Inc2022 <- read.csv("https://drkblake.com/wp-content/uploads/2024/02/Inc2022.csv")
Inc2017 <- read.csv("https://drkblake.com/wp-content/uploads/2024/02/Inc2017.csv")
Task 1: With these data frames created, you can now perform various
operations such as merging them, performing calculations, or visualizing
the data using functions provided by the
tidyverse package.
MergedData <- left_join(Inc2017,
Inc2022,
by = join_by(GEOID == GEOID))
head(MergedData, 10)
## GEOID HHInc2017 District County HHInc2022 Significance
## 1 4702190022 44091 District 1 Cheatham County 59741 Significant
## 2 4702190212 55247 District 2 Cheatham County 73634 Significant
## 3 4702190402 73802 District 3 Cheatham County 89861 Nonsignificant
## 4 4702190592 49731 District 4 Cheatham County 73293 Significant
## 5 4702190782 60793 District 5 Cheatham County 78380 Significant
## 6 4702190972 66750 District 6 Cheatham County 92305 Significant
## 7 4703790038 56258 District 1 Davidson County 75038 Significant
## 8 4703790228 32487 District 2 Davidson County 54346 Significant
## 9 4703790418 49402 District 3 Davidson County 67953 Significant
## 10 4703790608 79381 District 4 Davidson County 106260 Significant
Task 2: The left_join() function is
used to merge the Inc2022 and
Inc2017 data frames. In a left join, all
rows from the left data frame (Inc2022)
are saved, and matching rows from the right data frame
(Inc2017) are attached.
MergedData <- MergedData %>%
mutate(Change = HHInc2022 - HHInc2017)
MergedData <-MergedData %>%
mutate(Direction = case_when(Change <0.9 ~ "Loss",
Change == 0.9~ "No change",
Change >0.9 ~ "Gain",
.default = "Error"))
Task 3: I subtracted the income estimates for 2017
(Income2017) from the income estimates for
2022 (Income2022) for each district.
MergedData <- mutate(MergedData, GEOID= ifelse(MergedData >= 100000, "$100k+", "<$100k"))
(drop = FALSE)
## [1] FALSE
head(MergedData, 10)
## GEOID.GEOID GEOID.HHInc2017 GEOID.District GEOID.County GEOID.HHInc2022
## 1 $100k+ <$100k $100k+ $100k+ <$100k
## 2 $100k+ <$100k $100k+ $100k+ <$100k
## 3 $100k+ <$100k $100k+ $100k+ <$100k
## 4 $100k+ <$100k $100k+ $100k+ <$100k
## 5 $100k+ <$100k $100k+ $100k+ <$100k
## 6 $100k+ <$100k $100k+ $100k+ <$100k
## 7 $100k+ <$100k $100k+ $100k+ <$100k
## 8 $100k+ <$100k $100k+ $100k+ <$100k
## 9 $100k+ <$100k $100k+ $100k+ <$100k
## 10 $100k+ <$100k $100k+ $100k+ $100k+
## GEOID.Significance GEOID.Change GEOID.Direction HHInc2017 District
## 1 $100k+ <$100k $100k+ 44091 District 1
## 2 $100k+ <$100k $100k+ 55247 District 2
## 3 $100k+ <$100k $100k+ 73802 District 3
## 4 $100k+ <$100k $100k+ 49731 District 4
## 5 $100k+ <$100k $100k+ 60793 District 5
## 6 $100k+ <$100k $100k+ 66750 District 6
## 7 $100k+ <$100k $100k+ 56258 District 1
## 8 $100k+ <$100k $100k+ 32487 District 2
## 9 $100k+ <$100k $100k+ 49402 District 3
## 10 $100k+ <$100k $100k+ 79381 District 4
## County HHInc2022 Significance Change Direction
## 1 Cheatham County 59741 Significant 15650 Gain
## 2 Cheatham County 73634 Significant 18387 Gain
## 3 Cheatham County 89861 Nonsignificant 16059 Gain
## 4 Cheatham County 73293 Significant 23562 Gain
## 5 Cheatham County 78380 Significant 17587 Gain
## 6 Cheatham County 92305 Significant 25555 Gain
## 7 Davidson County 75038 Significant 18780 Gain
## 8 Davidson County 54346 Significant 21859 Gain
## 9 Davidson County 67953 Significant 18551 Gain
## 10 Davidson County 106260 Significant 26879 Gain
Task4: The result is a modified MergedData data frame that includes the newly added “Level” variable indicating whether each district’s Inc2022 figure is $100,000 or more, or less than $100,000.
LevelbyCounty <- MergedData %>%
group_by(County, GEOID) %>%
summarize(Count = n())
## `summarise()` has grouped output by 'County'. You can override using the
## `.groups` argument.
head(LevelbyCounty, 10)
## # A tibble: 10 × 3
## # Groups: County [5]
## County GEOID[,"GEOID"] [,"HHInc2017"] [,"District"] Count
## <chr> <chr> <chr> <chr> <int>
## 1 Cheatham County $100k+ <$100k $100k+ 6
## 2 Davidson County $100k+ $100k+ $100k+ 1
## 3 Davidson County $100k+ <$100k $100k+ 5
## 4 Davidson County $100k+ <$100k $100k+ 29
## 5 Robertson County $100k+ <$100k $100k+ 1
## 6 Robertson County $100k+ <$100k $100k+ 11
## 7 Rutherford County $100k+ <$100k $100k+ 3
## 8 Rutherford County $100k+ <$100k $100k+ 18
## 9 Sumner County $100k+ <$100k $100k+ 2
## 10 Sumner County $100k+ <$100k $100k+ 10
## # ℹ 1 more variable: GEOID[4:8] <chr>
Task 5: This task was to create a new data frame. I use
summarize(Count = n()) to calculate the
count of districts in each county for each level category. The
n() function counts the number of
observations in each group.
RichDistricts <- filter(MergedData, District == "$100k+")
head(RichDistricts, 10)
## [1] GEOID HHInc2017 District County HHInc2022
## [6] Significance Change Direction
## <0 rows> (or 0-length row.names)
Task 6: I used this function because filters through the GEOID section to give me just the rich districts.
Task 8: One thing that I noticed was that Davidson County has more wealthier areas then not. Rather then Wilson County which doesn’t have any wealthy areas. It was really interesting to see all the different income within one county but also it was interesting to see jumps from 2017 income to 2022.