#ANgel Gallardo
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
nycemp <- read.csv("https://raw.githubusercontent.com/Angelogallardo05/Week6/main/nychist.csv")
head(nycemp)
## AREA YEAR SERIESCODE AREANAME INDUSTRY_TITLE JAN
## 1 93561 2023 0 New York City Total Nonfarm 4,575.1
## 2 93561 2023 5000000 New York City Total Private 4,004.1
## 3 93561 2023 6000000 New York City Goods Producing 197.7
## 4 93561 2023 7000000 New York City Service-Providing 4,377.4
## 5 93561 2023 8000000 New York City Private Service Providing 3,806.4
## 6 93561 2023 15000000 New York City Mining, Logging and Construction 141.6
## FEB MAR APR MAY JUN JUL AUG SEP OCT
## 1 4,623.3 4,649.7 4,647.8 4,679.5 4,695.2 4,681.4 4,665.2 4,690.1 4,725.7
## 2 4,052.3 4,075.3 4,076.5 4,107.7 4,126.6 4,103.1 4,088.7 4,113.5 4,149.3
## 3 202.5 203.8 206.6 211.0 213.8 215.1 215.4 217.5 219.5
## 4 4,420.8 4,445.9 4,441.2 4,468.5 4,481.4 4,466.3 4,449.8 4,472.6 4,506.2
## 5 3,849.8 3,871.5 3,869.9 3,896.7 3,912.8 3,888.0 3,873.3 3,896.0 3,929.8
## 6 145.6 146.4 149.2 153.5 155.7 157.7 157.9 159.8 161.7
## NOV DEC ANNUAL X JAN.1 FEB.1 MAR.1 APR.1 MAY.1 JUN.1 JUL.1 AUG.1
## 1 4,743.9 4,756.6 NA 215.4 190.2 181.0 150.6 159.6 149.1 50.4 39.7
## 2 4,171.2 4,181.5 NA 211.0 185.9 176.6 145.7 155.0 144.8 127.6 116.8
## 3 212.0 210.5 NA 5.3 7.2 6.4 6.4 9.6 10.1 13.0 13.2
## 4 4,531.9 4,546.1 NA 210.1 183.0 174.6 144.2 150.0 139.0 37.4 26.5
## 5 3,959.2 3,971.0 NA 205.7 178.7 170.2 139.3 145.4 134.7 114.6 103.6
## 6 154.4 153.7 NA 4.9 7.0 5.9 6.6 10.0 10.4 13.6 13.5
## SEP.1 OCT.1 NOV.1 DEC.1 AVG X.1 JAN.2 FEB.2 MAR.2 APR.2 MAY.2 JUN.2 JUL.2
## 1 127.0 95.9 72.6 53.6 NA NA 4.9% 4.3% 4.1% 3.3% 3.5% 3.3% 1.1%
## 2 112.3 91.2 73.6 54.9 NA NA 5.6% 4.8% 4.5% 3.7% 3.9% 3.6% 3.2%
## 3 14.8 16.0 5.1 5.8 NA NA 2.8% 3.7% 3.2% 3.2% 4.8% 5.0% 6.4%
## 4 112.2 79.9 67.5 47.8 NA NA 5.0% 4.3% 4.1% 3.4% 3.5% 3.2% 0.8%
## 5 97.5 75.2 68.5 49.1 NA NA 5.7% 4.9% 4.6% 3.7% 3.9% 3.6% 3.0%
## 6 15.0 16.3 6.1 7.5 NA NA 3.6% 5.1% 4.2% 4.6% 7.0% 7.2% 9.4%
## AUG.2 SEP.2 OCT.2 NOV.2 DEC.2 AVG.1 X.2
## 1 0.9% 2.8% 2.1% 1.6% 1.1% NA
## 2 2.9% 2.8% 2.2% 1.8% 1.3% NA
## 3 6.5% 7.3% 7.9% 2.5% 2.8% NA
## 4 0.6% 2.6% 1.8% 1.5% 1.1% NA
## 5 2.7% 2.6% 2.0% 1.8% 1.3% NA
## 6 9.3% 10.4% 11.2% 4.1% 5.1% NA
nycemp <- nycemp %>%
mutate(across(starts_with("JAN"):ends_with("ANNUAL"), ~as.numeric(gsub(",", "", .))))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(...)`.
## Caused by warning in `x:y`:
## ! numerical expression has 3 elements: only the first used
head(nycemp)
## AREA YEAR SERIESCODE AREANAME INDUSTRY_TITLE JAN
## 1 93561 2023 0 New York City Total Nonfarm 4575.1
## 2 93561 2023 5000000 New York City Total Private 4004.1
## 3 93561 2023 6000000 New York City Goods Producing 197.7
## 4 93561 2023 7000000 New York City Service-Providing 4377.4
## 5 93561 2023 8000000 New York City Private Service Providing 3806.4
## 6 93561 2023 15000000 New York City Mining, Logging and Construction 141.6
## FEB MAR APR MAY JUN JUL AUG SEP OCT NOV DEC
## 1 4623.3 4649.7 4647.8 4679.5 4695.2 4681.4 4665.2 4690.1 4725.7 4743.9 4756.6
## 2 4052.3 4075.3 4076.5 4107.7 4126.6 4103.1 4088.7 4113.5 4149.3 4171.2 4181.5
## 3 202.5 203.8 206.6 211.0 213.8 215.1 215.4 217.5 219.5 212.0 210.5
## 4 4420.8 4445.9 4441.2 4468.5 4481.4 4466.3 4449.8 4472.6 4506.2 4531.9 4546.1
## 5 3849.8 3871.5 3869.9 3896.7 3912.8 3888.0 3873.3 3896.0 3929.8 3959.2 3971.0
## 6 145.6 146.4 149.2 153.5 155.7 157.7 157.9 159.8 161.7 154.4 153.7
## ANNUAL X JAN.1 FEB.1 MAR.1 APR.1 MAY.1 JUN.1 JUL.1 AUG.1 SEP.1 OCT.1 NOV.1
## 1 NA NA 215.4 190.2 181.0 150.6 159.6 149.1 50.4 39.7 127.0 95.9 72.6
## 2 NA NA 211.0 185.9 176.6 145.7 155.0 144.8 127.6 116.8 112.3 91.2 73.6
## 3 NA NA 5.3 7.2 6.4 6.4 9.6 10.1 13.0 13.2 14.8 16.0 5.1
## 4 NA NA 210.1 183.0 174.6 144.2 150.0 139.0 37.4 26.5 112.2 79.9 67.5
## 5 NA NA 205.7 178.7 170.2 139.3 145.4 134.7 114.6 103.6 97.5 75.2 68.5
## 6 NA NA 4.9 7.0 5.9 6.6 10.0 10.4 13.6 13.5 15.0 16.3 6.1
## DEC.1 AVG X.1 JAN.2 FEB.2 MAR.2 APR.2 MAY.2 JUN.2 JUL.2 AUG.2 SEP.2 OCT.2
## 1 53.6 NA NA 4.9% 4.3% 4.1% 3.3% 3.5% 3.3% 1.1% 0.9% 2.8% 2.1%
## 2 54.9 NA NA 5.6% 4.8% 4.5% 3.7% 3.9% 3.6% 3.2% 2.9% 2.8% 2.2%
## 3 5.8 NA NA 2.8% 3.7% 3.2% 3.2% 4.8% 5.0% 6.4% 6.5% 7.3% 7.9%
## 4 47.8 NA NA 5.0% 4.3% 4.1% 3.4% 3.5% 3.2% 0.8% 0.6% 2.6% 1.8%
## 5 49.1 NA NA 5.7% 4.9% 4.6% 3.7% 3.9% 3.6% 3.0% 2.7% 2.6% 2.0%
## 6 7.5 NA NA 3.6% 5.1% 4.2% 4.6% 7.0% 7.2% 9.4% 9.3% 10.4% 11.2%
## NOV.2 DEC.2 AVG.1 X.2
## 1 1.6% 1.1% NA
## 2 1.8% 1.3% NA
## 3 2.5% 2.8% NA
## 4 1.5% 1.1% NA
## 5 1.8% 1.3% NA
## 6 4.1% 5.1% NA
nycemp <- nycemp %>%
select(-19:-47)
#fill the average colu,m
nycemp <- nycemp %>%
rowwise() %>%
mutate(ANNUAL = ifelse(is.na(ANNUAL), mean(c_across(JAN:DEC), na.rm = TRUE), ANNUAL)) %>%
ungroup()
head(nycemp)
## # A tibble: 6 × 18
## AREA YEAR SERIESCODE AREANAME INDUSTRY_TITLE JAN FEB MAR APR MAY
## <int> <int> <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 93561 2023 0 New York … Total Nonfarm 4575. 4623. 4650. 4648. 4680.
## 2 93561 2023 5000000 New York … Total Private 4004. 4052. 4075. 4076. 4108.
## 3 93561 2023 6000000 New York … Goods Produci… 198. 202. 204. 207. 211
## 4 93561 2023 7000000 New York … Service-Provi… 4377. 4421. 4446. 4441. 4468.
## 5 93561 2023 8000000 New York … Private Servi… 3806. 3850. 3872. 3870. 3897.
## 6 93561 2023 15000000 New York … Mining, Loggi… 142. 146. 146. 149. 154.
## # ℹ 8 more variables: JUN <dbl>, JUL <dbl>, AUG <dbl>, SEP <dbl>, OCT <dbl>,
## # NOV <dbl>, DEC <dbl>, ANNUAL <dbl>
nycemp_year <- nycemp %>%
group_by(INDUSTRY_TITLE, AREANAME, YEAR) %>%
summarise(ANNUAL = mean(ANNUAL)) %>%
select(INDUSTRY_TITLE, AREANAME, YEAR, ANNUAL)
## `summarise()` has grouped output by 'INDUSTRY_TITLE', 'AREANAME'. You can
## override using the `.groups` argument.
nycemp_filtered <- nycemp_year %>%
pivot_wider(names_from = YEAR, values_from = ANNUAL) %>%
mutate_all(~ replace(., is.na(.), 0)) %>%
select(INDUSTRY_TITLE, AREANAME, order(as.numeric(colnames(.))))
## `mutate_all()` ignored the following grouping variables:
## • Columns `INDUSTRY_TITLE`, `AREANAME`
## ℹ Use `mutate_at(df, vars(-group_cols()), myoperation)` to silence the message.
## Warning in order(as.numeric(colnames(.))): NAs introduced by coercion
last_10_years <- nycemp_filtered %>%
select(INDUSTRY_TITLE, AREANAME, `2014`:`2023`)
# Calculate percentage changes for each year
percentage_changes <- last_10_years %>%
mutate(
Percentage_Change_2015 = ((`2015` - `2014`) / `2014`) * 100,
Percentage_Change_2016 = ((`2016` - `2015`) / `2015`) * 100,
Percentage_Change_2017 = ((`2017` - `2016`) / `2016`) * 100,
Percentage_Change_2018 = ((`2018` - `2017`) / `2017`) * 100,
Percentage_Change_2019 = ((`2019` - `2018`) / `2018`) * 100,
Percentage_Change_2020 = ((`2020` - `2019`) / `2019`) * 100,
Percentage_Change_2021 = ((`2021` - `2020`) / `2020`) * 100,
Percentage_Change_2022 = ((`2022` - `2021`) / `2021`) * 100,
Percentage_Change_2023 = ((`2023` - `2022`) / `2022`) * 100
) %>%
select(-matches("^\\d{4}$")) # Remove original columns
highest_growth_2023 <- percentage_changes %>%
filter(Percentage_Change_2023 == max(Percentage_Change_2023)) %>%
arrange(desc(Percentage_Change_2023))
glimpse(percentage_changes)
## Rows: 122
## Columns: 11
## Groups: INDUSTRY_TITLE, AREANAME [122]
## $ INDUSTRY_TITLE <chr> "Accommodation", "Accommodation and Food Servic…
## $ AREANAME <chr> "New York City", "New York City", "New York Cit…
## $ Percentage_Change_2015 <dbl> 1.1857708, 5.1650367, 7.9497908, 3.5517759, 3.6…
## $ Percentage_Change_2016 <dbl> 2.3437500, 2.9061319, 5.2325581, 3.1400966, 3.0…
## $ Percentage_Change_2017 <dbl> 0.9541985, 3.9819260, 0.5524862, 2.9976581, 2.8…
## $ Percentage_Change_2018 <dbl> 0.3780718, 0.6518197, -1.6483516, 6.4574807, 6.…
## $ Percentage_Change_2019 <dbl> 0.5649718, 0.6475985, 3.1657356, 6.6211021, 6.3…
## $ Percentage_Change_2020 <dbl> -46.067416, -41.715818, -3.429603, -12.620192, …
## $ Percentage_Change_2021 <dbl> -3.4722222, 13.3394664, 4.2990654, 3.1636864, 2…
## $ Percentage_Change_2022 <dbl> 42.805755, 31.939935, 3.942652, 9.377778, 9.373…
## $ Percentage_Change_2023 <dbl> 12.8253568, 9.3227725, -2.9022989, 1.9233374, 2…
percentage_changes <- percentage_changes %>%
rename_with(~ gsub("Percentage_Change_", "", .), starts_with("Percentage_Change_"))
top_5_industries <- highest_growth_2023$INDUSTRY_TITLE[1:5]
top_5_growth <- percentage_changes %>%
filter(INDUSTRY_TITLE %in% top_5_industries) %>%
pivot_longer(cols = starts_with("2"),
names_to = "Year",
values_to = "Percentage_Change")
# Plot
ggplot(top_5_growth, aes(x = Year, y = Percentage_Change, group = INDUSTRY_TITLE, color = INDUSTRY_TITLE)) +
geom_line() +
geom_point() +
labs(title = "Top 5 Industries with Highest Growth Percentage (2014-2023)",
x = "Year",
y = "Percentage Change") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))