# Load Libraries
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse) # This includes both dplyr and readr
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the data into a tibble
data <- read_csv("Data Manipulation with Dplyr.csv")
## Rows: 6048 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Indicator, Year, State, County, Race/Ethnicity, Sex, Cases
## dbl (1): FIPS
## num (1): Population
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Cleaning the data
data_cleaned <- data %>%
mutate(
Year = gsub(" \\(COVID-19 Pandemic\\)", "", Year), # Remove "(COVID-19 Pandemic)" from Year
Year = as.numeric(Year), # Convert Year to numeric
Cases = as.character(Cases), # Convert Cases to character for cleaning
Cases = ifelse(Cases == "Data suppressed" | Cases == "Data not available", NA_character_, Cases),
Cases = as.numeric(Cases), # Convert Cases back to numeric, NA introduced for non-numeric values
Population = as.numeric(Population) # Ensure Population is numeric, convert NA to NA_character_ if necessary
) %>%
filter(!is.na(Population)) # Remove rows where Population is NA
# Continue with the data manipulation
# Adding a new variable for case rate per 100,000 population
data_cleaned <- data_cleaned %>%
mutate(Case_Rate = Cases / Population * 100000)
# Selecting relevant columns
selected_data <- data_cleaned %>%
select(Year, State, County, Case_Rate)
# Filtering the dataset for the year 2021
filtered_data_2021 <- selected_data %>%
filter(Year == 2021)
# Summarizing to find the total cases by State and County
total_cases_by_location <- data_cleaned %>%
group_by(State, County) %>%
summarise(Total_Cases = sum(Cases, na.rm = TRUE))
## `summarise()` has grouped output by 'State'. You can override using the
## `.groups` argument.
# Arranging the dataset based on the Case Rate in descending order
arranged_by_case_rate <- selected_data %>%
arrange(desc(Case_Rate))
# Print the first few rows of the arranged data to check
print(arranged_by_case_rate)
## # A tibble: 5,922 × 4
## Year State County Case_Rate
## <dbl> <chr> <chr> <dbl>
## 1 2020 IA Des Moines-West Des Moines 820.
## 2 2019 NC Winston-Salem 794.
## 3 2021 WI Milwaukee-Waukesha 725.
## 4 2020 IN Indianapolis-Carmel-Anderson 717.
## 5 2021 NC Winston-Salem 606.
## 6 2019 NE-IA Omaha-Council Bluffs 439.
## 7 2021 NC Raleigh-Cary 412.
## 8 2019 IN Indianapolis-Carmel-Anderson 395.
## 9 2020 PA Montgomery County-Bucks County-Chester County 392.
## 10 2020 FL Miami-Miami Beach-Kendall 361.
## # ℹ 5,912 more rows