# Load Libraries
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse) # This includes both dplyr and readr
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the data into a tibble
data <- read_csv("Data Manipulation with Dplyr.csv")
## Rows: 6048 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Indicator, Year, State, County, Race/Ethnicity, Sex, Cases
## dbl (1): FIPS
## num (1): Population
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Cleaning the data
data_cleaned <- data %>%
  mutate(
    Year = gsub(" \\(COVID-19 Pandemic\\)", "", Year),  # Remove "(COVID-19 Pandemic)" from Year
    Year = as.numeric(Year),                            # Convert Year to numeric
    Cases = as.character(Cases),                        # Convert Cases to character for cleaning
    Cases = ifelse(Cases == "Data suppressed" | Cases == "Data not available", NA_character_, Cases),
    Cases = as.numeric(Cases),                          # Convert Cases back to numeric, NA introduced for non-numeric values
    Population = as.numeric(Population)                 # Ensure Population is numeric, convert NA to NA_character_ if necessary
  ) %>%
  filter(!is.na(Population))  # Remove rows where Population is NA


# Continue with the data manipulation
# Adding a new variable for case rate per 100,000 population
data_cleaned <- data_cleaned %>%
  mutate(Case_Rate = Cases / Population * 100000)

# Selecting relevant columns
selected_data <- data_cleaned %>%
  select(Year, State, County, Case_Rate)

# Filtering the dataset for the year 2021
filtered_data_2021 <- selected_data %>%
  filter(Year == 2021)

# Summarizing to find the total cases by State and County
total_cases_by_location <- data_cleaned %>%
  group_by(State, County) %>%
  summarise(Total_Cases = sum(Cases, na.rm = TRUE))
## `summarise()` has grouped output by 'State'. You can override using the
## `.groups` argument.
# Arranging the dataset based on the Case Rate in descending order
arranged_by_case_rate <- selected_data %>%
  arrange(desc(Case_Rate))

# Print the first few rows of the arranged data to check
print(arranged_by_case_rate)
## # A tibble: 5,922 × 4
##     Year State County                                        Case_Rate
##    <dbl> <chr> <chr>                                             <dbl>
##  1  2020 IA    Des Moines-West Des Moines                         820.
##  2  2019 NC    Winston-Salem                                      794.
##  3  2021 WI    Milwaukee-Waukesha                                 725.
##  4  2020 IN    Indianapolis-Carmel-Anderson                       717.
##  5  2021 NC    Winston-Salem                                      606.
##  6  2019 NE-IA Omaha-Council Bluffs                               439.
##  7  2021 NC    Raleigh-Cary                                       412.
##  8  2019 IN    Indianapolis-Carmel-Anderson                       395.
##  9  2020 PA    Montgomery County-Bucks County-Chester County      392.
## 10  2020 FL    Miami-Miami Beach-Kendall                          361.
## # ℹ 5,912 more rows