#load the data
#im using the code below to make it easier for R to look for my files to work with

setwd("C:/Users/Roxana/OneDrive - Latino Commission on AIDS/Documents/UTSA Adolph Delgado/Week 3")
getwd()
## [1] "C:/Users/Roxana/OneDrive - Latino Commission on AIDS/Documents/UTSA Adolph Delgado/Week 3"
#install and load the required packages
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#install.packages("tidyverse") load the package and import the data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Data <- read_csv("Data Manipulation.csv")
## Rows: 6048 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Indicator, Year, State, County, Race/Ethnicity, Sex, Cases
## dbl (1): FIPS
## num (1): Population
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Data)

#cleaning the data

Data_cleaned <- Data %>%
  mutate(
    Year = gsub("\\(COVID-19 Pandemic\\)", "", Year), # remove the "(COVID-19 Pandemic)" from Year
    Year = as.numeric(Year), #Convert Year to numeric
    Cases = as.character(Cases), #Convert Cases to character/text for cleaning
    Cases = ifelse(Cases == "Data suppressed" | Cases == "Data not available", NA_character_, Cases), #cleaning cells with "Data suppressed" and "Data not available"
    Cases = as.numeric(Cases), # converting Cases back from character/text to numeric values 
    Population = as.numeric(Population) #Ensure population is numeric
  ) %>%
  filter(!is.na(Population)) # Remove rows where Population is NA

View(Data_cleaned)

# Prompt 1 
#using Mutate() to create a new variable "Case_Rate" for case rate per 100,000 of population

Data_cleaned <- Data_cleaned %>%
  mutate(Case_Rate = (Cases / Population) * 100000)
head(Data_cleaned)
## # A tibble: 6 × 10
##   Indicator     Year State County   FIPS `Race/Ethnicity` Sex   Cases Population
##   <chr>        <dbl> <chr> <chr>   <dbl> <chr>            <chr> <dbl>      <dbl>
## 1 HIV diagnos…  2021 OH    Akron  1.04e9 Multiracial      Fema…     1       5960
## 2 HIV diagnos…  2021 OH    Akron  1.04e9 Multiracial      Male      1       5487
## 3 HIV diagnos…  2021 OH    Akron  1.04e9 White            Fema…     3     246500
## 4 HIV diagnos…  2021 OH    Akron  1.04e9 White            Male     21     235597
## 5 HIV diagnos…  2021 OH    Akron  1.04e9 Native Hawaiian… Fema…    NA         87
## 6 HIV diagnos…  2021 OH    Akron  1.04e9 Native Hawaiian… Male     NA         92
## # ℹ 1 more variable: Case_Rate <dbl>
#Prompt 2
#Selecting specific columns with select()

Select_data <- Data_cleaned %>%
  select(Year, State, County, Case_Rate)

head(Select_data)
## # A tibble: 6 × 4
##    Year State County Case_Rate
##   <dbl> <chr> <chr>      <dbl>
## 1  2021 OH    Akron      16.8 
## 2  2021 OH    Akron      18.2 
## 3  2021 OH    Akron       1.22
## 4  2021 OH    Akron       8.91
## 5  2021 OH    Akron      NA   
## 6  2021 OH    Akron      NA
#Prompt 3: Filtering Rows with filter()
#filtering the dataset for the year 2021

Filtered_data_2021 <- Select_data %>%
  filter(Year==2021)
View(Filtered_data_2021)

#Prompt 4: Summarizng data with summarise()
#summarizing to find the total cases by State and County

Total_cases_by_location <- Data_cleaned %>%
  group_by(State, County) %>%
  summarise(Total_Cases = sum(Cases, na.rm = TRUE))
## `summarise()` has grouped output by 'State'. You can override using the
## `.groups` argument.
head(Total_cases_by_location)
## # A tibble: 6 × 3
## # Groups:   State [3]
##   State County                               Total_Cases
##   <chr> <chr>                                      <dbl>
## 1 AL    Birmingham-Hoover                            462
## 2 AL    Huntsville                                    52
## 3 AR    Fayetteville-Springdale-Rogers                98
## 4 AR    Little Rock-North Little Rock-Conway         345
## 5 AZ    Phoenix-Mesa-Scottsdale                     1672
## 6 AZ    Tucson                                       315
#Prompt 5: Sorting Data with arrange()
#arranging the data in descending order based on the Case_Rate

Arrange_by_case_rate <- Select_data %>%
  arrange(desc(Case_Rate))
head(Arrange_by_case_rate)
## # A tibble: 6 × 4
##    Year State County                       Case_Rate
##   <dbl> <chr> <chr>                            <dbl>
## 1  2020 IA    Des Moines-West Des Moines        820.
## 2  2019 NC    Winston-Salem                     794.
## 3  2021 WI    Milwaukee-Waukesha                725.
## 4  2020 IN    Indianapolis-Carmel-Anderson      717.
## 5  2021 NC    Winston-Salem                     606.
## 6  2019 NE-IA Omaha-Council Bluffs              439.
#Print the first few rows of the arranged data to check 

print(Arrange_by_case_rate)
## # A tibble: 5,922 × 4
##     Year State County                                        Case_Rate
##    <dbl> <chr> <chr>                                             <dbl>
##  1  2020 IA    Des Moines-West Des Moines                         820.
##  2  2019 NC    Winston-Salem                                      794.
##  3  2021 WI    Milwaukee-Waukesha                                 725.
##  4  2020 IN    Indianapolis-Carmel-Anderson                       717.
##  5  2021 NC    Winston-Salem                                      606.
##  6  2019 NE-IA Omaha-Council Bluffs                               439.
##  7  2021 NC    Raleigh-Cary                                       412.
##  8  2019 IN    Indianapolis-Carmel-Anderson                       395.
##  9  2020 PA    Montgomery County-Bucks County-Chester County      392.
## 10  2020 FL    Miami-Miami Beach-Kendall                          361.
## # ℹ 5,912 more rows