#load the data
#im using the code below to make it easier for R to look for my files to work with
setwd("C:/Users/Roxana/OneDrive - Latino Commission on AIDS/Documents/UTSA Adolph Delgado/Week 3")
getwd()
## [1] "C:/Users/Roxana/OneDrive - Latino Commission on AIDS/Documents/UTSA Adolph Delgado/Week 3"
#install and load the required packages
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#install.packages("tidyverse") load the package and import the data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Data <- read_csv("Data Manipulation.csv")
## Rows: 6048 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Indicator, Year, State, County, Race/Ethnicity, Sex, Cases
## dbl (1): FIPS
## num (1): Population
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Data)
#cleaning the data
Data_cleaned <- Data %>%
mutate(
Year = gsub("\\(COVID-19 Pandemic\\)", "", Year), # remove the "(COVID-19 Pandemic)" from Year
Year = as.numeric(Year), #Convert Year to numeric
Cases = as.character(Cases), #Convert Cases to character/text for cleaning
Cases = ifelse(Cases == "Data suppressed" | Cases == "Data not available", NA_character_, Cases), #cleaning cells with "Data suppressed" and "Data not available"
Cases = as.numeric(Cases), # converting Cases back from character/text to numeric values
Population = as.numeric(Population) #Ensure population is numeric
) %>%
filter(!is.na(Population)) # Remove rows where Population is NA
View(Data_cleaned)
# Prompt 1
#using Mutate() to create a new variable "Case_Rate" for case rate per 100,000 of population
Data_cleaned <- Data_cleaned %>%
mutate(Case_Rate = (Cases / Population) * 100000)
head(Data_cleaned)
## # A tibble: 6 × 10
## Indicator Year State County FIPS `Race/Ethnicity` Sex Cases Population
## <chr> <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl> <dbl>
## 1 HIV diagnos… 2021 OH Akron 1.04e9 Multiracial Fema… 1 5960
## 2 HIV diagnos… 2021 OH Akron 1.04e9 Multiracial Male 1 5487
## 3 HIV diagnos… 2021 OH Akron 1.04e9 White Fema… 3 246500
## 4 HIV diagnos… 2021 OH Akron 1.04e9 White Male 21 235597
## 5 HIV diagnos… 2021 OH Akron 1.04e9 Native Hawaiian… Fema… NA 87
## 6 HIV diagnos… 2021 OH Akron 1.04e9 Native Hawaiian… Male NA 92
## # ℹ 1 more variable: Case_Rate <dbl>
#Prompt 2
#Selecting specific columns with select()
Select_data <- Data_cleaned %>%
select(Year, State, County, Case_Rate)
head(Select_data)
## # A tibble: 6 × 4
## Year State County Case_Rate
## <dbl> <chr> <chr> <dbl>
## 1 2021 OH Akron 16.8
## 2 2021 OH Akron 18.2
## 3 2021 OH Akron 1.22
## 4 2021 OH Akron 8.91
## 5 2021 OH Akron NA
## 6 2021 OH Akron NA
#Prompt 3: Filtering Rows with filter()
#filtering the dataset for the year 2021
Filtered_data_2021 <- Select_data %>%
filter(Year==2021)
View(Filtered_data_2021)
#Prompt 4: Summarizng data with summarise()
#summarizing to find the total cases by State and County
Total_cases_by_location <- Data_cleaned %>%
group_by(State, County) %>%
summarise(Total_Cases = sum(Cases, na.rm = TRUE))
## `summarise()` has grouped output by 'State'. You can override using the
## `.groups` argument.
head(Total_cases_by_location)
## # A tibble: 6 × 3
## # Groups: State [3]
## State County Total_Cases
## <chr> <chr> <dbl>
## 1 AL Birmingham-Hoover 462
## 2 AL Huntsville 52
## 3 AR Fayetteville-Springdale-Rogers 98
## 4 AR Little Rock-North Little Rock-Conway 345
## 5 AZ Phoenix-Mesa-Scottsdale 1672
## 6 AZ Tucson 315
#Prompt 5: Sorting Data with arrange()
#arranging the data in descending order based on the Case_Rate
Arrange_by_case_rate <- Select_data %>%
arrange(desc(Case_Rate))
head(Arrange_by_case_rate)
## # A tibble: 6 × 4
## Year State County Case_Rate
## <dbl> <chr> <chr> <dbl>
## 1 2020 IA Des Moines-West Des Moines 820.
## 2 2019 NC Winston-Salem 794.
## 3 2021 WI Milwaukee-Waukesha 725.
## 4 2020 IN Indianapolis-Carmel-Anderson 717.
## 5 2021 NC Winston-Salem 606.
## 6 2019 NE-IA Omaha-Council Bluffs 439.
#Print the first few rows of the arranged data to check
print(Arrange_by_case_rate)
## # A tibble: 5,922 × 4
## Year State County Case_Rate
## <dbl> <chr> <chr> <dbl>
## 1 2020 IA Des Moines-West Des Moines 820.
## 2 2019 NC Winston-Salem 794.
## 3 2021 WI Milwaukee-Waukesha 725.
## 4 2020 IN Indianapolis-Carmel-Anderson 717.
## 5 2021 NC Winston-Salem 606.
## 6 2019 NE-IA Omaha-Council Bluffs 439.
## 7 2021 NC Raleigh-Cary 412.
## 8 2019 IN Indianapolis-Carmel-Anderson 395.
## 9 2020 PA Montgomery County-Bucks County-Chester County 392.
## 10 2020 FL Miami-Miami Beach-Kendall 361.
## # ℹ 5,912 more rows