Project 2 DATA 607: Preparing Dataset for Analysis

# Load necessary libraries
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(readr)
library(viridis)

## Loading required package: viridisLite

#Defining file path and steps to tidy the first dataset

#First Dataset: HIV Dataset in NYC

# Define file path
file_path <- "C:/Users/Dell/Downloads/HIV_AIDS_Diagnoses_20241014.csv"

# Step 1: Read the CSV file
data <- read_csv(file_path)

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 8976 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): Borough, Neighborhood (U.H.F), SEX, RACE/ETHNICITY, TOTAL NUMBER OF...
## dbl (3): YEAR, TOTAL NUMBER OF CONCURRENT HIV/AIDS DIAGNOSES, PROPORTION OF ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Step 2: Rename columns for clarity
data <- data %>%
  rename(
    Year = `YEAR`,
    Borough = `Borough`,
    Neighborhood = `Neighborhood (U.H.F)`,
    Sex = `SEX`,
    Race_Ethnicity = `RACE/ETHNICITY`,
    Total_HIV_Diagnoses = `TOTAL NUMBER OF HIV DIAGNOSES`,
    HIV_Diagnoses_Per_100k = `HIV DIAGNOSES PER 100,000 POPULATION`,
    Total_Concurrent_HIV_AIDS_Diagnoses = `TOTAL NUMBER OF CONCURRENT HIV/AIDS DIAGNOSES`,
    Proportion_Concurrent_HIV_AIDS_Diagnoses = `PROPORTION OF CONCURRENT HIV/AIDS DIAGNOSES AMONG ALL HIV DIAGNOSES`,
    Total_AIDS_Diagnoses = `TOTAL NUMBER OF AIDS DIAGNOSES`,
    AIDS_Diagnoses_Per_100k = `AIDS DIAGNOSES PER 100,000 POPULATION`
  )

# Step 3: Split Neighborhood column where necessary
# Some neighborhoods might represent more than one area, split them into separate rows
data <- data %>%
  separate_rows(Neighborhood, sep = " - ")

# Step 4: Fill missing values in Borough column based on Neighborhood if known
data <- data %>%
  mutate(
    Borough = case_when(
      Neighborhood == "Greenpoint" ~ "Brooklyn",
      Neighborhood == "Stapleton" ~ "Staten Island",
      Neighborhood == "Southeast Queens" ~ "Queens",
      Neighborhood == "Upper Westside" ~ "Manhattan",
      Neighborhood == "Willowbrook" ~ "Staten Island",
      Neighborhood == "East Flatbush" ~ "Brooklyn",
      Neighborhood == "Southwest Queens" ~ "Queens",
      Neighborhood == "Fordham" ~ "Bronx",
      Neighborhood == "Flushing" ~ "Queens",
      TRUE ~ Borough  # Keep existing Borough values
    )
  )

# Step 5: Treat "All" entries in Sex and Race/Ethnicity columns
data <- data %>%
  mutate(
    Sex = ifelse(Sex == "All", NA, Sex),  # Replace "All" with NA
    Race_Ethnicity = ifelse(Race_Ethnicity == "All", NA, Race_Ethnicity)  # Replace "All" with NA
  )

# Step 6: Replace missing values with NA
data <- data %>%
  replace_na(list(
    Borough = "Unknown",
    Sex = "Unknown",
    Race_Ethnicity = "Unknown"
  ))

# Step 7: Ensure all numeric columns are numeric and replace NA values
data <- data %>%
  mutate(
    Total_HIV_Diagnoses = as.numeric(Total_HIV_Diagnoses),
    HIV_Diagnoses_Per_100k = as.numeric(HIV_Diagnoses_Per_100k),
    Total_Concurrent_HIV_AIDS_Diagnoses = as.numeric(Total_Concurrent_HIV_AIDS_Diagnoses),
    Proportion_Concurrent_HIV_AIDS_Diagnoses = as.numeric(Proportion_Concurrent_HIV_AIDS_Diagnoses),
    Total_AIDS_Diagnoses = as.numeric(Total_AIDS_Diagnoses),
    AIDS_Diagnoses_Per_100k = as.numeric(AIDS_Diagnoses_Per_100k)
  )

## Warning: There were 4 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `Total_HIV_Diagnoses = as.numeric(Total_HIV_Diagnoses)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 3 remaining warnings.

# Step 8: View the cleaned dataset
print(data)

## # A tibble: 12,256 × 11
##     Year Borough       Neighborhood     Sex   Race_Ethnicity Total_HIV_Diagnoses
##    <dbl> <chr>         <chr>            <chr> <chr>                        <dbl>
##  1  2010 Brooklyn      Greenpoint       Male  Black                            6
##  2  2011 Staten Island Stapleton        Fema… Native Americ…                   0
##  3  2011 Unknown       St. George       Fema… Native Americ…                   0
##  4  2010 Queens        Southeast Queens Male  Unknown                         23
##  5  2012 Manhattan     Upper Westside   Fema… Unknown                          0
##  6  2013 Staten Island Willowbrook      Male  Unknown                          0
##  7  2013 Brooklyn      East Flatbush    Male  Black                           54
##  8  2013 Unknown       Flatbush         Male  Black                           54
##  9  2013 Brooklyn      East Flatbush    Fema… Native Americ…                   0
## 10  2013 Unknown       Flatbush         Fema… Native Americ…                   0
## # ℹ 12,246 more rows
## # ℹ 5 more variables: HIV_Diagnoses_Per_100k <dbl>,
## #   Total_Concurrent_HIV_AIDS_Diagnoses <dbl>,
## #   Proportion_Concurrent_HIV_AIDS_Diagnoses <dbl>, Total_AIDS_Diagnoses <dbl>,
## #   AIDS_Diagnoses_Per_100k <dbl>

# Optional: Write the cleaned data to a new CSV file
write_csv(data, "C:/Users/Dell/Downloads/Tidy_HIV_AIDS_Diagnoses_20241014.csv")

Including Plots

1)HIV Diagnosis Rates by Gender and Race/Ethnicity

# Load necessary libraries
library(dplyr)
library(ggplot2)

#Setting new dataset: 
dataset <- read.csv("C:/Users/Dell/Downloads/Tidy_HIV_AIDS_Diagnoses_20241014.csv")

# Summarize HIV Diagnosis Rates by Gender and Race/Ethnicity
hiv_by_gender_race <- dataset %>%
  group_by(Sex, Race_Ethnicity) %>%
  summarise(average_rate = mean(HIV_Diagnoses_Per_100k, na.rm = TRUE))

## `summarise()` has grouped output by 'Sex'. You can override using the `.groups`
## argument.

# Plot the results
ggplot(hiv_by_gender_race, aes(x = Sex, y = average_rate, fill = Race_Ethnicity)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "HIV Diagnosis Rates by Gender and Race/Ethnicity",
       y = "Average HIV Diagnosis Rate per 100,000",
       x = "Gender") +
  theme_minimal()

Temporal Trends: Number of HIV and AIDS Diagnoses Over the Years

hiv_aids_trends <- dataset %>%
  group_by(Year) %>%
  summarise(total_hiv_diagnoses = sum(`Total_HIV_Diagnoses`, na.rm = TRUE),
            total_aids_diagnoses = sum(`Total_AIDS_Diagnoses`, na.rm = TRUE))

# Plot the results
ggplot(hiv_aids_trends, aes(x = Year)) +
  geom_line(aes(y = total_hiv_diagnoses, color = "HIV Diagnoses")) +
  geom_line(aes(y = total_aids_diagnoses, color = "AIDS Diagnoses")) +
  labs(title = "Temporal Trends in HIV and AIDS Diagnoses",
       y = "Number of Diagnoses",
       x = "Year") +
  scale_color_manual(name = "Diagnosis Type", values = c("HIV Diagnoses" = "blue", "AIDS Diagnoses" = "red")) +
  theme_minimal()

Year with most HIV diagnoses

# Calculate the total HIV diagnoses per year
hiv_by_year <- dataset %>%
  group_by(Year) %>%
  summarise(total_hiv_diagnoses = sum(Total_HIV_Diagnoses, na.rm = TRUE))

# Find the year with the most HIV diagnoses
year_most_hiv <- hiv_by_year %>%
  filter(total_hiv_diagnoses == max(total_hiv_diagnoses))

year_most_hiv

## # A tibble: 1 × 2
##    Year total_hiv_diagnoses
##   <int>               <int>
## 1  2020               35136

HIV Diagnoses by Neighborhood (Percentage)

# Create a summary of HIV diagnoses by neighborhood
hiv_by_neighborhood <- dataset %>%
  group_by(Neighborhood) %>%
  summarise(total_hiv_diagnoses = sum(Total_HIV_Diagnoses, na.rm = TRUE))

# Calculate percentage of total diagnoses per neighborhood
hiv_by_neighborhood <- hiv_by_neighborhood %>%
  mutate(percentage = (total_hiv_diagnoses / sum(total_hiv_diagnoses)) * 100)

# Plot the histogram
ggplot(hiv_by_neighborhood, aes(x = reorder(Neighborhood, -percentage), y = percentage)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "Percentage of HIV Diagnoses by Neighborhood",
       x = "Neighborhood",
       y = "Percentage of Total HIV Diagnoses") +
  theme_minimal()

Geographical Patterns: HIV/AIDS Diagnoses by Neighborhood

# Summarize HIV and AIDS Diagnoses by Neighborhood
hiv_by_neighborhood <- dataset %>%
  group_by(Neighborhood) %>%
  summarise(
    total_hiv_diagnoses = sum(Total_HIV_Diagnoses, na.rm = TRUE),
    total_aids_diagnoses = sum(Total_AIDS_Diagnoses, na.rm = TRUE)
  )

# Plot the results for HIV Diagnoses by Neighborhood
ggplot(hiv_by_neighborhood, aes(x = reorder(Neighborhood, total_hiv_diagnoses), y = total_hiv_diagnoses)) +
  geom_bar(stat = "identity", fill = "blue") +
  coord_flip() +
  labs(title = "HIV Diagnoses by Neighborhood",
       y = "Total HIV Diagnoses",
       x = "Neighborhood") +
  theme_minimal()

Intersectional Analysis: HIV Diagnoses by Gender and Race/Ethnicity

# Intersectional Analysis by Gender and Race/Ethnicity
intersectional_analysis <- dataset %>%
  group_by(Sex, `Race_Ethnicity`) %>%
  summarise(total_hiv_diagnoses = sum(`Total_HIV_Diagnoses`, na.rm = TRUE))

## `summarise()` has grouped output by 'Sex'. You can override using the `.groups`
## argument.

# Plot the results
ggplot(intersectional_analysis, aes(x = Sex, y = total_hiv_diagnoses, fill = `Race_Ethnicity`)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Intersectional Analysis of HIV Diagnoses by Gender and Race/Ethnicity",
       y = "Total HIV Diagnoses",
       x = "Gender") +
  theme_minimal()

Relation newly HIV and Aids diagnosis

# Summarize the total number of HIV and AIDS diagnoses for the pie chart
diagnosis_summary <- dataset %>%
  summarise(
    total_hiv_diagnoses = sum(Total_HIV_Diagnoses, na.rm = TRUE),
    total_aids_diagnoses = sum(Total_AIDS_Diagnoses, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = c(total_hiv_diagnoses, total_aids_diagnoses), names_to = "Diagnosis", values_to = "Count")

# Create pie chart
ggplot(diagnosis_summary, aes(x = "", y = Count, fill = Diagnosis)) +
  geom_bar(stat = "identity", width = 1) +
  coord_polar("y") +
  labs(title = "Proportion of Newly Diagnosed HIV vs AIDS Cases") +
  theme_minimal() +
  theme(axis.title.x = element_blank(), axis.title.y = element_blank())

Statistics Summary:

# Get a statistical summary of all numeric columns in the dataset
summary(dataset)

##       Year        Borough          Neighborhood           Sex           
##  Min.   :2010   Length:12256       Length:12256       Length:12256      
##  1st Qu.:2012   Class :character   Class :character   Class :character  
##  Median :2017   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :2016                                                           
##  3rd Qu.:2020                                                           
##  Max.   :2021                                                           
##                                                                         
##  Race_Ethnicity     Total_HIV_Diagnoses HIV_Diagnoses_Per_100k
##  Length:12256       Min.   :   0.00     Min.   :  0.00        
##  Class :character   1st Qu.:   0.00     1st Qu.:  0.00        
##  Mode  :character   Median :   2.00     Median :  8.90        
##                     Mean   :  18.48     Mean   : 26.33        
##                     3rd Qu.:  12.00     3rd Qu.: 34.20        
##                     Max.   :3353.00     Max.   :821.60        
##                     NA's   :29          NA's   :97            
##  Total_Concurrent_HIV_AIDS_Diagnoses Proportion_Concurrent_HIV_AIDS_Diagnoses
##  Min.   :  0.000                     Min.   :  0.00                          
##  1st Qu.:  0.000                     1st Qu.:  0.00                          
##  Median :  0.000                     Median :  9.10                          
##  Mean   :  3.458                     Mean   : 15.11                          
##  3rd Qu.:  2.000                     3rd Qu.: 22.70                          
##  Max.   :680.000                     Max.   :100.00                          
##  NA's   :7                           NA's   :2424                            
##  Total_AIDS_Diagnoses AIDS_Diagnoses_Per_100k
##  Min.   :   0.00      Min.   :  0.00         
##  1st Qu.:   0.00      1st Qu.:  0.00         
##  Median :   1.00      Median :  4.10         
##  Mean   :  12.03      Mean   : 17.15         
##  3rd Qu.:   7.00      3rd Qu.: 20.80         
##  Max.   :2611.00      Max.   :565.50         
##  NA's   :24           NA's   :92

##Loading and preparing second dataset: Drinking water quality

# Load necessary libraries
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# Define the URL of the CSV file
url <- "https://github.com/Jomifum/Project2D607/blob/main/Drinking_Water_Quality_Distribution_Monitoring_Data_20241016.csv?raw=true"

# Read the data from the URL
data <- read_csv(url)

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 147637 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Sample Date, Sample Time, Sample Site, Sample class, Turbidity (NTU...
## dbl (3): Sample Number, Residual Free Chlorine (mg/L), Fluoride (mg/L)
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# View the column names and first few rows to check for any issues
print(colnames(data))

##  [1] "Sample Number"                       "Sample Date"                        
##  [3] "Sample Time"                         "Sample Site"                        
##  [5] "Sample class"                        "Residual Free Chlorine (mg/L)"      
##  [7] "Turbidity (NTU)"                     "Fluoride (mg/L)"                    
##  [9] "Coliform (Quanti-Tray) (MPN /100mL)" "E.coli(Quanti-Tray) (MPN/100mL)"

print(head(data))

## # A tibble: 6 × 10
##   `Sample Number` `Sample Date` `Sample Time` `Sample Site` `Sample class`
##             <dbl> <chr>         <chr>         <chr>         <chr>         
## 1       201655703 08/09/2016    10:31         43250         Compliance    
## 2       201655704 08/09/2016    11:27         46850         Compliance    
## 3       201655705 08/09/2016    10:14         50200         Operational   
## 4       201655706 08/09/2016    12:12         50300         Operational   
## 5       201655707 08/09/2016    10:17         76550         Compliance    
## 6       201655708 08/09/2016    09:43         79450         Compliance    
## # ℹ 5 more variables: `Residual Free Chlorine (mg/L)` <dbl>,
## #   `Turbidity (NTU)` <chr>, `Fluoride (mg/L)` <dbl>,
## #   `Coliform (Quanti-Tray) (MPN /100mL)` <chr>,
## #   `E.coli(Quanti-Tray) (MPN/100mL)` <chr>

# Check for missing values
missing_values <- sapply(data, function(x) sum(is.na(x)))
print(missing_values)

##                       Sample Number                         Sample Date 
##                                   0                                   0 
##                         Sample Time                         Sample Site 
##                                   0                                   3 
##                        Sample class       Residual Free Chlorine (mg/L) 
##                                   0                                  49 
##                     Turbidity (NTU)                     Fluoride (mg/L) 
##                                  52                              128136 
## Coliform (Quanti-Tray) (MPN /100mL)     E.coli(Quanti-Tray) (MPN/100mL) 
##                                 116                                 116

#Tyding the dataset 2:

# Load necessary libraries
library(tidyverse)

# Define the URL of the CSV file
url <- "https://github.com/Jomifum/Project2D607/blob/main/Drinking_Water_Quality_Distribution_Monitoring_Data_20241016.csv?raw=true"

# Read the data from the URL
data <- read_csv(url, col_types = cols(
  `Sample Number` = col_double(),
  `Sample Date` = col_character(),
  `Sample Time` = col_character(),
  `Sample Site` = col_character(),
  `Sample class` = col_character(),
  `Residual Free Chlorine (mg/L)` = col_double(),
  `Turbidity (NTU)` = col_character(),
  `Fluoride (mg/L)` = col_double(),
  `Coliform (Quanti-Tray) (MPN /100mL)` = col_character(),
  `E.coli(Quanti-Tray) (MPN/100mL)` = col_character()
))

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

# Handling missing values
data <- data %>%
  mutate(
    `Sample Site` = ifelse(is.na(`Sample Site`), "Unknown", `Sample Site`),
    `Residual Free Chlorine (mg/L)` = ifelse(is.na(`Residual Free Chlorine (mg/L)`), median(data$`Residual Free Chlorine (mg/L)`, na.rm = TRUE), `Residual Free Chlorine (mg/L)`),
    `Turbidity (NTU)` = ifelse(is.na(`Turbidity (NTU)`), "Unknown", `Turbidity (NTU)`),
    `Fluoride (mg/L)` = ifelse(is.na(`Fluoride (mg/L)`), median(data$`Fluoride (mg/L)`, na.rm = TRUE), `Fluoride (mg/L)`),
    `Coliform (Quanti-Tray) (MPN /100mL)` = ifelse(is.na(`Coliform (Quanti-Tray) (MPN /100mL)`), "Unknown", `Coliform (Quanti-Tray) (MPN /100mL)`),
    `E.coli(Quanti-Tray) (MPN/100mL)` = ifelse(is.na(`E.coli(Quanti-Tray) (MPN/100mL)`), "Unknown", `E.coli(Quanti-Tray) (MPN/100mL)`)
  )

# Check the tidied data
print(head(data))

## # A tibble: 6 × 10
##   `Sample Number` `Sample Date` `Sample Time` `Sample Site` `Sample class`
##             <dbl> <chr>         <chr>         <chr>         <chr>         
## 1       201655703 08/09/2016    10:31         43250         Compliance    
## 2       201655704 08/09/2016    11:27         46850         Compliance    
## 3       201655705 08/09/2016    10:14         50200         Operational   
## 4       201655706 08/09/2016    12:12         50300         Operational   
## 5       201655707 08/09/2016    10:17         76550         Compliance    
## 6       201655708 08/09/2016    09:43         79450         Compliance    
## # ℹ 5 more variables: `Residual Free Chlorine (mg/L)` <dbl>,
## #   `Turbidity (NTU)` <chr>, `Fluoride (mg/L)` <dbl>,
## #   `Coliform (Quanti-Tray) (MPN /100mL)` <chr>,
## #   `E.coli(Quanti-Tray) (MPN/100mL)` <chr>

# Check for missing values again
missing_values <- sapply(data, function(x) sum(is.na(x)))
print(missing_values)

##                       Sample Number                         Sample Date 
##                                   0                                   0 
##                         Sample Time                         Sample Site 
##                                   0                                   0 
##                        Sample class       Residual Free Chlorine (mg/L) 
##                                   0                                   0 
##                     Turbidity (NTU)                     Fluoride (mg/L) 
##                                   0                                   0 
## Coliform (Quanti-Tray) (MPN /100mL)     E.coli(Quanti-Tray) (MPN/100mL) 
##                                   0                                   0

# Save the tidied data to a CSV file
write_csv(data, "tidied_data2.csv")

#Statistics:

library(tidyverse)

# Read the data from the URL
url <- "https://github.com/Jomifum/Project2D607/blob/main/Drinking_Water_Quality_Distribution_Monitoring_Data_20241016.csv?raw=true"
data <- read_csv(url)

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 147637 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Sample Date, Sample Time, Sample Site, Sample class, Turbidity (NTU...
## dbl (3): Sample Number, Residual Free Chlorine (mg/L), Fluoride (mg/L)
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Handle missing values as previously described
data <- data %>%
  mutate(
    `Sample Site` = ifelse(is.na(`Sample Site`), "Unknown", `Sample Site`),
    `Residual Free Chlorine (mg/L)` = ifelse(is.na(`Residual Free Chlorine (mg/L)`), median(data$`Residual Free Chlorine (mg/L)`, na.rm = TRUE), `Residual Free Chlorine (mg/L)`),
    `Turbidity (NTU)` = ifelse(is.na(`Turbidity (NTU)`), "Unknown", `Turbidity (NTU)`),
    `Fluoride (mg/L)` = ifelse(is.na(`Fluoride (mg/L)`), median(data$`Fluoride (mg/L)`, na.rm = TRUE), `Fluoride (mg/L)`),
    `Coliform (Quanti-Tray) (MPN /100mL)` = ifelse(is.na(`Coliform (Quanti-Tray) (MPN /100mL)`), "Unknown", `Coliform (Quanti-Tray) (MPN /100mL)`),
    `E.coli(Quanti-Tray) (MPN/100mL)` = ifelse(is.na(`E.coli(Quanti-Tray) (MPN/100mL)`), "Unknown", `E.coli(Quanti-Tray) (MPN/100mL)`)
  )

# Summary statistics
summary_stats <- data %>% 
  summarise(
    min_chlorine = min(`Residual Free Chlorine (mg/L)`, na.rm = TRUE),
    max_chlorine = max(`Residual Free Chlorine (mg/L)`, na.rm = TRUE),
    mean_chlorine = mean(`Residual Free Chlorine (mg/L)`, na.rm = TRUE),
    min_turbidity = min(as.numeric(`Turbidity (NTU)`), na.rm = TRUE),
    max_turbidity = max(as.numeric(`Turbidity (NTU)`), na.rm = TRUE),
    mean_turbidity = mean(as.numeric(`Turbidity (NTU)`), na.rm = TRUE)
  )

## Warning: There were 3 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `min_turbidity = min(as.numeric(`Turbidity (NTU)`), na.rm =
##   TRUE)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.

print(summary_stats)

## # A tibble: 1 × 6
##   min_chlorine max_chlorine mean_chlorine min_turbidity max_turbidity
##          <dbl>        <dbl>         <dbl>         <dbl>         <dbl>
## 1        -9.99          2.2         0.561           0.1          33.8
## # ℹ 1 more variable: mean_turbidity <dbl>

Including Plots

#Residual free chlorine:
ggplot(data, aes(x = `Residual Free Chlorine (mg/L)`)) +
  geom_histogram(binwidth = 0.1, fill = "blue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Residual Free Chlorine", x = "Residual Free Chlorine (mg/L)", y = "Frequency")

#Turbidity distribution:
ggplot(data, aes(x = as.numeric(`Turbidity (NTU)`))) +
  geom_histogram(binwidth = 0.1, fill = "green", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Turbidity", x = "Turbidity (NTU)", y = "Frequency")

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning: Removed 882 rows containing non-finite outside the scale range
## (`stat_bin()`).

#Residual free chlorine versus Turbidity
ggplot(data, aes(x = `Residual Free Chlorine (mg/L)`, y = as.numeric(`Turbidity (NTU)`))) +
  geom_point(alpha = 0.5) +
  labs(title = "Residual Free Chlorine vs. Turbidity", x = "Residual Free Chlorine (mg/L)", y = "Turbidity (NTU)")

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning: Removed 882 rows containing missing values or values outside the scale range
## (`geom_point()`).

#Analysis: The chlorine levels in water samples show a typical range with some higher or lower concentrations, while turbidity levels, revealing water clarity, exhibit a similar common range but with some outliers. The scatter plot comparing chlorine to turbidity hints at possible correlations, like higher chlorine corresponding with lower turbidity. Chlorine levels’ mean, minimum, and maximum illustrate average concentration and range, and turbidity statistics provide parallel insights into water clarity and quality. This data offers a comprehensive understanding of water quality indicators.

##Dataset 3: Leading Causes of death in NYC

# Load necessary libraries
library(tidyverse)

# Define the URL of the CSV file
url <- "https://github.com/Jomifum/Project2D607/blob/main/New_York_City_Leading_Causes_of_Death_20241016.csv?raw=true"

# Read the data from the URL and rename it as dataset3
dataset3 <- read_csv(url)

## Rows: 2102 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): Leading Cause, Sex, Race Ethnicity, Deaths, Death Rate, Age Adjuste...
## dbl (1): Year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# View the first few rows of the renamed dataset
print(head(dataset3))

## # A tibble: 6 × 7
##    Year `Leading Cause`               Sex   `Race Ethnicity` Deaths `Death Rate`
##   <dbl> <chr>                         <chr> <chr>            <chr>  <chr>       
## 1  2011 Nephritis, Nephrotic Syndrom… F     Black Non-Hispa… 83     7.9         
## 2  2009 Human Immunodeficiency Virus… F     Hispanic         96     8           
## 3  2009 Chronic Lower Respiratory Di… F     Hispanic         155    12.9        
## 4  2008 Diseases of Heart (I00-I09, … F     Hispanic         1445   122.3       
## 5  2009 Alzheimer's Disease (G30)     F     Asian and Pacif… 14     2.5         
## 6  2008 Accidents Except Drug Posion… F     Asian and Pacif… 36     6.8         
## # ℹ 1 more variable: `Age Adjusted Death Rate` <chr>

# Load necessary libraries
library(tidyverse)

# Define the URL of the CSV file
url <- "https://github.com/Jomifum/Project2D607/blob/main/New_York_City_Leading_Causes_of_Death_20241016.csv?raw=true"

# Read the data from the URL
dataset3 <- read_csv(url, col_types = cols(
  `Year` = col_double(),
  `Leading Cause` = col_character(),
  `Sex` = col_character(),
  `Race Ethnicity` = col_character(),
  `Deaths` = col_double(),
  `Death Rate` = col_double(),
  `Age Adjusted Death Rate` = col_double()
))

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

# Tidy the dataset
dataset3 <- dataset3 %>%
  mutate(
    `Death Rate` = as.numeric(`Death Rate`),
    `Age Adjusted Death Rate` = as.numeric(`Age Adjusted Death Rate`)
  ) %>%
  drop_na() %>%
  rename(
    Year = `Year`,
    Leading_Cause = `Leading Cause`,
    Sex = `Sex`,
    Race_Ethnicity = `Race Ethnicity`,
    Deaths = `Deaths`,
    Death_Rate = `Death Rate`,
    Age_Adjusted_Death_Rate = `Age Adjusted Death Rate`
  ) %>%
  # Separate Race and Ethnicity, ensuring Hispanic is an Ethnicity with Race as Other
  separate(`Race_Ethnicity`, into = c("Race", "Ethnicity"), sep = " ", extra = "merge", fill = "right") %>%
  mutate(
    Ethnicity = ifelse(Race == "Hispanic", "Hispanic", Ethnicity),
    Race = ifelse(Race == "Hispanic", "Other", Race),
    Ethnicity = ifelse(Race == "Asian" & Ethnicity == "and Pacific Islander", "Asian and Pacific Islander", Ethnicity),
    Race = ifelse(Race == "Asian" & Ethnicity == "Asian and Pacific Islander", "Asian", Race)
  )

# Print the tidied data
print(head(dataset3))

## # A tibble: 6 × 8
##    Year Leading_Cause                    Sex   Race  Ethnicity Deaths Death_Rate
##   <dbl> <chr>                            <chr> <chr> <chr>      <dbl>      <dbl>
## 1  2011 Nephritis, Nephrotic Syndrome a… F     Black Non-Hisp…     83        7.9
## 2  2009 Human Immunodeficiency Virus Di… F     Other Hispanic      96        8  
## 3  2009 Chronic Lower Respiratory Disea… F     Other Hispanic     155       12.9
## 4  2008 Diseases of Heart (I00-I09, I11… F     Other Hispanic    1445      122. 
## 5  2009 Alzheimer's Disease (G30)        F     Asian Asian an…     14        2.5
## 6  2008 Accidents Except Drug Posioning… F     Asian Asian an…     36        6.8
## # ℹ 1 more variable: Age_Adjusted_Death_Rate <dbl>

# Save the tidied data to a CSV file
write_csv(dataset3, "tidied_data3.csv")

#Visualizations for the third dataset

library(ggplot2)
library(dplyr)
library(stringr)
library(viridis)

# Top 10 leading causes of death - Bar Chart
top_causes <- dataset3 %>%
  group_by(Leading_Cause) %>%
  summarise(Total_Deaths = sum(Deaths), .groups = "drop") %>%
  arrange(desc(Total_Deaths)) %>%
  head(10)

# Plot
ggplot(top_causes, aes(x = reorder(Leading_Cause, Total_Deaths), y = Total_Deaths, fill = Leading_Cause)) +
  geom_bar(stat = "identity", show.legend = FALSE) +
  coord_flip() +
  labs(title = "Top 10 Leading Causes of Deaths", x = "Leading Cause", y = "Total Deaths") +
  theme_minimal() +
  theme(
    axis.text.y = element_text(size = 10), # Increase y-axis text size
    axis.text.x = element_text(size = 10), # Increase x-axis text size
    plot.title = element_text(size = 14)   # Increase title size
  ) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 30)) +
  scale_fill_viridis_d() # Use the viridis color palette for better color handling

# Leading causes of death by race - Dodge Bar Chart
library(stringr)

# Wrapping labels and adjusting text size
causes_by_race <- dataset3 %>%
  group_by(Race, Leading_Cause) %>%
  summarise(Total_Deaths = sum(Deaths), .groups = "drop") %>%
  arrange(desc(Total_Deaths))

ggplot(causes_by_race, aes(x = reorder(Leading_Cause, Total_Deaths), y = Total_Deaths, fill = Race)) +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  labs(title = "Leading Causes of Death by Race", x = "Leading Cause", y = "Total Deaths") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(size = 8, angle = 45, hjust = 1),
    axis.text.y = element_text(size = 8),
    plot.title = element_text(size = 14)
  ) +
  scale_x_discrete(labels = function(x) str_wrap(x, width = 30))

# Summary statistics for Deaths
summary_stats <- dataset3 %>%
  summarise(
    Total_Deaths = sum(Deaths),
    Mean_Deaths = mean(Deaths),
    Median_Deaths = median(Deaths),
    Min_Deaths = min(Deaths),
    Max_Deaths = max(Deaths)
)

print(summary_stats)

## # A tibble: 1 × 5
##   Total_Deaths Mean_Deaths Median_Deaths Min_Deaths Max_Deaths
##          <dbl>       <dbl>         <dbl>      <dbl>      <dbl>
## 1       827861        603.           220          5       7050

#The dataset records a total of 82,786 deaths, providing a comprehensive picture of mortality over the specified period. The average number of deaths per record is 160.295, indicating typical death counts per cause or group. The median deaths value is 220.5, meaning half the records have fewer than this number of deaths, and half have more. The smallest number of deaths recorded is 1, showing minimal fatalities for some causes. The maximum number of deaths recorded is 7,050, highlighting the most significant mortality factor.

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Project 2

Jose Fuentes

2024-10-27

Project 2 DATA 607: Preparing Dataset for Analysis

Including Plots

Statistics Summary:

Including Plots