Loading in the data
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   1.0.1
## ✔ tibble  3.2.1     ✔ dplyr   1.1.1
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.3     ✔ forcats 1.0.0
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
hiv_data <- read_csv("HIV_AIDS_NY.csv")
## Rows: 6005 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): Borough, UHF, Gender, Age, Race
## dbl (13): Year, HIV diagnoses, HIV diagnosis rate, Concurrent diagnoses, % l...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Cleaning the data
hiv_data <- subset(hiv_data, hiv_data$`HIV diagnosis rate` != 999999.0 & hiv_data$`% linked to care within 3 months`<= 100 & hiv_data$`AIDS diagnosis rate` != 99999.0 & hiv_data$`PLWDHI prevalence` != 99999.0 & hiv_data$`% viral suppression` != 99999 & hiv_data$Deaths != 99999 & hiv_data$`Death rate` <= 100 & hiv_data$`HIV-related death rate` != 99999 & hiv_data$`Non-HIV-related death rate` != 99999)
hiv_data <- hiv_data[hiv_data$Race != "All", ]

1. Metadata

The dataset I used is the HIV_AIDS_NY dataset conducted by the HIV Epidemiology Program of the NYC Department of Health and Mental Hygiene, which provides data on HIV and AIDS cases in New York City from the years 2011-2015. The categorical variables of this dataset are the year, both the Borough and UHF, which is a code for a smaller neighborhood inside of a borough in New York City, gender, age, and race. The quantitative variable include the number of HIV diagnoses, AIDS diagnoses, and Concurrent diagnosis(those diagnosed with both diseases), the diagnosis rates for HIV and AIDS (which is the number of diagnoses per 100,00 people), the percentage of people who were diagnosed with HIV that were linked to medical care within 3 months, the PLWDHI prevalence (estimate prevalence of people living with diagnosed or undiagnosed HIV), the percent viral suppression of people diagnosed with HIV within one year of diagnosis, the number of deaths, the death-rate, HIV-related death rate, and the Non-HIV-related death rate (all of which are also per 100,00 people).

2. Summary Statistics

Mode function
calculate_mode <- function(x) {
  tbl <- table(x) 
  mode_val <- as.numeric(names(tbl[tbl == max(tbl)])) 
  return(mode_val)
}
HIV diagnosis variable
length(hiv_data$`HIV diagnoses`)
## [1] 1252
summary(hiv_data$`HIV diagnoses`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    2.00    5.00   26.23   17.00 1042.00
calculate_mode(hiv_data$`HIV diagnoses`)
## [1] 1
sd(hiv_data$`HIV diagnoses`)
## [1] 85.19704
HIV Diagnosis Rate Variable
length(hiv_data$`HIV diagnosis rate`)
## [1] 1252
summary(hiv_data$`HIV diagnosis rate`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.80   16.00   39.05   61.66   80.85  635.30
calculate_mode(hiv_data$`HIV diagnosis rate`)
## [1]  6.1 18.9
sd(hiv_data$`HIV diagnosis rate`)
## [1] 69.82095
Percent linked to care within 3 months variable
length(hiv_data$`% linked to care within 3 months`)
## [1] 1252
summary(hiv_data$`% linked to care within 3 months`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   63.00   75.00   73.05  100.00  100.00
calculate_mode(hiv_data$`% linked to care within 3 months`)
## [1] 100
sd(hiv_data$`% linked to care within 3 months`)
## [1] 26.06184

3. Frequency distribution and Relative Frequency Distribution

table(hiv_data$Race)
## 
## Asian/Pacific Islander                  Black        Latino/Hispanic 
##                    184                    343                    339 
##          Other/Unknown                  White 
##                    100                    286
table(hiv_data$Race)/length(hiv_data$Race)
## 
## Asian/Pacific Islander                  Black        Latino/Hispanic 
##              0.1469649              0.2739617              0.2707668 
##          Other/Unknown                  White 
##              0.0798722              0.2284345

4. Contigency Tables

table(hiv_data$Gender,hiv_data$Age)
##         
##          All
##   Female 499
##   Male   753

Bar Graph and Pie Chart

barplot(table(hiv_data$Race), col="steelblue", main="Distribution of Race",
        xlab="Race", ylab="Count")

pie(table(hiv_data$Gender), 
                main="Distribution of Gender", 
                col=c("steelblue", "orange", "green"), 
                labels=c("Female", "Male", "Unknown"), 
                cex=0.8)

Histograms and Boxplots

HIV Diagnosis Rate
hist(hiv_data$`HIV diagnosis rate`, col="steelblue", border="black",
     main="HIV Diagnosis Rate", xlab="HIV Diagnosis Rate", ylab="Count")

race_colors <- c("red", "blue", "green", "yellow", "purple")
boxplot(hiv_data$`HIV diagnosis rate` ~ hiv_data$Race, col=race_colors,
        main="Boxplot of HIV Diagnosis Rate by Race",
        xlab="Race", ylab="HIV Diagnosis Rate",
        names= c("Asian/Pacific Islander", "Black", "Latino/Hispanic", "Unknown", "White"))

AIDS diagnosis Rate
hist(hiv_data$`AIDS diagnosis rate`, col="steelblue", border="black",
     main="AIDS Diagnosis Rate", xlab="AIDS Diagnosis Rate", ylab="Count")

boxplot(hiv_data$`AIDS diagnosis rate` ~ hiv_data$Race, col=race_colors,
        main="Boxplot of AIDS Diagnosis Rate by Race",
        xlab="Race", ylab="AIDS Diagnosis Rate",
        names= c("Asian/Pacific Islander", "Black", "Latino/Hispanic", "Unknown", "White"))

7. Summary

The summary statistics show that while the majority of diagnosis data collected in areas of New York turn up with very few diagnosis in that area, the diagnosis rates for HIV on on average around 50 per 100,00 people. The frequency distributions and the bar graph reveal that compared to the other races, black and Hispanic/Latino people appear to account for more diagnoses of HIV than other races. The contingency table and the pie chart show that most cases of HIV occur between ages 20-50 and they occur in men more often than women.

The histograms and box plots reveal that HIV is diagnosed slightly more often than AIDS is, however these rates are still distributed in a very similar manner. It is also shown that the rate in which black people are diagnosed with HIV and AIDS are higher than any other race, while Asian/Pacific Islander and White diagnosis rates are some of the lowest.