library(ggplot2)
library(readr)
library(knitr)
suicide_data <- read_csv("Death_rates_for_suicide__by_sex__race__Hispanic_origin__and_age__United_States.csv")
## Rows: 6390 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): INDICATOR, UNIT, STUB_NAME, STUB_LABEL, AGE, FLAG
## dbl (7): UNIT_NUM, STUB_NAME_NUM, STUB_LABEL_NUM, YEAR, YEAR_NUM, AGE_NUM, E...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(suicide_data)
## # A tibble: 6 × 13
## INDICATOR UNIT UNIT_NUM STUB_NAME STUB_NAME_NUM STUB_LABEL STUB_LABEL_NUM
## <chr> <chr> <dbl> <chr> <dbl> <chr> <dbl>
## 1 Death rates … Deat… 1 Total 0 All perso… 0
## 2 Death rates … Deat… 1 Total 0 All perso… 0
## 3 Death rates … Deat… 1 Total 0 All perso… 0
## 4 Death rates … Deat… 1 Total 0 All perso… 0
## 5 Death rates … Deat… 1 Total 0 All perso… 0
## 6 Death rates … Deat… 1 Total 0 All perso… 0
## # ℹ 6 more variables: YEAR <dbl>, YEAR_NUM <dbl>, AGE <chr>, AGE_NUM <dbl>,
## # ESTIMATE <dbl>, FLAG <chr>
summary(suicide_data)
## INDICATOR UNIT UNIT_NUM STUB_NAME
## Length:6390 Length:6390 Min. :1.000 Length:6390
## Class :character Class :character 1st Qu.:2.000 Class :character
## Mode :character Mode :character Median :2.000 Mode :character
## Mean :1.873
## 3rd Qu.:2.000
## Max. :2.000
##
## STUB_NAME_NUM STUB_LABEL STUB_LABEL_NUM YEAR
## Min. : 0.000 Length:6390 Min. :0.000 Min. :1950
## 1st Qu.: 3.000 Class :character 1st Qu.:3.230 1st Qu.:1988
## Median : 5.000 Mode :character Median :5.125 Median :1999
## Mean : 4.622 Mean :4.687 Mean :1998
## 3rd Qu.: 6.000 3rd Qu.:6.153 3rd Qu.:2009
## Max. :11.000 Max. :7.235 Max. :2018
##
## YEAR_NUM AGE AGE_NUM ESTIMATE
## Min. : 1.00 Length:6390 Min. :0.000 Min. : 0.30
## 1st Qu.:12.00 Class :character 1st Qu.:0.000 1st Qu.: 5.00
## Median :23.00 Mode :character Median :3.000 Median :10.50
## Mean :22.69 Mean :2.712 Mean :13.71
## 3rd Qu.:33.00 3rd Qu.:4.100 3rd Qu.:19.50
## Max. :42.00 Max. :6.000 Max. :74.80
## NA's :906
## FLAG
## Length:6390
## Class :character
## Mode :character
##
##
##
##
colSums(is.na(suicide_data))
## INDICATOR UNIT UNIT_NUM STUB_NAME STUB_NAME_NUM
## 0 0 0 0 0
## STUB_LABEL STUB_LABEL_NUM YEAR YEAR_NUM AGE
## 0 0 0 0 0
## AGE_NUM ESTIMATE FLAG
## 0 906 5484
hist(suicide_data$ESTIMATE,
main="Distribution of Suicide Death Rates",
xlab="Suicide Death Rate (per 100,000)",
col="skyblue")

plot(suicide_data$YEAR, suicide_data$ESTIMATE,
main="Scatterplot of Year vs Suicide Death Rate",
xlab="Year", ylab="Suicide Death Rate",
col="blue", pch=19)

cor(suicide_data$YEAR, suicide_data$ESTIMATE, use="complete.obs")
## [1] -0.06733087
ggplot(suicide_data, aes(x = YEAR, y = ESTIMATE)) +
geom_point(color = "blue") +
ggtitle("Year vs Suicide Death Rate") +
xlab("Year") +
ylab("Suicide Death Rate") +
theme_minimal()
## Warning: Removed 906 rows containing missing values or values outside the scale range
## (`geom_point()`).

suicide_data$AGE_NUM <- as.numeric(suicide_data$AGE_NUM)
correlation_age_estimate <- cor(suicide_data$AGE_NUM, suicide_data$ESTIMATE, use = "complete.obs")
correlation_age_estimate
## [1] 0.3256933