library(ggplot2)
library(readr)
library(knitr)

suicide_data <- read_csv("Death_rates_for_suicide__by_sex__race__Hispanic_origin__and_age__United_States.csv")
## Rows: 6390 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): INDICATOR, UNIT, STUB_NAME, STUB_LABEL, AGE, FLAG
## dbl (7): UNIT_NUM, STUB_NAME_NUM, STUB_LABEL_NUM, YEAR, YEAR_NUM, AGE_NUM, E...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(suicide_data)
## # A tibble: 6 × 13
##   INDICATOR     UNIT  UNIT_NUM STUB_NAME STUB_NAME_NUM STUB_LABEL STUB_LABEL_NUM
##   <chr>         <chr>    <dbl> <chr>             <dbl> <chr>               <dbl>
## 1 Death rates … Deat…        1 Total                 0 All perso…              0
## 2 Death rates … Deat…        1 Total                 0 All perso…              0
## 3 Death rates … Deat…        1 Total                 0 All perso…              0
## 4 Death rates … Deat…        1 Total                 0 All perso…              0
## 5 Death rates … Deat…        1 Total                 0 All perso…              0
## 6 Death rates … Deat…        1 Total                 0 All perso…              0
## # ℹ 6 more variables: YEAR <dbl>, YEAR_NUM <dbl>, AGE <chr>, AGE_NUM <dbl>,
## #   ESTIMATE <dbl>, FLAG <chr>
summary(suicide_data)
##   INDICATOR             UNIT              UNIT_NUM      STUB_NAME        
##  Length:6390        Length:6390        Min.   :1.000   Length:6390       
##  Class :character   Class :character   1st Qu.:2.000   Class :character  
##  Mode  :character   Mode  :character   Median :2.000   Mode  :character  
##                                        Mean   :1.873                     
##                                        3rd Qu.:2.000                     
##                                        Max.   :2.000                     
##                                                                          
##  STUB_NAME_NUM     STUB_LABEL        STUB_LABEL_NUM       YEAR     
##  Min.   : 0.000   Length:6390        Min.   :0.000   Min.   :1950  
##  1st Qu.: 3.000   Class :character   1st Qu.:3.230   1st Qu.:1988  
##  Median : 5.000   Mode  :character   Median :5.125   Median :1999  
##  Mean   : 4.622                      Mean   :4.687   Mean   :1998  
##  3rd Qu.: 6.000                      3rd Qu.:6.153   3rd Qu.:2009  
##  Max.   :11.000                      Max.   :7.235   Max.   :2018  
##                                                                    
##     YEAR_NUM         AGE               AGE_NUM         ESTIMATE    
##  Min.   : 1.00   Length:6390        Min.   :0.000   Min.   : 0.30  
##  1st Qu.:12.00   Class :character   1st Qu.:0.000   1st Qu.: 5.00  
##  Median :23.00   Mode  :character   Median :3.000   Median :10.50  
##  Mean   :22.69                      Mean   :2.712   Mean   :13.71  
##  3rd Qu.:33.00                      3rd Qu.:4.100   3rd Qu.:19.50  
##  Max.   :42.00                      Max.   :6.000   Max.   :74.80  
##                                                     NA's   :906    
##      FLAG          
##  Length:6390       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 
colSums(is.na(suicide_data))
##      INDICATOR           UNIT       UNIT_NUM      STUB_NAME  STUB_NAME_NUM 
##              0              0              0              0              0 
##     STUB_LABEL STUB_LABEL_NUM           YEAR       YEAR_NUM            AGE 
##              0              0              0              0              0 
##        AGE_NUM       ESTIMATE           FLAG 
##              0            906           5484
hist(suicide_data$ESTIMATE, 
     main="Distribution of Suicide Death Rates", 
     xlab="Suicide Death Rate (per 100,000)", 
     col="skyblue")

plot(suicide_data$YEAR, suicide_data$ESTIMATE, 
     main="Scatterplot of Year vs Suicide Death Rate",
     xlab="Year", ylab="Suicide Death Rate",
     col="blue", pch=19)

cor(suicide_data$YEAR, suicide_data$ESTIMATE, use="complete.obs")
## [1] -0.06733087
ggplot(suicide_data, aes(x = YEAR, y = ESTIMATE)) +
  geom_point(color = "blue") +
  ggtitle("Year vs Suicide Death Rate") +
  xlab("Year") +
  ylab("Suicide Death Rate") +
  theme_minimal()
## Warning: Removed 906 rows containing missing values or values outside the scale range
## (`geom_point()`).

suicide_data$AGE_NUM <- as.numeric(suicide_data$AGE_NUM)

correlation_age_estimate <- cor(suicide_data$AGE_NUM, suicide_data$ESTIMATE, use = "complete.obs")

correlation_age_estimate
## [1] 0.3256933