This is Mental Health Care data set from Data.gov. I’m just cleaning it. some times i don’t use dplyr because there are built in functions that are shorter to type.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(writexl)
## Warning: package 'writexl' was built under R version 4.4.1
mental_health_care <- read.csv("Mental_Health_Care_in_the_Last_4_Weeks.csv")
mental_health_care %>%
summary()
## Indicator Group State Subgroup
## Length:10404 Length:10404 Length:10404 Length:10404
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Phase Time.Period Time.Period.Label Time.Period.Start.Date
## Length:10404 Min. : 1.00 Length:10404 Length:10404
## Class :character 1st Qu.:20.00 Class :character Class :character
## Mode :character Median :29.00 Mode :character Mode :character
## Mean :28.13
## 3rd Qu.:37.00
## Max. :45.00
##
## Time.Period.End.Date Value LowCI HighCI
## Length:10404 Min. : 1.40 Min. : 0.80 Min. : 2.00
## Class :character 1st Qu.:10.30 1st Qu.: 8.00 1st Qu.:12.90
## Mode :character Median :16.20 Median :13.90 Median :19.20
## Mean :17.45 Mean :14.77 Mean :20.48
## 3rd Qu.:24.00 3rd Qu.:20.80 3rd Qu.:27.40
## Max. :62.90 Max. :53.20 Max. :71.90
## NA's :490 NA's :490 NA's :490
## Confidence.Interval Quartile.Range Suppression.Flag
## Length:10404 Length:10404 Min. :1
## Class :character Class :character 1st Qu.:1
## Mode :character Mode :character Median :1
## Mean :1
## 3rd Qu.:1
## Max. :1
## NA's :10382
mental_health_care %>%
str()
## 'data.frame': 10404 obs. of 15 variables:
## $ Indicator : chr "Took Prescription Medication for Mental Health, Last 4 Weeks" "Took Prescription Medication for Mental Health, Last 4 Weeks" "Took Prescription Medication for Mental Health, Last 4 Weeks" "Took Prescription Medication for Mental Health, Last 4 Weeks" ...
## $ Group : chr "National Estimate" "By Age" "By Age" "By Age" ...
## $ State : chr "United States" "United States" "United States" "United States" ...
## $ Subgroup : chr "United States" "18 - 29 years" "30 - 39 years" "40 - 49 years" ...
## $ Phase : chr "2" "2" "2" "2" ...
## $ Time.Period : int 13 13 13 13 13 13 13 13 13 13 ...
## $ Time.Period.Label : chr "Aug 19 - Aug 31, 2020" "Aug 19 - Aug 31, 2020" "Aug 19 - Aug 31, 2020" "Aug 19 - Aug 31, 2020" ...
## $ Time.Period.Start.Date: chr "08/19/2020" "08/19/2020" "08/19/2020" "08/19/2020" ...
## $ Time.Period.End.Date : chr "08/31/2020" "08/31/2020" "08/31/2020" "08/31/2020" ...
## $ Value : num 19.4 18.7 18.3 20.4 21.2 19.7 18.4 13.8 13.7 24.6 ...
## $ LowCI : num 19 17.2 17.3 19.5 20.2 18.7 17 11 13 24 ...
## $ HighCI : num 19.8 20.3 19.2 21.3 22.2 20.7 19.9 17 14.3 25.2 ...
## $ Confidence.Interval : chr "19.0 - 19.8" "17.2 - 20.3" "17.3 - 19.2" "19.5 - 21.3" ...
## $ Quartile.Range : chr "" "" "" "" ...
## $ Suppression.Flag : num NA NA NA NA NA NA NA NA NA NA ...
missing_values <- mental_health_care %>%
summarise(across(everything()) %>%
is.na() %>%
sum())
missing_values
## across(everything()) %>% is.na() %>% sum()
## 1 11852
colnames(mental_health_care)
## [1] "Indicator" "Group" "State"
## [4] "Subgroup" "Phase" "Time.Period"
## [7] "Time.Period.Label" "Time.Period.Start.Date" "Time.Period.End.Date"
## [10] "Value" "LowCI" "HighCI"
## [13] "Confidence.Interval" "Quartile.Range" "Suppression.Flag"
mental_health_care <- mental_health_care %>%
select(-Quartile.Range, -Suppression.Flag)
colnames(mental_health_care)
## [1] "Indicator" "Group" "State"
## [4] "Subgroup" "Phase" "Time.Period"
## [7] "Time.Period.Label" "Time.Period.Start.Date" "Time.Period.End.Date"
## [10] "Value" "LowCI" "HighCI"
## [13] "Confidence.Interval"
print("No need to change letters because there are no bad inputs ")
## [1] "No need to change letters because there are no bad inputs "
mental_health_care <- mental_health_care %>%
mutate(
Indicator_Split = strsplit(Indicator, ", Last"),
Before_Comma = Indicator_Split[[1]][1],
After_Comma = Indicator_Split[[1]][2],
Recent_Prescription = After_Comma
) %>%
select(-Indicator_Split)
colnames(mental_health_care)
## [1] "Indicator" "Group" "State"
## [4] "Subgroup" "Phase" "Time.Period"
## [7] "Time.Period.Label" "Time.Period.Start.Date" "Time.Period.End.Date"
## [10] "Value" "LowCI" "HighCI"
## [13] "Confidence.Interval" "Before_Comma" "After_Comma"
## [16] "Recent_Prescription"
mental_health_care <- mental_health_care %>%
select(-Indicator, -After_Comma)
colnames(mental_health_care)
## [1] "Group" "State" "Subgroup"
## [4] "Phase" "Time.Period" "Time.Period.Label"
## [7] "Time.Period.Start.Date" "Time.Period.End.Date" "Value"
## [10] "LowCI" "HighCI" "Confidence.Interval"
## [13] "Before_Comma" "Recent_Prescription"
mental_health_care <- mental_health_care %>%
rename(Prescription_Description = Before_Comma) %>%
relocate(Prescription_Description, .before = everything())
colnames(mental_health_care)
## [1] "Prescription_Description" "Group"
## [3] "State" "Subgroup"
## [5] "Phase" "Time.Period"
## [7] "Time.Period.Label" "Time.Period.Start.Date"
## [9] "Time.Period.End.Date" "Value"
## [11] "LowCI" "HighCI"
## [13] "Confidence.Interval" "Recent_Prescription"
print("No need to combine columns because the dates are already combined. Still not going to delete the combined and the separate dates")
## [1] "No need to combine columns because the dates are already combined. Still not going to delete the combined and the separate dates"
mental_health_care <- mental_health_care %>%
mutate(
Time.Period.Start.Date = as.Date(Time.Period.Start.Date, format = "%m/%d/%Y"),
Time.Period.End.Date = as.Date(Time.Period.End.Date, format = "%m/%d/%Y")
) %>%
str()
## 'data.frame': 10404 obs. of 14 variables:
## $ Prescription_Description: chr "Took Prescription Medication for Mental Health" "Took Prescription Medication for Mental Health" "Took Prescription Medication for Mental Health" "Took Prescription Medication for Mental Health" ...
## $ Group : chr "National Estimate" "By Age" "By Age" "By Age" ...
## $ State : chr "United States" "United States" "United States" "United States" ...
## $ Subgroup : chr "United States" "18 - 29 years" "30 - 39 years" "40 - 49 years" ...
## $ Phase : chr "2" "2" "2" "2" ...
## $ Time.Period : int 13 13 13 13 13 13 13 13 13 13 ...
## $ Time.Period.Label : chr "Aug 19 - Aug 31, 2020" "Aug 19 - Aug 31, 2020" "Aug 19 - Aug 31, 2020" "Aug 19 - Aug 31, 2020" ...
## $ Time.Period.Start.Date : Date, format: "2020-08-19" "2020-08-19" ...
## $ Time.Period.End.Date : Date, format: "2020-08-31" "2020-08-31" ...
## $ Value : num 19.4 18.7 18.3 20.4 21.2 19.7 18.4 13.8 13.7 24.6 ...
## $ LowCI : num 19 17.2 17.3 19.5 20.2 18.7 17 11 13 24 ...
## $ HighCI : num 19.8 20.3 19.2 21.3 22.2 20.7 19.9 17 14.3 25.2 ...
## $ Confidence.Interval : chr "19.0 - 19.8" "17.2 - 20.3" "17.3 - 19.2" "19.5 - 21.3" ...
## $ Recent_Prescription : chr " 4 Weeks" " 4 Weeks" " 4 Weeks" " 4 Weeks" ...
#write_xlsx(mental_health_care, "mental_health_care_cleaned.xlsx")
#I might need to fix this. Well it saves to xls when i used the run chunks button but not the run all.
print("Data saved as an excel file")
## [1] "Data saved as an excel file"