This is Anxiety or Depression dataset. I got it from data.gov website. I’m just going to clean it.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(writexl)
## Warning: package 'writexl' was built under R version 4.4.1
anx_or_dep <- read.csv("anxiety_or_depression.csv")
anx_or_dep %>%
summary()
## Indicator Group State Subgroup
## Length:16794 Length:16794 Length:16794 Length:16794
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Phase Time.Period Time.Period.Label Time.Period.Start.Date
## Length:16794 Min. : 1.00 Length:16794 Length:16794
## Class :character 1st Qu.:17.00 Class :character Class :character
## Mode :character Median :37.00 Mode :character Mode :character
## Mean :35.92
## 3rd Qu.:55.00
## Max. :72.00
##
## Time.Period.End.Date Value Low.CI High.CI
## Length:16794 Min. : 4.60 Min. : 3.30 Min. : 6.00
## Class :character 1st Qu.:22.10 1st Qu.:18.70 1st Qu.:25.50
## Mode :character Median :27.70 Median :24.10 Median :31.60
## Mean :28.14 Mean :24.64 Mean :31.89
## 3rd Qu.:33.40 3rd Qu.:29.70 3rd Qu.:37.40
## Max. :85.20 Max. :79.90 Max. :89.50
## NA's :707 NA's :707 NA's :707
## Confidence.Interval Quartile.Range
## Length:16794 Length:16794
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
anx_or_dep %>%
str()
## 'data.frame': 16794 obs. of 14 variables:
## $ Indicator : chr "Symptoms of Depressive Disorder" "Symptoms of Depressive Disorder" "Symptoms of Depressive Disorder" "Symptoms of Depressive Disorder" ...
## $ Group : chr "National Estimate" "By Age" "By Age" "By Age" ...
## $ State : chr "United States" "United States" "United States" "United States" ...
## $ Subgroup : chr "United States" "18 - 29 years" "30 - 39 years" "40 - 49 years" ...
## $ Phase : chr "1" "1" "1" "1" ...
## $ Time.Period : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Time.Period.Label : chr "Apr 23 - May 5, 2020" "Apr 23 - May 5, 2020" "Apr 23 - May 5, 2020" "Apr 23 - May 5, 2020" ...
## $ Time.Period.Start.Date: chr "04/23/2020" "04/23/2020" "04/23/2020" "04/23/2020" ...
## $ Time.Period.End.Date : chr "05/05/2020" "05/05/2020" "05/05/2020" "05/05/2020" ...
## $ Value : num 23.5 32.7 25.7 24.8 23.2 18.4 13.6 14.4 20.8 26.1 ...
## $ Low.CI : num 22.7 30.2 24.1 23.3 21.5 17 11.8 9 19.6 25.2 ...
## $ High.CI : num 24.3 35.2 27.3 26.2 25 19.7 15.5 21.4 22 27.1 ...
## $ Confidence.Interval : chr "22.7 - 24.3" "30.2 - 35.2" "24.1 - 27.3" "23.3 - 26.2" ...
## $ Quartile.Range : chr "" "" "" "" ...
anx_or_dep %>%
head(7)
## Indicator Group State Subgroup
## 1 Symptoms of Depressive Disorder National Estimate United States United States
## 2 Symptoms of Depressive Disorder By Age United States 18 - 29 years
## 3 Symptoms of Depressive Disorder By Age United States 30 - 39 years
## 4 Symptoms of Depressive Disorder By Age United States 40 - 49 years
## 5 Symptoms of Depressive Disorder By Age United States 50 - 59 years
## 6 Symptoms of Depressive Disorder By Age United States 60 - 69 years
## 7 Symptoms of Depressive Disorder By Age United States 70 - 79 years
## Phase Time.Period Time.Period.Label Time.Period.Start.Date
## 1 1 1 Apr 23 - May 5, 2020 04/23/2020
## 2 1 1 Apr 23 - May 5, 2020 04/23/2020
## 3 1 1 Apr 23 - May 5, 2020 04/23/2020
## 4 1 1 Apr 23 - May 5, 2020 04/23/2020
## 5 1 1 Apr 23 - May 5, 2020 04/23/2020
## 6 1 1 Apr 23 - May 5, 2020 04/23/2020
## 7 1 1 Apr 23 - May 5, 2020 04/23/2020
## Time.Period.End.Date Value Low.CI High.CI Confidence.Interval Quartile.Range
## 1 05/05/2020 23.5 22.7 24.3 22.7 - 24.3
## 2 05/05/2020 32.7 30.2 35.2 30.2 - 35.2
## 3 05/05/2020 25.7 24.1 27.3 24.1 - 27.3
## 4 05/05/2020 24.8 23.3 26.2 23.3 - 26.2
## 5 05/05/2020 23.2 21.5 25.0 21.5 - 25.0
## 6 05/05/2020 18.4 17.0 19.7 17.0 - 19.7
## 7 05/05/2020 13.6 11.8 15.5 11.8 - 15.5
#Please dont do this code it really doesnt run the way you wanted it
#anx_or_dep %>%
#summary() %>%
#str() %>%
#head(7)
missing_values <- anx_or_dep %>%
summarise(across(everything()) %>%
is.na() %>%
sum())
missing_values
## across(everything()) %>% is.na() %>% sum()
## 1 2121
colnames(anx_or_dep)
## [1] "Indicator" "Group" "State"
## [4] "Subgroup" "Phase" "Time.Period"
## [7] "Time.Period.Label" "Time.Period.Start.Date" "Time.Period.End.Date"
## [10] "Value" "Low.CI" "High.CI"
## [13] "Confidence.Interval" "Quartile.Range"
anx_or_dep <- anx_or_dep %>%
select(-Quartile.Range)
colnames(anx_or_dep)
## [1] "Indicator" "Group" "State"
## [4] "Subgroup" "Phase" "Time.Period"
## [7] "Time.Period.Label" "Time.Period.Start.Date" "Time.Period.End.Date"
## [10] "Value" "Low.CI" "High.CI"
## [13] "Confidence.Interval"
print("No need to change letters because there are no bad inputs ")
## [1] "No need to change letters because there are no bad inputs "
anx_or_dep <- anx_or_dep %>%
mutate(
Indicator = case_when(
str_detect(Indicator, "Symptoms of Depressive Disorder or Anxiety Disorder") ~ "Depression or Anxiety",
str_detect(Indicator, "Symptoms of Depressive Disorder") ~ "Depression",
str_detect(Indicator, "Symptoms of Anxiety Disorder") ~ "Anxiety",
TRUE ~ Indicator
) %>%
str_replace_all("Symptoms of | Disorder", "")
)
anx_or_dep %>%
head(7)
## Indicator Group State Subgroup Phase Time.Period
## 1 Depression National Estimate United States United States 1 1
## 2 Depression By Age United States 18 - 29 years 1 1
## 3 Depression By Age United States 30 - 39 years 1 1
## 4 Depression By Age United States 40 - 49 years 1 1
## 5 Depression By Age United States 50 - 59 years 1 1
## 6 Depression By Age United States 60 - 69 years 1 1
## 7 Depression By Age United States 70 - 79 years 1 1
## Time.Period.Label Time.Period.Start.Date Time.Period.End.Date Value Low.CI
## 1 Apr 23 - May 5, 2020 04/23/2020 05/05/2020 23.5 22.7
## 2 Apr 23 - May 5, 2020 04/23/2020 05/05/2020 32.7 30.2
## 3 Apr 23 - May 5, 2020 04/23/2020 05/05/2020 25.7 24.1
## 4 Apr 23 - May 5, 2020 04/23/2020 05/05/2020 24.8 23.3
## 5 Apr 23 - May 5, 2020 04/23/2020 05/05/2020 23.2 21.5
## 6 Apr 23 - May 5, 2020 04/23/2020 05/05/2020 18.4 17.0
## 7 Apr 23 - May 5, 2020 04/23/2020 05/05/2020 13.6 11.8
## High.CI Confidence.Interval
## 1 24.3 22.7 - 24.3
## 2 35.2 30.2 - 35.2
## 3 27.3 24.1 - 27.3
## 4 26.2 23.3 - 26.2
## 5 25.0 21.5 - 25.0
## 6 19.7 17.0 - 19.7
## 7 15.5 11.8 - 15.5
colnames(anx_or_dep)
## [1] "Indicator" "Group" "State"
## [4] "Subgroup" "Phase" "Time.Period"
## [7] "Time.Period.Label" "Time.Period.Start.Date" "Time.Period.End.Date"
## [10] "Value" "Low.CI" "High.CI"
## [13] "Confidence.Interval"
print("No need to combine columns because the dates are already combined. Still not going to delete the combined and the separate dates")
## [1] "No need to combine columns because the dates are already combined. Still not going to delete the combined and the separate dates"
anx_or_dep <- anx_or_dep %>%
mutate(
Time.Period.Start.Date = as.Date(Time.Period.Start.Date, format = "%m/%d/%Y"),
Time.Period.End.Date = as.Date(Time.Period.End.Date, format = "%m/%d/%Y")
) %>%
str()
## 'data.frame': 16794 obs. of 13 variables:
## $ Indicator : chr "Depression" "Depression" "Depression" "Depression" ...
## $ Group : chr "National Estimate" "By Age" "By Age" "By Age" ...
## $ State : chr "United States" "United States" "United States" "United States" ...
## $ Subgroup : chr "United States" "18 - 29 years" "30 - 39 years" "40 - 49 years" ...
## $ Phase : chr "1" "1" "1" "1" ...
## $ Time.Period : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Time.Period.Label : chr "Apr 23 - May 5, 2020" "Apr 23 - May 5, 2020" "Apr 23 - May 5, 2020" "Apr 23 - May 5, 2020" ...
## $ Time.Period.Start.Date: Date, format: "2020-04-23" "2020-04-23" ...
## $ Time.Period.End.Date : Date, format: "2020-05-05" "2020-05-05" ...
## $ Value : num 23.5 32.7 25.7 24.8 23.2 18.4 13.6 14.4 20.8 26.1 ...
## $ Low.CI : num 22.7 30.2 24.1 23.3 21.5 17 11.8 9 19.6 25.2 ...
## $ High.CI : num 24.3 35.2 27.3 26.2 25 19.7 15.5 21.4 22 27.1 ...
## $ Confidence.Interval : chr "22.7 - 24.3" "30.2 - 35.2" "24.1 - 27.3" "23.3 - 26.2" ...
#write_xlsx(anx_or_dep, "anxiety_or_depression.xlsx")
print("Data saved as an excel file")
## [1] "Data saved as an excel file"