Introduction

This is Anxiety or Depression dataset. I got it from data.gov website. I’m just going to clean it.

  1. Load data, Summarize and Check for missing values.
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(writexl)
## Warning: package 'writexl' was built under R version 4.4.1
anx_or_dep <- read.csv("anxiety_or_depression.csv")

anx_or_dep %>%
  summary()
##   Indicator            Group              State             Subgroup        
##  Length:16794       Length:16794       Length:16794       Length:16794      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##     Phase            Time.Period    Time.Period.Label  Time.Period.Start.Date
##  Length:16794       Min.   : 1.00   Length:16794       Length:16794          
##  Class :character   1st Qu.:17.00   Class :character   Class :character      
##  Mode  :character   Median :37.00   Mode  :character   Mode  :character      
##                     Mean   :35.92                                            
##                     3rd Qu.:55.00                                            
##                     Max.   :72.00                                            
##                                                                              
##  Time.Period.End.Date     Value           Low.CI         High.CI     
##  Length:16794         Min.   : 4.60   Min.   : 3.30   Min.   : 6.00  
##  Class :character     1st Qu.:22.10   1st Qu.:18.70   1st Qu.:25.50  
##  Mode  :character     Median :27.70   Median :24.10   Median :31.60  
##                       Mean   :28.14   Mean   :24.64   Mean   :31.89  
##                       3rd Qu.:33.40   3rd Qu.:29.70   3rd Qu.:37.40  
##                       Max.   :85.20   Max.   :79.90   Max.   :89.50  
##                       NA's   :707     NA's   :707     NA's   :707    
##  Confidence.Interval Quartile.Range    
##  Length:16794        Length:16794      
##  Class :character    Class :character  
##  Mode  :character    Mode  :character  
##                                        
##                                        
##                                        
## 
anx_or_dep %>%
  str()
## 'data.frame':    16794 obs. of  14 variables:
##  $ Indicator             : chr  "Symptoms of Depressive Disorder" "Symptoms of Depressive Disorder" "Symptoms of Depressive Disorder" "Symptoms of Depressive Disorder" ...
##  $ Group                 : chr  "National Estimate" "By Age" "By Age" "By Age" ...
##  $ State                 : chr  "United States" "United States" "United States" "United States" ...
##  $ Subgroup              : chr  "United States" "18 - 29 years" "30 - 39 years" "40 - 49 years" ...
##  $ Phase                 : chr  "1" "1" "1" "1" ...
##  $ Time.Period           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Time.Period.Label     : chr  "Apr 23 - May 5, 2020" "Apr 23 - May 5, 2020" "Apr 23 - May 5, 2020" "Apr 23 - May 5, 2020" ...
##  $ Time.Period.Start.Date: chr  "04/23/2020" "04/23/2020" "04/23/2020" "04/23/2020" ...
##  $ Time.Period.End.Date  : chr  "05/05/2020" "05/05/2020" "05/05/2020" "05/05/2020" ...
##  $ Value                 : num  23.5 32.7 25.7 24.8 23.2 18.4 13.6 14.4 20.8 26.1 ...
##  $ Low.CI                : num  22.7 30.2 24.1 23.3 21.5 17 11.8 9 19.6 25.2 ...
##  $ High.CI               : num  24.3 35.2 27.3 26.2 25 19.7 15.5 21.4 22 27.1 ...
##  $ Confidence.Interval   : chr  "22.7 - 24.3" "30.2 - 35.2" "24.1 - 27.3" "23.3 - 26.2" ...
##  $ Quartile.Range        : chr  "" "" "" "" ...
anx_or_dep %>%
  head(7)
##                         Indicator             Group         State      Subgroup
## 1 Symptoms of Depressive Disorder National Estimate United States United States
## 2 Symptoms of Depressive Disorder            By Age United States 18 - 29 years
## 3 Symptoms of Depressive Disorder            By Age United States 30 - 39 years
## 4 Symptoms of Depressive Disorder            By Age United States 40 - 49 years
## 5 Symptoms of Depressive Disorder            By Age United States 50 - 59 years
## 6 Symptoms of Depressive Disorder            By Age United States 60 - 69 years
## 7 Symptoms of Depressive Disorder            By Age United States 70 - 79 years
##   Phase Time.Period    Time.Period.Label Time.Period.Start.Date
## 1     1           1 Apr 23 - May 5, 2020             04/23/2020
## 2     1           1 Apr 23 - May 5, 2020             04/23/2020
## 3     1           1 Apr 23 - May 5, 2020             04/23/2020
## 4     1           1 Apr 23 - May 5, 2020             04/23/2020
## 5     1           1 Apr 23 - May 5, 2020             04/23/2020
## 6     1           1 Apr 23 - May 5, 2020             04/23/2020
## 7     1           1 Apr 23 - May 5, 2020             04/23/2020
##   Time.Period.End.Date Value Low.CI High.CI Confidence.Interval Quartile.Range
## 1           05/05/2020  23.5   22.7    24.3         22.7 - 24.3               
## 2           05/05/2020  32.7   30.2    35.2         30.2 - 35.2               
## 3           05/05/2020  25.7   24.1    27.3         24.1 - 27.3               
## 4           05/05/2020  24.8   23.3    26.2         23.3 - 26.2               
## 5           05/05/2020  23.2   21.5    25.0         21.5 - 25.0               
## 6           05/05/2020  18.4   17.0    19.7         17.0 - 19.7               
## 7           05/05/2020  13.6   11.8    15.5         11.8 - 15.5
#Please dont do this code it really doesnt run the way you wanted it
#anx_or_dep %>%
  #summary() %>%
  #str() %>%
  #head(7)
  

missing_values <- anx_or_dep %>%
  summarise(across(everything()) %>%
  is.na() %>% 
  sum())

missing_values
##   across(everything()) %>% is.na() %>% sum()
## 1                                       2121
colnames(anx_or_dep)
##  [1] "Indicator"              "Group"                  "State"                 
##  [4] "Subgroup"               "Phase"                  "Time.Period"           
##  [7] "Time.Period.Label"      "Time.Period.Start.Date" "Time.Period.End.Date"  
## [10] "Value"                  "Low.CI"                 "High.CI"               
## [13] "Confidence.Interval"    "Quartile.Range"
  1. Remove all unimportant columns and/or Amputate the missing values using the most frequent value within the column.
anx_or_dep <- anx_or_dep %>%
  select(-Quartile.Range)

colnames(anx_or_dep)
##  [1] "Indicator"              "Group"                  "State"                 
##  [4] "Subgroup"               "Phase"                  "Time.Period"           
##  [7] "Time.Period.Label"      "Time.Period.Start.Date" "Time.Period.End.Date"  
## [10] "Value"                  "Low.CI"                 "High.CI"               
## [13] "Confidence.Interval"
  1. Use Upper or Lower Cases to columns especially for cleaning bad inputs
print("No need to change letters because there are no bad inputs ")
## [1] "No need to change letters because there are no bad inputs "
  1. Use Delimiters to extract important values from emails and, the uncleaned data after upper or lowering the letters.
  1. Delimeters
anx_or_dep <- anx_or_dep %>%
  mutate(
    Indicator = case_when(
      str_detect(Indicator, "Symptoms of Depressive Disorder or Anxiety Disorder") ~ "Depression or Anxiety",
      str_detect(Indicator, "Symptoms of Depressive Disorder") ~ "Depression",
      str_detect(Indicator, "Symptoms of Anxiety Disorder") ~ "Anxiety",
      TRUE ~ Indicator
    ) %>%
      str_replace_all("Symptoms of | Disorder", "")
  )
anx_or_dep %>%
  head(7)
##    Indicator             Group         State      Subgroup Phase Time.Period
## 1 Depression National Estimate United States United States     1           1
## 2 Depression            By Age United States 18 - 29 years     1           1
## 3 Depression            By Age United States 30 - 39 years     1           1
## 4 Depression            By Age United States 40 - 49 years     1           1
## 5 Depression            By Age United States 50 - 59 years     1           1
## 6 Depression            By Age United States 60 - 69 years     1           1
## 7 Depression            By Age United States 70 - 79 years     1           1
##      Time.Period.Label Time.Period.Start.Date Time.Period.End.Date Value Low.CI
## 1 Apr 23 - May 5, 2020             04/23/2020           05/05/2020  23.5   22.7
## 2 Apr 23 - May 5, 2020             04/23/2020           05/05/2020  32.7   30.2
## 3 Apr 23 - May 5, 2020             04/23/2020           05/05/2020  25.7   24.1
## 4 Apr 23 - May 5, 2020             04/23/2020           05/05/2020  24.8   23.3
## 5 Apr 23 - May 5, 2020             04/23/2020           05/05/2020  23.2   21.5
## 6 Apr 23 - May 5, 2020             04/23/2020           05/05/2020  18.4   17.0
## 7 Apr 23 - May 5, 2020             04/23/2020           05/05/2020  13.6   11.8
##   High.CI Confidence.Interval
## 1    24.3         22.7 - 24.3
## 2    35.2         30.2 - 35.2
## 3    27.3         24.1 - 27.3
## 4    26.2         23.3 - 26.2
## 5    25.0         21.5 - 25.0
## 6    19.7         17.0 - 19.7
## 7    15.5         11.8 - 15.5
colnames(anx_or_dep)
##  [1] "Indicator"              "Group"                  "State"                 
##  [4] "Subgroup"               "Phase"                  "Time.Period"           
##  [7] "Time.Period.Label"      "Time.Period.Start.Date" "Time.Period.End.Date"  
## [10] "Value"                  "Low.CI"                 "High.CI"               
## [13] "Confidence.Interval"
  1. Combine Important columns into one such as Fname, MI, and Lname to FullName
print("No need to combine columns because the dates are already combined. Still not going to delete the combined and the separate dates")
## [1] "No need to combine columns because the dates are already combined. Still not going to delete the combined and the separate dates"
  1. Change the data typing in the data frame and use correct format(like date and time).
anx_or_dep <- anx_or_dep %>%
  mutate(
    Time.Period.Start.Date = as.Date(Time.Period.Start.Date, format = "%m/%d/%Y"),
    Time.Period.End.Date = as.Date(Time.Period.End.Date, format = "%m/%d/%Y")
  ) %>%
  str()
## 'data.frame':    16794 obs. of  13 variables:
##  $ Indicator             : chr  "Depression" "Depression" "Depression" "Depression" ...
##  $ Group                 : chr  "National Estimate" "By Age" "By Age" "By Age" ...
##  $ State                 : chr  "United States" "United States" "United States" "United States" ...
##  $ Subgroup              : chr  "United States" "18 - 29 years" "30 - 39 years" "40 - 49 years" ...
##  $ Phase                 : chr  "1" "1" "1" "1" ...
##  $ Time.Period           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Time.Period.Label     : chr  "Apr 23 - May 5, 2020" "Apr 23 - May 5, 2020" "Apr 23 - May 5, 2020" "Apr 23 - May 5, 2020" ...
##  $ Time.Period.Start.Date: Date, format: "2020-04-23" "2020-04-23" ...
##  $ Time.Period.End.Date  : Date, format: "2020-05-05" "2020-05-05" ...
##  $ Value                 : num  23.5 32.7 25.7 24.8 23.2 18.4 13.6 14.4 20.8 26.1 ...
##  $ Low.CI                : num  22.7 30.2 24.1 23.3 21.5 17 11.8 9 19.6 25.2 ...
##  $ High.CI               : num  24.3 35.2 27.3 26.2 25 19.7 15.5 21.4 22 27.1 ...
##  $ Confidence.Interval   : chr  "22.7 - 24.3" "30.2 - 35.2" "24.1 - 27.3" "23.3 - 26.2" ...
  1. Save to excel sheet.
#write_xlsx(anx_or_dep, "anxiety_or_depression.xlsx")

print("Data saved as an excel file")
## [1] "Data saved as an excel file"