library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(pastecs)
##
## Attaching package: 'pastecs'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## The following object is masked from 'package:tidyr':
##
## extract
load('NSDUH_2023.Rdata')
AlcoholMH<-select(puf2023_102124, ALCYRTOT, IMPYDAYS)
summary(AlcoholMH)
## ALCYRTOT IMPYDAYS
## Min. : 1.0 Min. : 0.0
## 1st Qu.: 36.0 1st Qu.: 1.0
## Median :200.0 Median :999.0
## Mean :466.2 Mean :554.4
## 3rd Qu.:991.0 3rd Qu.:999.0
## Max. :998.0 Max. :999.0
stat.desc(AlcoholMH$IMPYDAYS)
## x
## nbr.val 5.670500e+04
## nbr.null 1.352000e+04
## nbr.na 0.000000e+00
## min 0.000000e+00
## max 9.990000e+02
## range 9.990000e+02
## sum 3.143451e+07
## median 9.990000e+02
## mean 5.543517e+02
## SE.mean 2.048868e+00
## CI.mean.0.95 4.015793e+00
## var 2.380396e+05
## std.dev 4.878930e+02
## coef.var 8.801146e-01
AlcoholMHClean<-AlcoholMH%>% filter(AlcoholMH$ALCYRTOT >=0 & AlcoholMH$ALCYRTOT <=365)
AlcoholMHClean2<-AlcoholMHClean%>% filter(AlcoholMHClean$IMPYDAYS >=0 & AlcoholMHClean$IMPYDAYS <=365)
summary(AlcoholMHClean2)
## ALCYRTOT IMPYDAYS
## Min. : 1.00 Min. : 0.00
## 1st Qu.: 12.00 1st Qu.: 0.00
## Median : 48.00 Median : 0.00
## Mean : 81.61 Mean : 21.07
## 3rd Qu.:108.00 3rd Qu.: 8.00
## Max. :365.00 Max. :365.00
stat.desc(AlcoholMHClean2$IMPYDAYS)
## x
## nbr.val 1.876000e+04
## nbr.null 9.976000e+03
## nbr.na 0.000000e+00
## min 0.000000e+00
## max 3.650000e+02
## range 3.650000e+02
## sum 3.951940e+05
## median 0.000000e+00
## mean 2.106578e+01
## SE.mean 4.428110e-01
## CI.mean.0.95 8.679496e-01
## var 3.678491e+03
## std.dev 6.065056e+01
## coef.var 2.879104e+00
hist(AlcoholMHClean2$IMPYDAYS, main="Number of Days Missed Work", xlab="Days", breaks=20)
ggplot(AlcoholMHClean2,aes(x=IMPYDAYS,y=ALCYRTOT)) + geom_point() + ggtitle("Untransformed Data")
ggplot(AlcoholMHClean2,aes(x=log(IMPYDAYS),y=log(ALCYRTOT))) + geom_point() + ggtitle("Log-Transformed Data")
AlcoholMHClean2_log<-AlcoholMHClean2 %>% mutate(LOG_Days=log(IMPYDAYS)) %>% select(IMPYDAYS,LOG_Days)
head(AlcoholMHClean2_log)
## # A tibble: 6 × 2
## IMPYDAYS LOG_Days
## <dbl> <dbl>
## 1 6 1.79
## 2 20 3.00
## 3 10 2.30
## 4 0 -Inf
## 5 0 -Inf
## 6 0 -Inf
str(AlcoholMHClean2_log)
## tibble [18,760 × 2] (S3: tbl_df/tbl/data.frame)
## $ IMPYDAYS: num [1:18760] 6 20 10 0 0 0 0 10 0 0 ...
## ..- attr(*, "label")= chr "HOW MANY DAY IN PAST YR YOU WERE UNABLE TO WORK"
## ..- attr(*, "format.sas")= chr "IMPYDAYSFMT"
## $ LOG_Days: num [1:18760] 1.79 3 2.3 -Inf -Inf ...
## ..- attr(*, "label")= chr "HOW MANY DAY IN PAST YR YOU WERE UNABLE TO WORK"
## ..- attr(*, "format.sas")= chr "IMPYDAYSFMT"
This doesn’t work using the log, since there are so many entries with the value of zero. So we are going to try the square root.
AlcoholMHClean2_sqrt<-AlcoholMHClean2 %>% mutate(Days_SQRT=sqrt(IMPYDAYS)) %>% select(IMPYDAYS,Days_SQRT)
head(AlcoholMHClean2_sqrt)
## # A tibble: 6 × 2
## IMPYDAYS Days_SQRT
## <dbl> <dbl>
## 1 6 2.45
## 2 20 4.47
## 3 10 3.16
## 4 0 0
## 5 0 0
## 6 0 0
This looks like will work much better for a historgram.
hist(AlcoholMHClean2_sqrt$IMPYDAYS, main="Transformed Data # Days Missed Work", xlab="Log Days", breaks=20)
I am not sure this is useful at this point. The two historgrams look the
same. I will need to do something else with this data to review it. I
will keep playing around with it.