Data Prep

Load Libraries

# if you haven't run this code before, you'll need to download the below packages first
# instructions on how to do this are included in the video
# but as a reminder, you use the packages tab to the right

library(psych) # for the describe() command
library(naniar) # for the gg_miss-upset() command
library(expss) # for the cross_cases() command

## Loading required package: maditr

## 
## To aggregate data: take(mtcars, mean_mpg = mean(mpg), by = am)

## 
## Use 'expss_output_rnotebook()' to display tables inside R Notebooks.
##  To return to the console output, use 'expss_output_default()'.

## 
## Attaching package: 'expss'

## The following object is masked from 'package:naniar':
## 
##     is_na

Import Data

# for the lab, you'll import the CSV file you downloaded along with the current file we're working in (an RMD file)
# for the homework, you'll download the CSV file from your chosen README page (should be titled arc_data_final.csv or eammi2_data_final.csv)
df <- read.csv(file="data/arc_data_final.csv", header=T)

Viewing Data

# these are commands useful for viewing a dataframe
# you can also click the object in the environment tab to view it in a new window
names(df)

##  [1] "X"                    "gender"               "trans"               
##  [4] "sexual_orientation"   "ethnicity"            "relationship_status" 
##  [7] "age"                  "urban_rural"          "income"              
## [10] "education"            "employment"           "treatment"           
## [13] "health"               "mhealth"              "sleep_hours"         
## [16] "exercise_cat"         "pet"                  "covid_pos"           
## [19] "covid_neg"            "big5_open"            "big5_con"            
## [22] "big5_agr"             "big5_neu"             "big5_ext"            
## [25] "pswq_std"             "iou"                  "mfq_26"              
## [28] "mfq_state"            "rse"                  "school_covid_support"
## [31] "school_att"           "pas_covid"            "pss"                 
## [34] "phq"                  "gad"                  "edeq12"              
## [37] "brs"                  "swemws"               "isolation"           
## [40] "support"

head(df)

##    X gender trans    sexual_orientation                     ethnicity
## 1  1 female    no Heterosexual/Straight White - British, Irish, other
## 2 20   male    no Heterosexual/Straight White - British, Irish, other
## 3 30 female    no Heterosexual/Straight White - British, Irish, other
## 4 31 female    no Heterosexual/Straight White - British, Irish, other
## 5 32   <NA>  <NA>                  <NA>                          <NA>
## 6 33 female    no Heterosexual/Straight White - British, Irish, other
##                        relationship_status                 age urban_rural
## 1 In a relationship/married and cohabiting                <NA>        city
## 2                        Prefer not to say          1 under 18        city
## 3                        Prefer not to say          1 under 18        city
## 4 In a relationship/married and cohabiting 4 between 36 and 45        town
## 5                                     <NA>                <NA>        <NA>
## 6 In a relationship/married and cohabiting 4 between 36 and 45        city
##     income                              education               employment
## 1   3 high            6 graduate degree or higher               3 employed
## 2     <NA>                      prefer not to say 1 high school equivalent
## 3     <NA> 2 equivalent to high school completion 1 high school equivalent
## 4 2 middle                 5 undergraduate degree               3 employed
## 5     <NA>                                   <NA>                     <NA>
## 6 2 middle            6 graduate degree or higher               3 employed
##                    treatment                           health          mhealth
## 1 no psychological disorders something else or not applicable       none or NA
## 2               in treatment something else or not applicable anxiety disorder
## 3           not in treatment something else or not applicable       none or NA
## 4 no psychological disorders                   two conditions       none or NA
## 5                       <NA>                             <NA>       none or NA
## 6           not in treatment something else or not applicable       none or NA
##   sleep_hours       exercise_cat                   pet covid_pos covid_neg
## 1 3 7-8 hours 1 less than 1 hour                   cat         0         0
## 2 2 5-6 hours        2 1-2 hours                   cat         0         0
## 3 3 7-8 hours        3 2-5 hours                   dog         0         0
## 4 2 5-6 hours        2 1-2 hours               no pets         0         0
## 5        <NA>               <NA>                  <NA>         0         0
## 6 3 7-8 hours        2 1-2 hours multiple types of pet         0         0
##   big5_open big5_con big5_agr big5_neu big5_ext   pswq_std      iou mfq_26
## 1  5.333333 6.000000 4.333333 6.000000 2.000000  2.3094514 3.185185   4.20
## 2  5.333333 3.333333 4.333333 6.666667 1.666667  0.8509744 4.000000   3.35
## 3  5.000000 5.333333 6.666667 4.000000 6.000000 -1.1235082 1.592593   4.65
## 4  6.000000 5.666667 4.666667 4.000000 5.000000  1.1626810 3.370370   4.65
## 5        NA       NA       NA       NA       NA         NA       NA     NA
## 6  5.000000 6.000000 6.333333 2.666667       NA -0.3424552 1.703704   4.50
##   mfq_state rse school_covid_support school_att pas_covid  pss      phq
## 1     3.625 2.3                   NA         NA  3.222222 3.25 1.333333
## 2     3.000 1.6                   NA         NA  4.555556 3.75 3.333333
## 3     5.875 3.9                   NA         NA  3.333333 1.00 1.000000
## 4     4.000 1.7                   NA         NA  4.222222 3.25 2.333333
## 5        NA  NA                   NA         NA        NA   NA       NA
## 6     4.625 3.9                   NA         NA  3.222222 2.00 1.111111
##        gad   edeq12 brs   swemws isolation  support
## 1 1.857143 1.583333  NA 2.857143      2.25 2.500000
## 2 3.857143 1.833333  NA 2.285714      3.50 2.166667
## 3 1.142857 1.000000  NA 4.285714      1.00 5.000000
## 4 2.000000 1.666667  NA 3.285714      2.50 2.500000
## 5       NA       NA  NA       NA        NA       NA
## 6 1.428571 1.416667  NA 4.000000      1.75 3.666667

str(df)

## 'data.frame':    2073 obs. of  40 variables:
##  $ X                   : int  1 20 30 31 32 33 48 49 57 58 ...
##  $ gender              : chr  "female" "male" "female" "female" ...
##  $ trans               : chr  "no" "no" "no" "no" ...
##  $ sexual_orientation  : chr  "Heterosexual/Straight" "Heterosexual/Straight" "Heterosexual/Straight" "Heterosexual/Straight" ...
##  $ ethnicity           : chr  "White - British, Irish, other" "White - British, Irish, other" "White - British, Irish, other" "White - British, Irish, other" ...
##  $ relationship_status : chr  "In a relationship/married and cohabiting" "Prefer not to say" "Prefer not to say" "In a relationship/married and cohabiting" ...
##  $ age                 : chr  NA "1 under 18" "1 under 18" "4 between 36 and 45" ...
##  $ urban_rural         : chr  "city" "city" "city" "town" ...
##  $ income              : chr  "3 high" NA NA "2 middle" ...
##  $ education           : chr  "6 graduate degree or higher" "prefer not to say" "2 equivalent to high school completion" "5 undergraduate degree" ...
##  $ employment          : chr  "3 employed" "1 high school equivalent" "1 high school equivalent" "3 employed" ...
##  $ treatment           : chr  "no psychological disorders" "in treatment" "not in treatment" "no psychological disorders" ...
##  $ health              : chr  "something else or not applicable" "something else or not applicable" "something else or not applicable" "two conditions" ...
##  $ mhealth             : chr  "none or NA" "anxiety disorder" "none or NA" "none or NA" ...
##  $ sleep_hours         : chr  "3 7-8 hours" "2 5-6 hours" "3 7-8 hours" "2 5-6 hours" ...
##  $ exercise_cat        : chr  "1 less than 1 hour" "2 1-2 hours" "3 2-5 hours" "2 1-2 hours" ...
##  $ pet                 : chr  "cat" "cat" "dog" "no pets" ...
##  $ covid_pos           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ covid_neg           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ big5_open           : num  5.33 5.33 5 6 NA ...
##  $ big5_con            : num  6 3.33 5.33 5.67 NA ...
##  $ big5_agr            : num  4.33 4.33 6.67 4.67 NA ...
##  $ big5_neu            : num  6 6.67 4 4 NA ...
##  $ big5_ext            : num  2 1.67 6 5 NA ...
##  $ pswq_std            : num  2.309 0.851 -1.124 1.163 NA ...
##  $ iou                 : num  3.19 4 1.59 3.37 NA ...
##  $ mfq_26              : num  4.2 3.35 4.65 4.65 NA 4.5 NA 4.3 5.25 4.45 ...
##  $ mfq_state           : num  3.62 3 5.88 4 NA ...
##  $ rse                 : num  2.3 1.6 3.9 1.7 NA 3.9 NA 2.4 1.8 NA ...
##  $ school_covid_support: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ school_att          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ pas_covid           : num  3.22 4.56 3.33 4.22 NA ...
##  $ pss                 : num  3.25 3.75 1 3.25 NA 2 NA 2 4 1.25 ...
##  $ phq                 : num  1.33 3.33 1 2.33 NA ...
##  $ gad                 : num  1.86 3.86 1.14 2 NA ...
##  $ edeq12              : num  1.58 1.83 1 1.67 NA ...
##  $ brs                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ swemws              : num  2.86 2.29 4.29 3.29 NA ...
##  $ isolation           : num  2.25 3.5 1 2.5 NA 1.75 NA 2 1.25 1 ...
##  $ support             : num  2.5 2.17 5 2.5 NA ...

Subsetting Data

# for the HW: use the codebook you created in the codebook activity to get the names of your variables (first column)
# enter this list of names in the select=c() argument to subset those columns from the dataframe
# variables for the lab: id, variable2, variable3, variable5, variable8, variable10, variable11
d <- subset(df, select=c(X, age, mhealth, pas_covid, phq, gad, swemws))
#this is where I will put my variables I chose from ARC data set

Recoding Variables

# categorical variables need to be recoded as factors
# the content of the variable will stay the same, but R will treat the variable differently at times
d$age <- as.factor(d$age)
d$mhealth <- as.factor(d$mhealth)
str(d)

## 'data.frame':    2073 obs. of  7 variables:
##  $ X        : int  1 20 30 31 32 33 48 49 57 58 ...
##  $ age      : Factor w/ 5 levels "1 under 18","2 between 18 and 25",..: NA 1 1 4 NA 4 4 4 4 1 ...
##  $ mhealth  : Factor w/ 8 levels "anxiety disorder",..: 5 1 5 5 5 5 3 5 1 5 ...
##  $ pas_covid: num  3.22 4.56 3.33 4.22 NA ...
##  $ phq      : num  1.33 3.33 1 2.33 NA ...
##  $ gad      : num  1.86 3.86 1.14 2 NA ...
##  $ swemws   : num  2.86 2.29 4.29 3.29 NA ...

Missing Data

We looked at the missing data in our data set and found that about 12% of the participants in our sample skipped at least one item. We dropped these participants from our analysis, which is not advisable and runs the risk of dropping vulnerable groups or skewing results. However, we will proceed for the sake of this class using the reduced data set.

# use the gg_miss_upset() command for a visualization of your missing data
gg_miss_upset(d[-1], nsets = 6)

# use the na.omit() command to create a new dataframe in which any participants with missing data are dropped from the dataframe
d2 <- na.omit(d)
889/1000

## [1] 0.889

# 0.889

Exporting Data

# last step is to export the data after you've dropped NAs
# for the HW, the file you're exporting here is what you'll use for all future HW assignments (labs will use the files I provide you)
# make sure you give it a name that is memorable!
# and make sure you save it to your Data folder!
write.csv(d2, file="data/mydata.csv", row.names = F)
#reccommend using something like this for HW

# since we've created a cleaned dataframe in d2, we'll use that for the rest of the lab/HW

Basic Statistics

Univariate Plots: Histograms & Tables

table(d2$age)

## 
##          1 under 18 2 between 18 and 25 3 between 26 and 35 4 between 36 and 45 
##                 585                  54                   6                  87 
##           5 over 45 
##                 180

table(d2$mhealth)

## 
##              anxiety disorder                       bipolar 
##                            94                             3 
##                    depression              eating disorders 
##                            23                            17 
##                    none or NA obsessive compulsive disorder 
##                           724                            14 
##                         other                          ptsd 
##                            23                            14

hist(d2$pas_covid)

hist(d2$phq)

hist(d2$gad)

hist(d2$swemws)

Univariate Normality

We analyzed the skew and kurtosis of our categorical variables and all were within the accepted range (-2/+2).

We analyzed the skew and kurtosis of our continuous variables and all were within the accepted range.

describe(d2)

##           vars   n    mean      sd  median trimmed     mad min  max range  skew
## X            1 912 4623.25 2562.60 4717.00 4661.90 3242.45  20 8860  8840 -0.10
## age*         2 912    2.15    1.67    1.00    1.94    0.00   1    5     4  0.91
## mhealth*     3 912    4.62    1.38    5.00    4.88    0.00   1    8     7 -1.53
## pas_covid    4 912    3.22    0.68    3.22    3.24    0.66   1    5     4 -0.19
## phq          5 912    2.04    0.86    1.89    1.96    0.99   1    4     3  0.68
## gad          6 912    1.98    0.90    1.71    1.88    0.85   1    4     3  0.75
## swemws       7 912    3.18    0.83    3.29    3.20    0.85   1    5     4 -0.20
##           kurtosis    se
## X            -1.20 84.86
## age*         -1.03  0.06
## mhealth*      2.73  0.05
## pas_covid     0.13  0.02
## phq          -0.59  0.03
## gad          -0.61  0.03
## swemws       -0.36  0.03

Bivariate Plots

Crosstabs

cross_cases(d2, age, mhealth)

	mhealth
	anxiety disorder	bipolar	depression	eating disorders	none or NA	obsessive compulsive disorder	other	ptsd
age
1 under 18	63	2	8	14	461	10	16	11
2 between 18 and 25	10	1	1	3	36	2	1
3 between 26 and 35	1				5
4 between 36 and 45	12		4		66	2	3
5 over 45	8		10		156		3	3
#Total cases	94	3	23	17	724	14	23	14

Scatterplots

plot(d2$pas_covid, d2$phq,
     main="Scatterplot of pas_covid and phq",
     xlab = "pas_covid",
     ylab = "phq")

plot(d2$gad, d2$swemws,
     main="Scatterplot of gad and swemws",
     xlab = "gad",
     ylab = "swemws")

Boxplots

boxplot(data=d2, pas_covid~age,
        main="Boxplot of X and Y",
        xlab = "x (age)",
        ylab = "Y (pas_covid)")

boxplot(data=d2, swemws~mhealth,
        main="Boxplot of X and Y",
        xlab = "x (mhealth)",
        ylab = "Y (swemws)")

#x is categorical,y is continuous and it has to be continuous-categorical

P421 Lab - Data Prep & Basic Statistics

Emma Fox

2024-02-25