Data Prep

Load Libraries

# if you haven't run this code before, you'll need to download the below packages first
# instructions on how to do this are included in the video
# but as a reminder, you use the packages tab to the right

library(tidyverse) # for the map() command

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(psych) # for the describe() command

## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(naniar) # for the gg_miss-upset() command
library(expss) # for the cross_cases() command

## Loading required package: maditr
## 
## To select rows from data: rows(mtcars, am==0)
## 
## 
## Attaching package: 'maditr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, coalesce, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
## 
## The following object is masked from 'package:readr':
## 
##     cols
## 
## 
## Use 'expss_output_viewer()' to display tables in the RStudio Viewer.
##  To return to the console output, use 'expss_output_default()'.
## 
## 
## Attaching package: 'expss'
## 
## The following object is masked from 'package:naniar':
## 
##     is_na
## 
## The following objects are masked from 'package:stringr':
## 
##     fixed, regex
## 
## The following objects are masked from 'package:dplyr':
## 
##     compute, contains, na_if, recode, vars, where
## 
## The following objects are masked from 'package:purrr':
## 
##     keep, modify, modify_if, when
## 
## The following objects are masked from 'package:tidyr':
## 
##     contains, nest
## 
## The following object is masked from 'package:ggplot2':
## 
##     vars

Import Data

# for the lab, you'll import the CSV file you downloaded along with the current file we're working in (an RMD file)
# for the homework, you'll download the CSV file from your chosen README page (should be titled arc_data_final.csv or eammi2_data_final.csv)
df <- read.csv(file="Data/arc_data_final.csv", header=T)

Viewing Data

# these are commands useful for viewing a dataframe
# you can also click the object in the environment tab to view it in a new window
names(df)

##  [1] "X"                    "gender"               "trans"               
##  [4] "sexual_orientation"   "ethnicity"            "relationship_status" 
##  [7] "age"                  "urban_rural"          "income"              
## [10] "education"            "employment"           "treatment"           
## [13] "health"               "mhealth"              "sleep_hours"         
## [16] "exercise"             "pet"                  "covid_pos"           
## [19] "covid_neg"            "big5_open"            "big5_con"            
## [22] "big5_agr"             "big5_neu"             "big5_ext"            
## [25] "pswq"                 "iou"                  "mfq_26"              
## [28] "mfq_state"            "rse"                  "school_covid_support"
## [31] "school_att"           "pas_covid"            "pss"                 
## [34] "phq"                  "gad"                  "edeq12"              
## [37] "brs"                  "swemws"               "isolation_a"         
## [40] "isolation_c"          "support"

head(df)

##    X gender trans    sexual_orientation                     ethnicity
## 1  1 female    no Heterosexual/Straight White - British, Irish, other
## 2 20   male    no Heterosexual/Straight White - British, Irish, other
## 3 30 female    no Heterosexual/Straight White - British, Irish, other
## 4 31 female    no Heterosexual/Straight White - British, Irish, other
## 5 32   <NA>  <NA>                  <NA>                          <NA>
## 6 33 female    no Heterosexual/Straight White - British, Irish, other
##                        relationship_status                 age urban_rural
## 1 In a relationship/married and cohabiting                <NA>        city
## 2                        Prefer not to say          1 under 18        city
## 3                        Prefer not to say          1 under 18        city
## 4 In a relationship/married and cohabiting 4 between 36 and 45        town
## 5                                     <NA>                <NA>        <NA>
## 6 In a relationship/married and cohabiting 4 between 36 and 45        city
##     income                              education               employment
## 1   3 high            6 graduate degree or higher               3 employed
## 2     <NA>                      prefer not to say 1 high school equivalent
## 3     <NA> 2 equivalent to high school completion 1 high school equivalent
## 4 2 middle                 5 undergraduate degree               3 employed
## 5     <NA>                                   <NA>                     <NA>
## 6 2 middle            6 graduate degree or higher               3 employed
##                    treatment                           health          mhealth
## 1 no psychological disorders something else or not applicable       none or NA
## 2               in treatment something else or not applicable anxiety disorder
## 3           not in treatment something else or not applicable       none or NA
## 4 no psychological disorders                   two conditions       none or NA
## 5                       <NA>                             <NA>       none or NA
## 6           not in treatment something else or not applicable       none or NA
##   sleep_hours exercise                   pet covid_pos covid_neg big5_open
## 1 3 7-8 hours      0.0                   cat         0         0  5.333333
## 2 2 5-6 hours      2.0                   cat         0         0  5.333333
## 3 3 7-8 hours      3.0                   dog         0         0  5.000000
## 4 2 5-6 hours      1.5               no pets         0         0  6.000000
## 5        <NA>       NA                  <NA>         0         0        NA
## 6 3 7-8 hours      1.0 multiple types of pet         0         0  5.000000
##   big5_con big5_agr big5_neu big5_ext     pswq      iou mfq_26 mfq_state rse
## 1 6.000000 4.333333 6.000000 2.000000 4.937500 3.185185   4.20     3.625 2.3
## 2 3.333333 4.333333 6.666667 1.666667 3.357143 4.000000   3.35     3.000 1.6
## 3 5.333333 6.666667 4.000000 6.000000 1.857143 1.592593   4.65     5.875 3.9
## 4 5.666667 4.666667 4.000000 5.000000 3.937500 3.370370   4.65     4.000 1.7
## 5       NA       NA       NA       NA       NA       NA     NA        NA  NA
## 6 6.000000 6.333333 2.666667       NA 2.625000 1.703704   4.50     4.625 3.9
##   school_covid_support school_att pas_covid  pss      phq      gad   edeq12 brs
## 1                   NA         NA  3.222222 3.25 1.333333 1.857143 1.583333  NA
## 2                   NA         NA  4.555556 3.75 3.333333 3.857143 1.833333  NA
## 3                   NA         NA  3.333333 1.00 1.000000 1.142857 1.000000  NA
## 4                   NA         NA  4.222222 3.25 2.333333 2.000000 1.666667  NA
## 5                   NA         NA        NA   NA       NA       NA       NA  NA
## 6                   NA         NA  3.222222 2.00 1.111111 1.428571 1.416667  NA
##     swemws isolation_a isolation_c  support
## 1 2.857143        2.25          NA 2.500000
## 2 2.285714          NA         3.5 2.166667
## 3 4.285714          NA         1.0 5.000000
## 4 3.285714        2.50          NA 2.500000
## 5       NA          NA          NA       NA
## 6 4.000000        1.75          NA 3.666667

str(df)

## 'data.frame':    2073 obs. of  41 variables:
##  $ X                   : int  1 20 30 31 32 33 48 49 57 58 ...
##  $ gender              : chr  "female" "male" "female" "female" ...
##  $ trans               : chr  "no" "no" "no" "no" ...
##  $ sexual_orientation  : chr  "Heterosexual/Straight" "Heterosexual/Straight" "Heterosexual/Straight" "Heterosexual/Straight" ...
##  $ ethnicity           : chr  "White - British, Irish, other" "White - British, Irish, other" "White - British, Irish, other" "White - British, Irish, other" ...
##  $ relationship_status : chr  "In a relationship/married and cohabiting" "Prefer not to say" "Prefer not to say" "In a relationship/married and cohabiting" ...
##  $ age                 : chr  NA "1 under 18" "1 under 18" "4 between 36 and 45" ...
##  $ urban_rural         : chr  "city" "city" "city" "town" ...
##  $ income              : chr  "3 high" NA NA "2 middle" ...
##  $ education           : chr  "6 graduate degree or higher" "prefer not to say" "2 equivalent to high school completion" "5 undergraduate degree" ...
##  $ employment          : chr  "3 employed" "1 high school equivalent" "1 high school equivalent" "3 employed" ...
##  $ treatment           : chr  "no psychological disorders" "in treatment" "not in treatment" "no psychological disorders" ...
##  $ health              : chr  "something else or not applicable" "something else or not applicable" "something else or not applicable" "two conditions" ...
##  $ mhealth             : chr  "none or NA" "anxiety disorder" "none or NA" "none or NA" ...
##  $ sleep_hours         : chr  "3 7-8 hours" "2 5-6 hours" "3 7-8 hours" "2 5-6 hours" ...
##  $ exercise            : num  0 2 3 1.5 NA 1 NA 2 2 1.7 ...
##  $ pet                 : chr  "cat" "cat" "dog" "no pets" ...
##  $ covid_pos           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ covid_neg           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ big5_open           : num  5.33 5.33 5 6 NA ...
##  $ big5_con            : num  6 3.33 5.33 5.67 NA ...
##  $ big5_agr            : num  4.33 4.33 6.67 4.67 NA ...
##  $ big5_neu            : num  6 6.67 4 4 NA ...
##  $ big5_ext            : num  2 1.67 6 5 NA ...
##  $ pswq                : num  4.94 3.36 1.86 3.94 NA ...
##  $ iou                 : num  3.19 4 1.59 3.37 NA ...
##  $ mfq_26              : num  4.2 3.35 4.65 4.65 NA 4.5 NA 4.3 5.25 4.45 ...
##  $ mfq_state           : num  3.62 3 5.88 4 NA ...
##  $ rse                 : num  2.3 1.6 3.9 1.7 NA 3.9 NA 2.4 1.8 NA ...
##  $ school_covid_support: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ school_att          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ pas_covid           : num  3.22 4.56 3.33 4.22 NA ...
##  $ pss                 : num  3.25 3.75 1 3.25 NA 2 NA 2 4 1.25 ...
##  $ phq                 : num  1.33 3.33 1 2.33 NA ...
##  $ gad                 : num  1.86 3.86 1.14 2 NA ...
##  $ edeq12              : num  1.58 1.83 1 1.67 NA ...
##  $ brs                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ swemws              : num  2.86 2.29 4.29 3.29 NA ...
##  $ isolation_a         : num  2.25 NA NA 2.5 NA 1.75 NA 2 1.25 NA ...
##  $ isolation_c         : num  NA 3.5 1 NA NA NA NA NA NA 1 ...
##  $ support             : num  2.5 2.17 5 2.5 NA ...

Subsetting Data

# for the HW: use the codebook you created in the codebook activity to get the names of your variables (first column)
# enter this list of names in the select=c() argument to subset those columns from the dataframe
# variables for the lab: id, variable2, variable3, variable5, variable8, variable10, variable11
d <- subset(df, select=c(sleep_hours, mhealth, iou, mfq_26, pas_covid, pss))

Recoding Variables

# categorical variables need to be recoded as factors
# the content of the variable will stay the same, but R will treat the variable differently at times
d$sleep_hours <- as.factor(d$sleep_hours)
d$mhealth <- as.factor(d$mhealth)

str(d)

## 'data.frame':    2073 obs. of  6 variables:
##  $ sleep_hours: Factor w/ 5 levels "1 < 5 hours",..: 3 2 3 2 NA 3 NA 3 3 4 ...
##  $ mhealth    : Factor w/ 8 levels "anxiety disorder",..: 5 1 5 5 5 5 3 5 1 5 ...
##  $ iou        : num  3.19 4 1.59 3.37 NA ...
##  $ mfq_26     : num  4.2 3.35 4.65 4.65 NA 4.5 NA 4.3 5.25 4.45 ...
##  $ pas_covid  : num  3.22 4.56 3.33 4.22 NA ...
##  $ pss        : num  3.25 3.75 1 3.25 NA 2 NA 2 4 1.25 ...

Missing Data

# use the gg_miss_upset() command for a visualization of your missing data
gg_miss_upset(d[-1], nsets = 6)

# use the na.omit() command to create a new dataframe in which any participants with missing data are dropped from the dataframe
d2 <- na.omit(d)

Exporting Data

# last step is to export the data after you've dropped NAs
# for the HW, the file you're exporting here is what you'll use for all future HW assignments (labs will use the files I provide you)
# make sure you give it a name that is memorable!
# and make sure you save it to your Data folder!
write.csv(d2, file="Data/arc_data_finalclean.csv", row.names = F) 

# since we've created a cleaned dataframe in d2, we'll use that for the rest of the lab/HW

Basic Statistics

Univariate Plots: Histograms & Tables

table(d2$sleep_hours)

## 
##  1 < 5 hours  2 5-6 hours  3 7-8 hours 4 8-10 hours 5 > 10 hours 
##           83          320          455          276           48

table(d2$mhealth)

## 
##              anxiety disorder                       bipolar 
##                           123                             5 
##                    depression              eating disorders 
##                            28                            28 
##                    none or NA obsessive compulsive disorder 
##                           916                            27 
##                         other                          ptsd 
##                            35                            20

hist(d2$iou)

hist(d2$mfq_26)

hist(d2$pas_covid)

hist(d2$pss)

Univariate Normality

Cutoffs are -2 to +2. iou = skew and kurtosis are okay. mfq_26 = skew and kurtosis are okay. pas_covid = skew and kurtosis are okay. pss = skew and kurtosis are okay.

describe(d2)

##              vars    n mean   sd median trimmed  mad min max range  skew
## sleep_hours*    1 1182 2.90 0.97   3.00    2.92 1.48 1.0   5   4.0  0.00
## mhealth*        2 1182 4.63 1.41   5.00    4.87 0.00 1.0   8   7.0 -1.45
## iou             3 1182 2.56 0.90   2.41    2.50 0.99 1.0   5   4.0  0.50
## mfq_26          4 1182 4.31 0.67   4.35    4.33 0.67 1.8   6   4.2 -0.32
## pas_covid       5 1182 3.23 0.68   3.22    3.24 0.66 1.0   5   4.0 -0.19
## pss             6 1182 2.94 0.95   3.00    2.93 1.11 1.0   5   4.0  0.06
##              kurtosis   se
## sleep_hours*    -0.48 0.03
## mhealth*         2.52 0.04
## iou             -0.59 0.03
## mfq_26           0.14 0.02
## pas_covid       -0.04 0.02
## pss             -0.76 0.03

Bivariate Plots

Crosstabs

cross_cases(d2, sleep_hours, mhealth)

	mhealth
	anxiety disorder	bipolar	depression	eating disorders	none or NA	obsessive compulsive disorder	other	ptsd
sleep_hours
1 < 5 hours	19	2	1	3	45	2	5	6
2 5-6 hours	43	1	9	8	232	8	10	9
3 7-8 hours	36		10	10	369	12	15	3
4 8-10 hours	20	2	7	6	233	2	4	2
5 > 10 hours	5		1	1	37	3	1
#Total cases	123	5	28	28	916	27	35	20

Scatterplots

plot(d2$iou, d2$mfq_26,
     main="Scatterplot of iou and mfq_26",
     xlab = "iou",
     ylab = "mfq_26")

plot(d2$pas_covid, d2$pss,
     main="Scatterplot of pas_covid and pss",
     xlab = "pas_covid",
     ylab = "pss")

Boxplots

boxplot(data=d2, iou~sleep_hours,
        main="Boxplot of sleep_hours and iou",
        xlab = "x sleep_hours",
        ylab = "Y iou")

boxplot(data=d2, mfq_26~mhealth,
        main="Boxplot of mhealth and mfq_26",
        xlab = "x mfq_26",
        ylab = "Y mfq")

P421 Lab - Data Prep & Basic Statistics

Runtian Huang

2023-07-02

Data Prep

Load Libraries

Import Data

Viewing Data

Subsetting Data

Recoding Variables

Missing Data

Exporting Data

Basic Statistics

Univariate Plots: Histograms & Tables

Univariate Normality

Bivariate Plots

Crosstabs

Scatterplots

Boxplots