Data Prep

Load Libraries

# if you haven't run this code before, you'll need to download the below packages first
# instructions on how to do this are included in the video
# but as a reminder, you use the packages tab to the right

library(tidyverse) # for the map() command

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(psych) # for the describe() command

## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(naniar) # for the gg_miss-upset() command
library(expss) # for the cross_cases() command

## Loading required package: maditr
## 
## To aggregate several columns with one summary: take(mtcars, mpg, hp, fun = mean, by = am)
## 
## 
## Attaching package: 'maditr'
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, coalesce, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
## 
## The following object is masked from 'package:readr':
## 
##     cols
## 
## 
## Use 'expss_output_viewer()' to display tables in the RStudio Viewer.
##  To return to the console output, use 'expss_output_default()'.
## 
## 
## Attaching package: 'expss'
## 
## The following object is masked from 'package:naniar':
## 
##     is_na
## 
## The following objects are masked from 'package:stringr':
## 
##     fixed, regex
## 
## The following objects are masked from 'package:dplyr':
## 
##     compute, contains, na_if, recode, vars, where
## 
## The following objects are masked from 'package:purrr':
## 
##     keep, modify, modify_if, when
## 
## The following objects are masked from 'package:tidyr':
## 
##     contains, nest
## 
## The following object is masked from 'package:ggplot2':
## 
##     vars

Import Data

df <- read.csv(file="Data/arc_data_final.csv", header=T)

Viewing Data

# these are commands useful for viewing a dataframe
# you can also click the object in the environment tab to view it in a new window
names(df)

##  [1] "X"                    "gender"               "trans"               
##  [4] "sexual_orientation"   "ethnicity"            "relationship_status" 
##  [7] "age"                  "urban_rural"          "income"              
## [10] "education"            "employment"           "treatment"           
## [13] "health"               "mhealth"              "sleep_hours"         
## [16] "exercise"             "pet"                  "covid_pos"           
## [19] "covid_neg"            "big5_open"            "big5_con"            
## [22] "big5_agr"             "big5_neu"             "big5_ext"            
## [25] "pswq"                 "iou"                  "mfq_26"              
## [28] "mfq_state"            "rse"                  "school_covid_support"
## [31] "school_att"           "pas_covid"            "pss"                 
## [34] "phq"                  "gad"                  "edeq12"              
## [37] "brs"                  "swemws"               "isolation_a"         
## [40] "isolation_c"          "support"

head(df)

##    X gender trans    sexual_orientation                     ethnicity
## 1  1 female    no Heterosexual/Straight White - British, Irish, other
## 2 20   male    no Heterosexual/Straight White - British, Irish, other
## 3 30 female    no Heterosexual/Straight White - British, Irish, other
## 4 31 female    no Heterosexual/Straight White - British, Irish, other
## 5 32   <NA>  <NA>                  <NA>                          <NA>
## 6 33 female    no Heterosexual/Straight White - British, Irish, other
##                        relationship_status                 age urban_rural
## 1 In a relationship/married and cohabiting                <NA>        city
## 2                        Prefer not to say          1 under 18        city
## 3                        Prefer not to say          1 under 18        city
## 4 In a relationship/married and cohabiting 4 between 36 and 45        town
## 5                                     <NA>                <NA>        <NA>
## 6 In a relationship/married and cohabiting 4 between 36 and 45        city
##     income                              education               employment
## 1   3 high            6 graduate degree or higher               3 employed
## 2     <NA>                      prefer not to say 1 high school equivalent
## 3     <NA> 2 equivalent to high school completion 1 high school equivalent
## 4 2 middle                 5 undergraduate degree               3 employed
## 5     <NA>                                   <NA>                     <NA>
## 6 2 middle            6 graduate degree or higher               3 employed
##                    treatment                           health          mhealth
## 1 no psychological disorders something else or not applicable       none or NA
## 2               in treatment something else or not applicable anxiety disorder
## 3           not in treatment something else or not applicable       none or NA
## 4 no psychological disorders                   two conditions       none or NA
## 5                       <NA>                             <NA>       none or NA
## 6           not in treatment something else or not applicable       none or NA
##   sleep_hours exercise                   pet covid_pos covid_neg big5_open
## 1 3 7-8 hours      0.0                   cat         0         0  5.333333
## 2 2 5-6 hours      2.0                   cat         0         0  5.333333
## 3 3 7-8 hours      3.0                   dog         0         0  5.000000
## 4 2 5-6 hours      1.5               no pets         0         0  6.000000
## 5        <NA>       NA                  <NA>         0         0        NA
## 6 3 7-8 hours      1.0 multiple types of pet         0         0  5.000000
##   big5_con big5_agr big5_neu big5_ext     pswq      iou mfq_26 mfq_state rse
## 1 6.000000 4.333333 6.000000 2.000000 4.937500 3.185185   4.20     3.625 2.3
## 2 3.333333 4.333333 6.666667 1.666667 3.357143 4.000000   3.35     3.000 1.6
## 3 5.333333 6.666667 4.000000 6.000000 1.857143 1.592593   4.65     5.875 3.9
## 4 5.666667 4.666667 4.000000 5.000000 3.937500 3.370370   4.65     4.000 1.7
## 5       NA       NA       NA       NA       NA       NA     NA        NA  NA
## 6 6.000000 6.333333 2.666667       NA 2.625000 1.703704   4.50     4.625 3.9
##   school_covid_support school_att pas_covid  pss      phq      gad   edeq12 brs
## 1                   NA         NA  3.222222 3.25 1.333333 1.857143 1.583333  NA
## 2                   NA         NA  4.555556 3.75 3.333333 3.857143 1.833333  NA
## 3                   NA         NA  3.333333 1.00 1.000000 1.142857 1.000000  NA
## 4                   NA         NA  4.222222 3.25 2.333333 2.000000 1.666667  NA
## 5                   NA         NA        NA   NA       NA       NA       NA  NA
## 6                   NA         NA  3.222222 2.00 1.111111 1.428571 1.416667  NA
##     swemws isolation_a isolation_c  support
## 1 2.857143        2.25          NA 2.500000
## 2 2.285714          NA         3.5 2.166667
## 3 4.285714          NA         1.0 5.000000
## 4 3.285714        2.50          NA 2.500000
## 5       NA          NA          NA       NA
## 6 4.000000        1.75          NA 3.666667

str(df)

## 'data.frame':    2073 obs. of  41 variables:
##  $ X                   : int  1 20 30 31 32 33 48 49 57 58 ...
##  $ gender              : chr  "female" "male" "female" "female" ...
##  $ trans               : chr  "no" "no" "no" "no" ...
##  $ sexual_orientation  : chr  "Heterosexual/Straight" "Heterosexual/Straight" "Heterosexual/Straight" "Heterosexual/Straight" ...
##  $ ethnicity           : chr  "White - British, Irish, other" "White - British, Irish, other" "White - British, Irish, other" "White - British, Irish, other" ...
##  $ relationship_status : chr  "In a relationship/married and cohabiting" "Prefer not to say" "Prefer not to say" "In a relationship/married and cohabiting" ...
##  $ age                 : chr  NA "1 under 18" "1 under 18" "4 between 36 and 45" ...
##  $ urban_rural         : chr  "city" "city" "city" "town" ...
##  $ income              : chr  "3 high" NA NA "2 middle" ...
##  $ education           : chr  "6 graduate degree or higher" "prefer not to say" "2 equivalent to high school completion" "5 undergraduate degree" ...
##  $ employment          : chr  "3 employed" "1 high school equivalent" "1 high school equivalent" "3 employed" ...
##  $ treatment           : chr  "no psychological disorders" "in treatment" "not in treatment" "no psychological disorders" ...
##  $ health              : chr  "something else or not applicable" "something else or not applicable" "something else or not applicable" "two conditions" ...
##  $ mhealth             : chr  "none or NA" "anxiety disorder" "none or NA" "none or NA" ...
##  $ sleep_hours         : chr  "3 7-8 hours" "2 5-6 hours" "3 7-8 hours" "2 5-6 hours" ...
##  $ exercise            : num  0 2 3 1.5 NA 1 NA 2 2 1.7 ...
##  $ pet                 : chr  "cat" "cat" "dog" "no pets" ...
##  $ covid_pos           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ covid_neg           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ big5_open           : num  5.33 5.33 5 6 NA ...
##  $ big5_con            : num  6 3.33 5.33 5.67 NA ...
##  $ big5_agr            : num  4.33 4.33 6.67 4.67 NA ...
##  $ big5_neu            : num  6 6.67 4 4 NA ...
##  $ big5_ext            : num  2 1.67 6 5 NA ...
##  $ pswq                : num  4.94 3.36 1.86 3.94 NA ...
##  $ iou                 : num  3.19 4 1.59 3.37 NA ...
##  $ mfq_26              : num  4.2 3.35 4.65 4.65 NA 4.5 NA 4.3 5.25 4.45 ...
##  $ mfq_state           : num  3.62 3 5.88 4 NA ...
##  $ rse                 : num  2.3 1.6 3.9 1.7 NA 3.9 NA 2.4 1.8 NA ...
##  $ school_covid_support: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ school_att          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ pas_covid           : num  3.22 4.56 3.33 4.22 NA ...
##  $ pss                 : num  3.25 3.75 1 3.25 NA 2 NA 2 4 1.25 ...
##  $ phq                 : num  1.33 3.33 1 2.33 NA ...
##  $ gad                 : num  1.86 3.86 1.14 2 NA ...
##  $ edeq12              : num  1.58 1.83 1 1.67 NA ...
##  $ brs                 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ swemws              : num  2.86 2.29 4.29 3.29 NA ...
##  $ isolation_a         : num  2.25 NA NA 2.5 NA 1.75 NA 2 1.25 NA ...
##  $ isolation_c         : num  NA 3.5 1 NA NA NA NA NA NA 1 ...
##  $ support             : num  2.5 2.17 5 2.5 NA ...

Subsetting Data

# for the HW: use the codebook you created in the codebook activity to get the names of your variables (first column)
# enter this list of names in the select=c() argument to subset those columns from the dataframe
# variables for the lab: id, variable2, variable3, variable5, variable8, variable10, variable11
# this is a comment hi 
d <- subset(df, select=c(X, income, exercise, covid_pos,big5_ext,pswq, gad))

Recoding Variables

# categorical variables need to be recoded as factors
#the content of the variable will stay the same, but R will treat the variable differently at times
d$X <- as.factor(d$X)
d$income <- as.factor(d$income)
d$exercise <- as.factor(d$exercise)

str(d)

## 'data.frame':    2073 obs. of  7 variables:
##  $ X        : Factor w/ 2073 levels "1","20","30",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ income   : Factor w/ 4 levels "1 low","2 middle",..: 3 NA NA 2 NA 2 2 2 2 NA ...
##  $ exercise : Factor w/ 71 levels "0","0.1","0.15",..: 1 30 38 25 NA 19 NA 30 30 28 ...
##  $ covid_pos: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ big5_ext : num  2 1.67 6 5 NA ...
##  $ pswq     : num  4.94 3.36 1.86 3.94 NA ...
##  $ gad      : num  1.86 3.86 1.14 2 NA ...

We looked at our missing data and had a number of participants with skipped responses (n=1747). There was a large number of participants who did not decide to answer the income question. We decided to drop participants who did not respond to any of the items in our analyses. However, the decision to drop participants is not ideal and can lead to the erasure of vulnerable groups. This is a limitation of our dataset and our analysis that we will need to acknowledge in our manuscript.

## Missing Data
# use the gg_miss_upset() command for a visualization of your missing data
gg_miss_upset(d[-1], nsets = 6)

# use the na.omit() command to create a new dataframe in which any participants with missing data are dropped from the dataframe
d2 <- na.omit(d)

Exporting Data

# last step is to export the data after you've dropped NAs
# for the HW, the file you're exporting here is what you'll use for all future HW assignments (labs will use the files I provide you)
# make sure you give it a name that is memorable!
# and make sure you save it to your Data folder!
write.csv(d2, file="Data/arc_data_clean_FINAL.csv", row.names = F)

# since we've created a cleaned dataframe in d2, we'll use that for the rest of the lab/HW

Basic Statistics

Univariate Plots: Histograms & Tables

table(d2$income)

## 
##             1 low          2 middle            3 high prefer not to say 
##                38               159                85                44

table(d2$exercise)

## 
##      0    0.1   0.15    0.2   0.25    0.3   0.35   0.45    0.5   0.57    0.6 
##     15      0      0      0      1      2      0      1     23      0      1 
##   0.66    0.7   0.72   0.75    0.8   0.83    0.9      1    1.1    1.2   1.25 
##      0      3      0      4      0      0      0     64      0      2      3 
##    1.3   1.45    1.5   1.55    1.6    1.7   1.75      2    2.1    2.2   2.25 
##      3      0     42      0      0      0      2     65      0      0      1 
##    2.3    2.5    2.6   2.75      3    3.1    3.2   3.25    3.5      4   4.25 
##      0     13      0      1     22      1      0      2      8     11      0 
##    4.5      5    5.5      6    6.3    6.5      7    7.3    7.5      8    8.5 
##      1      6      1      5      0      1      2      0      0      4      1 
##      9     10  10.25     11     12     13     14     15     16     17   17.5 
##      0      1      1      1      2      3      2      1      0      1      1 
##     18  31.75     32   40.3 333333 
##      1      0      0      1      0

table(d2$covid_pos)

## 
##   0   1   2   3   4   5   6   9  12 
## 305   4   7   2   2   1   3   1   1

hist(d2$covid_pos)

hist(d2$big5_ext)

hist(d2$pswq)

hist(d2$gad)

Univariate Normality

Cutoffs are -2 to +2

COVID Positivity = skew and kurtosis are extremely high and positively skewed, this is something we will need to acknowledge in our manuscript. Big 5 Extraversion = skew and kurtosis are okay. PSWQ Scores = skew and kurtosis are okay. GAD Scores = skew is high but okay, kurtosis is high and positively skewed.

describe(d2)

##           vars   n   mean     sd median trimmed    mad  min     max   range
## X*           1 326 671.51 462.67 602.50  624.93 453.68 1.00 2054.00 2053.00
## income*      2 326   2.41   0.86   2.00    2.39   1.48 1.00    4.00    3.00
## exercise*    3 326  27.80  13.98  25.00   27.19   8.90 1.00   70.00   69.00
## covid_pos    4 326   0.23   1.14   0.00    0.00   0.00 0.00   12.00   12.00
## big5_ext     5 326   4.64   1.33   4.67    4.72   1.48 1.33    7.00    5.67
## pswq         6 326   2.87   0.87   2.81    2.85   1.02 1.12    4.94    3.81
## gad          7 326   1.53   0.61   1.43    1.42   0.42 1.00    4.00    3.00
##            skew kurtosis    se
## X*         0.97     0.96 25.62
## income*    0.35    -0.56  0.05
## exercise*  0.44     0.26  0.77
## covid_pos  6.57    50.61  0.06
## big5_ext  -0.43    -0.58  0.07
## pswq       0.20    -0.79  0.05
## gad        1.73     3.22  0.03

Bivariate Plots

Crosstabs

cross_cases(d2, income, exercise)

	exercise
	0	0.1	0.15	0.2	0.25	0.3	0.35	0.45	0.5	0.57	0.6	0.66	0.7	0.72	0.75	0.8	0.83	0.9	1	1.1	1.2	1.25	1.3	1.45	1.5	1.55	1.6	1.7	1.75	2	2.1	2.2	2.25	2.3	2.5	2.6	2.75	3	3.1	3.2	3.25	3.5	4	4.25	4.5	5	5.5	6	6.3	6.5	7	7.3	7.5	8	8.5	9	10	10.25	11	12	13	14	15	16	17	17.5	18	31.75	32	40.3	333333
income
1 low	4					1			2										9						4					7					2			3				1	2		1	1																1
2 middle	7				1			1	11				2		2				31		1	1	1		19				1	35			1		4			12			2	4	5			3	1	2			2			2	1		1	1	1	1	1					1				1
3 high	4					1			7						1				14			2			12				1	15					6		1	5	1			2	3			1		1						2						1	2		1		1		1
prefer not to say									3		1		1		1				10		1		2		7					8					1			2				1	1			1		2		1												1
#Total cases	15				1	2		1	23		1		3		4				64		2	3	3		42				2	65			1		13		1	22	1		2	8	11		1	6	1	5		1	2			4	1		1	1	1	2	3	2	1		1	1	1			1

Scatterplots

Two scatterplots.

plot(d2$pswq, d2$big5_ext,
     main="Scatterplot of PSWQ Scores and Big 5 Extraversion",
     xlab = "PSWQ Scores",
     ylab = "Big 5 Extraversion")

plot(d2$gad, d2$exercise,
     main="Scatterplot of Hours of Exercise and GAD scores",
     xlab = "GAD Scores",
     ylab = "Hours of Exercise")

Boxplots

Two boxplots

boxplot(data=d2, gad~income,
        main="Boxplot of GAD scores and Income",
        xlab = "Income",
        ylab = "GAD Scores")

boxplot(data=d2, big5_ext~gad,
        main="Boxplot of Big 5 Extraversion and GAD Scores",
        xlab = "GAD Scores",
        ylab = "Big 5 Extraversion")

P421 HW - Data Prep & Basic Statistics

Jenna Kirkpatrick

2023-07-21