# if you haven't used a given package before, you'll need to download it first
# delete the "#" before the install function and run it to download
# then run the library function calling that package
#install.packages("naniar")
library(naniar) # for the gg_miss-upset() command
Import the full project data into a dataframe, call it “df”. Replace ‘DOWLOADED FILE NAME’ with the actual file name of your dataset (either for the ARC or EAMMi2).
Note: If you named your folder something else, you will also need to replace ‘Data’ with whatever the name of your folder is where you saved the dataset in.
df <- read.csv(file="Data/arc_data_final_SP25.csv", header=T)
These are commands useful for viewing a data frame.
# you can also click the object (the little table picture) in the environment tab to view it in a new window
names(df) # all the variable name in the data frame
## [1] "X" "gender" "trans"
## [4] "sexual_orientation" "ethnicity" "relationship_status"
## [7] "age" "urban_rural" "income"
## [10] "education" "employment" "treatment"
## [13] "health" "mhealth" "sleep_hours"
## [16] "exercise" "pet" "covid_pos"
## [19] "covid_neg" "big5_open" "big5_con"
## [22] "big5_agr" "big5_neu" "big5_ext"
## [25] "pswq" "iou" "mfq_26"
## [28] "mfq_state" "rse" "school_covid_support"
## [31] "school_att" "pas_covid" "pss"
## [34] "phq" "gad" "edeq12"
## [37] "brs" "swemws" "isolation_a"
## [40] "isolation_c" "support"
head(df) # first 6 lines of data in the data frame
## X gender trans sexual_orientation
## 1 1 female no Heterosexual/Straight
## 2 321 male no Heterosexual/Straight
## 3 401 female no Heterosexual/Straight
## 4 469 female no Heterosexual/Straight
## 5 520 female no Prefer not to say
## 6 1390 male no Heterosexual/Straight
## ethnicity
## 1 White - British, Irish, other
## 2 White - British, Irish, other
## 3 Middle Eastern/Middle Eastern British - Arab, Turkish, other
## 4 White - British, Irish, other
## 5 Prefer not to say
## 6 White - British, Irish, other
## relationship_status age urban_rural
## 1 In a relationship/married and cohabiting <NA> city
## 2 Single, never married 1 under 18 town
## 3 Single, divorced or widowed 4 between 36 and 45 village
## 4 In a relationship/married and cohabiting <NA> town
## 5 Single, never married 1 under 18 town
## 6 In a relationship/married and cohabiting 5 over 45 village
## income education employment
## 1 3 high 6 graduate degree or higher 3 employed
## 2 <NA> 1 equivalent to not completing high school 1 high school equivalent
## 3 3 high 6 graduate degree or higher 3 employed
## 4 2 middle 5 undergraduate degree 3 employed
## 5 <NA> 1 equivalent to not completing high school 1 high school equivalent
## 6 3 high 5 undergraduate degree 3 employed
## treatment health
## 1 no psychological disorders something else or not applicable
## 2 not in treatment something else or not applicable
## 3 not in treatment high blood pressure
## 4 in treatment diabetes
## 5 <NA> something else or not applicable
## 6 not in treatment something else or not applicable
## mhealth sleep_hours exercise pet
## 1 none or NA 3 7-8 hours 1 less than 1 hour cat
## 2 none or NA 4 8-10 hours 1 less than 1 hour cat and dog
## 3 obsessive compulsive disorder 2 5-6 hours 1 less than 1 hour no pets
## 4 depression 3 7-8 hours 1 less than 1 hour no pets
## 5 none or NA 2 5-6 hours 1 less than 1 hour cat
## 6 none or NA 2 5-6 hours 1 less than 1 hour cat and dog
## covid_pos covid_neg big5_open big5_con big5_agr big5_neu big5_ext pswq
## 1 0 0 5.333333 6.000000 4.333333 6.000000 2.000000 4.937500
## 2 0 0 4.000000 4.333333 3.000000 3.666667 4.666667 1.714286
## 3 0 0 6.000000 5.000000 6.666667 3.000000 4.666667 2.437500
## 4 0 0 5.000000 5.666667 4.000000 4.333333 6.000000 2.500000
## 5 0 0 3.666667 3.000000 4.333333 5.333333 2.000000 2.714286
## 6 0 0 3.000000 3.333333 5.333333 3.333333 5.333333 2.250000
## iou mfq_26 mfq_state rse school_covid_support school_att pas_covid pss
## 1 3.185185 4.20 3.625 2.3 NA NA 3.222222 3.25
## 2 2.481481 4.30 6.000 3.8 NA NA 2.333333 2.25
## 3 2.814815 4.70 5.000 3.1 NA NA 4.000000 2.25
## 4 2.592593 4.60 5.125 3.0 NA NA NA 2.25
## 5 2.222222 2.70 3.000 2.6 NA NA 3.000000 2.75
## 6 1.481481 2.95 3.500 3.0 NA NA 2.888889 2.75
## phq gad edeq12 brs swemws isolation_a isolation_c support
## 1 1.333333 1.857143 1.583333 NA 2.857143 2.25 NA 2.500000
## 2 1.888889 1.000000 1.000000 NA 3.857143 NA 1.25 2.500000
## 3 2.444444 2.142857 3.083333 NA 3.714286 2.50 NA 3.833333
## 4 1.222222 1.714286 1.833333 NA 3.428571 1.00 NA 4.666667
## 5 1.555556 1.142857 1.333333 NA 3.000000 NA 1.00 2.833333
## 6 1.222222 1.000000 1.500000 NA 2.571429 2.25 NA 2.833333
str(df) # shows all the variables in the data frame and their classification type (e.g., numeric, string, character,etc.)
## 'data.frame': 2073 obs. of 41 variables:
## $ X : int 1 321 401 469 520 1390 1422 1849 2183 2247 ...
## $ gender : chr "female" "male" "female" "female" ...
## $ trans : chr "no" "no" "no" "no" ...
## $ sexual_orientation : chr "Heterosexual/Straight" "Heterosexual/Straight" "Heterosexual/Straight" "Heterosexual/Straight" ...
## $ ethnicity : chr "White - British, Irish, other" "White - British, Irish, other" "Middle Eastern/Middle Eastern British - Arab, Turkish, other" "White - British, Irish, other" ...
## $ relationship_status : chr "In a relationship/married and cohabiting" "Single, never married" "Single, divorced or widowed" "In a relationship/married and cohabiting" ...
## $ age : chr NA "1 under 18" "4 between 36 and 45" NA ...
## $ urban_rural : chr "city" "town" "village" "town" ...
## $ income : chr "3 high" NA "3 high" "2 middle" ...
## $ education : chr "6 graduate degree or higher" "1 equivalent to not completing high school" "6 graduate degree or higher" "5 undergraduate degree" ...
## $ employment : chr "3 employed" "1 high school equivalent" "3 employed" "3 employed" ...
## $ treatment : chr "no psychological disorders" "not in treatment" "not in treatment" "in treatment" ...
## $ health : chr "something else or not applicable" "something else or not applicable" "high blood pressure" "diabetes" ...
## $ mhealth : chr "none or NA" "none or NA" "obsessive compulsive disorder" "depression" ...
## $ sleep_hours : chr "3 7-8 hours" "4 8-10 hours" "2 5-6 hours" "3 7-8 hours" ...
## $ exercise : chr "1 less than 1 hour" "1 less than 1 hour" "1 less than 1 hour" "1 less than 1 hour" ...
## $ pet : chr "cat" "cat and dog" "no pets" "no pets" ...
## $ covid_pos : int 0 0 0 0 0 0 0 0 0 0 ...
## $ covid_neg : int 0 0 0 0 0 0 0 0 0 0 ...
## $ big5_open : num 5.33 4 6 5 3.67 ...
## $ big5_con : num 6 4.33 5 5.67 3 ...
## $ big5_agr : num 4.33 3 6.67 4 4.33 ...
## $ big5_neu : num 6 3.67 3 4.33 5.33 ...
## $ big5_ext : num 2 4.67 4.67 6 2 ...
## $ pswq : num 4.94 1.71 2.44 2.5 2.71 ...
## $ iou : num 3.19 2.48 2.81 2.59 2.22 ...
## $ mfq_26 : num 4.2 4.3 4.7 4.6 2.7 2.95 2.95 2.4 3.65 4.7 ...
## $ mfq_state : num 3.62 6 5 5.12 3 ...
## $ rse : num 2.3 3.8 3.1 3 2.6 3 1.3 2.1 3 3.2 ...
## $ school_covid_support: num NA NA NA NA NA NA NA NA NA NA ...
## $ school_att : num NA NA NA NA NA NA NA NA NA NA ...
## $ pas_covid : num 3.22 2.33 4 NA 3 ...
## $ pss : num 3.25 2.25 2.25 2.25 2.75 2.75 4.75 3.25 3.5 2.25 ...
## $ phq : num 1.33 1.89 2.44 1.22 1.56 ...
## $ gad : num 1.86 1 2.14 1.71 1.14 ...
## $ edeq12 : num 1.58 1 3.08 1.83 1.33 ...
## $ brs : num NA NA NA NA NA NA NA NA NA NA ...
## $ swemws : num 2.86 3.86 3.71 3.43 3 ...
## $ isolation_a : num 2.25 NA 2.5 1 NA 2.25 NA NA NA NA ...
## $ isolation_c : num NA 1.25 NA NA 1 NA 3.5 3.5 2.25 2 ...
## $ support : num 2.5 2.5 3.83 4.67 2.83 ...
Open your mini codebook and get the names of your variables (first column). Then enter this list of names within the “select=c()” argument to subset those columns from the dataframe “df” into a new one “d”.
Replace “variable1, variable2,…” with your variables names.
# Make sure to keep the "ResponseID" variable first in the "select" argument.
# NOTE: you will need to replace "ResponseID" with "X" if you are using the ARC data.
d <- subset(df, select=c(X, age, mhealth, pas_covid, pss, support, phq))
# Your new data frame should contain 7 variables (ResponseID, + your 2 categorical, + your 4 continuous)
names(d) # all the variable name in the data frame
## [1] "X" "age" "mhealth" "pas_covid" "pss" "support"
## [7] "phq"
head(d) # first 6 lines of data in the data frame
## X age mhealth pas_covid pss
## 1 1 <NA> none or NA 3.222222 3.25
## 2 321 1 under 18 none or NA 2.333333 2.25
## 3 401 4 between 36 and 45 obsessive compulsive disorder 4.000000 2.25
## 4 469 <NA> depression NA 2.25
## 5 520 1 under 18 none or NA 3.000000 2.75
## 6 1390 5 over 45 none or NA 2.888889 2.75
## support phq
## 1 2.500000 1.333333
## 2 2.500000 1.888889
## 3 3.833333 2.444444
## 4 4.666667 1.222222
## 5 2.833333 1.555556
## 6 2.833333 1.222222
str(d)
## 'data.frame': 2073 obs. of 7 variables:
## $ X : int 1 321 401 469 520 1390 1422 1849 2183 2247 ...
## $ age : chr NA "1 under 18" "4 between 36 and 45" NA ...
## $ mhealth : chr "none or NA" "none or NA" "obsessive compulsive disorder" "depression" ...
## $ pas_covid: num 3.22 2.33 4 NA 3 ...
## $ pss : num 3.25 2.25 2.25 2.25 2.75 2.75 4.75 3.25 3.5 2.25 ...
## $ support : num 2.5 2.5 3.83 4.67 2.83 ...
## $ phq : num 1.33 1.89 2.44 1.22 1.56 ...
# use the gg_miss_upset() command for a visualization of your missing data
gg_miss_upset(d[-1], nsets = 6)
# the [-1] tells the function to ignore the first column i.e., variable -- we are doing this because here it is just the ID variable, we don't need to check it for missingness because everyone was assigned a random ID
# use the na.omit() command to create a new dataframe in which any participants with missing data are dropped from the dataframe
d2 <- na.omit(d)
# Next, calc the total number of participants dropped, then convert to % and insert both the number and % in the text below.
# insert the total number of participants in your d2 in the text where it says N = #.
2073-962
## [1] 1111
1111/2073
## [1] 0.5359383
# Then fill in the paragraph below with your numbers
We looked at the missing data in our dataset, and found that 1111, or about 53.59%, of the participants in our sample skipped at least one item. We dropped these participants from our analysis, which is not advisable and runs the risk of dropping vulnerable groups or skewing results. However, we will proceed for the sake of this class using the reduced dataset, N = 962.
Our last step is to export the data frame after we’ve dropped NAs so that it can be used in future HWs.
# use the "write.cvs" function to export the cleaned data
# please keep the file name as 'projectdata'
# note: you only need to change 'Data' before the slash if you named your folder something else
write.csv(d2, file="Data/projectdata.csv", row.names = F)