Data Prep

Load Libraries

# if you haven't run this code before, you'll need to download the below packages first
# instructions on how to do this are included in the video
# but as a reminder, you use the packages tab to the right

library(psych) # for the describe() command
library(naniar) # for the gg_miss-upset() command
library(expss) # for the cross_cases() command

## Loading required package: maditr

## 
## To select rows from data: rows(mtcars, am==0)

## 
## Attaching package: 'expss'

## The following object is masked from 'package:naniar':
## 
##     is_na

Import Data

# for the lab, you'll import the CSV file you downloaded along with the current file we're working in (an RMD file)
# for the homework, you'll download the CSV file from your chosen README page (should be titled arc_data_final.csv or eammi2_data_final.csv)
df <- read.csv(file="Data/eammi2_data_final.csv", header=T)

Viewing Data

# these are commands useful for viewing a dataframe
# you can also click the object in the environment tab to view it in a new window
names(df)

##  [1] "ResponseId"       "gender"           "race_rc"          "age"             
##  [5] "income"           "edu"              "sibling"          "party_rc"        
##  [9] "disability"       "marriage5"        "phys_sym"         "pipwd"           
## [13] "moa_independence" "moa_role"         "moa_safety"       "moa_maturity"    
## [17] "idea"             "swb"              "mindful"          "belong"          
## [21] "efficacy"         "support"          "socmeduse"        "usdream"         
## [25] "npi"              "exploit"          "stress"

head(df)

##          ResponseId gender race_rc                 age         income
## 1 R_BJN3bQqi1zUMid3      f   white 1 between 18 and 25          1 low
## 2 R_2TGbiBXmAtxywsD      m   white 1 between 18 and 25          1 low
## 3 R_12G7bIqN2wB2N65      m   white 1 between 18 and 25 rather not say
## 4 R_39pldNoon8CePfP      f   other 1 between 18 and 25 rather not say
## 5 R_1QiKb2LdJo1Bhvv      m   white 1 between 18 and 25       2 middle
## 6 R_pmwDTZyCyCycXwB      f   white 1 between 18 and 25 rather not say
##                            edu              sibling    party_rc  disability
## 1       2 Currently in college at least one sibling    democrat        <NA>
## 2 5 Completed Bachelors Degree at least one sibling independent        <NA>
## 3       2 Currently in college at least one sibling  apolitical psychiatric
## 4       2 Currently in college at least one sibling  apolitical        <NA>
## 5       2 Currently in college at least one sibling  apolitical        <NA>
## 6       2 Currently in college at least one sibling  apolitical        <NA>
##                                 marriage5                phys_sym    pipwd
## 1 are currently divorced from one another high number of symptoms       NA
## 2    are currently married to one another high number of symptoms       NA
## 3    are currently married to one another high number of symptoms 2.333333
## 4    are currently married to one another high number of symptoms       NA
## 5    are currently married to one another  low number of symptoms       NA
## 6    are currently married to one another high number of symptoms       NA
##   moa_independence moa_role moa_safety moa_maturity  idea      swb mindful
## 1         3.666667 3.000000       2.75     3.666667 3.750 4.333333     2.4
## 2         3.666667 2.666667       3.25     3.333333 3.875 4.166667     1.8
## 3         3.500000 2.500000       3.00     3.666667 3.750 1.833333     2.2
## 4         3.000000 2.000000       1.25     3.000000 3.750 5.166667     2.2
## 5         3.833333 2.666667       2.25     3.666667 3.500 3.666667     3.2
## 6         3.500000 3.333333       2.50     4.000000 3.250 4.000000     3.4
##   belong efficacy  support socmeduse
## 1    2.8      3.4 6.000000        47
## 2    4.2      3.4 6.750000        23
## 3    3.6      2.2 5.166667        34
## 4    4.0      2.8 5.583333        35
## 5    3.4      3.0 6.000000        37
## 6    4.2      2.4 4.500000        13
##                                                           usdream        npi
## 1               american dream is important and achievable for me 0.69230769
## 2               american dream is important and achievable for me 0.15384615
## 3 american dream is not important and maybe not achievable for me 0.07692308
## 4 american dream is not important and maybe not achievable for me 0.07692308
## 5                            not sure if american dream important 0.76923077
## 6 american dream is not important and maybe not achievable for me 0.23076923
##    exploit stress
## 1 2.000000    3.3
## 2 3.666667    3.3
## 3 4.333333    4.0
## 4 1.666667    3.2
## 5 4.000000    3.1
## 6 1.333333    3.5

str(df)

## 'data.frame':    3182 obs. of  27 variables:
##  $ ResponseId      : chr  "R_BJN3bQqi1zUMid3" "R_2TGbiBXmAtxywsD" "R_12G7bIqN2wB2N65" "R_39pldNoon8CePfP" ...
##  $ gender          : chr  "f" "m" "m" "f" ...
##  $ race_rc         : chr  "white" "white" "white" "other" ...
##  $ age             : chr  "1 between 18 and 25" "1 between 18 and 25" "1 between 18 and 25" "1 between 18 and 25" ...
##  $ income          : chr  "1 low" "1 low" "rather not say" "rather not say" ...
##  $ edu             : chr  "2 Currently in college" "5 Completed Bachelors Degree" "2 Currently in college" "2 Currently in college" ...
##  $ sibling         : chr  "at least one sibling" "at least one sibling" "at least one sibling" "at least one sibling" ...
##  $ party_rc        : chr  "democrat" "independent" "apolitical" "apolitical" ...
##  $ disability      : chr  NA NA "psychiatric" NA ...
##  $ marriage5       : chr  "are currently divorced from one another" "are currently married to one another" "are currently married to one another" "are currently married to one another" ...
##  $ phys_sym        : chr  "high number of symptoms" "high number of symptoms" "high number of symptoms" "high number of symptoms" ...
##  $ pipwd           : num  NA NA 2.33 NA NA ...
##  $ moa_independence: num  3.67 3.67 3.5 3 3.83 ...
##  $ moa_role        : num  3 2.67 2.5 2 2.67 ...
##  $ moa_safety      : num  2.75 3.25 3 1.25 2.25 2.5 4 3.25 2.75 3.5 ...
##  $ moa_maturity    : num  3.67 3.33 3.67 3 3.67 ...
##  $ idea            : num  3.75 3.88 3.75 3.75 3.5 ...
##  $ swb             : num  4.33 4.17 1.83 5.17 3.67 ...
##  $ mindful         : num  2.4 1.8 2.2 2.2 3.2 ...
##  $ belong          : num  2.8 4.2 3.6 4 3.4 4.2 3.9 3.6 2.9 2.5 ...
##  $ efficacy        : num  3.4 3.4 2.2 2.8 3 2.4 2.3 3 3 3.7 ...
##  $ support         : num  6 6.75 5.17 5.58 6 ...
##  $ socmeduse       : int  47 23 34 35 37 13 37 43 37 29 ...
##  $ usdream         : chr  "american dream is important and achievable for me" "american dream is important and achievable for me" "american dream is not important and maybe not achievable for me" "american dream is not important and maybe not achievable for me" ...
##  $ npi             : num  0.6923 0.1538 0.0769 0.0769 0.7692 ...
##  $ exploit         : num  2 3.67 4.33 1.67 4 ...
##  $ stress          : num  3.3 3.3 4 3.2 3.1 3.5 3.3 2.4 2.9 2.7 ...

Subsetting Data

# for the HW: use the codebook you created in the codebook activity to get the names of your variables (first column)
# enter this list of names in the select=c() argument to subset those columns from the dataframe
# variables for the lab: id, variable2, variable3, variable5, variable8, variable10, variable11
d <- subset(df, select=c(gender, age, idea, swb, mindful, socmeduse))
# gender,  age, inventory of the dimensions of emerging adulthood, satisfaction with life scale, mindful attention awareness scale, social media use

Recoding Variables

# categorical variables need to be recoded as factors
# the content of the variable will stay the same, but R will treat the variable differently at times
d$gender <- as.factor(d$gender)
d$age <- as.factor(d$age)

str(d)

## 'data.frame':    3182 obs. of  6 variables:
##  $ gender   : Factor w/ 3 levels "f","m","nb": 1 2 2 1 2 1 1 1 1 1 ...
##  $ age      : Factor w/ 4 levels "1 between 18 and 25",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ idea     : num  3.75 3.88 3.75 3.75 3.5 ...
##  $ swb      : num  4.33 4.17 1.83 5.17 3.67 ...
##  $ mindful  : num  2.4 1.8 2.2 2.2 3.2 ...
##  $ socmeduse: int  47 23 34 35 37 13 37 43 37 29 ...

Missing Data

I looked at the missing data in our dataset, and found that about 33% of the participants in my sample skipped at least one item. I dropped these participants from my analysis, which is not advisable and runs the risk of dropping vulnerable groups or skewing results. However, I will proceed for the sake of this class using the reduced dataset.

# use the gg_miss_upset() command for a visualization of your missing data
gg_miss_upset(d[-1], nsets = 6)

# use the na.omit() command to create a new dataframe in which any participants with missing data are dropped from the dataframe
d2 <- na.omit(d)
2156/3182

## [1] 0.6775613

Exporting Data

# last step is to export the data after you've dropped NAs
# for the HW, the file you're exporting here is what you'll use for all future HW assignments (labs will use the files I provide you)
# make sure you give it a name that is memorable!
# and make sure you save it to your Data folder!
write.csv(d2, file="Data/eammi2_new_final.csv", row.names = F)

# since we've created a cleaned dataframe in d2, we'll use that for the rest of the lab/HW

Basic Statistics

Univariate Plots: Histograms & Tables

table(d2$gender)

## 
##    f    m   nb 
## 1583  542   31

table(d2$age)

## 
## 1 between 18 and 25 2 between 26 and 35 3 between 36 and 45           4 over 45 
##                1986                 115                  38                  17

hist(d2$idea)

hist(d2$swb)

hist(d2$mindful)

hist(d2$socmeduse)

Univariate Normality

I analyzed the skew and kurtosis of my variables and most were within the accepted range (-2/2+). However, the variables age and dimensions of emerging adulthood were outside of the accepted range. For this analysis, we will use them anyway, but outside of this class this is bad practice.

describe(d2)

##           vars    n  mean   sd median trimmed  mad   min max range  skew
## gender*      1 2156  1.28 0.48   1.00    1.21 0.00  1.00   3  2.00  1.36
## age*         2 2156  1.11 0.43   1.00    1.00 0.00  1.00   4  3.00  4.43
## idea         3 2156  3.57 0.39   3.62    3.61 0.37  1.00   4  3.00 -1.49
## swb          4 2156  4.43 1.33   4.50    4.49 1.48  1.00   7  6.00 -0.35
## mindful      5 2156  3.72 0.84   3.73    3.72 0.79  1.13   6  4.87 -0.04
## socmeduse    6 2156 34.27 8.58  35.00   34.53 7.41 11.00  55 44.00 -0.30
##           kurtosis   se
## gender*       0.72 0.01
## age*         21.24 0.01
## idea          3.96 0.01
## swb          -0.49 0.03
## mindful      -0.15 0.02
## socmeduse     0.19 0.18

Bivariate Plots

Crosstabs

cross_cases(d2, gender, age)

	age
	1 between 18 and 25	2 between 26 and 35	3 between 36 and 45	4 over 45
gender
f	1474	69	28	12
m	482	46	9	5
nb	30		1
#Total cases	1986	115	38	17

Scatterplots

plot(d2$idea, d2$swb,
     main="Scatterplot of Dimensions of Emerging Adulthood and Satisfaction with Life",
     xlab = "idea",
     ylab = "swb")

plot(d2$mindful, d2$socmeduse,
     main="Scatterplot of Mindfulness and Social Media Use",
     xlab = "mindful",
     ylab = "socmeduse")

plot(d2$idea, d2$socmeduse,
     main="Scatterplot of Dimensions of Emerging Adulthood and Social Media Use",
     xlab = "idea",
     ylab = "socmeduse")

plot(d2$swb, d2$socmeduse,
     main="Scatterplot of Satisfaction with Life and Social Media Use",
     xlab = "swb",
     ylab = "socmeduse")

plot(d2$idea, d2$mindful,
     main="Scatterplot of Dimensions of Emerging Adulthood and Mindfulness",
     xlab = "idea",
     ylab = "mindful")

Boxplots

boxplot(data=d2, gender~socmeduse,
        main="Boxplot of Gender and Social Media Use",
        xlab = "socmeduse",
        ylab = "gender")

boxplot(data=d2, age~mindful,
        main="Boxplot of Age and Mindfulness",
        xlab = "age",
        ylab = "mindful")

boxplot(data=d2, gender~mindful,
        main="Boxplot of Gender and Mindfulness",
        xlab = "gender",
        ylab = "mindful")

boxplot(data=d2, age~swb,
        main="Boxplot of Age and Satisfaction with Life",
        xlab = "age",
        ylab = "swb")

boxplot(data=d2, age~socmeduse,
        main="Boxplot of Age and Social Media Use",
        xlab = "age",
        ylab = "socmeduse")

boxplot(data=d2, gender~idea,
        main="Boxplot of Gender and Dimensions of Emerging Adulthood",
        xlab = "gender",
        ylab = "idea")

P421 HW - Data Prep & Basic Statistics

Anna Geller

2024-02-25