#load libraries
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.4.4     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)
library(forcats)
library(ggplot2)
library(lubridate)
library(purrr)
library(stringr)
library(tibble)
library(tidyr)
library(psych)

## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(mice)

## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(sjstats)

## 
## Attaching package: 'sjstats'
## 
## The following object is masked from 'package:psych':
## 
##     phi

library(formattable)
options(scipen = 999)

#import data
# Using a relative path with correct working directory
setwd("/Users/melissalagunas/Desktop/Lab/DISSERTATION")
Dissertation_Main_Study_August_30 <- read_csv("Dissertation_Main_Study_August 30, 2024_08.47.csv")

## New names:
## Rows: 323 Columns: 90
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (90): StartDate, EndDate, Status, IPAddress, Progress, Duration (in seco...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `Q29` -> `Q29...18`
## • `Q1` -> `Q1...22`
## • `Q2` -> `Q2...23`
## • `Q1` -> `Q1...74`
## • `Q2` -> `Q2...75`
## • `Q29` -> `Q29...90`

# rename dataset 
dat <- Dissertation_Main_Study_August_30

#rename consent
dat <- dplyr::rename(dat, consent = Q29...18)

# assess how many individuals opened the survey link vs granted consent
dat <- dplyr::filter(dat, consent == "4")
nrow(dat)

## [1] 319

Number of people who opened the survey = 319 Number of people who consented = 319

# Identify duplicates based on key columns (e.g., RespondentID, Email, Consent)
duplicates <- dat[duplicated(dat[, c("Q1...22", "IPAddress")]), ]

# Print the number of duplicate rows
cat("Number of duplicate rows based on key columns: ", nrow(duplicates), "\n")

## Number of duplicate rows based on key columns:  7

# Remove duplicates based on the key columns, keeping only the first occurrence
dat <- dat[!duplicated(dat[, c("Q1...22", "IPAddress")]), ]

# Check the number of rows before and after removing duplicates
cat("Number of rows before removing duplicates: ", nrow(dat), "\n")

## Number of rows before removing duplicates:  312

cat("Number of rows after removing duplicates: ", nrow(dat), "\n")

## Number of rows after removing duplicates:  312

# Analyze the survey completion time after removing duplicates
summary(dat$`Duration (in seconds)`)

##    Length     Class      Mode 
##       312 character character

# delete variables

dat2 = select(dat,-c(StartDate, EndDate, Status, IPAddress, Finished, RecordedDate, ResponseId, RecipientLastName, RecipientFirstName, RecipientEmail, ExternalReference, LocationLatitude, LocationLongitude, DistributionChannel, UserLanguage))

# change variable names
dat2 <- dat2 %>%
  rename(
    US_located_age = Q35,
    PD = Q1...22,
    OC_1 = Q2...23,
    OC_2 = Q31_1,
    SS_1 = Q30_1, 
    SS_2 = Q30_2,
    SS_3 = Q30_3, 
    SS_4 = Q30_4, 
    SS_5 = Q30_5, 
    SS_6 = Q30_6, 
    SS_7 = Q30_7, 
    SS_8 = Q30_8, 
    SS_9 = Q30_9, 
    SS_10 = Q30_10, 
    SS_11 = Q30_11, 
    SS_12 = Q30_12,
    SS_13_TEXT = Q32,
    PSNQ_1 = Q27_1,
    PSNQ_2 = Q27_2,
    PSNQ_3 = Q27_3,
    PSNQ_4 = Q27_4,
    PSNQ_5 = Q27_5,
    PSNQ_6 = Q27_6,
    PSNQ_7 = Q27_7,
    PSNQ_8 = Q27_8,
    PSNQ_9 = Q27_9,
    PSNQ_10 = Q27_10,
    SBS_1 = Q4_1,
    SBS_2 = Q5_1,
    SBS_3 = Q6_1,
    SBS_4 = Q7_1,
    SBS_5 = Q8_1,
    SBS_6 = Q9_1,
    SBS_7 = Q10_1,
    PF_1 = Q11_1,  
    PF_2 = Q11_2,  
    PF_3 = Q11_3,  
    PF_4 = Q11_4,  
    PF_5 = Q11_5,  
    PF_6 = Q11_6,  
    PF_7 = Q11_7,  
    PF_8 = Q11_8, 
    CS_1 = Q26_1, 
    CS_2 = Q26_2, 
    CS_3 = Q26_3, 
    CS_4 = Q26_4, 
    CS_5 = Q26_5, 
    BRS_1 = Q12_1,
    BRS_2 = Q12_2,
    BRS_3 = Q12_3,
    BRS_4 = Q12_4,
    BRS_5 = Q12_5,
    BRS_6 = Q12_6,
    age = Q1...74,
    gender = Q2...75,
    gender_8_TEXT = Q2_8_TEXT,
    sexual_orientation = Q3,
    sexual_orientation_9_TEXT = Q3_9_TEXT,
    employment_status = Q13,
    employment_status_TEXT = Q13_3_TEXT,
    US_born = Q4,
    race = Q5, 
    race_10_TEXT = Q5_10_TEXT,
    income = Q6,
    fam_income = Q7,
    religion = Q8,
    religion_15_TEXT = Q8_15_TEXT,
    education = Q11,
    education_9_TEXT = Q11_9_TEXT,
    year_education = Q12)

# check structure
str(dat2)

## tibble [312 × 75] (S3: tbl_df/tbl/data.frame)
##  $ Progress                 : chr [1:312] "100" "100" "100" "100" ...
##  $ Duration (in seconds)    : chr [1:312] "803" "1034" "533" "1547" ...
##  $ consent                  : chr [1:312] "4" "4" "4" "4" ...
##  $ US_located_age           : chr [1:312] NA NA NA NA ...
##  $ employment_status        : chr [1:312] "1" "2" "1" "2" ...
##  $ employment_status_TEXT   : chr [1:312] NA NA NA NA ...
##  $ PD                       : chr [1:312] "Associate Attorney at large international law firm." "The Information Technology (IT) profession involves designing, developing, managing, and maintaining computer s"| __truncated__ "Health education" "Education - I work as a college advisor for high school students and support them throughout the college application process." ...
##  $ OC_1                     : chr [1:312] "3" "4" "3" "1" ...
##  $ OC_2                     : chr [1:312] "2" "3" "5" "2" ...
##  $ SS_1                     : chr [1:312] "6" "7" "1" "5" ...
##  $ SS_2                     : chr [1:312] "7" "7" "1" "5" ...
##  $ SS_3                     : chr [1:312] "6" "6" "1" "4" ...
##  $ SS_4                     : chr [1:312] "6" "5" "1" "4" ...
##  $ SS_5                     : chr [1:312] "7" "7" "1" "5" ...
##  $ SS_6                     : chr [1:312] "6" "5" "1" "7" ...
##  $ SS_7                     : chr [1:312] "6" "5" "1" "7" ...
##  $ SS_8                     : chr [1:312] "6" "6" "5" "4" ...
##  $ SS_9                     : chr [1:312] "6" "5" "5" "7" ...
##  $ SS_10                    : chr [1:312] "7" "7" "3" "6" ...
##  $ SS_11                    : chr [1:312] "7" "6" "5" "4" ...
##  $ SS_12                    : chr [1:312] "7" "5" "5" "7" ...
##  $ SS_13_TEXT               : chr [1:312] "Mother and Friend" "Uncle(Family Member)" "Friend" "A friend who also is a mother-figure to me." ...
##  $ PSNQ_1                   : chr [1:312] "6" "6" "6" "4" ...
##  $ PSNQ_2                   : chr [1:312] "5" "6" "6" "6" ...
##  $ PSNQ_3                   : chr [1:312] "5" "6" "7" "7" ...
##  $ PSNQ_4                   : chr [1:312] "5" "6" "7" "6" ...
##  $ PSNQ_5                   : chr [1:312] "4" "7" "7" "7" ...
##  $ PSNQ_6                   : chr [1:312] "3" "7" "7" "7" ...
##  $ PSNQ_7                   : chr [1:312] "4" "7" "6" "7" ...
##  $ PSNQ_8                   : chr [1:312] "5" "7" "6" "7" ...
##  $ PSNQ_9                   : chr [1:312] "3" "7" "6" "5" ...
##  $ PSNQ_10                  : chr [1:312] "3" "7" "6" "6" ...
##  $ SBS_1                    : chr [1:312] "2" "5" "5" "2" ...
##  $ SBS_2                    : chr [1:312] "2" "4" "5" "2" ...
##  $ SBS_3                    : chr [1:312] "3" "4" "5" "5" ...
##  $ SBS_4                    : chr [1:312] "4" "5" "3" "5" ...
##  $ SBS_5                    : chr [1:312] "3" "5" "3" "5" ...
##  $ SBS_6                    : chr [1:312] "3" "4" "5" "3" ...
##  $ SBS_7                    : chr [1:312] "3" "5" "5" "5" ...
##  $ PF_1                     : chr [1:312] "5" "7" "6" "7" ...
##  $ PF_2                     : chr [1:312] "4" "7" "7" "4" ...
##  $ PF_3                     : chr [1:312] "5" "7" "7" "6" ...
##  $ PF_4                     : chr [1:312] "6" "7" "7" "4" ...
##  $ PF_5                     : chr [1:312] "6" "7" "7" "7" ...
##  $ PF_6                     : chr [1:312] "7" "7" "7" "5" ...
##  $ PF_7                     : chr [1:312] "6" "7" "7" "6" ...
##  $ PF_8                     : chr [1:312] "5" "7" "7" "6" ...
##  $ CS_1                     : chr [1:312] "4" "4" "5" "5" ...
##  $ CS_2                     : chr [1:312] "4" "5" "5" "5" ...
##  $ CS_3                     : chr [1:312] "5" "5" "4" "5" ...
##  $ CS_4                     : chr [1:312] "4" "5" "5" "5" ...
##  $ CS_5                     : chr [1:312] "5" "5" "5" "5" ...
##  $ BRS_1                    : chr [1:312] "3" "5" "5" "5" ...
##  $ BRS_2                    : chr [1:312] "1" "4" "5" "1" ...
##  $ BRS_3                    : chr [1:312] "4" "2" "5" "4" ...
##  $ BRS_4                    : chr [1:312] "4" "3" "5" "1" ...
##  $ BRS_5                    : chr [1:312] "3" "4" "5" "4" ...
##  $ BRS_6                    : chr [1:312] "1" "5" "5" "1" ...
##  $ age                      : chr [1:312] NA "21" "25" "29" ...
##  $ gender                   : chr [1:312] "1" "2" "2" "1" ...
##  $ gender_8_TEXT            : chr [1:312] NA NA NA NA ...
##  $ sexual_orientation       : chr [1:312] "5" "5" "5" "2" ...
##  $ sexual_orientation_9_TEXT: chr [1:312] NA NA NA NA ...
##  $ US_born                  : chr [1:312] "1" "1" "1" "1" ...
##  $ race                     : chr [1:312] "1" "2" "2" "1" ...
##  $ race_10_TEXT             : chr [1:312] NA NA NA NA ...
##  $ income                   : chr [1:312] "9" "1" "1" "2" ...
##  $ fam_income               : chr [1:312] "7" "2" "2" "2" ...
##  $ religion                 : chr [1:312] "4" "5" "4,5" "4" ...
##  $ religion_15_TEXT         : chr [1:312] NA NA NA NA ...
##  $ education                : chr [1:312] "6" "4" "4" "4" ...
##  $ education_9_TEXT         : chr [1:312] NA NA NA NA ...
##  $ year_education           : chr [1:312] "2017" "2022" "2022" "2022" ...
##  $ Q28                      : chr [1:312] NA NA NA NA ...
##  $ Q29...90                 : chr [1:312] NA NA NA NA ...

# Convert all characters into integers for anything that has a number
library(hablar)

## 
## Attaching package: 'hablar'

## The following object is masked from 'package:mice':
## 
##     squeeze

## The following object is masked from 'package:forcats':
## 
##     fct

## The following object is masked from 'package:tibble':
## 
##     num

## The following object is masked from 'package:dplyr':
## 
##     na_if

dat2 <- dat2 %>% #convert dataframe to integer
  convert(int(Progress:employment_status, OC_1:SS_12, PSNQ_1:gender, sexual_orientation, US_born, race, income:religion, education))

## Warning: There were 3 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `age = (structure(function (..., .x = ..1, .y = ..2, . = ..1)
##   ...`.
## Caused by warning in `as_reliable_int()`:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 2 remaining warnings.

str(dat2)

## tibble [312 × 75] (S3: tbl_df/tbl/data.frame)
##  $ Progress                 : int [1:312] 100 100 100 100 100 100 100 100 100 100 ...
##  $ Duration (in seconds)    : int [1:312] 803 1034 533 1547 1352 800 281 866 495 617 ...
##  $ consent                  : int [1:312] 4 4 4 4 4 4 4 4 4 4 ...
##  $ US_located_age           : int [1:312] NA NA NA NA NA NA NA NA NA NA ...
##  $ employment_status        : int [1:312] 1 2 1 2 1 1 1 1 1 1 ...
##  $ employment_status_TEXT   : chr [1:312] NA NA NA NA ...
##  $ PD                       : chr [1:312] "Associate Attorney at large international law firm." "The Information Technology (IT) profession involves designing, developing, managing, and maintaining computer s"| __truncated__ "Health education" "Education - I work as a college advisor for high school students and support them throughout the college application process." ...
##  $ OC_1                     : int [1:312] 3 4 3 1 5 6 5 5 3 4 ...
##  $ OC_2                     : int [1:312] 2 3 5 2 1 4 4 3 3 4 ...
##  $ SS_1                     : int [1:312] 6 7 1 5 7 6 6 5 6 6 ...
##  $ SS_2                     : int [1:312] 7 7 1 5 7 6 6 6 7 6 ...
##  $ SS_3                     : int [1:312] 6 6 1 4 6 6 6 5 6 6 ...
##  $ SS_4                     : int [1:312] 6 5 1 4 7 6 6 4 6 6 ...
##  $ SS_5                     : int [1:312] 7 7 1 5 7 7 6 4 5 7 ...
##  $ SS_6                     : int [1:312] 6 5 1 7 5 6 6 5 6 6 ...
##  $ SS_7                     : int [1:312] 6 5 1 7 6 5 6 4 6 6 ...
##  $ SS_8                     : int [1:312] 6 6 5 4 5 6 6 5 6 6 ...
##  $ SS_9                     : int [1:312] 6 5 5 7 5 6 6 5 6 6 ...
##  $ SS_10                    : int [1:312] 7 7 3 6 7 6 6 5 6 6 ...
##  $ SS_11                    : int [1:312] 7 6 5 4 6 5 6 5 6 6 ...
##  $ SS_12                    : int [1:312] 7 5 5 7 5 6 6 5 6 6 ...
##  $ SS_13_TEXT               : chr [1:312] "Mother and Friend" "Uncle(Family Member)" "Friend" "A friend who also is a mother-figure to me." ...
##  $ PSNQ_1                   : int [1:312] 6 6 6 4 6 6 6 6 2 6 ...
##  $ PSNQ_2                   : int [1:312] 5 6 6 6 6 6 6 6 3 7 ...
##  $ PSNQ_3                   : int [1:312] 5 6 7 7 6 6 6 6 4 6 ...
##  $ PSNQ_4                   : int [1:312] 5 6 7 6 NA 6 6 6 3 6 ...
##  $ PSNQ_5                   : int [1:312] 4 7 7 7 6 6 6 5 2 6 ...
##  $ PSNQ_6                   : int [1:312] 3 7 7 7 6 6 6 5 1 7 ...
##  $ PSNQ_7                   : int [1:312] 4 7 6 7 6 5 6 6 2 6 ...
##  $ PSNQ_8                   : int [1:312] 5 7 6 7 5 5 6 6 5 6 ...
##  $ PSNQ_9                   : int [1:312] 3 7 6 5 3 6 6 6 2 7 ...
##  $ PSNQ_10                  : int [1:312] 3 7 6 6 3 5 6 6 2 6 ...
##  $ SBS_1                    : int [1:312] 2 5 5 2 4 4 4 4 2 4 ...
##  $ SBS_2                    : int [1:312] 2 4 5 2 3 3 4 4 2 4 ...
##  $ SBS_3                    : int [1:312] 3 4 5 5 4 4 4 3 2 4 ...
##  $ SBS_4                    : int [1:312] 4 5 3 5 4 4 4 4 2 4 ...
##  $ SBS_5                    : int [1:312] 3 5 3 5 4 4 4 3 2 5 ...
##  $ SBS_6                    : int [1:312] 3 4 5 3 4 4 4 4 2 4 ...
##  $ SBS_7                    : int [1:312] 3 5 5 5 4 5 4 4 1 4 ...
##  $ PF_1                     : int [1:312] 5 7 6 7 6 6 6 6 2 6 ...
##  $ PF_2                     : int [1:312] 4 7 7 4 6 6 6 6 2 6 ...
##  $ PF_3                     : int [1:312] 5 7 7 6 6 6 6 5 5 6 ...
##  $ PF_4                     : int [1:312] 6 7 7 4 4 6 6 6 4 7 ...
##  $ PF_5                     : int [1:312] 6 7 7 7 6 6 6 5 6 6 ...
##  $ PF_6                     : int [1:312] 7 7 7 5 6 6 6 6 6 6 ...
##  $ PF_7                     : int [1:312] 6 7 7 6 5 6 6 6 2 6 ...
##  $ PF_8                     : int [1:312] 5 7 7 6 6 6 6 6 4 6 ...
##  $ CS_1                     : int [1:312] 4 4 5 5 5 5 4 4 1 4 ...
##  $ CS_2                     : int [1:312] 4 5 5 5 5 5 5 3 3 4 ...
##  $ CS_3                     : int [1:312] 5 5 4 5 2 5 5 3 1 4 ...
##  $ CS_4                     : int [1:312] 4 5 5 5 5 5 5 4 2 4 ...
##  $ CS_5                     : int [1:312] 5 5 5 5 4 4 5 4 2 4 ...
##  $ BRS_1                    : int [1:312] 3 5 5 5 4 5 5 4 4 4 ...
##  $ BRS_2                    : int [1:312] 1 4 5 1 4 1 1 2 2 4 ...
##  $ BRS_3                    : int [1:312] 4 2 5 4 4 5 5 4 4 4 ...
##  $ BRS_4                    : int [1:312] 4 3 5 1 4 1 1 2 2 4 ...
##  $ BRS_5                    : int [1:312] 3 4 5 4 3 5 5 4 4 4 ...
##  $ BRS_6                    : int [1:312] 1 5 5 1 2 1 1 2 2 4 ...
##  $ age                      : int [1:312] NA 21 25 29 NA NA 40 27 60 40 ...
##  $ gender                   : int [1:312] 1 2 2 1 1 2 1 2 1 5 ...
##  $ gender_8_TEXT            : chr [1:312] NA NA NA NA ...
##  $ sexual_orientation       : int [1:312] 5 5 5 2 5 9 5 5 5 5 ...
##  $ sexual_orientation_9_TEXT: chr [1:312] NA NA NA NA ...
##  $ US_born                  : int [1:312] 1 1 1 1 1 1 1 1 1 1 ...
##  $ race                     : int [1:312] 1 2 2 1 1 6 6 9 1 5 ...
##  $ race_10_TEXT             : chr [1:312] NA NA NA NA ...
##  $ income                   : int [1:312] 9 1 1 2 8 5 5 9 8 2 ...
##  $ fam_income               : int [1:312] 7 2 2 2 8 3 3 6 3 2 ...
##  $ religion                 : int [1:312] 4 5 NA 4 10 5 5 10 4 4 ...
##  $ religion_15_TEXT         : chr [1:312] NA NA NA NA ...
##  $ education                : int [1:312] 6 4 4 4 6 5 1 3 6 3 ...
##  $ education_9_TEXT         : chr [1:312] NA NA NA NA ...
##  $ year_education           : chr [1:312] "2017" "2022" "2022" "2022" ...
##  $ Q28                      : chr [1:312] NA NA NA NA ...
##  $ Q29...90                 : chr [1:312] NA NA NA NA ...

# Assuming your original dataset is called df
dat2$ID <- 1:nrow(dat2)

# View the dataset with the new ID variable
head(df)

##                                               
## 1 function (x, df1, df2, ncp, log = FALSE)    
## 2 {                                           
## 3     if (missing(ncp))                       
## 4         .Call(C_df, x, df1, df2, log)       
## 5     else .Call(C_dnf, x, df1, df2, ncp, log)
## 6 }

# duration
#*calculate average, SD, max/min Duration__in_seconds_ 
duration_stats <- psych::describe(dat2$`Duration (in seconds)`, na.rm = TRUE, ranges = TRUE)

# View the results
print(duration_stats)

##    vars   n   mean      sd median trimmed    mad min   max range skew kurtosis
## X1    1 312 924.68 1397.29    543   690.7 467.76   6 17152 17146 6.66    63.75
##       se
## X1 79.11

Mean: 924.68
SD: 1397.29
Max: 17152
Min: 6

ANALYZE ITEM LEVEL DATA MISSINGNESS AND PATTERNS

dat2_missingness <- (dplyr::select(dat2, employment_status, OC_1:SS_12, PSNQ_1:gender, sexual_orientation, US_born, race, income:religion, education))
# NOTE: excluded demographics textboxes

# analyze item level data missingness and patterns 
## typically want >80% of data 

# Calculate the number and proportion of item-level missingness
dat2_missingness$nmiss <- dat2_missingness %>%
    dplyr::select(employment_status:education) %>% # Select all variables between employment_status and education
    is.na %>% 
    rowSums

# Dynamically calculate the number of selected variables
num_vars <- dat2_missingness %>%
    dplyr::select(employment_status:education) %>%
    ncol()

# Calculate the proportion of missingness
dat2_missingness <- dat2_missingness %>%
  dplyr::mutate(prop_miss = (nmiss/num_vars)*100)

# Describe the missingness proportion
psych::describe(dat2_missingness$prop_miss)

##    vars   n  mean    sd median trimmed  mad min max range skew kurtosis   se
## X1    1 312 16.76 33.97   1.67    8.67 2.47   0 100   100 1.82     1.47 1.92

Across cases that were deemed eligible on the basis of the inclusion/exclusion criteria, missingness ranged from 0% to 100%

dat2_missingness <- dplyr::filter(dat2_missingness, prop_miss <= 90)
print(dat2_missingness)

## # A tibble: 271 × 62
##    employment_status  OC_1  OC_2  SS_1  SS_2  SS_3  SS_4  SS_5  SS_6  SS_7  SS_8
##                <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
##  1                 1     3     2     6     7     6     6     7     6     6     6
##  2                 2     4     3     7     7     6     5     7     5     5     6
##  3                 1     3     5     1     1     1     1     1     1     1     5
##  4                 2     1     2     5     5     4     4     5     7     7     4
##  5                 1     5     1     7     7     6     7     7     5     6     5
##  6                 1     6     4     6     6     6     6     7     6     5     6
##  7                 1     5     4     6     6     6     6     6     6     6     6
##  8                 1     5     3     5     6     5     4     4     5     4     5
##  9                 1     3     3     6     7     6     6     5     6     6     6
## 10                 1     4     4     6     6     6     6     7     6     6     6
## # ℹ 261 more rows
## # ℹ 51 more variables: SS_9 <int>, SS_10 <int>, SS_11 <int>, SS_12 <int>,
## #   PSNQ_1 <int>, PSNQ_2 <int>, PSNQ_3 <int>, PSNQ_4 <int>, PSNQ_5 <int>,
## #   PSNQ_6 <int>, PSNQ_7 <int>, PSNQ_8 <int>, PSNQ_9 <int>, PSNQ_10 <int>,
## #   SBS_1 <int>, SBS_2 <int>, SBS_3 <int>, SBS_4 <int>, SBS_5 <int>,
## #   SBS_6 <int>, SBS_7 <int>, PF_1 <int>, PF_2 <int>, PF_3 <int>, PF_4 <int>,
## #   PF_5 <int>, PF_6 <int>, PF_7 <int>, PF_8 <int>, CS_1 <int>, CS_2 <int>, …

Filter the dataframe to include only those rows that have at least 90% of their data complete. This results in 271 valid cases remaining.

#further update to exclude the n_miss and prop_miss variables since these will mess with our analyses
dat2_missingness <- dat2_missingness %>%
  dplyr::select (-c(nmiss, prop_miss))

#what proportion of cells missing across entire dataset
formattable::percent(mean(is.na(dat2_missingness)))

## [1] 4.50%

#what proportion of cases (rows) are complete (nonmissing)
formattable::percent(mean(complete.cases(dat2_missingness)))

## [1] 47.60%

Among the cases that met the inclusion and exclusion criteria, missing data ranged from 0% to 100%. Overall, 4.50% of cells had missing data, while 47.60% of cases had no missing data.

ANALYZING MISSING DATA PATTERNS ITEM LEVEL

missing_data_patterns <- mice::md.pattern(dat2_missingness, plot = TRUE, rotate.names = TRUE)

missing_data_patterns

##     SS_2 SS_4 SS_9 SS_12 OC_1 SS_1 SS_3 SS_6 SS_5 SS_7 SS_10 employment_status
## 129    1    1    1     1    1    1    1    1    1    1     1                 1
## 42     1    1    1     1    1    1    1    1    1    1     1                 1
## 11     1    1    1     1    1    1    1    1    1    1     1                 1
## 9      1    1    1     1    1    1    1    1    1    1     1                 1
## 4      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 6      1    1    1     1    1    1    1    1    1    1     1                 1
## 5      1    1    1     1    1    1    1    1    1    1     1                 1
## 3      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 4      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 2      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 6      1    1    1     1    1    1    1    1    1    1     1                 1
## 2      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 3      1    1    1     1    1    1    1    1    1    1     1                 1
## 2      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 1
## 1      1    1    1     1    1    1    1    1    1    1     1                 0
## 1      1    1    1     1    1    1    1    1    1    1     1                 0
## 1      1    1    1     1    1    1    1    1    1    1     1                 0
## 1      1    1    1     1    1    1    1    1    1    1     0                 1
## 1      1    1    1     1    1    1    1    1    1    1     0                 1
## 1      1    1    1     1    1    1    1    1    1    0     1                 1
## 2      1    1    1     1    1    1    1    1    0    1     1                 1
## 1      1    1    1     1    1    1    1    0    1    1     1                 1
## 1      1    1    1     1    1    1    0    1    1    0     1                 1
## 1      1    1    1     1    1    0    1    1    1    1     1                 1
## 1      1    1    1     1    0    1    1    1    1    1     1                 1
##        0    0    0     0    1    1    1    1    2    2     2                 3
##     OC_2 SS_8 SS_11 PSNQ_1 PSNQ_2 PSNQ_5 PSNQ_6 PSNQ_10 PSNQ_3 PSNQ_4 PSNQ_7
## 129    1    1     1      1      1      1      1       1      1      1      1
## 42     1    1     1      1      1      1      1       1      1      1      1
## 11     1    1     1      1      1      1      1       1      1      1      1
## 9      1    1     1      1      1      1      1       1      1      1      1
## 4      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 6      1    1     1      1      1      1      1       1      1      1      1
## 5      1    1     1      1      1      1      1       1      1      1      1
## 3      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 4      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 2      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 6      1    1     1      1      1      1      1       1      1      1      1
## 2      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      0      1
## 1      1    1     1      1      1      1      1       1      0      1      1
## 1      1    1     1      0      0      0      0       0      0      0      0
## 3      1    1     1      0      0      0      0       0      0      0      0
## 2      1    1     0      1      1      1      1       1      1      1      1
## 1      1    1     0      1      1      1      1       1      1      1      1
## 1      1    0     1      1      1      1      1       1      1      1      1
## 1      1    0     1      1      1      1      1       1      1      1      1
## 1      0    1     1      1      1      1      1       1      1      1      1
## 1      0    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      0    0     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      0      0      0      0       0      0      0      0
## 1      1    1     1      1      1      1      1       1      1      1      1
## 2      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      0
## 1      1    1     1      1      1      1      1       1      1      1      1
## 1      1    1     1      1      1      1      1       1      1      1      1
##        3    3     3      5      5      5      5       5      6      6      6
##     PSNQ_9 PSNQ_8 SBS_2 SBS_6 SBS_7 SBS_3 SBS_4 SBS_1 SBS_5 PF_1 PF_4 PF_5 PF_6
## 129      1      1     1     1     1     1     1     1     1    1    1    1    1
## 42       1      1     1     1     1     1     1     1     1    1    1    1    1
## 11       1      1     1     1     1     1     1     1     1    1    1    1    1
## 9        1      1     1     1     1     1     1     1     1    1    1    1    1
## 4        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 6        1      1     1     1     1     1     1     1     1    1    1    1    1
## 5        1      1     1     1     1     1     1     1     1    1    1    1    1
## 3        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 4        1      1     1     1     1     1     1     1     1    0    0    0    0
## 1        1      1     1     1     1     1     1     1     0    1    1    1    1
## 1        1      1     1     1     1     1     1     1     0    1    1    1    1
## 1        1      1     1     1     1     1     1     1     0    1    1    1    1
## 2        1      1     1     1     1     1     1     0     1    1    1    1    1
## 1        1      1     1     1     1     1     0     1     1    1    1    1    1
## 1        1      1     1     1     1     0     1     1     1    1    1    1    1
## 6        1      1     0     0     0     0     0     0     0    0    0    0    0
## 2        1      0     1     1     1     1     1     1     1    1    1    1    1
## 1        0      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     0    1    1    1    1
## 1        0      0     1     1     1     1     1     1     1    1    1    1    1
## 3        0      0     0     0     0     0     0     0     0    0    0    0    0
## 2        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        0      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      0     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    0
## 1        1      1     1     1     1     1     1     1     1    1    1    0    1
## 1        0      0     0     0     0     0     0     0     0    0    0    0    0
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 2        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
## 1        1      1     1     1     1     1     1     1     1    1    1    1    1
##          7      8    10    10    10    11    11    12    14   14   14   15   15
##     PF_8 CS_2 BRS_1 gender income fam_income PF_3 CS_1 BRS_2 BRS_5 BRS_6
## 129    1    1     1      1      1          1    1    1     1     1     1
## 42     1    1     1      1      1          1    1    1     1     1     1
## 11     1    1     1      1      1          1    1    1     1     1     1
## 9      1    1     1      1      1          1    1    1     1     1     1
## 4      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 6      1    1     1      1      1          1    1    1     1     1     1
## 5      1    1     1      1      1          1    1    1     1     1     1
## 3      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    0     1     1     1
## 1      1    1     1      1      1          1    0    1     1     1     1
## 1      1    1     0      0      0          0    1    1     0     0     0
## 1      1    0     1      1      1          1    1    0     1     1     1
## 1      0    1     1      1      1          1    1    1     1     1     1
## 4      0    0     0      0      0          0    0    0     0     0     0
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 2      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    0    1     1     1     1
## 6      0    0     0      0      0          0    0    0     0     0     0
## 2      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 3      0    0     0      0      0          0    0    0     0     0     0
## 2      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     0     0     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     0
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      0    0     0      0      0          0    0    0     0     0     0
## 1      1    1     1      1      1          1    1    1     1     1     1
## 2      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
## 1      1    1     1      1      1          1    1    1     1     1     1
##       15   15    15     15     15         15   16   16    16    16    16
##     sexual_orientation PF_7 CS_4 CS_5 BRS_3 PF_2 BRS_4 education CS_3 US_born
## 129                  1    1    1    1     1    1     1         1    1       1
## 42                   1    1    1    1     1    1     1         1    1       1
## 11                   1    1    1    1     1    1     1         1    1       1
## 9                    1    1    1    1     1    1     1         1    1       1
## 4                    1    1    1    1     1    1     1         1    1       1
## 1                    1    1    1    1     1    1     1         1    1       1
## 6                    1    1    1    1     1    1     1         1    1       1
## 5                    1    1    1    1     1    1     1         1    1       1
## 3                    1    1    1    1     1    1     1         1    1       0
## 1                    1    1    1    1     1    1     1         1    0       1
## 1                    1    1    1    1     1    1     1         0    1       1
## 1                    1    1    1    1     1    1     1         0    1       1
## 1                    1    1    1    1     1    1     1         0    1       0
## 1                    1    1    1    1     1    0     1         1    1       1
## 1                    1    1    1    1     1    0     1         1    1       1
## 1                    1    1    1    1     0    1     1         1    0       0
## 1                    1    1    1    0     1    1     1         1    1       1
## 1                    1    1    0    1     1    1     0         1    1       1
## 1                    1    0    1    1     1    1     1         1    1       1
## 1                    1    0    1    1     1    1     1         1    0       1
## 1                    1    1    1    1     1    1     1         1    1       1
## 1                    1    1    1    1     1    1     1         1    1       1
## 1                    0    1    1    1     0    1     0         0    1       0
## 1                    1    1    0    0     1    1     1         1    0       1
## 1                    1    1    1    1     1    1     1         1    1       1
## 4                    0    0    0    0     0    0     0         0    0       0
## 1                    1    1    1    1     1    1     1         1    1       1
## 1                    1    1    1    0     1    1     1         1    1       1
## 1                    0    1    1    1     1    1     0         1    1       1
## 2                    1    1    1    1     1    1     1         1    1       1
## 1                    1    1    1    1     1    1     1         1    1       1
## 1                    1    1    1    1     1    0     1         1    1       1
## 6                    0    0    0    0     0    0     0         0    0       0
## 2                    1    1    1    1     1    1     1         1    1       1
## 1                    1    1    1    1     1    0     1         1    1       1
## 1                    1    1    1    1     1    1     1         1    1       1
## 1                    1    1    1    1     1    1     1         1    1       1
## 1                    1    1    0    1     1    1     1         1    1       1
## 3                    0    0    0    0     0    0     0         0    0       0
## 2                    1    1    1    1     1    1     1         1    1       1
## 1                    1    1    1    1     1    1     1         1    1       1
## 1                    1    1    1    1     1    1     1         1    1       1
## 1                    1    1    1    1     1    1     1         1    1       1
## 1                    1    1    1    1     1    1     1         1    1       1
## 1                    1    1    1    1     1    1     1         1    1       1
## 1                    1    1    1    1     1    1     1         1    1       1
## 1                    1    1    1    1     0    1     1         1    1       1
## 1                    1    1    1    1     1    1     1         1    1       1
## 1                    1    0    1    1     1    1     0         1    0       1
## 1                    0    0    0    0     0    0     0         0    0       0
## 1                    1    1    1    1     1    1     1         1    1       1
## 2                    1    1    1    1     1    1     1         1    1       1
## 1                    1    1    1    1     1    1     1         1    1       1
## 1                    1    1    1    1     1    1     1         1    1       1
## 1                    1    1    1    1     1    1     1         1    1       1
## 1                    1    1    1    1     1    1     1         1    1       1
##                     16   17   17   17    17   18    18        18   19      20
##     race religion age    
## 129    1        1   1   0
## 42     1        1   0   1
## 11     1        0   1   1
## 9      1        0   0   2
## 4      0        1   1   1
## 1      0        1   0   2
## 6      0        0   1   2
## 5      0        0   0   3
## 3      1        1   1   1
## 1      1        1   1   1
## 1      1        1   1   1
## 1      1        1   0   2
## 1      1        1   0   3
## 1      1        1   1   1
## 1      1        1   0   2
## 1      1        1   0   4
## 1      1        1   1   1
## 1      1        1   0   3
## 1      1        1   0   2
## 1      1        1   0   3
## 1      1        1   1   1
## 1      1        1   1   1
## 1      0        0   0  15
## 1      1        1   1   5
## 1      1        1   0   2
## 4      0        0   0  28
## 1      1        1   0   2
## 1      0        1   0   4
## 1      1        1   1   3
## 2      1        1   1   1
## 1      1        1   1   1
## 1      1        1   1   3
## 6      0        0   0  35
## 2      1        1   1   1
## 1      1        1   1   2
## 1      1        1   0   2
## 1      1        1   1   2
## 1      1        1   1  11
## 3      0        0   0  45
## 2      1        1   1   1
## 1      1        1   0   4
## 1      1        1   1   1
## 1      0        1   1   2
## 1      1        1   1   1
## 1      1        1   1   2
## 1      1        1   1   1
## 1      1        1   1   3
## 1      1        0   0   7
## 1      1        0   0   7
## 1      0        0   0  46
## 1      1        1   1   1
## 2      1        1   1   1
## 1      1        1   1   1
## 1      1        0   1   4
## 1      1        1   1   1
## 1      1        0   1   2
##       33       50  86 732

RECODING and REVERSE SCORING

#BRS (scale 5 point, range 1-5), reverse score item 2,4,6

dat2 <- dat2 %>%
  dplyr::mutate(BRS_2r = 6 - BRS_2) %>%
  dplyr::mutate(BRS_4r = 6 - BRS_4) %>%
  dplyr::mutate(BRS_6r = 6 - BRS_5)

# Aggregate items into average subscales (STANDARDIZED)

#social support
SS <- c('SS_1','SS_2','SS_3','SS_4','SS_5','SS_6','SS_7','SS_8','SS_9','SS_10','SS_11','SS_12')

#perceived support network quality
PSNQ <- c('PSNQ_1', 'PSNQ_2', 'PSNQ_3', 'PSNQ_4', 'PSNQ_5', 'PSNQ_6', 'PSNQ_7', 'PSNQ_8', 'PSNQ_9', 'PSNQ_10')

#resilience
BRS <- c('BRS_1', 'BRS_2r', 'BRS_3', 'BRS_4r', 'BRS_5', 'BRS_6r')

#sense of belonging
SBS <- c('SBS_1', 'SBS_2', 'SBS_3', 'SBS_4', 'SBS_5', 'SBS_6', 'SBS_7')

#professional flourishing
PF <- c('PF_1','PF_2','PF_3','PF_4','PF_5','PF_6','PF_7','PF_8')

#career satisfaction
CS <- c('CS_1', 'CS_2','CS_3','CS_4','CS_5')

#organizational climate
OC <- c('OC_1', 'OC_2')

#create new variables #80% of items should be present to get the mean
dat2$SS_AVG <- mean_n(dat2[,SS], .80)
dat2$PSNQ_AVG <- mean_n(dat2[,PSNQ], .80)
dat2$BRS_AVG <- mean_n(dat2[,BRS], .80)
dat2$SBS_AVG <- mean_n(dat2[,SBS], .80)
dat2$PF_AVG <- mean_n(dat2[,PF], .80)
dat2$CS_AVG <- mean_n(dat2[,CS], .80)
dat2$OC_AVG <- mean_n(dat2[,OC], .80)

# Aggregate items into total sum scores (UNSTANDARDIZED)

#create new variables #80% of items should be present to get the mean
dat2$SS_total <- rowSums(dat2[,SS], .80)
dat2$PSNQ_total <- rowSums(dat2[,PSNQ], .80)
dat2$BRS_total <- rowSums(dat2[,BRS], .80)
dat2$SBS_total <- rowSums(dat2[,SBS], .80)
dat2$PF_total <- rowSums(dat2[,PF], .80)
dat2$CS_total <- rowSums(dat2[,CS], .80)
dat2$OC_total <- rowSums(dat2[,OC], .80)

INTERNAL CONSISTENCY OF SCALES

#social support
psych::alpha(dat2[c('SS_1','SS_2','SS_3','SS_4','SS_5','SS_6','SS_7','SS_8','SS_9','SS_10','SS_11','SS_12')])

## 
## Reliability analysis   
## Call: psych::alpha(x = dat2[c("SS_1", "SS_2", "SS_3", "SS_4", "SS_5", 
##     "SS_6", "SS_7", "SS_8", "SS_9", "SS_10", "SS_11", "SS_12")])
## 
##   raw_alpha std.alpha G6(smc) average_r S/N    ase mean   sd median_r
##       0.92      0.92    0.94      0.49  11 0.0071  5.3 0.84     0.47
## 
##     95% confidence boundaries 
##          lower alpha upper
## Feldt      0.9  0.92  0.93
## Duhachek   0.9  0.92  0.93
## 
##  Reliability if an item is dropped:
##       raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## SS_1       0.91      0.91    0.94      0.49  11   0.0075 0.018  0.48
## SS_2       0.91      0.91    0.94      0.48  10   0.0078 0.018  0.48
## SS_3       0.91      0.91    0.94      0.49  10   0.0078 0.018  0.48
## SS_4       0.91      0.91    0.94      0.49  11   0.0076 0.016  0.48
## SS_5       0.91      0.91    0.94      0.48  10   0.0080 0.019  0.46
## SS_6       0.91      0.91    0.94      0.48  10   0.0079 0.020  0.46
## SS_7       0.91      0.91    0.94      0.48  10   0.0079 0.019  0.46
## SS_8       0.91      0.92    0.94      0.50  11   0.0073 0.015  0.48
## SS_9       0.91      0.91    0.94      0.48  10   0.0078 0.018  0.46
## SS_10      0.91      0.91    0.94      0.48  10   0.0078 0.018  0.46
## SS_11      0.91      0.91    0.94      0.48  10   0.0080 0.020  0.45
## SS_12      0.91      0.91    0.94      0.49  10   0.0077 0.017  0.46
## 
##  Item statistics 
##         n raw.r std.r r.cor r.drop mean   sd
## SS_1  271  0.70  0.69  0.66   0.62  5.2 1.36
## SS_2  271  0.74  0.74  0.72   0.68  5.4 1.14
## SS_3  270  0.74  0.72  0.70   0.67  5.3 1.33
## SS_4  272  0.71  0.69  0.67   0.63  5.3 1.27
## SS_5  269  0.78  0.78  0.76   0.73  5.5 1.08
## SS_6  270  0.75  0.76  0.74   0.70  5.3 1.05
## SS_7  269  0.75  0.76  0.74   0.69  5.1 1.14
## SS_8  268  0.63  0.61  0.58   0.54  5.2 1.20
## SS_9  271  0.72  0.74  0.72   0.67  5.3 1.02
## SS_10 269  0.73  0.74  0.71   0.67  5.7 1.09
## SS_11 268  0.77  0.77  0.75   0.72  5.2 1.20
## SS_12 271  0.71  0.73  0.71   0.65  5.4 0.99
## 
## Non missing response frequency for each item
##          1    2    3    4    5    6    7 miss
## SS_1  0.01 0.05 0.04 0.13 0.39 0.20 0.19 0.13
## SS_2  0.01 0.00 0.05 0.08 0.38 0.29 0.19 0.13
## SS_3  0.01 0.03 0.06 0.14 0.26 0.34 0.17 0.13
## SS_4  0.01 0.02 0.06 0.12 0.32 0.32 0.15 0.13
## SS_5  0.00 0.00 0.04 0.09 0.37 0.28 0.22 0.14
## SS_6  0.00 0.00 0.05 0.11 0.38 0.33 0.11 0.13
## SS_7  0.01 0.02 0.04 0.15 0.45 0.20 0.13 0.14
## SS_8  0.01 0.03 0.05 0.10 0.40 0.27 0.14 0.14
## SS_9  0.00 0.01 0.02 0.15 0.42 0.24 0.15 0.13
## SS_10 0.00 0.01 0.04 0.04 0.34 0.32 0.25 0.14
## SS_11 0.00 0.03 0.05 0.14 0.40 0.22 0.16 0.14
## SS_12 0.00 0.01 0.01 0.12 0.44 0.26 0.16 0.13

#perceived social network quality
psych::alpha(dat2[c('PSNQ_1', 'PSNQ_2', 'PSNQ_3', 'PSNQ_4', 'PSNQ_5', 'PSNQ_6', 'PSNQ_7', 'PSNQ_8', 'PSNQ_9', 'PSNQ_10')])

## 
## Reliability analysis   
## Call: psych::alpha(x = dat2[c("PSNQ_1", "PSNQ_2", "PSNQ_3", "PSNQ_4", 
##     "PSNQ_5", "PSNQ_6", "PSNQ_7", "PSNQ_8", "PSNQ_9", "PSNQ_10")])
## 
##   raw_alpha std.alpha G6(smc) average_r S/N   ase mean   sd median_r
##       0.92      0.92    0.93      0.54  12 0.007  5.6 0.84     0.55
## 
##     95% confidence boundaries 
##          lower alpha upper
## Feldt      0.9  0.92  0.93
## Duhachek   0.9  0.92  0.93
## 
##  Reliability if an item is dropped:
##         raw_alpha std.alpha G6(smc) average_r  S/N alpha se  var.r med.r
## PSNQ_1       0.91      0.92    0.92      0.55 10.9   0.0073 0.0120  0.56
## PSNQ_2       0.92      0.92    0.92      0.56 11.3   0.0070 0.0082  0.56
## PSNQ_3       0.91      0.91    0.93      0.53 10.3   0.0077 0.0151  0.56
## PSNQ_4       0.91      0.91    0.93      0.54 10.4   0.0077 0.0147  0.54
## PSNQ_5       0.91      0.91    0.92      0.52  9.8   0.0081 0.0131  0.54
## PSNQ_6       0.91      0.91    0.92      0.53 10.3   0.0078 0.0122  0.55
## PSNQ_7       0.91      0.91    0.92      0.53 10.0   0.0079 0.0130  0.55
## PSNQ_8       0.91      0.91    0.93      0.54 10.5   0.0076 0.0127  0.56
## PSNQ_9       0.91      0.91    0.92      0.53 10.2   0.0079 0.0131  0.55
## PSNQ_10      0.91      0.91    0.92      0.53 10.0   0.0080 0.0127  0.55
## 
##  Item statistics 
##           n raw.r std.r r.cor r.drop mean   sd
## PSNQ_1  266  0.71  0.71  0.68   0.63  5.5 1.15
## PSNQ_2  266  0.66  0.66  0.63   0.57  5.7 1.09
## PSNQ_3  265  0.77  0.77  0.74   0.71  5.6 1.04
## PSNQ_4  265  0.76  0.76  0.71   0.69  5.6 1.23
## PSNQ_5  266  0.83  0.83  0.81   0.78  5.5 1.04
## PSNQ_6  266  0.78  0.77  0.75   0.71  5.4 1.21
## PSNQ_7  265  0.80  0.81  0.79   0.75  5.7 0.97
## PSNQ_8  263  0.74  0.74  0.71   0.68  5.7 0.98
## PSNQ_9  264  0.79  0.78  0.76   0.72  5.4 1.26
## PSNQ_10 266  0.80  0.80  0.78   0.75  5.6 1.04
## 
## Non missing response frequency for each item
##            1    2    3    4    5    6    7 miss
## PSNQ_1  0.00 0.03 0.03 0.13 0.16 0.52 0.13 0.15
## PSNQ_2  0.00 0.02 0.03 0.05 0.27 0.41 0.22 0.15
## PSNQ_3  0.00 0.02 0.03 0.09 0.22 0.49 0.16 0.15
## PSNQ_4  0.00 0.06 0.01 0.06 0.19 0.48 0.20 0.15
## PSNQ_5  0.00 0.02 0.02 0.11 0.28 0.42 0.15 0.15
## PSNQ_6  0.01 0.02 0.06 0.07 0.26 0.43 0.15 0.15
## PSNQ_7  0.00 0.02 0.00 0.09 0.23 0.49 0.17 0.15
## PSNQ_8  0.01 0.00 0.02 0.07 0.22 0.50 0.18 0.16
## PSNQ_9  0.00 0.02 0.08 0.11 0.19 0.42 0.18 0.15
## PSNQ_10 0.00 0.01 0.04 0.09 0.18 0.53 0.15 0.15

#resilience
psych::alpha(dat2[c('BRS_1', 'BRS_2r', 'BRS_3', 'BRS_4r', 'BRS_5', 'BRS_6r')])

## Warning in cor.smooth(r): Matrix was not positive definite, smoothing was done

## Warning in psych::alpha(dat2[c("BRS_1", "BRS_2r", "BRS_3", "BRS_4r", "BRS_5", : Some items were negatively correlated with the first principal component and probably 
## should be reversed.  
## To do this, run the function again with the 'check.keys=TRUE' option

## Some items ( BRS_2r BRS_4r BRS_6r ) were negatively correlated with the first principal component and 
## probably should be reversed.  
## To do this, run the function again with the 'check.keys=TRUE' option

## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0

## 
## Reliability analysis   
## Call: psych::alpha(x = dat2[c("BRS_1", "BRS_2r", "BRS_3", "BRS_4r", 
##     "BRS_5", "BRS_6r")])
## 
##   raw_alpha std.alpha G6(smc) average_r   S/N   ase mean   sd median_r
##       0.19     0.031    0.28    0.0053 0.032 0.064  3.2 0.45     0.03
## 
##     95% confidence boundaries 
##          lower alpha upper
## Feldt     0.04  0.19  0.32
## Duhachek  0.06  0.19  0.31
## 
##  Reliability if an item is dropped:
##        raw_alpha std.alpha G6(smc) average_r   S/N alpha se var.r  med.r
## BRS_1     0.0026    -0.318   0.026    -0.051 -0.24    0.075 0.208 -0.014
## BRS_2r   -0.2448    -0.394  -0.125    -0.060 -0.28    0.098 0.241  0.000
## BRS_3     0.1498    -0.115   0.154    -0.021 -0.10    0.062 0.216  0.035
## BRS_4r   -0.1810    -0.330  -0.115    -0.052 -0.25    0.091 0.237  0.020
## BRS_5     0.2306     0.095   0.434     0.021  0.11    0.062 0.125  0.035
## BRS_6r    0.5282     0.548   0.656     0.195  1.21    0.044 0.083  0.075
## 
##  Item statistics 
##          n raw.r std.r  r.cor r.drop mean   sd
## BRS_1  256  0.57  0.64  0.711  0.267  3.9 0.90
## BRS_2r 255  0.74  0.68  0.840  0.412  2.7 1.15
## BRS_3  254  0.45  0.52  0.471  0.089  3.6 0.98
## BRS_4r 253  0.72  0.65  0.813  0.346  2.9 1.22
## BRS_5  255  0.30  0.35 -0.096 -0.021  3.7 0.85
## BRS_6r 255 -0.30 -0.35 -1.416 -0.541  2.3 0.85
## 
## Non missing response frequency for each item
##           1    2    3    4    5 miss
## BRS_1  0.00 0.09 0.13 0.50 0.27 0.18
## BRS_2r 0.11 0.44 0.17 0.20 0.08 0.18
## BRS_3  0.02 0.13 0.26 0.41 0.18 0.19
## BRS_4r 0.14 0.29 0.21 0.26 0.10 0.19
## BRS_5  0.01 0.08 0.24 0.52 0.15 0.18
## BRS_6r 0.15 0.52 0.24 0.08 0.01 0.18

#sense of belonging
psych::alpha(dat2[c('SBS_1', 'SBS_2', 'SBS_3', 'SBS_4', 'SBS_5', 'SBS_6', 'SBS_7')])

## 
## Reliability analysis   
## Call: psych::alpha(x = dat2[c("SBS_1", "SBS_2", "SBS_3", "SBS_4", "SBS_5", 
##     "SBS_6", "SBS_7")])
## 
##   raw_alpha std.alpha G6(smc) average_r S/N   ase mean   sd median_r
##       0.88      0.88    0.88      0.51 7.2 0.011  3.7 0.68     0.49
## 
##     95% confidence boundaries 
##          lower alpha upper
## Feldt     0.85  0.88   0.9
## Duhachek  0.86  0.88   0.9
## 
##  Reliability if an item is dropped:
##       raw_alpha std.alpha G6(smc) average_r S/N alpha se  var.r med.r
## SBS_1      0.86      0.86    0.85      0.50 6.0    0.013 0.0054  0.49
## SBS_2      0.85      0.86    0.85      0.50 5.9    0.013 0.0061  0.51
## SBS_3      0.86      0.86    0.86      0.51 6.3    0.012 0.0092  0.49
## SBS_4      0.86      0.86    0.85      0.50 6.1    0.012 0.0087  0.47
## SBS_5      0.86      0.86    0.86      0.51 6.3    0.012 0.0073  0.49
## SBS_6      0.86      0.86    0.86      0.51 6.2    0.012 0.0075  0.51
## SBS_7      0.86      0.86    0.86      0.51 6.2    0.012 0.0100  0.47
## 
##  Item statistics 
##         n raw.r std.r r.cor r.drop mean   sd
## SBS_1 259  0.78  0.77  0.74   0.69  3.5 0.97
## SBS_2 261  0.79  0.78  0.75   0.70  3.4 0.94
## SBS_3 260  0.74  0.74  0.67   0.63  3.8 0.92
## SBS_4 260  0.76  0.77  0.72   0.67  3.9 0.81
## SBS_5 257  0.73  0.74  0.69   0.63  3.8 0.86
## SBS_6 261  0.76  0.75  0.70   0.65  3.6 0.92
## SBS_7 261  0.75  0.76  0.70   0.66  3.9 0.86
## 
## Non missing response frequency for each item
##          1    2    3    4    5 miss
## SBS_1 0.01 0.16 0.31 0.38 0.14 0.17
## SBS_2 0.01 0.16 0.34 0.36 0.12 0.16
## SBS_3 0.00 0.07 0.30 0.37 0.26 0.17
## SBS_4 0.01 0.02 0.29 0.45 0.23 0.17
## SBS_5 0.00 0.07 0.30 0.42 0.21 0.18
## SBS_6 0.01 0.08 0.35 0.36 0.19 0.16
## SBS_7 0.00 0.08 0.15 0.54 0.22 0.16

#professional flourishing
psych::alpha(dat2[c('PF_1','PF_2','PF_3','PF_4','PF_5','PF_6','PF_7','PF_8')])

## 
## Reliability analysis   
## Call: psych::alpha(x = dat2[c("PF_1", "PF_2", "PF_3", "PF_4", "PF_5", 
##     "PF_6", "PF_7", "PF_8")])
## 
##   raw_alpha std.alpha G6(smc) average_r S/N    ase mean  sd median_r
##       0.94      0.94    0.93      0.65  15 0.0052  5.8 0.9     0.65
## 
##     95% confidence boundaries 
##          lower alpha upper
## Feldt     0.93  0.94  0.95
## Duhachek  0.93  0.94  0.95
## 
##  Reliability if an item is dropped:
##      raw_alpha std.alpha G6(smc) average_r S/N alpha se  var.r med.r
## PF_1      0.92      0.92    0.92      0.63  12   0.0066 0.0037  0.62
## PF_2      0.93      0.93    0.92      0.64  13   0.0062 0.0044  0.64
## PF_3      0.93      0.93    0.93      0.67  14   0.0056 0.0032  0.67
## PF_4      0.93      0.93    0.92      0.65  13   0.0060 0.0053  0.64
## PF_5      0.93      0.93    0.93      0.66  14   0.0057 0.0041  0.66
## PF_6      0.93      0.93    0.93      0.66  13   0.0058 0.0041  0.64
## PF_7      0.92      0.92    0.92      0.64  12   0.0063 0.0040  0.64
## PF_8      0.93      0.93    0.93      0.66  14   0.0057 0.0049  0.67
## 
##  Item statistics 
##        n raw.r std.r r.cor r.drop mean   sd
## PF_1 257  0.90  0.89  0.89   0.86  5.6 1.24
## PF_2 253  0.87  0.86  0.84   0.82  5.6 1.17
## PF_3 255  0.79  0.78  0.74   0.72  5.8 1.04
## PF_4 257  0.84  0.84  0.81   0.78  5.8 1.09
## PF_5 256  0.79  0.79  0.76   0.73  5.9 0.94
## PF_6 256  0.81  0.81  0.78   0.75  5.9 1.01
## PF_7 254  0.87  0.88  0.86   0.83  5.8 1.14
## PF_8 256  0.80  0.81  0.77   0.74  5.8 0.93
## 
## Non missing response frequency for each item
##      1    2    3    4    5    6    7 miss
## PF_1 0 0.03 0.05 0.10 0.18 0.41 0.23 0.18
## PF_2 0 0.02 0.05 0.09 0.21 0.42 0.21 0.19
## PF_3 0 0.02 0.01 0.08 0.21 0.42 0.26 0.18
## PF_4 0 0.01 0.04 0.09 0.15 0.46 0.25 0.18
## PF_5 0 0.00 0.01 0.08 0.20 0.43 0.28 0.18
## PF_6 0 0.01 0.02 0.09 0.16 0.46 0.27 0.18
## PF_7 0 0.01 0.04 0.08 0.18 0.41 0.27 0.19
## PF_8 0 0.00 0.02 0.05 0.22 0.48 0.23 0.18

#career success
psych::alpha(dat2[c('CS_1', 'CS_2','CS_3','CS_4','CS_5')])

## 
## Reliability analysis   
## Call: psych::alpha(x = dat2[c("CS_1", "CS_2", "CS_3", "CS_4", "CS_5")])
## 
##   raw_alpha std.alpha G6(smc) average_r S/N   ase mean   sd median_r
##       0.85      0.85    0.83      0.54 5.8 0.013    4 0.72     0.54
## 
##     95% confidence boundaries 
##          lower alpha upper
## Feldt     0.83  0.85  0.88
## Duhachek  0.83  0.85  0.88
## 
##  Reliability if an item is dropped:
##      raw_alpha std.alpha G6(smc) average_r S/N alpha se  var.r med.r
## CS_1      0.85      0.85    0.81      0.58 5.5    0.014 0.0020  0.56
## CS_2      0.81      0.81    0.77      0.52 4.3    0.017 0.0058  0.52
## CS_3      0.81      0.81    0.77      0.52 4.3    0.017 0.0022  0.54
## CS_4      0.82      0.82    0.78      0.53 4.5    0.017 0.0055  0.54
## CS_5      0.83      0.83    0.79      0.55 4.8    0.016 0.0065  0.54
## 
##  Item statistics 
##        n raw.r std.r r.cor r.drop mean   sd
## CS_1 255  0.71  0.73  0.62   0.57  4.0 0.81
## CS_2 256  0.83  0.82  0.77   0.71  4.0 0.93
## CS_3 252  0.84  0.82  0.78   0.71  4.0 1.07
## CS_4 254  0.81  0.81  0.75   0.70  4.0 0.89
## CS_5 254  0.78  0.78  0.70   0.65  4.1 0.83
## 
## Non missing response frequency for each item
##         1    2    3    4    5 miss
## CS_1 0.01 0.05 0.08 0.60 0.26 0.18
## CS_2 0.00 0.09 0.13 0.42 0.36 0.18
## CS_3 0.03 0.10 0.09 0.41 0.37 0.19
## CS_4 0.02 0.04 0.17 0.47 0.31 0.19
## CS_5 0.00 0.06 0.13 0.50 0.31 0.19

#organizational climate
psych::alpha(dat2[c('OC_1', 'OC_2')])

## 
## Reliability analysis   
## Call: psych::alpha(x = dat2[c("OC_1", "OC_2")])
## 
##   raw_alpha std.alpha G6(smc) average_r S/N   ase mean   sd median_r
##       0.54      0.54    0.37      0.37 1.2 0.051  3.6 0.96     0.37
## 
##     95% confidence boundaries 
##          lower alpha upper
## Feldt     0.42  0.54  0.63
## Duhachek  0.44  0.54  0.64
## 
##  Reliability if an item is dropped:
##      raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## OC_1      0.45      0.37    0.14      0.37 0.6       NA     0  0.37
## OC_2      0.31      0.37    0.14      0.37 0.6       NA     0  0.37
## 
##  Item statistics 
##        n raw.r std.r r.cor r.drop mean  sd
## OC_1 280  0.86  0.83  0.51   0.37  3.9 1.3
## OC_2 278  0.79  0.83  0.51   0.37  3.3 1.0
## 
## Non missing response frequency for each item
##         1    2    3    4    5    6 miss
## OC_1 0.07 0.00 0.26 0.40 0.12 0.14 0.10
## OC_2 0.06 0.15 0.28 0.41 0.10 0.00 0.11

MISSING ANALYSIS: SCALE LEVEL WITH STANDARDIZED (AVG) SCORES

dat2_missingness2 <-(dplyr::select (dat2, OC_AVG, SS_AVG:CS_AVG))

#In the script below we create a variable that counts the number of missing variables and then creates a proportion by dividing it by the number of total variables.

#Create a variable (n_miss) that counts the number missing
dat2_missingness2$n_miss <- dat2_missingness2%>%
  dplyr::select(OC_AVG, SS_AVG:CS_AVG) %>% 
is.na %>% 
rowSums

#Create a proportion missing by dividing n_miss by the total number of variables (6)
#Pipe to sort in order of descending frequency to get a sense of the missingness
dat2_missingness2<- dat2_missingness2%>%
 dplyr::mutate(prop_miss = (n_miss/6)*100)%>%
  arrange(desc(n_miss))

psych::describe(dat2_missingness2$prop_miss)

##    vars   n  mean    sd median trimmed mad min    max  range skew kurtosis   se
## X1    1 312 18.22 39.49      0    8.27   0   0 116.67 116.67 1.85     1.61 2.24

Across 312 cases for which the scoring protocol was applied, missingness ranged from 0% to 116.67%

# Filter out cases with 20% or more missing data
fgpdata_scored <- dplyr::filter(dat2_missingness2, prop_miss <= 20)

# Select only the relevant columns (if necessary)
fgpdata_scored <- dplyr::select(fgpdata_scored, OC_AVG, SS_AVG:CS_AVG)

# Save the cleaned data to a CSV file
#write.csv(fgpdata_scored, file = "fgpdata_average_scores.csv", row.names = FALSE)

# Optionally, check the number of cases retained
num_cases_retained <- nrow(fgpdata_scored)
print(num_cases_retained)

## [1] 257

# Assuming your original dataset is called df
fgpdata_scored$ID <- 1:nrow(fgpdata_scored)

# View the dataset with the new ID variable
head(fgpdata_scored)

## # A tibble: 6 × 8
##   OC_AVG SS_AVG PSNQ_AVG BRS_AVG SBS_AVG PF_AVG CS_AVG    ID
##    <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>  <dbl> <int>
## 1    3.5   4       NA       3       3.14   3.88   3.25     1
## 2    4     5.5      5.9     3.17    4.43   6     NA        2
## 3   NA     5.75     5.33    3       3.57   5.12   4.2      3
## 4    3     3.08     2.9    NA       2.57   3      2.4      4
## 5   NA     4        4.8     2.33    3.14   3.62   3.2      5
## 6   NA     4.82     5.6     3.33    3.86   6      4        6

After eliminating cases with greater than 20% missing, the dataset analyzed included 257 cases.

REVISITING MISSING ANALYSIS AT THE SCALE LEVEL

#percent missing across df
formattable::percent(mean(is.na(fgpdata_scored)))

## [1] 0.39%

#percent of rows with nonmissing data
formattable::percent(mean(complete.cases(fgpdata_scored)))

## [1] 96.89%

In this dataset, missing data was minimal, with only 0.44% of the values missing overall. Additionally, 96.89% of the rows had complete data at the scale level.

mice_ScaleLvl_fgp <- mice::md.pattern(fgpdata_scored, plot = TRUE, rotate.names=TRUE)

mice_ScaleLvl_fgp

##     SS_AVG SBS_AVG PF_AVG ID PSNQ_AVG CS_AVG BRS_AVG OC_AVG  
## 249      1       1      1  1        1      1       1      1 0
## 4        1       1      1  1        1      1       1      0 1
## 2        1       1      1  1        1      1       0      1 1
## 1        1       1      1  1        1      0       1      1 1
## 1        1       1      1  1        0      1       1      1 1
##          0       0      0  0        1      1       2      4 8

MISSING ANALYSIS: SCALE LEVEL WITH UNSTANDARDIZED (TOTAL) SCORES

dat2_missingness3 <-(dplyr::select (dat2, OC_total, SS_total:CS_total))

#In the script below we create a variable that counts the number of missing variables and then creates a proportion by dividing it by the number of total variables.

#Create a variable (n_miss) that counts the number missing
dat2_missingness3$n_miss <- dat2_missingness3%>%
  dplyr::select(OC_total, SS_total:CS_total) %>% 
is.na %>% 
rowSums

#Create a proportion missing by dividing n_miss by the total number of variables (6)
#Pipe to sort in order of descending frequency to get a sense of the missingness
dat2_missingness3<- dat2_missingness3%>%
 dplyr::mutate(prop_miss = (n_miss/6)*100)%>%
  arrange(desc(n_miss))

psych::describe(dat2_missingness3$prop_miss)

##    vars   n mean sd median trimmed mad min max range skew kurtosis se
## X1    1 312    0  0      0       0   0   0   0     0  NaN      NaN  0

Across 312 cases for which the scoring protocol was applied, missingness ranged from 0% to 0%

# Filter out cases with 20% or more missing data
fgpdata_scored2 <- dplyr::filter(dat2_missingness3, prop_miss <= 20)

# Select only the relevant columns (if necessary)
fgpdata_scored2 <- dplyr::select(fgpdata_scored2, OC_total, SS_total:CS_total)

# Save the cleaned data to a CSV file
#write.csv(fgpdata_scored2, file = "fgpdata_total_scores.csv", row.names = FALSE)

# Optionally, check the number of cases retained
num_cases_retained2 <- nrow(fgpdata_scored2)
print(num_cases_retained2)

## [1] 312

After eliminating cases with greater than 20% missing, the dataset remained the same at 312

REVISITING MISSING ANALYSIS AT THE SCALE LEVEL

#percent missing across df
formattable::percent(mean(is.na(fgpdata_scored2)))

## [1] 0.00%

#percent of rows with nonmissing data
formattable::percent(mean(complete.cases(fgpdata_scored2)))

## [1] 100.00%

mice_ScaleLvl_fgp2 <- mice::md.pattern(fgpdata_scored2, plot = TRUE, rotate.names=TRUE)

##  /\     /\
## {  `---'  }
## {  O   O  }
## ==>  V <==  No need for mice. This data set is completely observed.
##  \  \|/  /
##   `-----'

mice_ScaleLvl_fgp2

##     OC_total SS_total PSNQ_total BRS_total SBS_total PF_total CS_total  
## 312        1        1          1         1         1        1        1 0
##            0        0          0         0         0        0        0 0

DATA DIAGNOSTICS

ANALYZE SCALE LEVEL ASSUMPTIONS WITH STANDARDIZED DATASET

fgpdata_Subscales <-(dplyr::select (fgpdata_scored, OC_AVG, SS_AVG:CS_AVG))

#skew and kurtosis
descriptives <- psych::describe(fgpdata_Subscales, type = 1)
descriptives

##          vars   n mean   sd median trimmed  mad  min  max range  skew kurtosis
## OC_AVG      1 253 3.67 0.94   4.00    3.72 0.74 1.00 5.50  4.50 -0.59     0.41
## SS_AVG      2 257 5.36 0.84   5.27    5.36 0.83 2.50 7.00  4.50 -0.16     0.07
## PSNQ_AVG    3 256 5.60 0.84   5.70    5.64 0.65 2.50 7.00  4.50 -0.76     0.99
## BRS_AVG     4 255 3.19 0.44   3.17    3.17 0.25 1.67 4.33  2.66  0.45     0.82
## SBS_AVG     5 257 3.71 0.67   3.57    3.69 0.64 1.86 5.00  3.14  0.14    -0.55
## PF_AVG      6 257 5.76 0.90   6.00    5.83 0.74 2.88 7.00  4.12 -0.79     0.32
## CS_AVG      7 256 4.02 0.72   4.20    4.08 0.59 1.40 5.00  3.60 -0.81     0.32
##            se
## OC_AVG   0.06
## SS_AVG   0.05
## PSNQ_AVG 0.05
## BRS_AVG  0.03
## SBS_AVG  0.04
## PF_AVG   0.06
## CS_AVG   0.05

All skew values fall below 3.0 All kurtosis fall below 10 *As per Kline, 2016

#normality
shapiro.test(fgpdata_Subscales$SS_AVG)

## 
##  Shapiro-Wilk normality test
## 
## data:  fgpdata_Subscales$SS_AVG
## W = 0.98729, p-value = 0.0226

shapiro.test(fgpdata_Subscales$PSNQ_AVG)

## 
##  Shapiro-Wilk normality test
## 
## data:  fgpdata_Subscales$PSNQ_AVG
## W = 0.95562, p-value = 0.0000004639

shapiro.test(fgpdata_Subscales$BRS_AVG)

## 
##  Shapiro-Wilk normality test
## 
## data:  fgpdata_Subscales$BRS_AVG
## W = 0.93818, p-value = 0.000000007157

shapiro.test(fgpdata_Subscales$SBS_AVG)

## 
##  Shapiro-Wilk normality test
## 
## data:  fgpdata_Subscales$SBS_AVG
## W = 0.97696, p-value = 0.0003519

shapiro.test(fgpdata_Subscales$PF_AVG)

## 
##  Shapiro-Wilk normality test
## 
## data:  fgpdata_Subscales$PF_AVG
## W = 0.93998, p-value = 0.000000009607

shapiro.test(fgpdata_Subscales$CS_AVG)

## 
##  Shapiro-Wilk normality test
## 
## data:  fgpdata_Subscales$CS_AVG
## W = 0.93674, p-value = 0.000000004962

shapiro.test(fgpdata_Subscales$OC_AVG)

## 
##  Shapiro-Wilk normality test
## 
## data:  fgpdata_Subscales$OC_AVG
## W = 0.9488, p-value = 0.00000009337

When the p-value from the Shapiro-Wilk test is less than 0.05, it indicates that the variable’s distribution significantly deviates from a normal distribution. In other words, the test suggests that the data is not normally distributed.

#histogram and QQplot
psych::pairs.panels(fgpdata_Subscales[c("OC_AVG", "SS_AVG", "PSNQ_AVG", "BRS_AVG", "SBS_AVG", "PF_AVG", "CS_AVG" )], stars = TRUE, lm = TRUE)

#Mahalanobis distance test
fgpdata_Subscales$Mahal <- psych::outlier(fgpdata_Subscales[c("OC_AVG", "SS_AVG", "PSNQ_AVG", "BRS_AVG", "SBS_AVG", "PF_AVG", "CS_AVG" )])

psych::describe(fgpdata_Subscales$Mahal)

##    vars   n mean   sd median trimmed  mad  min   max range skew kurtosis   se
## X1    1 257 6.96 6.74   4.82    5.67 3.44 0.69 51.33 50.64    3    11.83 0.42

# creates a variable indicating TRUE or FALSE if an item is an
# outlier
fgpdata_scored$MOutlier <- dplyr::if_else(fgpdata_Subscales$Mahal > (median(fgpdata_Subscales$Mahal) +
    (3 * sd(fgpdata_Subscales$Mahal))), TRUE, FALSE)

# shows us the first 6 rows of the data so we can see the new
# variables (Mahal, MOutlier)
head(fgpdata_scored)

## # A tibble: 6 × 9
##   OC_AVG SS_AVG PSNQ_AVG BRS_AVG SBS_AVG PF_AVG CS_AVG    ID MOutlier
##    <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>  <dbl> <int> <lgl>   
## 1    3.5   4       NA       3       3.14   3.88   3.25     1 FALSE   
## 2    4     5.5      5.9     3.17    4.43   6     NA        2 FALSE   
## 3   NA     5.75     5.33    3       3.57   5.12   4.2      3 FALSE   
## 4    3     3.08     2.9    NA       2.57   3      2.4      4 FALSE   
## 5   NA     4        4.8     2.33    3.14   3.62   3.2      5 FALSE   
## 6   NA     4.82     5.6     3.33    3.86   6      4        6 FALSE

# Count the number of outliers and non-outliers
OutlierCount <- fgpdata_scored %>%
    dplyr::count(MOutlier)

# Number of outliers
num_outliers <- OutlierCount %>% filter(MOutlier == TRUE) %>% pull(n)

# Number of non-outliers
num_non_outliers <- OutlierCount %>% filter(MOutlier == FALSE) %>% pull(n)

# Alternatively, calculate directly
num_outliers <- sum(fgpdata_scored$MOutlier == TRUE)
num_non_outliers <- sum(fgpdata_scored$MOutlier == FALSE)

# Print the results
cat("Number of outliers:", num_outliers, "\n")

## Number of outliers: 8

cat("Number of non-outliers:", num_non_outliers, "\n")

## Number of non-outliers: 249

At this stage did a visual inspection of the data and removed duplicate rows again leading to a final dataset of 230 observations.

# Create a new dataframe with only the non-outliers (assuming non-outliers are marked as FALSE in the MOutlier column)
non_outliers_df <- fgpdata_scored %>% 
    dplyr::filter(MOutlier == FALSE)

# View the first few rows of the non-outliers dataframe
head(non_outliers_df)

## # A tibble: 6 × 9
##   OC_AVG SS_AVG PSNQ_AVG BRS_AVG SBS_AVG PF_AVG CS_AVG    ID MOutlier
##    <dbl>  <dbl>    <dbl>   <dbl>   <dbl>  <dbl>  <dbl> <int> <lgl>   
## 1    3.5   4       NA       3       3.14   3.88   3.25     1 FALSE   
## 2    4     5.5      5.9     3.17    4.43   6     NA        2 FALSE   
## 3   NA     5.75     5.33    3       3.57   5.12   4.2      3 FALSE   
## 4    3     3.08     2.9    NA       2.57   3      2.4      4 FALSE   
## 5   NA     4        4.8     2.33    3.14   3.62   3.2      5 FALSE   
## 6   NA     4.82     5.6     3.33    3.86   6      4        6 FALSE

# Optionally, check the number of rows to confirm it has 308 observations
cat("Number of non-outliers:", nrow(non_outliers_df), "\n")

## Number of non-outliers: 249

# Count the number of outliers and non-outliers
OutlierCount <- non_outliers_df %>%
    dplyr::count(MOutlier)

# Number of outliers
num_outliers <- OutlierCount %>% filter(MOutlier == TRUE) %>% pull(n)

# Number of non-outliers
num_non_outliers <- OutlierCount %>% filter(MOutlier == FALSE) %>% pull(n)

# Alternatively, calculate directly
num_outliers <- sum(non_outliers_df$MOutlier == TRUE)
num_non_outliers <- sum(non_outliers_df$MOutlier == FALSE)

# Print the results
cat("Number of outliers:", num_outliers, "\n")

## Number of outliers: 0

cat("Number of non-outliers:", num_non_outliers, "\n")

## Number of non-outliers: 249

FINAL CSV FILE OF CLEANED AND SCORED DATA WITH DEMOGRAPHICS, AVERAGES, AND TOTALS

# Assuming both data frames have a common key, such as "ID"
merged_df <- merge(non_outliers_df, dat2, by = "ID")

# Select only the desired columns
fgpdata_dem_avg <- merged_df[, c(
  "OC_AVG.x", "SBS_AVG.x", "SS_AVG.x", "BRS_AVG.x", "PSNQ_AVG.x", "PF_AVG.x", "CS_AVG.x", 
  "employment_status", "employment_status_TEXT", "PD", "SS_13_TEXT", "age", "year_education", 
  "education", "education_9_TEXT", "religion", "religion_15_TEXT", "income", 
  "fam_income", "race", "race_10_TEXT", "US_born", "sexual_orientation", 
  "sexual_orientation_9_TEXT", "gender", "gender_8_TEXT"
)]

names(fgpdata_dem_avg) <- c(
  "OC_AVG", "SBS_AVG", "SS_AVG", "BRS_AVG", "PSNQ_AVG", "PF_AVG", "CS_AVG", 
  "employment_status", "employment_status_TEXT", "PD", "SS_13_TEXT", "age",
  "year_education", "education", "education_9_TEXT", "religion", 
  "religion_15_TEXT", "income", "fam_income", "race", "race_10_TEXT", 
  "US_born", "sexual_orientation", "sexual_orientation_9_TEXT", 
  "gender", "gender_8_TEXT"
)

write.csv(fgpdata_dem_avg, "fgpdata_dem_avg.csv")

data_scrubbing

Melissa-Ann Lagunas

2024-08-19

ANALYZE ITEM LEVEL DATA MISSINGNESS AND PATTERNS

ANALYZING MISSING DATA PATTERNS ITEM LEVEL

RECODING and REVERSE SCORING

INTERNAL CONSISTENCY OF SCALES

MISSING ANALYSIS: SCALE LEVEL WITH STANDARDIZED (AVG) SCORES

REVISITING MISSING ANALYSIS AT THE SCALE LEVEL

MISSING ANALYSIS: SCALE LEVEL WITH UNSTANDARDIZED (TOTAL) SCORES

REVISITING MISSING ANALYSIS AT THE SCALE LEVEL

DATA DIAGNOSTICS

ANALYZE SCALE LEVEL ASSUMPTIONS WITH STANDARDIZED DATASET

FINAL CSV FILE OF CLEANED AND SCORED DATA WITH DEMOGRAPHICS, AVERAGES, AND TOTALS