Missing Data Analysis for RAPD Student Data

setwd("C:/Work Files/Collaboration/Andi/RAPD Student Data")


library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(mice)

## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(naniar)
library(Amelia)

## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.2, built: 2024-04-10)
## ## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

library(missForest)
library(haven)
library(misty)

## |-------------------------------------|
## | misty 0.6.5 (2024-06-29)            |
## | Miscellaneous Functions T. Yanagida |
## |-------------------------------------|

library(ggplot2)

RAPD_Student_Data <-read_dta("Eisman_merged_appended_7.10_N346.dta")

This block of code is used to clean the file and generate blocks of variables so that missing data visualizations are easier to read. Multiple variables were removed in the RAPD_Student_Data_Clean dataset. Some variables such as ID are not relevant for missing data analysis. Some other variables seemed to be placeholders in the code (e.g., xxxxx_group_demos_xxxxx). Finaly there were several variables that had no variance(age_t1, age_t2, fuzzy, profhelp1_t2_0, profhelp1_t2_1).

11 Blocks comprised of 20 variables each were created for visualizations to make reading the variable names in the graphs easier.

RAPD_Student_Data_Clean <-RAPD_Student_Data[,c(-1,-3,-4,-15,-16,-138,-249,-250:-264)]
RAPD_Student_Data_Clean <-RAPD_Student_Data_Clean[,c(-3,-4,-11,-171,-172)]

RAPD_Block_1 <-RAPD_Student_Data_Clean[,c(1:20)]
RAPD_Block_2 <-RAPD_Student_Data_Clean[,c(21:40)]
RAPD_Block_3 <-RAPD_Student_Data_Clean[,c(41:60)]
RAPD_Block_4 <-RAPD_Student_Data_Clean[,c(61:80)]
RAPD_Block_5 <-RAPD_Student_Data_Clean[,c(81:100)]
RAPD_Block_6 <-RAPD_Student_Data_Clean[,c(101:120)]
RAPD_Block_7 <-RAPD_Student_Data_Clean[,c(121:140)]
RAPD_Block_8 <-RAPD_Student_Data_Clean[,c(141:180)]
RAPD_Block_9 <-RAPD_Student_Data_Clean[,c(181:200)]
RAPD_Block_10 <-RAPD_Student_Data_Clean[,c(201:220)]
RAPD_Block_11 <-RAPD_Student_Data_Clean[,c(221:237)]

This block of code runs missing data visualizations and the Little MCAR test

vis_miss(RAPD_Block_1)

vis_miss(RAPD_Block_2)

vis_miss(RAPD_Block_3)

vis_miss(RAPD_Block_4)

vis_miss(RAPD_Block_5)

vis_miss(RAPD_Block_6)

vis_miss(RAPD_Block_7)

vis_miss(RAPD_Block_8)

vis_miss(RAPD_Block_9)

vis_miss(RAPD_Block_10)

vis_miss(RAPD_Block_11)

gg_miss_var(RAPD_Block_1)

gg_miss_var(RAPD_Block_2)

gg_miss_var(RAPD_Block_3)

gg_miss_var(RAPD_Block_4)

gg_miss_var(RAPD_Block_5)

gg_miss_var(RAPD_Block_6)

gg_miss_var(RAPD_Block_7)

gg_miss_var(RAPD_Block_8)

gg_miss_var(RAPD_Block_9)

gg_miss_var(RAPD_Block_10)

gg_miss_var(RAPD_Block_11)

gg_miss_upset(RAPD_Block_1)

gg_miss_upset(RAPD_Block_2)

gg_miss_upset(RAPD_Block_3)

gg_miss_upset(RAPD_Block_4)

gg_miss_upset(RAPD_Block_5)

gg_miss_upset(RAPD_Block_6)

gg_miss_upset(RAPD_Block_7)

gg_miss_upset(RAPD_Block_8)

gg_miss_upset(RAPD_Block_9)

gg_miss_upset(RAPD_Block_10)

gg_miss_upset(RAPD_Block_11)

This block of code is for the Little MCAR test by block. The Little MCAR test was significant for all but blocks 3 & 7. Block 8 had numerical integration issues because of perfect collinearity among some of the variables.

na.test(RAPD_Block_1)

##  Little's MCAR Test
## 
##     n nIncomp nPattern   chi2  df  pval 
##   346      27       12 292.43 185 0.000

na.test(RAPD_Block_2)

##  Little's MCAR Test
## 
##     n nIncomp nPattern    chi2  df  pval 
##   346     342       34 1972.47 478 0.000

na.test(RAPD_Block_3)

##  Little's MCAR Test
## 
##     n nIncomp nPattern   chi2  df  pval 
##   346      42       21 256.96 283 0.865

na.test(RAPD_Block_4)

##  Little's MCAR Test
## 
##     n nIncomp nPattern   chi2  df  pval 
##   346     335       25 392.16 315 0.002

na.test(RAPD_Block_5)

##  Little's MCAR Test
## 
##     n nIncomp nPattern   chi2  df  pval 
##   346     328       17 245.82 189 0.003

na.test(RAPD_Block_6)

##  Little's MCAR Test
## 
##     n nIncomp nPattern   chi2  df  pval 
##   346     188       17 555.93 264 0.000

na.test(RAPD_Block_7)

##  Little's MCAR Test
## 
##     n nIncomp nPattern   chi2  df  pval 
##   346     330       43 545.72 605 0.959

#na.test(RAPD_Block_8)
na.test(RAPD_Block_9)

##  Little's MCAR Test
## 
##     n nIncomp nPattern   chi2  df  pval 
##   346     266       15 253.35 180 0.000

na.test(RAPD_Block_10)

##  Little's MCAR Test
## 
##     n nIncomp nPattern   chi2  df  pval 
##   346     284       14 415.59 190 0.000

na.test(RAPD_Block_11)

##  Little's MCAR Test
## 
##     n nIncomp nPattern   chi2  df  pval 
##   346     324       42 696.86 429 0.000

Missing Data Analysis for RAPD Student Data

Ty

2024-08-03