library(rmarkdown); library(knitr)
library(dplyr)
library(psych); library(corrplot); library(GPArotation)
library(lavaan); #library(semPlot); library(moments)
HUNGERDATA <- read.csv("https://www.dropbox.com/s/znpumq7o1fghlfb/?dl=1")
attach(HUNGERDATA)
set.seed(37)
Find a 95% confidence interval for the mean difference in BOXES between cases with missing values and those without missing values
set.seed(37)
#Create arrays of BOXES where data is missing and BOXES where data is not missing
missCases <- BOXES[is.na(HUNGERDATA$KIDSFED)]
compCases <- BOXES[complete.cases(KIDSFED)]
#Bootstrap the CI for the difference between the mean of BOXES w/ and w/out missing data
meanDiffs <- replicate(1000, {
mean((slice_sample(data.frame(compCases), n = length(compCases), replace = T))[,1]) - mean((slice_sample(data.frame(missCases), n = length(missCases), replace = T))[,1])
})
quantile(meanDiffs, c(0.025, 0.975))
## 2.5% 97.5%
## -34.34444 42.45556
Explain if the interval above implied that MCAR (missing completely at random) is likely
The confidence interval contains 0, so MCAR seems likely.
Impute the missing values of KIDSFED using mean substitution
meanSubKidsFed <- KIDSFED
#Replace NAs with the mean of the data which is not missing
meanSubKidsFed[is.na(KIDSFED)] <- mean(KIDSFED, na.rm = T)
meanSubKidsFed
## [1] 15.00000 34.33333 54.00000 34.33333 15.00000 12.00000 25.00000 27.00000
## [9] 34.33333 18.00000 93.00000 50.00000
Compute the missing values of KIDSFED using group mean substitution based on DAY
days <- 15:21
#Calculate the mean of KIDSFED for each day
dayMeans <- c()
for (day in days) {
dayMeans <- c(dayMeans, mean(subset(KIDSFED, subset = (DAY == day)), na.rm = T))
}
#Substitute the missing data with the corresponding daily mean of KIDSFED
meanSubKidsFedByDay <- KIDSFED
pos <- 0
for (kid in KIDSFED) {
pos <- pos + 1
if(is.na(kid)) {
currDay <- DAY[pos]
meanSubKidsFedByDay[pos] <- dayMeans[currDay-14]
}
}
meanSubKidsFedByDay
## [1] 15.0 73.5 54.0 25.0 15.0 12.0 25.0 27.0 50.0 18.0 93.0 50.0
Compute the missing values of KIDSFED using hot deck imputation
hotDeckImp <- c()
set.seed(37)
#Replace NAs with a random value from the data that exists
for (kid in KIDSFED) {
if(is.na(kid)) {
randKid <- compCases[runif(1, 1, length(compCases))]
hotDeckImp <- c(hotDeckImp, randKid)
} else {
hotDeckImp <- c(hotDeckImp, kid)
}
}
hotDeckImp
## [1] 15 42 54 25 15 12 25 27 45 18 93 50
Compute the missing values of KIDSFED using cold deck imputation based on the previous months’ values
coldDeckImp <- c()
set.seed(37)
#Replace NAs with the value from the previous month
pos <- 0
for (kid in KIDSFED) {
pos <- pos + 1
if(is.na(kid)) {
coldDeckImp <- c(coldDeckImp, KIDSFED[pos - 1])
} else {
coldDeckImp <- c(coldDeckImp, kid)
}
}
coldDeckImp
## [1] 15 15 54 54 15 12 25 27 27 18 93 50
Compute the missing values of KIDSFED using regression substitution based on BOXES
subModel <- lm(KIDSFED ~ BOXES)
#coef(subModel)
B0 <- unname(coef(subModel))[1]
B1 <- unname(coef(subModel))[2]
regressedKids <- c()
pos <- 0
#Replace NAs with a value projected from a linear regression on BOXES
for (kid in KIDSFED) {
pos <- pos + 1
if(is.na(kid)) {
boxedKid <- B0 + BOXES[pos]*B1
regressedKids <- c(regressedKids, boxedKid)
} else {
regressedKids <- c(regressedKids, kid)
}
}
regressedKids
## [1] 15.00000 27.26277 54.00000 48.66217 15.00000 12.00000 25.00000 27.00000
## [9] 19.37878 18.00000 93.00000 50.00000
Use cbind() along with cor() to compute the missing data (pairwise observation) correlation matrix, R, for the interaction between VOLUNTEERS, BOXES, MEALSPERVOL, KIDSFED
x <- cbind(VOLUNTEERS, BOXES, MEALSPERVOL, KIDSFED)
cor(x, use = "pairwise.complete.obs")
## VOLUNTEERS BOXES MEALSPERVOL KIDSFED
## VOLUNTEERS 1.0000000 0.9192350 -0.4664163 0.9212326
## BOXES 0.9192350 1.0000000 -0.1732300 0.9994508
## MEALSPERVOL -0.4664163 -0.1732300 1.0000000 -0.1537829
## KIDSFED 0.9212326 0.9994508 -0.1537829 1.0000000
Use cbind() along with cor() to compute the complete observation correlation matrix, R, for the interaction between VOLUNTEERS, BOXES, MEALSPERVOL, KIDSFED
x <- cbind(VOLUNTEERS, BOXES, MEALSPERVOL, KIDSFED)
cor(x, use = "complete.obs")
## VOLUNTEERS BOXES MEALSPERVOL KIDSFED
## VOLUNTEERS 1.0000000 0.9254271 -0.4704469 0.9212326
## BOXES 0.9254271 1.0000000 -0.1548780 0.9994508
## MEALSPERVOL -0.4704469 -0.1548780 1.0000000 -0.1537829
## KIDSFED 0.9212326 0.9994508 -0.1537829 1.0000000
Describe how the complete observation correlation matrix above compares to the missing data correlation matrix for the interaction between VOLUNTEERS, BOXES, MEALSPERVOL, KIDSFED
The correlations with KIDSFED remains the same because they account for all available data of the KIDSFED variable. The rest of the correlations change slightly because they lose some data when pairwise correlation is not used.
Compute the z-score for each value in VOLUNTEERS
volVScores <- (VOLUNTEERS-mean(VOLUNTEERS))/sd(VOLUNTEERS)
volVScores
## [1] -0.930447516 0.056966174 0.006329575 0.284830872 -0.727901118
## [6] -1.031720715 -0.424081521 -0.246853423 -0.727901118 0.234194273
## [11] 2.614114449 0.892470066
Explain what the z-score above mean in terms of the presence or absence of univariate outliers among these cases
There is one value, 154, which is 2.6 standard deviations above the mean. The rest of the data roughly within 1 standard deviation.
Use boxplot() to generate a box plot of the distribution of VOLUNTEERS
boxplot(VOLUNTEERS, horizontal = T)
Explain if the box plot above supports your earlier conclusion regarding the presence or absence of univariate outliers
It supports the idea that there is an outlier because there is a point beyond the whiskers.
Use plot() to generate a scatter plot of the association between VOLUNTEERS and MEALSPERVOL
plot(VOLUNTEERS, MEALSPERVOL)
Explain what the scatter plot above means in terms of the presence or absence of multivariate outliers among these cases
There seems to be an outlier in an otherwise negatively correlated, weak, linear relationship.
Use mahalanobis() to compute the mahalanobis distance for the most outlying case (November) based on the interaction between VOLUNTEERS and MEALSPERVOL
x <- c(subset(HUNGERDATA, subset = MONTH == "November")$VOLUNTEERS, subset(HUNGERDATA, subset = MONTH == "November")$MEALSPERVOL)
mdist <- mahalanobis(x, center = c(mean(VOLUNTEERS), mean(MEALSPERVOL)), cov = cov(cbind(VOLUNTEERS, MEALSPERVOL)))
mdist
## [1] 7.296471
Use qchisq() to compute the critical threshold at a 95% confidence level for the mahalanobis distance based on the interaction between VOLUNTEERS and MEALSPERVOL
mdist > qchisq(0.95, df = 1, ) #If true, then significant (outlier)
## [1] TRUE
Explain what the mahalanobis distance and corresponding critical threshold above mean in terms of whether or not this case is a multivariate outlier
The chi square test of the mahalanobis distance shows that the distance is statistically significant and therefore November is a multivariate outlier.
Explain if the conclusion above supports your earlier conclusion regarding the presence of absence of multivariate outliers
Yes, it supports my earlier conclusion.