Question 1

Create a list “my_list” that contains two elements A = c(1:5, 7:3) and B = matrix(1:6, nrow-2), then use the lapply() function to answer the following questions:.

A = c(1:5 ,7:3)
B = matrix(1:6, nrow = 2)
my_list = list(A = A, B = B)

a. Find the length for each of my_list’s elements

lapply(my_list, length)
## $A
## [1] 10
## 
## $B
## [1] 6


b. Find the sum for each of my_list’s elements

lapply(my_list, sum)
## $A
## [1] 40
## 
## $B
## [1] 21

Question 2

Answer the following questions:
a. Use the apply() function to create boxplots for the first four columns of the “iris” dataset.

par(mfrow = c(2,2))
apply(iris[,1:4], 2, boxplot)

## $Sepal.Length
## $Sepal.Length$stats
##      [,1]
## [1,]  4.3
## [2,]  5.1
## [3,]  5.8
## [4,]  6.4
## [5,]  7.9
## 
## $Sepal.Length$n
## [1] 150
## 
## $Sepal.Length$conf
##          [,1]
## [1,] 5.632292
## [2,] 5.967708
## 
## $Sepal.Length$out
## numeric(0)
## 
## $Sepal.Length$group
## numeric(0)
## 
## $Sepal.Length$names
## [1] "1"
## 
## 
## $Sepal.Width
## $Sepal.Width$stats
##      [,1]
## [1,]  2.2
## [2,]  2.8
## [3,]  3.0
## [4,]  3.3
## [5,]  4.0
## 
## $Sepal.Width$n
## [1] 150
## 
## $Sepal.Width$conf
##          [,1]
## [1,] 2.935497
## [2,] 3.064503
## 
## $Sepal.Width$out
## [1] 4.4 4.1 4.2 2.0
## 
## $Sepal.Width$group
## [1] 1 1 1 1
## 
## $Sepal.Width$names
## [1] "1"
## 
## 
## $Petal.Length
## $Petal.Length$stats
##      [,1]
## [1,] 1.00
## [2,] 1.60
## [3,] 4.35
## [4,] 5.10
## [5,] 6.90
## 
## $Petal.Length$n
## [1] 150
## 
## $Petal.Length$conf
##          [,1]
## [1,] 3.898477
## [2,] 4.801523
## 
## $Petal.Length$out
## numeric(0)
## 
## $Petal.Length$group
## numeric(0)
## 
## $Petal.Length$names
## [1] "1"
## 
## 
## $Petal.Width
## $Petal.Width$stats
##      [,1]
## [1,]  0.1
## [2,]  0.3
## [3,]  1.3
## [4,]  1.8
## [5,]  2.5
## 
## $Petal.Width$n
## [1] 150
## 
## $Petal.Width$conf
##         [,1]
## [1,] 1.10649
## [2,] 1.49351
## 
## $Petal.Width$out
## numeric(0)
## 
## $Petal.Width$group
## numeric(0)
## 
## $Petal.Width$names
## [1] "1"

b. Do you observe any outliers for each variable? If so, remove the outliers and present boxplots again.

clean <- iris[ Reduce(`&`, lapply(iris[1:4], function(x) ! x %in% boxplot.stats(x)$out)), ]
par(mfrow = c(2,2)); lapply(clean[1:4], boxplot)

## $Sepal.Length
## $Sepal.Length$stats
##      [,1]
## [1,]  4.3
## [2,]  5.1
## [3,]  5.8
## [4,]  6.4
## [5,]  7.9
## 
## $Sepal.Length$n
## [1] 146
## 
## $Sepal.Length$conf
##         [,1]
## [1,] 5.63001
## [2,] 5.96999
## 
## $Sepal.Length$out
## numeric(0)
## 
## $Sepal.Length$group
## numeric(0)
## 
## $Sepal.Length$names
## [1] "1"
## 
## 
## $Sepal.Width
## $Sepal.Width$stats
##      [,1]
## [1,]  2.2
## [2,]  2.8
## [3,]  3.0
## [4,]  3.3
## [5,]  4.0
## 
## $Sepal.Width$n
## [1] 146
## 
## $Sepal.Width$conf
##          [,1]
## [1,] 2.934619
## [2,] 3.065381
## 
## $Sepal.Width$out
## numeric(0)
## 
## $Sepal.Width$group
## numeric(0)
## 
## $Sepal.Width$names
## [1] "1"
## 
## 
## $Petal.Length
## $Petal.Length$stats
##      [,1]
## [1,]  1.0
## [2,]  1.6
## [3,]  4.4
## [4,]  5.1
## [5,]  6.9
## 
## $Petal.Length$n
## [1] 146
## 
## $Petal.Length$conf
##          [,1]
## [1,] 3.942334
## [2,] 4.857666
## 
## $Petal.Length$out
## numeric(0)
## 
## $Petal.Length$group
## numeric(0)
## 
## $Petal.Length$names
## [1] "1"
## 
## 
## $Petal.Width
## $Petal.Width$stats
##      [,1]
## [1,]  0.1
## [2,]  0.3
## [3,]  1.3
## [4,]  1.8
## [5,]  2.5
## 
## $Petal.Width$n
## [1] 146
## 
## $Petal.Width$conf
##          [,1]
## [1,] 1.103857
## [2,] 1.496143
## 
## $Petal.Width$out
## numeric(0)
## 
## $Petal.Width$group
## numeric(0)
## 
## $Petal.Width$names
## [1] "1"

c. Apply the shapiro.test() function using the apply() function to the first four columns of the “iris” dataset. According to the test results, which variables violate the normality assumption?

pvals <- sapply(iris[,1:4], function(x) shapiro.test(x)$p.value)

names(pvals[pvals < 0.05])
## [1] "Sepal.Length" "Petal.Length" "Petal.Width"

d. For the non-normal variables identified in Question C, use the plot_normality() function in the dlookr package to assess whether log and square-root transformations resolve the non-normlaity issue.

vars <- c("Sepal.Length", "Petal.Length", "Petal.Width")

still_non_normal <- function(x) shapiro.test(x)$p.value < 0.05

log_violations  <- vars[sapply(iris[vars], function(x) still_non_normal(log(x)))]
sqrt_violations <- vars[sapply(iris[vars], function(x) still_non_normal(sqrt(x)))]

log_violations
## [1] "Petal.Length" "Petal.Width"
sqrt_violations
## [1] "Sepal.Length" "Petal.Length" "Petal.Width"

Question 3

Use the “nhanes” data in the mice R package to asnwer the following questions:
a. Use the aggr() function in the VM package to assess the rate and pattern of missing data.

aggr(nhanes)

b. Use the mice() function to impute missing data for 10 items (set the imputation method to pmm and the seed to 122).

set.seed(122)
imputed_nhanes <- mice(nhanes, m = 10, method = "pmm", seed = 122)
## 
##  iter imp variable
##   1   1  bmi  hyp  chl
##   1   2  bmi  hyp  chl
##   1   3  bmi  hyp  chl
##   1   4  bmi  hyp  chl
##   1   5  bmi  hyp  chl
##   1   6  bmi  hyp  chl
##   1   7  bmi  hyp  chl
##   1   8  bmi  hyp  chl
##   1   9  bmi  hyp  chl
##   1   10  bmi  hyp  chl
##   2   1  bmi  hyp  chl
##   2   2  bmi  hyp  chl
##   2   3  bmi  hyp  chl
##   2   4  bmi  hyp  chl
##   2   5  bmi  hyp  chl
##   2   6  bmi  hyp  chl
##   2   7  bmi  hyp  chl
##   2   8  bmi  hyp  chl
##   2   9  bmi  hyp  chl
##   2   10  bmi  hyp  chl
##   3   1  bmi  hyp  chl
##   3   2  bmi  hyp  chl
##   3   3  bmi  hyp  chl
##   3   4  bmi  hyp  chl
##   3   5  bmi  hyp  chl
##   3   6  bmi  hyp  chl
##   3   7  bmi  hyp  chl
##   3   8  bmi  hyp  chl
##   3   9  bmi  hyp  chl
##   3   10  bmi  hyp  chl
##   4   1  bmi  hyp  chl
##   4   2  bmi  hyp  chl
##   4   3  bmi  hyp  chl
##   4   4  bmi  hyp  chl
##   4   5  bmi  hyp  chl
##   4   6  bmi  hyp  chl
##   4   7  bmi  hyp  chl
##   4   8  bmi  hyp  chl
##   4   9  bmi  hyp  chl
##   4   10  bmi  hyp  chl
##   5   1  bmi  hyp  chl
##   5   2  bmi  hyp  chl
##   5   3  bmi  hyp  chl
##   5   4  bmi  hyp  chl
##   5   5  bmi  hyp  chl
##   5   6  bmi  hyp  chl
##   5   7  bmi  hyp  chl
##   5   8  bmi  hyp  chl
##   5   9  bmi  hyp  chl
##   5   10  bmi  hyp  chl
summary(imputed_nhanes)
## Class: mids
## Number of multiple imputations:  10 
## Imputation methods:
##   age   bmi   hyp   chl 
##    "" "pmm" "pmm" "pmm" 
## PredictorMatrix:
##     age bmi hyp chl
## age   0   1   1   1
## bmi   1   0   1   1
## hyp   1   1   0   1
## chl   1   1   1   0

c. Extract and print out the 10th imputed data set

nhanes_imputed10 <- complete(imputed_nhanes, 10)

print(nhanes_imputed10)
##    age  bmi hyp chl
## 1    1 22.7   1 113
## 2    2 22.7   1 187
## 3    1 30.1   1 187
## 4    3 22.7   1 186
## 5    1 20.4   1 113
## 6    3 21.7   1 184
## 7    1 22.5   1 118
## 8    1 30.1   1 187
## 9    2 22.0   1 238
## 10   2 20.4   1 187
## 11   1 22.0   1 113
## 12   2 27.2   1 229
## 13   3 21.7   1 206
## 14   2 28.7   2 204
## 15   1 29.6   1 187
## 16   1 22.5   1 187
## 17   3 27.2   2 284
## 18   2 26.3   2 199
## 19   1 35.3   1 218
## 20   3 25.5   2 206
## 21   1 30.1   1 187
## 22   1 33.2   1 229
## 23   1 27.5   1 131
## 24   3 24.9   1 218
## 25   2 27.4   1 186