knitr::opts_chunk$set(echo = TRUE)

library(dlookr)
library(mice)
library(VIM)

Question 1

Create my_list

my_list <- list(
  A = c(1:5, 7:3),
  B = matrix(1:6, nrow = 2)
)

my_list
## $A
##  [1] 1 2 3 4 5 7 6 5 4 3
## 
## $B
##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6

1a. Length of each element

lapply(my_list, length)
## $A
## [1] 10
## 
## $B
## [1] 6

1b. Sum of each element

lapply(my_list, sum)
## $A
## [1] 40
## 
## $B
## [1] 21

Question 2

2a. Boxplots of First Four Columns of iris

apply(iris[, 1:4], 2, boxplot)

## $Sepal.Length
## $Sepal.Length$stats
##      [,1]
## [1,]  4.3
## [2,]  5.1
## [3,]  5.8
## [4,]  6.4
## [5,]  7.9
## 
## $Sepal.Length$n
## [1] 150
## 
## $Sepal.Length$conf
##          [,1]
## [1,] 5.632292
## [2,] 5.967708
## 
## $Sepal.Length$out
## numeric(0)
## 
## $Sepal.Length$group
## numeric(0)
## 
## $Sepal.Length$names
## [1] "1"
## 
## 
## $Sepal.Width
## $Sepal.Width$stats
##      [,1]
## [1,]  2.2
## [2,]  2.8
## [3,]  3.0
## [4,]  3.3
## [5,]  4.0
## 
## $Sepal.Width$n
## [1] 150
## 
## $Sepal.Width$conf
##          [,1]
## [1,] 2.935497
## [2,] 3.064503
## 
## $Sepal.Width$out
## [1] 4.4 4.1 4.2 2.0
## 
## $Sepal.Width$group
## [1] 1 1 1 1
## 
## $Sepal.Width$names
## [1] "1"
## 
## 
## $Petal.Length
## $Petal.Length$stats
##      [,1]
## [1,] 1.00
## [2,] 1.60
## [3,] 4.35
## [4,] 5.10
## [5,] 6.90
## 
## $Petal.Length$n
## [1] 150
## 
## $Petal.Length$conf
##          [,1]
## [1,] 3.898477
## [2,] 4.801523
## 
## $Petal.Length$out
## numeric(0)
## 
## $Petal.Length$group
## numeric(0)
## 
## $Petal.Length$names
## [1] "1"
## 
## 
## $Petal.Width
## $Petal.Width$stats
##      [,1]
## [1,]  0.1
## [2,]  0.3
## [3,]  1.3
## [4,]  1.8
## [5,]  2.5
## 
## $Petal.Width$n
## [1] 150
## 
## $Petal.Width$conf
##         [,1]
## [1,] 1.10649
## [2,] 1.49351
## 
## $Petal.Width$out
## numeric(0)
## 
## $Petal.Width$group
## numeric(0)
## 
## $Petal.Width$names
## [1] "1"

2b. Remove Outliers and Replot

remove_outliers_rows <- function(df) {
  Q1 <- apply(df, 2, quantile, 0.25)
  Q3 <- apply(df, 2, quantile, 0.75)
  IQR_val <- Q3 - Q1
  
  lower_bound <- Q1 - 1.5 * IQR_val
  upper_bound <- Q3 + 1.5 * IQR_val
  
  df_clean <- df[
    apply(df, 1, function(row)
      all(row >= lower_bound & row <= upper_bound)
    ),
  ]
  
  return(df_clean)
}

iris_clean <- remove_outliers_rows(iris[, 1:4])

apply(iris_clean, 2, boxplot)

## $Sepal.Length
## $Sepal.Length$stats
##      [,1]
## [1,]  4.3
## [2,]  5.1
## [3,]  5.8
## [4,]  6.4
## [5,]  7.9
## 
## $Sepal.Length$n
## [1] 146
## 
## $Sepal.Length$conf
##         [,1]
## [1,] 5.63001
## [2,] 5.96999
## 
## $Sepal.Length$out
## numeric(0)
## 
## $Sepal.Length$group
## numeric(0)
## 
## $Sepal.Length$names
## [1] "1"
## 
## 
## $Sepal.Width
## $Sepal.Width$stats
##      [,1]
## [1,]  2.2
## [2,]  2.8
## [3,]  3.0
## [4,]  3.3
## [5,]  4.0
## 
## $Sepal.Width$n
## [1] 146
## 
## $Sepal.Width$conf
##          [,1]
## [1,] 2.934619
## [2,] 3.065381
## 
## $Sepal.Width$out
## numeric(0)
## 
## $Sepal.Width$group
## numeric(0)
## 
## $Sepal.Width$names
## [1] "1"
## 
## 
## $Petal.Length
## $Petal.Length$stats
##      [,1]
## [1,]  1.0
## [2,]  1.6
## [3,]  4.4
## [4,]  5.1
## [5,]  6.9
## 
## $Petal.Length$n
## [1] 146
## 
## $Petal.Length$conf
##          [,1]
## [1,] 3.942334
## [2,] 4.857666
## 
## $Petal.Length$out
## numeric(0)
## 
## $Petal.Length$group
## numeric(0)
## 
## $Petal.Length$names
## [1] "1"
## 
## 
## $Petal.Width
## $Petal.Width$stats
##      [,1]
## [1,]  0.1
## [2,]  0.3
## [3,]  1.3
## [4,]  1.8
## [5,]  2.5
## 
## $Petal.Width$n
## [1] 146
## 
## $Petal.Width$conf
##          [,1]
## [1,] 1.103857
## [2,] 1.496143
## 
## $Petal.Width$out
## numeric(0)
## 
## $Petal.Width$group
## numeric(0)
## 
## $Petal.Width$names
## [1] "1"

Written Answer

Outliers are identified using the IQR rule. After removal, the boxplots show reduced extreme values and more symmetric distributions.

2c. Shapiro-Wilk Normality Tests

shapiro_results <- apply(iris[, 1:4], 2, function(x) shapiro.test(x)$p.value)
shapiro_results
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
## 1.018116e-02 1.011543e-01 7.412263e-10 1.680465e-08

Written Answer

Any variable with a p-value less than 0.05 violates the normality assumption.

2d. Log and Square Root Transformations

non_normal_vars <- names(shapiro_results[shapiro_results < 0.05])
non_normal_vars
## [1] "Sepal.Length" "Petal.Length" "Petal.Width"
for (var in non_normal_vars) {
  
  cat("\nOriginal:", var, "\n")
  print(plot_normality(as.data.frame(iris[var])))
  
  cat("\nLog Transformation:", var, "\n")
  print(plot_normality(as.data.frame(log(iris[var]))))
  
  cat("\nSquare Root Transformation:", var, "\n")
  print(plot_normality(as.data.frame(sqrt(iris[var]))))
}
## 
## Original: Sepal.Length
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the dlookr package.
##   Please report the issue at <https://github.com/choonghyunryu/dlookr/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## [[1]]
## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name               grob
## 1 1 (2-2,1-1) arrange     gtable[layout]
## 2 2 (2-2,2-2) arrange     gtable[layout]
## 3 3 (3-3,1-1) arrange     gtable[layout]
## 4 4 (3-3,2-2) arrange     gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.11]
## 
## 
## Log Transformation: Sepal.Length

## [[1]]
## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name                grob
## 1 1 (2-2,1-1) arrange      gtable[layout]
## 2 2 (2-2,2-2) arrange      gtable[layout]
## 3 3 (3-3,1-1) arrange      gtable[layout]
## 4 4 (3-3,2-2) arrange      gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.156]
## 
## 
## Square Root Transformation: Sepal.Length

## [[1]]
## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name                grob
## 1 1 (2-2,1-1) arrange      gtable[layout]
## 2 2 (2-2,2-2) arrange      gtable[layout]
## 3 3 (3-3,1-1) arrange      gtable[layout]
## 4 4 (3-3,2-2) arrange      gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.298]
## 
## 
## Original: Petal.Length

## [[1]]
## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name                grob
## 1 1 (2-2,1-1) arrange      gtable[layout]
## 2 2 (2-2,2-2) arrange      gtable[layout]
## 3 3 (3-3,1-1) arrange      gtable[layout]
## 4 4 (3-3,2-2) arrange      gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.440]
## 
## 
## Log Transformation: Petal.Length

## [[1]]
## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name                grob
## 1 1 (2-2,1-1) arrange      gtable[layout]
## 2 2 (2-2,2-2) arrange      gtable[layout]
## 3 3 (3-3,1-1) arrange      gtable[layout]
## 4 4 (3-3,2-2) arrange      gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.582]
## 
## 
## Square Root Transformation: Petal.Length

## [[1]]
## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name                grob
## 1 1 (2-2,1-1) arrange      gtable[layout]
## 2 2 (2-2,2-2) arrange      gtable[layout]
## 3 3 (3-3,1-1) arrange      gtable[layout]
## 4 4 (3-3,2-2) arrange      gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.724]
## 
## 
## Original: Petal.Width

## [[1]]
## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name                grob
## 1 1 (2-2,1-1) arrange      gtable[layout]
## 2 2 (2-2,2-2) arrange      gtable[layout]
## 3 3 (3-3,1-1) arrange      gtable[layout]
## 4 4 (3-3,2-2) arrange      gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.866]
## 
## 
## Log Transformation: Petal.Width

## [[1]]
## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name                 grob
## 1 1 (2-2,1-1) arrange       gtable[layout]
## 2 2 (2-2,2-2) arrange       gtable[layout]
## 3 3 (3-3,1-1) arrange       gtable[layout]
## 4 4 (3-3,2-2) arrange       gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1008]
## 
## 
## Square Root Transformation: Petal.Width

## [[1]]
## TableGrob (3 x 2) "arrange": 5 grobs
##   z     cells    name                 grob
## 1 1 (2-2,1-1) arrange       gtable[layout]
## 2 2 (2-2,2-2) arrange       gtable[layout]
## 3 3 (3-3,1-1) arrange       gtable[layout]
## 4 4 (3-3,2-2) arrange       gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1150]

Written Answer

Based on the normality plots, log or square-root transformations may improve normality depending on the variable.

Question 3

3a. Assess Missing Data Pattern

data(nhanes)

aggr(nhanes,
     numbers = TRUE,
     sortVars = TRUE,
     labels = names(nhanes),
     cex.axis = .7,
     gap = 3,
     ylab = c("Missing Data", "Pattern"))

## 
##  Variables sorted by number of missings: 
##  Variable Count
##       chl  0.40
##       bmi  0.36
##       hyp  0.32
##       age  0.00

Written Answer

The aggr() output shows both the percentage of missing data and the missing data pattern across variables.

3b. Impute Missing Data

set.seed(123)

imputed_data <- mice(nhanes,
                     m = 10,
                     method = "pmm",
                     seed = 123)
## 
##  iter imp variable
##   1   1  bmi  hyp  chl
##   1   2  bmi  hyp  chl
##   1   3  bmi  hyp  chl
##   1   4  bmi  hyp  chl
##   1   5  bmi  hyp  chl
##   1   6  bmi  hyp  chl
##   1   7  bmi  hyp  chl
##   1   8  bmi  hyp  chl
##   1   9  bmi  hyp  chl
##   1   10  bmi  hyp  chl
##   2   1  bmi  hyp  chl
##   2   2  bmi  hyp  chl
##   2   3  bmi  hyp  chl
##   2   4  bmi  hyp  chl
##   2   5  bmi  hyp  chl
##   2   6  bmi  hyp  chl
##   2   7  bmi  hyp  chl
##   2   8  bmi  hyp  chl
##   2   9  bmi  hyp  chl
##   2   10  bmi  hyp  chl
##   3   1  bmi  hyp  chl
##   3   2  bmi  hyp  chl
##   3   3  bmi  hyp  chl
##   3   4  bmi  hyp  chl
##   3   5  bmi  hyp  chl
##   3   6  bmi  hyp  chl
##   3   7  bmi  hyp  chl
##   3   8  bmi  hyp  chl
##   3   9  bmi  hyp  chl
##   3   10  bmi  hyp  chl
##   4   1  bmi  hyp  chl
##   4   2  bmi  hyp  chl
##   4   3  bmi  hyp  chl
##   4   4  bmi  hyp  chl
##   4   5  bmi  hyp  chl
##   4   6  bmi  hyp  chl
##   4   7  bmi  hyp  chl
##   4   8  bmi  hyp  chl
##   4   9  bmi  hyp  chl
##   4   10  bmi  hyp  chl
##   5   1  bmi  hyp  chl
##   5   2  bmi  hyp  chl
##   5   3  bmi  hyp  chl
##   5   4  bmi  hyp  chl
##   5   5  bmi  hyp  chl
##   5   6  bmi  hyp  chl
##   5   7  bmi  hyp  chl
##   5   8  bmi  hyp  chl
##   5   9  bmi  hyp  chl
##   5   10  bmi  hyp  chl
imputed_data
## Class: mids
## Number of multiple imputations:  10 
## Imputation methods:
##   age   bmi   hyp   chl 
##    "" "pmm" "pmm" "pmm" 
## PredictorMatrix:
##     age bmi hyp chl
## age   0   1   1   1
## bmi   1   0   1   1
## hyp   1   1   0   1
## chl   1   1   1   0

3c. Extract the 10th Imputed Dataset

completed_data_10 <- complete(imputed_data, 10)
completed_data_10
##    age  bmi hyp chl
## 1    1 29.6   1 229
## 2    2 22.7   1 187
## 3    1 27.2   1 187
## 4    3 27.4   1 186
## 5    1 20.4   1 113
## 6    3 21.7   2 184
## 7    1 22.5   1 118
## 8    1 30.1   1 187
## 9    2 22.0   1 238
## 10   2 25.5   1 204
## 11   1 29.6   1 184
## 12   2 27.5   2 131
## 13   3 21.7   1 206
## 14   2 28.7   2 204
## 15   1 29.6   1 204
## 16   1 25.5   1 118
## 17   3 27.2   2 284
## 18   2 26.3   2 199
## 19   1 35.3   1 218
## 20   3 25.5   2 218
## 21   1 22.0   1 187
## 22   1 33.2   1 229
## 23   1 27.5   1 131
## 24   3 24.9   1 218
## 25   2 27.4   1 186