knitr::opts_chunk$set(echo = TRUE)
library(dlookr)
library(mice)
library(VIM)
my_list <- list(
A = c(1:5, 7:3),
B = matrix(1:6, nrow = 2)
)
my_list
## $A
## [1] 1 2 3 4 5 7 6 5 4 3
##
## $B
## [,1] [,2] [,3]
## [1,] 1 3 5
## [2,] 2 4 6
lapply(my_list, length)
## $A
## [1] 10
##
## $B
## [1] 6
lapply(my_list, sum)
## $A
## [1] 40
##
## $B
## [1] 21
apply(iris[, 1:4], 2, boxplot)
## $Sepal.Length
## $Sepal.Length$stats
## [,1]
## [1,] 4.3
## [2,] 5.1
## [3,] 5.8
## [4,] 6.4
## [5,] 7.9
##
## $Sepal.Length$n
## [1] 150
##
## $Sepal.Length$conf
## [,1]
## [1,] 5.632292
## [2,] 5.967708
##
## $Sepal.Length$out
## numeric(0)
##
## $Sepal.Length$group
## numeric(0)
##
## $Sepal.Length$names
## [1] "1"
##
##
## $Sepal.Width
## $Sepal.Width$stats
## [,1]
## [1,] 2.2
## [2,] 2.8
## [3,] 3.0
## [4,] 3.3
## [5,] 4.0
##
## $Sepal.Width$n
## [1] 150
##
## $Sepal.Width$conf
## [,1]
## [1,] 2.935497
## [2,] 3.064503
##
## $Sepal.Width$out
## [1] 4.4 4.1 4.2 2.0
##
## $Sepal.Width$group
## [1] 1 1 1 1
##
## $Sepal.Width$names
## [1] "1"
##
##
## $Petal.Length
## $Petal.Length$stats
## [,1]
## [1,] 1.00
## [2,] 1.60
## [3,] 4.35
## [4,] 5.10
## [5,] 6.90
##
## $Petal.Length$n
## [1] 150
##
## $Petal.Length$conf
## [,1]
## [1,] 3.898477
## [2,] 4.801523
##
## $Petal.Length$out
## numeric(0)
##
## $Petal.Length$group
## numeric(0)
##
## $Petal.Length$names
## [1] "1"
##
##
## $Petal.Width
## $Petal.Width$stats
## [,1]
## [1,] 0.1
## [2,] 0.3
## [3,] 1.3
## [4,] 1.8
## [5,] 2.5
##
## $Petal.Width$n
## [1] 150
##
## $Petal.Width$conf
## [,1]
## [1,] 1.10649
## [2,] 1.49351
##
## $Petal.Width$out
## numeric(0)
##
## $Petal.Width$group
## numeric(0)
##
## $Petal.Width$names
## [1] "1"
remove_outliers_rows <- function(df) {
Q1 <- apply(df, 2, quantile, 0.25)
Q3 <- apply(df, 2, quantile, 0.75)
IQR_val <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_val
upper_bound <- Q3 + 1.5 * IQR_val
df_clean <- df[
apply(df, 1, function(row)
all(row >= lower_bound & row <= upper_bound)
),
]
return(df_clean)
}
iris_clean <- remove_outliers_rows(iris[, 1:4])
apply(iris_clean, 2, boxplot)
## $Sepal.Length
## $Sepal.Length$stats
## [,1]
## [1,] 4.3
## [2,] 5.1
## [3,] 5.8
## [4,] 6.4
## [5,] 7.9
##
## $Sepal.Length$n
## [1] 146
##
## $Sepal.Length$conf
## [,1]
## [1,] 5.63001
## [2,] 5.96999
##
## $Sepal.Length$out
## numeric(0)
##
## $Sepal.Length$group
## numeric(0)
##
## $Sepal.Length$names
## [1] "1"
##
##
## $Sepal.Width
## $Sepal.Width$stats
## [,1]
## [1,] 2.2
## [2,] 2.8
## [3,] 3.0
## [4,] 3.3
## [5,] 4.0
##
## $Sepal.Width$n
## [1] 146
##
## $Sepal.Width$conf
## [,1]
## [1,] 2.934619
## [2,] 3.065381
##
## $Sepal.Width$out
## numeric(0)
##
## $Sepal.Width$group
## numeric(0)
##
## $Sepal.Width$names
## [1] "1"
##
##
## $Petal.Length
## $Petal.Length$stats
## [,1]
## [1,] 1.0
## [2,] 1.6
## [3,] 4.4
## [4,] 5.1
## [5,] 6.9
##
## $Petal.Length$n
## [1] 146
##
## $Petal.Length$conf
## [,1]
## [1,] 3.942334
## [2,] 4.857666
##
## $Petal.Length$out
## numeric(0)
##
## $Petal.Length$group
## numeric(0)
##
## $Petal.Length$names
## [1] "1"
##
##
## $Petal.Width
## $Petal.Width$stats
## [,1]
## [1,] 0.1
## [2,] 0.3
## [3,] 1.3
## [4,] 1.8
## [5,] 2.5
##
## $Petal.Width$n
## [1] 146
##
## $Petal.Width$conf
## [,1]
## [1,] 1.103857
## [2,] 1.496143
##
## $Petal.Width$out
## numeric(0)
##
## $Petal.Width$group
## numeric(0)
##
## $Petal.Width$names
## [1] "1"
Outliers are identified using the IQR rule. After removal, the boxplots show reduced extreme values and more symmetric distributions.
shapiro_results <- apply(iris[, 1:4], 2, function(x) shapiro.test(x)$p.value)
shapiro_results
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1.018116e-02 1.011543e-01 7.412263e-10 1.680465e-08
Any variable with a p-value less than 0.05 violates the normality assumption.
non_normal_vars <- names(shapiro_results[shapiro_results < 0.05])
non_normal_vars
## [1] "Sepal.Length" "Petal.Length" "Petal.Width"
for (var in non_normal_vars) {
cat("\nOriginal:", var, "\n")
print(plot_normality(as.data.frame(iris[var])))
cat("\nLog Transformation:", var, "\n")
print(plot_normality(as.data.frame(log(iris[var]))))
cat("\nSquare Root Transformation:", var, "\n")
print(plot_normality(as.data.frame(sqrt(iris[var]))))
}
##
## Original: Sepal.Length
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## ℹ The deprecated feature was likely used in the dlookr package.
## Please report the issue at <https://github.com/choonghyunryu/dlookr/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## [[1]]
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.11]
##
##
## Log Transformation: Sepal.Length
## [[1]]
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.156]
##
##
## Square Root Transformation: Sepal.Length
## [[1]]
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.298]
##
##
## Original: Petal.Length
## [[1]]
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.440]
##
##
## Log Transformation: Petal.Length
## [[1]]
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.582]
##
##
## Square Root Transformation: Petal.Length
## [[1]]
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.724]
##
##
## Original: Petal.Width
## [[1]]
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.866]
##
##
## Log Transformation: Petal.Width
## [[1]]
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1008]
##
##
## Square Root Transformation: Petal.Width
## [[1]]
## TableGrob (3 x 2) "arrange": 5 grobs
## z cells name grob
## 1 1 (2-2,1-1) arrange gtable[layout]
## 2 2 (2-2,2-2) arrange gtable[layout]
## 3 3 (3-3,1-1) arrange gtable[layout]
## 4 4 (3-3,2-2) arrange gtable[layout]
## 5 5 (1-1,1-2) arrange text[GRID.text.1150]
Based on the normality plots, log or square-root transformations may improve normality depending on the variable.
data(nhanes)
aggr(nhanes,
numbers = TRUE,
sortVars = TRUE,
labels = names(nhanes),
cex.axis = .7,
gap = 3,
ylab = c("Missing Data", "Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## chl 0.40
## bmi 0.36
## hyp 0.32
## age 0.00
The aggr() output shows both the percentage of missing data and the missing data pattern across variables.
set.seed(123)
imputed_data <- mice(nhanes,
m = 10,
method = "pmm",
seed = 123)
##
## iter imp variable
## 1 1 bmi hyp chl
## 1 2 bmi hyp chl
## 1 3 bmi hyp chl
## 1 4 bmi hyp chl
## 1 5 bmi hyp chl
## 1 6 bmi hyp chl
## 1 7 bmi hyp chl
## 1 8 bmi hyp chl
## 1 9 bmi hyp chl
## 1 10 bmi hyp chl
## 2 1 bmi hyp chl
## 2 2 bmi hyp chl
## 2 3 bmi hyp chl
## 2 4 bmi hyp chl
## 2 5 bmi hyp chl
## 2 6 bmi hyp chl
## 2 7 bmi hyp chl
## 2 8 bmi hyp chl
## 2 9 bmi hyp chl
## 2 10 bmi hyp chl
## 3 1 bmi hyp chl
## 3 2 bmi hyp chl
## 3 3 bmi hyp chl
## 3 4 bmi hyp chl
## 3 5 bmi hyp chl
## 3 6 bmi hyp chl
## 3 7 bmi hyp chl
## 3 8 bmi hyp chl
## 3 9 bmi hyp chl
## 3 10 bmi hyp chl
## 4 1 bmi hyp chl
## 4 2 bmi hyp chl
## 4 3 bmi hyp chl
## 4 4 bmi hyp chl
## 4 5 bmi hyp chl
## 4 6 bmi hyp chl
## 4 7 bmi hyp chl
## 4 8 bmi hyp chl
## 4 9 bmi hyp chl
## 4 10 bmi hyp chl
## 5 1 bmi hyp chl
## 5 2 bmi hyp chl
## 5 3 bmi hyp chl
## 5 4 bmi hyp chl
## 5 5 bmi hyp chl
## 5 6 bmi hyp chl
## 5 7 bmi hyp chl
## 5 8 bmi hyp chl
## 5 9 bmi hyp chl
## 5 10 bmi hyp chl
imputed_data
## Class: mids
## Number of multiple imputations: 10
## Imputation methods:
## age bmi hyp chl
## "" "pmm" "pmm" "pmm"
## PredictorMatrix:
## age bmi hyp chl
## age 0 1 1 1
## bmi 1 0 1 1
## hyp 1 1 0 1
## chl 1 1 1 0
completed_data_10 <- complete(imputed_data, 10)
completed_data_10
## age bmi hyp chl
## 1 1 29.6 1 229
## 2 2 22.7 1 187
## 3 1 27.2 1 187
## 4 3 27.4 1 186
## 5 1 20.4 1 113
## 6 3 21.7 2 184
## 7 1 22.5 1 118
## 8 1 30.1 1 187
## 9 2 22.0 1 238
## 10 2 25.5 1 204
## 11 1 29.6 1 184
## 12 2 27.5 2 131
## 13 3 21.7 1 206
## 14 2 28.7 2 204
## 15 1 29.6 1 204
## 16 1 25.5 1 118
## 17 3 27.2 2 284
## 18 2 26.3 2 199
## 19 1 35.3 1 218
## 20 3 25.5 2 218
## 21 1 22.0 1 187
## 22 1 33.2 1 229
## 23 1 27.5 1 131
## 24 3 24.9 1 218
## 25 2 27.4 1 186