library(mlbench)
library(tidyverse)
library(skimr)
library(e1071)
library(PerformanceAnalytics)
library(MASS)
library(caret)
data(Glass)
skim(Glass)
Name | Glass |
Number of rows | 214 |
Number of columns | 10 |
_______________________ | |
Column type frequency: | |
factor | 1 |
numeric | 9 |
________________________ | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
Type | 0 | 1 | FALSE | 6 | 2: 76, 1: 70, 7: 29, 3: 17 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
RI | 0 | 1 | 1.52 | 0.00 | 1.51 | 1.52 | 1.52 | 1.52 | 1.53 | ▁▇▂▁▁ |
Na | 0 | 1 | 13.41 | 0.82 | 10.73 | 12.91 | 13.30 | 13.83 | 17.38 | ▁▇▆▁▁ |
Mg | 0 | 1 | 2.68 | 1.44 | 0.00 | 2.11 | 3.48 | 3.60 | 4.49 | ▃▁▁▇▅ |
Al | 0 | 1 | 1.44 | 0.50 | 0.29 | 1.19 | 1.36 | 1.63 | 3.50 | ▂▇▃▁▁ |
Si | 0 | 1 | 72.65 | 0.77 | 69.81 | 72.28 | 72.79 | 73.09 | 75.41 | ▁▂▇▂▁ |
K | 0 | 1 | 0.50 | 0.65 | 0.00 | 0.12 | 0.56 | 0.61 | 6.21 | ▇▁▁▁▁ |
Ca | 0 | 1 | 8.96 | 1.42 | 5.43 | 8.24 | 8.60 | 9.17 | 16.19 | ▁▇▁▁▁ |
Ba | 0 | 1 | 0.18 | 0.50 | 0.00 | 0.00 | 0.00 | 0.00 | 3.15 | ▇▁▁▁▁ |
Fe | 0 | 1 | 0.06 | 0.10 | 0.00 | 0.00 | 0.00 | 0.10 | 0.51 | ▇▁▁▁▁ |
apply(Glass %>% dplyr::select(-Type), 2, e1071::skewness)
## RI Na Mg Al Si K
## 1.6027151 0.4478343 -1.1364523 0.8946104 -0.7202392 6.4600889
## Ca Ba Fe
## 2.0184463 3.3686800 1.7298107
lambda <- apply(Glass %>% dplyr::select(-Type), 2, caret::BoxCoxTrans)
lapply(lambda, '[', 1)
## $RI
## $RI$lambda
## [1] -2
##
##
## $Na
## $Na$lambda
## [1] -0.1
##
##
## $Mg
## $Mg$lambda
## [1] NA
##
##
## $Al
## $Al$lambda
## [1] 0.5
##
##
## $Si
## $Si$lambda
## [1] 2
##
##
## $K
## $K$lambda
## [1] NA
##
##
## $Ca
## $Ca$lambda
## [1] -1.1
##
##
## $Ba
## $Ba$lambda
## [1] NA
##
##
## $Fe
## $Fe$lambda
## [1] NA
plot(Glass$Type)
chart.Correlation(Glass %>% dplyr::select(-Type))
data("Soybean")
s<-skim(Soybean)
s1 <- s %>%
dplyr::select(skim_variable, factor.n_unique, factor.top_counts) %>%
filter(factor.n_unique == 2) %>%
as.data.frame() %>%
separate(col = 3, sep=",", into = c("var1","var2")) %>%
mutate(var1 = str_replace(var1, "0: ", "")) %>%
mutate(var2 = str_replace(var2, "0: ", "")) %>%
mutate(var1 = str_replace(var1, "1: ", "")) %>%
mutate(var2 = str_replace(var2, "1: ", ""))
s1$var1 <- as.numeric(s1$var1)
s1$var2 <- as.numeric(s1$var2)
s1 %>% mutate(ratio = var1/var2) %>%
arrange(desc(ratio)) %>%
filter(ratio > 10)
## skim_variable factor.n_unique var1 var2 ratio
## 1 mycelium 2 639 6 106.50000
## 2 sclerotia 2 625 20 31.25000
## 3 shriveling 2 539 38 14.18421
## 4 lodging 2 520 42 12.38095
## 5 leaf.malf 2 554 45 12.31111
These 5 variables have a high near-zero variance.
s2 <- s %>% dplyr::select(skim_variable, complete_rate)
hist(s2$complete_rate)
Soybean %>%
mutate(cc = complete.cases(Soybean)) %>%
group_by(Class) %>%
summarise(n(), sum(cc))
## # A tibble: 19 x 3
## Class `n()` `sum(cc)`
## <fct> <int> <int>
## 1 2-4-d-injury 16 0
## 2 alternarialeaf-spot 91 91
## 3 anthracnose 44 44
## 4 bacterial-blight 20 20
## 5 bacterial-pustule 20 20
## 6 brown-spot 92 92
## 7 brown-stem-rot 44 44
## 8 charcoal-rot 20 20
## 9 cyst-nematode 14 0
## 10 diaporthe-pod-&-stem-blight 15 0
## 11 diaporthe-stem-canker 20 20
## 12 downy-mildew 20 20
## 13 frog-eye-leaf-spot 91 91
## 14 herbicide-injury 8 0
## 15 phyllosticta-leaf-spot 20 20
## 16 phytophthora-rot 88 20
## 17 powdery-mildew 20 20
## 18 purple-seed-stain 20 20
## 19 rhizoctonia-root-rot 20 20
The variables with 0 complete cases are more likely to have missing values.
For missing values, you could impute the mean or use a more robust algorithm like MICE. Testing each type of missing value to see how it affects your model is important when making your final decision.