pacman::p_load(
skimr,
psych,
summarytools,
janitor,
graphics,
lattice,
ggplot2,
mlbench,
Amelia,
corrplot
)Summary and Visualization of Iris and Other Data
Load Packages
Load package
Import data
data("iris")Summary of dataset
# check row & cols no
dim(iris)[1] 150 5
#check col names
names(iris) [1] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width" "Species"
summary(iris) Sepal.Length Sepal.Width Petal.Length Petal.Width
Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
Median :5.800 Median :3.000 Median :4.350 Median :1.300
Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
Species
setosa :50
versicolor:50
virginica :50
# use skimr
skim(iris) | Name | iris |
| Number of rows | 150 |
| Number of columns | 5 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Species | 0 | 1 | FALSE | 3 | set: 50, ver: 50, vir: 50 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Sepal.Length | 0 | 1 | 5.84 | 0.83 | 4.3 | 5.1 | 5.80 | 6.4 | 7.9 | ▆▇▇▅▂ |
| Sepal.Width | 0 | 1 | 3.06 | 0.44 | 2.0 | 2.8 | 3.00 | 3.3 | 4.4 | ▁▆▇▂▁ |
| Petal.Length | 0 | 1 | 3.76 | 1.77 | 1.0 | 1.6 | 4.35 | 5.1 | 6.9 | ▇▁▆▇▂ |
| Petal.Width | 0 | 1 | 1.20 | 0.76 | 0.1 | 0.3 | 1.30 | 1.8 | 2.5 | ▇▁▇▅▃ |
# psych package
describe(iris) vars n mean sd median trimmed mad min max range skew
Sepal.Length 1 150 5.84 0.83 5.80 5.81 1.04 4.3 7.9 3.6 0.31
Sepal.Width 2 150 3.06 0.44 3.00 3.04 0.44 2.0 4.4 2.4 0.31
Petal.Length 3 150 3.76 1.77 4.35 3.76 1.85 1.0 6.9 5.9 -0.27
Petal.Width 4 150 1.20 0.76 1.30 1.18 1.04 0.1 2.5 2.4 -0.10
Species* 5 150 2.00 0.82 2.00 2.00 1.48 1.0 3.0 2.0 0.00
kurtosis se
Sepal.Length -0.61 0.07
Sepal.Width 0.14 0.04
Petal.Length -1.42 0.14
Petal.Width -1.36 0.06
Species* -1.52 0.07
str(iris) # check data structure'data.frame': 150 obs. of 5 variables:
$ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
$ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
$ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
$ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
tabyl(iris, Species, Petal.Width) # use janitor package (cross tabulate) Species 0.1 0.2 0.3 0.4 0.5 0.6 1 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2 2.1
setosa 5 29 7 7 1 1 0 0 0 0 0 0 0 0 0 0 0 0
versicolor 0 0 0 0 0 0 7 3 5 13 7 10 3 1 1 0 0 0
virginica 0 0 0 0 0 0 0 0 0 0 1 2 1 1 11 5 6 6
2.2 2.3 2.4 2.5
0 0 0 0
0 0 0 0
3 8 3 3
dfSummary(iris) # summarytools packageData Frame Summary
iris
Dimensions: 150 x 5
Duplicates: 1
-----------------------------------------------------------------------------------------------------------
No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
---- -------------- ----------------------- -------------------- --------------------- ---------- ---------
1 Sepal.Length Mean (sd) : 5.8 (0.8) 35 distinct values . . : : 150 0
[numeric] min < med < max: : : : : (100.0%) (0.0%)
4.3 < 5.8 < 7.9 : : : : :
IQR (CV) : 1.3 (0.1) : : : : :
: : : : : : : :
2 Sepal.Width Mean (sd) : 3.1 (0.4) 23 distinct values : 150 0
[numeric] min < med < max: : (100.0%) (0.0%)
2 < 3 < 4.4 . :
IQR (CV) : 0.5 (0.1) : : : :
. . : : : : : :
3 Petal.Length Mean (sd) : 3.8 (1.8) 43 distinct values : 150 0
[numeric] min < med < max: : . : (100.0%) (0.0%)
1 < 4.3 < 6.9 : : : .
IQR (CV) : 3.5 (0.5) : : : : : .
: : . : : : : : .
4 Petal.Width Mean (sd) : 1.2 (0.8) 22 distinct values : 150 0
[numeric] min < med < max: : (100.0%) (0.0%)
0.1 < 1.3 < 2.5 : . . :
IQR (CV) : 1.5 (0.6) : : : : .
: : : : : . : : :
5 Species 1. setosa 50 (33.3%) IIIIII 150 0
[factor] 2. versicolor 50 (33.3%) IIIIII (100.0%) (0.0%)
3. virginica 50 (33.3%) IIIIII
-----------------------------------------------------------------------------------------------------------
Univariate Visualization
Histograms
par(mfrow=c(1,4)) # multi-frame row-wise layout (1RowX4col)
for(i in 1:4) {
hist(iris[,i], main=names(iris)[i])
}Lattice Density Plots
par(mfrow=c(1,4))
for(i in 1:4) {
plot(density(iris[,i]), main=names(iris)[i])
}Box and Whisker Plots
par(mfrow=c(1,4))
for(i in 1:4) {
boxplot(iris[,i], main=names(iris)[i])
}Bar Plots for Breast Cancer Dataset
data(BreastCancer)
par(mfrow=c(2,4))
for(i in 2:9) {
counts <- table(BreastCancer[,i])
name <- names(BreastCancer)[i]
barplot(counts, main=name)
}Missing Plot for Soybean Dataset
data(Soybean)
missmap(Soybean, col=c("black", "grey"), legend=FALSE)Multivariate Visualization
correlations <- cor(iris[,1:4])
corrplot(correlations, method="circle")Pair-wise Scatterplots
pairs(iris)