Summary and Visualization of Iris and Other Data

Load Packages


Load package

pacman::p_load(
skimr,
psych,
summarytools,
janitor,
graphics,
lattice,
ggplot2,
mlbench,
Amelia,
corrplot
)

Import data

data("iris")

Summary of dataset

# check row & cols no
dim(iris)
[1] 150   5
#check col names
names(iris) 
[1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"     
summary(iris)
  Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
 Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
 1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
 Median :5.800   Median :3.000   Median :4.350   Median :1.300  
 Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
 3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
 Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
       Species  
 setosa    :50  
 versicolor:50  
 virginica :50  
                
                
                
# use skimr
skim(iris) 
Data summary
Name iris
Number of rows 150
Number of columns 5
_______________________
Column type frequency:
factor 1
numeric 4
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Species 0 1 FALSE 3 set: 50, ver: 50, vir: 50

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Sepal.Length 0 1 5.84 0.83 4.3 5.1 5.80 6.4 7.9 ▆▇▇▅▂
Sepal.Width 0 1 3.06 0.44 2.0 2.8 3.00 3.3 4.4 ▁▆▇▂▁
Petal.Length 0 1 3.76 1.77 1.0 1.6 4.35 5.1 6.9 ▇▁▆▇▂
Petal.Width 0 1 1.20 0.76 0.1 0.3 1.30 1.8 2.5 ▇▁▇▅▃
# psych package
describe(iris)
             vars   n mean   sd median trimmed  mad min max range  skew
Sepal.Length    1 150 5.84 0.83   5.80    5.81 1.04 4.3 7.9   3.6  0.31
Sepal.Width     2 150 3.06 0.44   3.00    3.04 0.44 2.0 4.4   2.4  0.31
Petal.Length    3 150 3.76 1.77   4.35    3.76 1.85 1.0 6.9   5.9 -0.27
Petal.Width     4 150 1.20 0.76   1.30    1.18 1.04 0.1 2.5   2.4 -0.10
Species*        5 150 2.00 0.82   2.00    2.00 1.48 1.0 3.0   2.0  0.00
             kurtosis   se
Sepal.Length    -0.61 0.07
Sepal.Width      0.14 0.04
Petal.Length    -1.42 0.14
Petal.Width     -1.36 0.06
Species*        -1.52 0.07
str(iris) # check data structure
'data.frame':   150 obs. of  5 variables:
 $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
 $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
tabyl(iris, Species, Petal.Width) # use janitor package (cross tabulate)
    Species 0.1 0.2 0.3 0.4 0.5 0.6 1 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2 2.1
     setosa   5  29   7   7   1   1 0   0   0   0   0   0   0   0   0   0 0   0
 versicolor   0   0   0   0   0   0 7   3   5  13   7  10   3   1   1   0 0   0
  virginica   0   0   0   0   0   0 0   0   0   0   1   2   1   1  11   5 6   6
 2.2 2.3 2.4 2.5
   0   0   0   0
   0   0   0   0
   3   8   3   3
dfSummary(iris) # summarytools package
Data Frame Summary  
iris  
Dimensions: 150 x 5  
Duplicates: 1  

-----------------------------------------------------------------------------------------------------------
No   Variable       Stats / Values          Freqs (% of Valid)   Graph                 Valid      Missing  
---- -------------- ----------------------- -------------------- --------------------- ---------- ---------
1    Sepal.Length   Mean (sd) : 5.8 (0.8)   35 distinct values     . . : :             150        0        
     [numeric]      min < med < max:                               : : : :             (100.0%)   (0.0%)   
                    4.3 < 5.8 < 7.9                                : : : : :                               
                    IQR (CV) : 1.3 (0.1)                           : : : : :                               
                                                                 : : : : : : : :                           

2    Sepal.Width    Mean (sd) : 3.1 (0.4)   23 distinct values           :             150        0        
     [numeric]      min < med < max:                                     :             (100.0%)   (0.0%)   
                    2 < 3 < 4.4                                        . :                                 
                    IQR (CV) : 0.5 (0.1)                             : : : :                               
                                                                 . . : : : : : :                           

3    Petal.Length   Mean (sd) : 3.8 (1.8)   43 distinct values   :                     150        0        
     [numeric]      min < med < max:                             :         . :         (100.0%)   (0.0%)   
                    1 < 4.3 < 6.9                                :         : : .                           
                    IQR (CV) : 3.5 (0.5)                         : :       : : : .                         
                                                                 : :   . : : : : : .                       

4    Petal.Width    Mean (sd) : 1.2 (0.8)   22 distinct values   :                     150        0        
     [numeric]      min < med < max:                             :                     (100.0%)   (0.0%)   
                    0.1 < 1.3 < 2.5                              :       . .   :                           
                    IQR (CV) : 1.5 (0.6)                         :       : :   :   .                       
                                                                 : :   : : : . : : :                       

5    Species        1. setosa               50 (33.3%)           IIIIII                150        0        
     [factor]       2. versicolor           50 (33.3%)           IIIIII                (100.0%)   (0.0%)   
                    3. virginica            50 (33.3%)           IIIIII                                    
-----------------------------------------------------------------------------------------------------------

Univariate Visualization

Histograms

par(mfrow=c(1,4)) # multi-frame row-wise layout (1RowX4col)
for(i in 1:4) {
hist(iris[,i], main=names(iris)[i])
}

Lattice Density Plots

par(mfrow=c(1,4))
for(i in 1:4) {
plot(density(iris[,i]), main=names(iris)[i])
}

Box and Whisker Plots

par(mfrow=c(1,4))
for(i in 1:4) {
boxplot(iris[,i], main=names(iris)[i])
}

Bar Plots for Breast Cancer Dataset

data(BreastCancer)
par(mfrow=c(2,4))
for(i in 2:9) {
counts <- table(BreastCancer[,i])
name <- names(BreastCancer)[i]
barplot(counts, main=name)
}

Missing Plot for Soybean Dataset

data(Soybean)
missmap(Soybean, col=c("black", "grey"), legend=FALSE)

Multivariate Visualization

correlations <- cor(iris[,1:4])
corrplot(correlations, method="circle")

Pair-wise Scatterplots

pairs(iris)