simulation & linear regression for employee datatest

Author

kirit ved

Published

November 23, 2023

Lecture-5

author’s details

author’s image

author’s website

https://kiritved.com

setting R environment

Code
rm(list=ls(all.names = T))
set.seed(1234)
setwd("d:/met/met_lect6")
if(! require("pacman")) install.packages("pacman")
Loading required package: pacman
Code
library("pacman")
p_load(tidyverse,
       janitor,
       randomForest,
       rpart,
       rpart.plot,
       readxl,
       writexl,
       Boruta,
       factoextra,
       cluster,
       NbClust,
       neuralnet,
       tidyverse,
       e1071,
       class,
       caTools,
       gamlss)
p_loaded()
 [1] "gamlss"       "nlme"         "gamlss.dist"  "gamlss.data"  "caTools"     
 [6] "class"        "e1071"        "neuralnet"    "NbClust"      "cluster"     
[11] "factoextra"   "Boruta"       "writexl"      "readxl"       "rpart.plot"  
[16] "rpart"        "randomForest" "janitor"      "lubridate"    "forcats"     
[21] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
[26] "tibble"       "ggplot2"      "tidyverse"    "pacman"      
Code
myage=function(dt){
  t=Sys.Date()-dt
  t=as.numeric(t)
  t=t/365.25
  t=ceiling(t)+1
  return(t)
}
close_open_devices=function(){
  for (i in dev.list()) {
  dev.off(i)
  }
}
close_open_devices()

simulation

two machine simulation

two m/cs run in series in the assembly line to get the final product

However time taken by m/c 1 is normally distributed with mean 80 & sd of 15 mins, while that of the second m/c is 60 & 10. The manager wants to know :

What are the chances that the product will be available in

  1. less than 125 mins

  2. above 150 mins.

  3. between 120 to 140 mins.

Code
```{r}
n=100000
t1=rnorm(n,80,15);t1=round(t1,2)
t2=rnorm(n,60,10);t2=round(t2,2)
ttlt=t1+t2
fl1=fl2=fl3=c()
for(i in 1:n){
if(ttlt[i]<=125){
  fl1[i]=1
} else{
  fl1[i]=0
}
if(ttlt[i]>=150){
  fl2[i]=1
} else{
  fl2[i]=0
}
 if(ttlt[i]<=140 & ttlt[i]>120){
  fl3[i]=1
} else{
  fl3[i]=0
} 
}
s1=sum(fl1)
s2=sum(fl2)
s3=sum(fl3)
p1=s1/n
p2=s2/n
p3=s3/n
s1;s2;s3;p1;p2;p3
d=data.frame(t1,t2,ttlt,fl1,fl2,fl3)
head(d)
```
[1] 20199
[1] 29065
[1] 36394
[1] 0.20199
[1] 0.29065
[1] 0.36394
t1 t2 ttlt fl1 fl2 fl3
61.89 45.84 107.73 1 0 0
84.16 63.17 147.33 0 0 0
96.27 67.28 163.55 0 1 0
44.81 41.00 85.81 1 0 0
86.44 48.55 134.99 0 0 1
87.59 65.77 153.36 0 1 0

linear regression with employee dataset

load employee dataset & correct date format

Code
d=read.csv("d:/met/met_lect6/emp.csv") |>
  select(-c(6),c(6)) 
d$bdate=mdy(d$bdate)
Warning: 1 failed to parse.
Code
d$bdate=d$bdate+years(30)
head(d)
id gender bdate educ jobcat salbegin jobtime prevexp minority salary
1 m 1982-02-03 15 3 27000 98 144 0 57000
2 m 1988-05-23 16 1 18750 98 36 0 40200
3 f 1959-07-26 12 1 12000 98 381 0 21450
4 f 1977-04-15 8 1 13200 98 190 0 21900
5 m 1985-02-09 15 1 21000 98 138 0 45000
6 m 1988-08-22 15 1 13500 98 67 0 32100
Code
tail(d)
id gender bdate educ jobcat salbegin jobtime prevexp minority salary
469 469 f 1994-06-01 15 1 13950 64 57 0 25200
470 470 m 1994-01-22 12 1 15750 64 69 1 26250
471 471 m 1996-08-03 15 1 15750 64 32 1 26400
472 472 m 1996-02-21 15 1 15750 63 46 0 39150
473 473 f 1967-11-25 12 1 12750 63 139 0 21450
474 474 f 1998-11-05 12 1 14250 63 9 0 29400
Code
summary(d)
       id           gender              bdate                 educ      
 Min.   :  1.0   Length:474         Min.   :1959-02-10   Min.   : 8.00  
 1st Qu.:119.2   Class :character   1st Qu.:1978-01-03   1st Qu.:12.00  
 Median :237.5   Mode  :character   Median :1992-01-23   Median :12.00  
 Mean   :237.5                      Mean   :1986-10-08   Mean   :13.49  
 3rd Qu.:355.8                      3rd Qu.:1995-07-06   3rd Qu.:15.00  
 Max.   :474.0                      Max.   :2001-02-10   Max.   :21.00  
                                    NA's   :1                           
     jobcat         salbegin        jobtime         prevexp      
 Min.   :1.000   Min.   : 9000   Min.   :63.00   Min.   :  0.00  
 1st Qu.:1.000   1st Qu.:12488   1st Qu.:72.00   1st Qu.: 19.25  
 Median :1.000   Median :15000   Median :81.00   Median : 55.00  
 Mean   :1.411   Mean   :17016   Mean   :81.11   Mean   : 95.86  
 3rd Qu.:1.000   3rd Qu.:17490   3rd Qu.:90.00   3rd Qu.:138.75  
 Max.   :3.000   Max.   :79980   Max.   :98.00   Max.   :476.00  
                                                                 
    minority          salary      
 Min.   :0.0000   Min.   : 15750  
 1st Qu.:0.0000   1st Qu.: 24000  
 Median :0.0000   Median : 28875  
 Mean   :0.2194   Mean   : 34420  
 3rd Qu.:0.0000   3rd Qu.: 36938  
 Max.   :1.0000   Max.   :135000  
                                  
Code
str(d)
'data.frame':   474 obs. of  10 variables:
 $ id      : int  1 2 3 4 5 6 7 8 9 10 ...
 $ gender  : chr  "m" "m" "f" "f" ...
 $ bdate   : Date, format: "1982-02-03" "1988-05-23" ...
 $ educ    : int  15 16 12 8 15 15 15 12 15 12 ...
 $ jobcat  : int  3 1 1 1 1 1 1 1 1 1 ...
 $ salbegin: int  27000 18750 12000 13200 21000 13500 18750 9750 12750 13500 ...
 $ jobtime : int  98 98 98 98 98 98 98 98 98 98 ...
 $ prevexp : int  144 36 381 190 138 67 114 0 115 244 ...
 $ minority: int  0 0 0 0 0 0 0 0 0 0 ...
 $ salary  : int  57000 40200 21450 21900 45000 32100 36000 21900 27900 24000 ...

remove rows with na value

Code
tmp=!complete.cases(d)
sum(tmp)
[1] 1
Code
d=d[tmp==F,]
head(d)
id gender bdate educ jobcat salbegin jobtime prevexp minority salary
1 m 1982-02-03 15 3 27000 98 144 0 57000
2 m 1988-05-23 16 1 18750 98 36 0 40200
3 f 1959-07-26 12 1 12000 98 381 0 21450
4 f 1977-04-15 8 1 13200 98 190 0 21900
5 m 1985-02-09 15 1 21000 98 138 0 45000
6 m 1988-08-22 15 1 13500 98 67 0 32100
Code
tail(d)
id gender bdate educ jobcat salbegin jobtime prevexp minority salary
469 469 f 1994-06-01 15 1 13950 64 57 0 25200
470 470 m 1994-01-22 12 1 15750 64 69 1 26250
471 471 m 1996-08-03 15 1 15750 64 32 1 26400
472 472 m 1996-02-21 15 1 15750 63 46 0 39150
473 473 f 1967-11-25 12 1 12750 63 139 0 21450
474 474 f 1998-11-05 12 1 14250 63 9 0 29400
Code
str(d)
'data.frame':   473 obs. of  10 variables:
 $ id      : int  1 2 3 4 5 6 7 8 9 10 ...
 $ gender  : chr  "m" "m" "f" "f" ...
 $ bdate   : Date, format: "1982-02-03" "1988-05-23" ...
 $ educ    : int  15 16 12 8 15 15 15 12 15 12 ...
 $ jobcat  : int  3 1 1 1 1 1 1 1 1 1 ...
 $ salbegin: int  27000 18750 12000 13200 21000 13500 18750 9750 12750 13500 ...
 $ jobtime : int  98 98 98 98 98 98 98 98 98 98 ...
 $ prevexp : int  144 36 381 190 138 67 114 0 115 244 ...
 $ minority: int  0 0 0 0 0 0 0 0 0 0 ...
 $ salary  : int  57000 40200 21450 21900 45000 32100 36000 21900 27900 24000 ...
Code
summary(d)
       id           gender              bdate                 educ      
 Min.   :  1.0   Length:473         Min.   :1959-02-10   Min.   : 8.00  
 1st Qu.:119.0   Class :character   1st Qu.:1978-01-03   1st Qu.:12.00  
 Median :237.0   Mode  :character   Median :1992-01-23   Median :12.00  
 Mean   :237.1                      Mean   :1986-10-08   Mean   :13.49  
 3rd Qu.:355.0                      3rd Qu.:1995-07-06   3rd Qu.:15.00  
 Max.   :474.0                      Max.   :2001-02-10   Max.   :21.00  
     jobcat         salbegin        jobtime         prevexp      
 Min.   :1.000   Min.   : 9000   Min.   :63.00   Min.   :  0.00  
 1st Qu.:1.000   1st Qu.:12450   1st Qu.:72.00   1st Qu.: 19.00  
 Median :1.000   Median :15000   Median :81.00   Median : 55.00  
 Mean   :1.412   Mean   :17009   Mean   :81.14   Mean   : 95.95  
 3rd Qu.:1.000   3rd Qu.:17490   3rd Qu.:90.00   3rd Qu.:139.00  
 Max.   :3.000   Max.   :79980   Max.   :98.00   Max.   :476.00  
    minority          salary      
 Min.   :0.0000   Min.   : 15750  
 1st Qu.:0.0000   1st Qu.: 24000  
 Median :0.0000   Median : 28800  
 Mean   :0.2199   Mean   : 34418  
 3rd Qu.:0.0000   3rd Qu.: 37050  
 Max.   :1.0000   Max.   :135000  
Code
writexl::write_xlsx(d,"cleanedemp.xlsx")
do=d

selecting feaatures

foe employee data

Code
br=Boruta::Boruta(salary~.,d)
br
Boruta performed 18 iterations in 3.58509 secs.
 9 attributes confirmed important: bdate, educ, gender, id, jobcat and
4 more;
 No attributes deemed unimportant.
Code
tmp=br$finalDecision[!br$finalDecision=="Rejected"]
tmp
       id    gender     bdate      educ    jobcat  salbegin   jobtime   prevexp 
Confirmed Confirmed Confirmed Confirmed Confirmed Confirmed Confirmed Confirmed 
 minority 
Confirmed 
Levels: Tentative Confirmed Rejected

the above analysis shows that 9 attributes were selected

for iris data

Code
diris=iris |>janitor::clean_names()
br=Boruta(species~.,diris)
br
Boruta performed 9 iterations in 0.2725961 secs.
 4 attributes confirmed important: petal_length, petal_width,
sepal_length, sepal_width;
 No attributes deemed unimportant.
Code
tmp=br$finalDecision[!br$finalDecision=="Rejected"]
tmp
sepal_length  sepal_width petal_length  petal_width 
   Confirmed    Confirmed    Confirmed    Confirmed 
Levels: Tentative Confirmed Rejected

the above analysis shows that 4 attributes were selected

data visualization

Code
mybarplot=function(df,cn,ttl="bar plot",xlbl,ylbl){
  colnm=colnames(df)[cn]
  v=df|>select(c(cn))
  t=table(v)
  ttl=paste(ttl,"for",colnm,sep=" ")
  b=barplot(t,col="lightgreen",main=ttl,xlab=colnm)
  text(b,min(t)*1.1,labels=as.character(t),col="red")
  #return (T)
}
mypie=function(df,cn,ttl="pie chart"){
  colnm=colnames(df)[cn]
  v=df|>select(c(cn))
  
  t=table(v)

  revnm=paste(names(t),t,sep="--")
  ttl=paste(ttl,"for",colnm,sep=" ")
    pie(t,labels=revnm,main=ttl)
}
dtmp=d|>group_by(gender)|>summarise(n=n())|>mutate(nper=100*n/sum(n))
dtmp
gender n nper
f 216 45.66596
m 257 54.33404
Code
mybarplot(d,2)
Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(cn)

  # Now:
  data %>% select(all_of(cn))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.

Code
mybarplot(d,4)

Code
mybarplot(d,5)

Code
mybarplot(d,9)

Code
mypie(d,2)

Code
mypie(d,4)

Code
mypie(d,5)

Code
mypie(d,9)

Code
plot(d$salbegin,d$salary,col="lightgreen")

Code
boxplot(salary~gender,d,col="lightgreen",main="boxplot of salary vs gender")

Code
boxplot(salary~jobcat,d,col="lightgreen",main="boxplot of salary vs jobcategory")

Code
boxplot(salary~educ,d,col="lightgreen",main="boxplot of salary vs educ")

Code
boxplot(salary~minority,d,col="lightgreen",main="boxplot of salary vs minority")

Code
hist(d$salary,
     main="histogram of salary",
     xlab="salary",
     ylab="frequency",col="lightgreen")

Code
hist(d$salbegin,
     main="histogram of begining salary",
     xlab="begining salary",
     ylab="frequency",col="lightgreen")

Code
hist(d$educ,
     main="histogram of education",
     xlab="education",
     ylab="frequency",col="lightgreen")

structure of data needed for employee database

Code
str(do)
'data.frame':   473 obs. of  10 variables:
 $ id      : int  1 2 3 4 5 6 7 8 9 10 ...
 $ gender  : chr  "m" "m" "f" "f" ...
 $ bdate   : Date, format: "1982-02-03" "1988-05-23" ...
 $ educ    : int  15 16 12 8 15 15 15 12 15 12 ...
 $ jobcat  : int  3 1 1 1 1 1 1 1 1 1 ...
 $ salbegin: int  27000 18750 12000 13200 21000 13500 18750 9750 12750 13500 ...
 $ jobtime : int  98 98 98 98 98 98 98 98 98 98 ...
 $ prevexp : int  144 36 381 190 138 67 114 0 115 244 ...
 $ minority: int  0 0 0 0 0 0 0 0 0 0 ...
 $ salary  : int  57000 40200 21450 21900 45000 32100 36000 21900 27900 24000 ...

calculate age of employee & replacing gender with numerical value using pipe operator

Code
do1=d|>mutate(age=myage(bdate),
gn=ifelse(gender=="f",0,1))|> select(-c(2,3)) |>select(-c(8),c(8))
head(do1)
id educ jobcat salbegin jobtime prevexp minority age gn salary
1 15 3 27000 98 144 0 43 1 57000
2 16 1 18750 98 36 0 37 1 40200
3 12 1 12000 98 381 0 66 0 21450
4 8 1 13200 98 190 0 48 0 21900
5 15 1 21000 98 138 0 40 1 45000
6 15 1 13500 98 67 0 37 1 32100
Code
str(do1)
'data.frame':   473 obs. of  10 variables:
 $ id      : int  1 2 3 4 5 6 7 8 9 10 ...
 $ educ    : int  15 16 12 8 15 15 15 12 15 12 ...
 $ jobcat  : int  3 1 1 1 1 1 1 1 1 1 ...
 $ salbegin: int  27000 18750 12000 13200 21000 13500 18750 9750 12750 13500 ...
 $ jobtime : int  98 98 98 98 98 98 98 98 98 98 ...
 $ prevexp : int  144 36 381 190 138 67 114 0 115 244 ...
 $ minority: int  0 0 0 0 0 0 0 0 0 0 ...
 $ age     : num  43 37 66 48 40 37 39 29 49 49 ...
 $ gn      : num  1 1 0 0 1 1 1 0 0 0 ...
 $ salary  : int  57000 40200 21450 21900 45000 32100 36000 21900 27900 24000 ...

performing linear regression

split the data

Code
split_ratio=2/3
tmp=sample(1:nrow(do1),nrow(do1)*split_ratio)
 # tmp
  do1train=do1[tmp,]
  do1test=do1[-tmp,]
  nrow(do1train);nrow(do1test)
[1] 315
[1] 158
Code
  mylm=lm(salary~.,do1train)
  mylm

Call:
lm(formula = salary ~ ., data = do1train)

Coefficients:
(Intercept)           id         educ       jobcat     salbegin      jobtime  
  35427.183      -33.270      453.599     4299.745        1.394     -312.294  
    prevexp     minority          age           gn  
    -14.229    -1091.049      -75.782     1524.412  
Code
  summary(mylm)

Call:
lm(formula = salary ~ ., data = do1train)

Residuals:
   Min     1Q Median     3Q    Max 
-19682  -3329   -528   2487  45676 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 35427.1833 62297.5212   0.569   0.5700    
id            -33.2696    46.2515  -0.719   0.4725    
educ          453.5995   179.2839   2.530   0.0119 *  
jobcat       4299.7455   698.7595   6.153 2.38e-09 ***
salbegin        1.3944     0.0743  18.769  < 2e-16 ***
jobtime      -312.2943   629.8997  -0.496   0.6204    
prevexp       -14.2288     6.0757  -2.342   0.0198 *  
minority    -1091.0494   962.6457  -1.133   0.2579    
age           -75.7820    53.3130  -1.421   0.1562    
gn           1524.4123   917.1525   1.662   0.0975 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6375 on 305 degrees of freedom
Multiple R-squared:  0.8641,    Adjusted R-squared:  0.8601 
F-statistic: 215.5 on 9 and 305 DF,  p-value: < 2.2e-16
Code
  pred=predict(mylm,do1test)
  pred=round(pred,0)
  #paste(pred,do1test$salary,sep=" - ")
  plot(do1test$salary,pred)

Code
  tmpcor=cor(pred,do1test$salary)
  tmpcor
[1] 0.8972594
Code
  RMSE = sqrt(mean((pred - do1test$salary)^2))
  RMSE
[1] 7723.815

create dummy data of employees for prediction

Code
str(do1)
'data.frame':   473 obs. of  10 variables:
 $ id      : int  1 2 3 4 5 6 7 8 9 10 ...
 $ educ    : int  15 16 12 8 15 15 15 12 15 12 ...
 $ jobcat  : int  3 1 1 1 1 1 1 1 1 1 ...
 $ salbegin: int  27000 18750 12000 13200 21000 13500 18750 9750 12750 13500 ...
 $ jobtime : int  98 98 98 98 98 98 98 98 98 98 ...
 $ prevexp : int  144 36 381 190 138 67 114 0 115 244 ...
 $ minority: int  0 0 0 0 0 0 0 0 0 0 ...
 $ age     : num  43 37 66 48 40 37 39 29 49 49 ...
 $ gn      : num  1 1 0 0 1 1 1 0 0 0 ...
 $ salary  : int  57000 40200 21450 21900 45000 32100 36000 21900 27900 24000 ...
Code
n=20
id=round(runif(n,474,550))
gender=(sample(c("m","f"),n,replace=T))
bdate=runif(n,min(do$bdate),max(d$bdate))
bdate=as.Date(bdate)
educ=round(runif(n,min(do1$educ),max(do1$educ)))
jobcat=round(runif(n,min(do1$jobcat),max(do1$jobcat)))
salbegin=round(runif(n,min(do1$salbegin),max(do1$salbegin)))
jobtime=round(runif(n,min(do1$jobtime),max(do1$jobtime)))
prevexp=round(runif(n,min(do1$prevexp),max(do1$prevexp)))
minority=round(runif(n,min(do1$educ),max(do1$educ)))
age=round(runif(n,min(do1$age),max(do1$age)))
gendern=(sample(c("m","f"),n,replace=T))
dpred=data.frame(id,gender,bdate,educ,jobcat,salbegin,jobtime,prevexp,minority)
dpred1=dpred|>mutate(age=myage(bdate),
gn=ifelse(gender=="f",0,1))|> select(-c(2,3)) |>select(-c(8),c(8))
dpred1
id educ jobcat salbegin jobtime prevexp minority gn age
525 14 3 39198 73 399 11 1 33
534 19 1 28098 86 108 21 0 30
483 14 2 57865 82 3 9 1 66
483 8 3 9757 67 386 16 1 39
487 13 1 67078 76 310 20 1 58
500 8 2 45325 89 261 19 0 31
478 9 1 45498 97 263 12 1 51
545 17 2 58439 90 317 9 1 50
505 11 1 73943 68 248 9 1 56
528 14 2 56822 83 147 12 1 32
527 15 2 41003 81 298 13 0 31
475 18 2 18584 92 429 9 1 40
500 19 2 70025 97 392 17 0 43
547 20 3 76414 78 354 19 1 43
480 13 2 42040 73 370 12 0 28
488 10 3 36818 83 17 13 0 47
494 13 1 32092 97 251 12 0 50
484 9 1 9790 85 304 17 0 27
546 19 1 49217 93 171 20 1 66
542 13 2 62887 84 265 16 1 44

predicting salary for new employee

Code
finpred=predict(mylm,dpred1)
finpred
         1          2          3          4          5          6          7 
 50417.149  16181.048  76050.184   4187.694  70121.044  39636.967  41883.299 
         8          9         10         11         12         13         14 
 70394.978  93721.538  70040.830  44405.257  16141.271  75979.843  93895.254 
        15         16         17         18         19         20 
 49300.048  44059.956  23193.510 -10106.061  42036.022  70313.911 
Code
dpred1$salary=round(finpred)
dpred1
id educ jobcat salbegin jobtime prevexp minority gn age salary
525 14 3 39198 73 399 11 1 33 50417
534 19 1 28098 86 108 21 0 30 16181
483 14 2 57865 82 3 9 1 66 76050
483 8 3 9757 67 386 16 1 39 4188
487 13 1 67078 76 310 20 1 58 70121
500 8 2 45325 89 261 19 0 31 39637
478 9 1 45498 97 263 12 1 51 41883
545 17 2 58439 90 317 9 1 50 70395
505 11 1 73943 68 248 9 1 56 93722
528 14 2 56822 83 147 12 1 32 70041
527 15 2 41003 81 298 13 0 31 44405
475 18 2 18584 92 429 9 1 40 16141
500 19 2 70025 97 392 17 0 43 75980
547 20 3 76414 78 354 19 1 43 93895
480 13 2 42040 73 370 12 0 28 49300
488 10 3 36818 83 17 13 0 47 44060
494 13 1 32092 97 251 12 0 50 23194
484 9 1 9790 85 304 17 0 27 -10106
546 19 1 49217 93 171 20 1 66 42036
542 13 2 62887 84 265 16 1 44 70314

analysis & prediction of species using iris dataset

loading iris dataset

Code
d=iris|>janitor::clean_names()
head(d)
sepal_length sepal_width petal_length petal_width species
5.1 3.5 1.4 0.2 setosa
4.9 3.0 1.4 0.2 setosa
4.7 3.2 1.3 0.2 setosa
4.6 3.1 1.5 0.2 setosa
5.0 3.6 1.4 0.2 setosa
5.4 3.9 1.7 0.4 setosa
Code
tail(d)
sepal_length sepal_width petal_length petal_width species
145 6.7 3.3 5.7 2.5 virginica
146 6.7 3.0 5.2 2.3 virginica
147 6.3 2.5 5.0 1.9 virginica
148 6.5 3.0 5.2 2.0 virginica
149 6.2 3.4 5.4 2.3 virginica
150 5.9 3.0 5.1 1.8 virginica
Code
str(d)
'data.frame':   150 obs. of  5 variables:
 $ sepal_length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 $ sepal_width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 $ petal_length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 $ petal_width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
 $ species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
Code
summary(d)
  sepal_length    sepal_width     petal_length    petal_width   
 Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
 1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
 Median :5.800   Median :3.000   Median :4.350   Median :1.300  
 Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
 3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
 Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
       species  
 setosa    :50  
 versicolor:50  
 virginica :50  
                
                
                

partitioning the iris dataset

Code
sr=0.7
tmp=sample(1:nrow(d),nrow(d)*sr)
dtrain=d[tmp,]
dtest=d[-tmp,]
nrow(dtrain)
[1] 105
Code
nrow(dtest)
[1] 45

creating dataset for prediction

Code
n=10
sl=runif(n,min(d$sepal_length),max(d$sepal_length))
sw=runif(n,min(d$sepal_width),max(d$sepal_width))
pl=runif(n,min(d$petal_length),max(d$petal_length))
pw=runif(n,min(d$petal_width),max(d$petal_width))
dpred=data.frame(sepal_length=sl,
                 sepal_width=sw,
                 petal_length=sl,
                 petal_width=sw
                 )
dpred[1,3]=1.2
dpred[1,4]=1
dpred[2,3]=1.2
dpred[2,4]=2

dpred
sepal_length sepal_width petal_length petal_width
5.915432 2.184280 1.200000 1.000000
5.146724 2.414594 1.200000 2.000000
4.443416 3.435550 4.443416 3.435550
4.717700 3.626872 4.717700 3.626872
6.786433 4.355298 6.786433 4.355298
6.338248 3.883521 6.338248 3.883521
6.691690 4.287332 6.691690 4.287332
6.487944 2.246687 6.487944 2.246687
5.153071 3.903833 5.153071 3.903833
5.774870 2.430127 5.774870 2.430127

creating model using rpart

Code
rp=rpart::rpart(species~.,dtrain)
rp
n= 105 

node), split, n, loss, yval, (yprob)
      * denotes terminal node

1) root 105 66 virginica (0.34285714 0.28571429 0.37142857)  
  2) petal_length< 2.45 36  0 setosa (1.00000000 0.00000000 0.00000000) *
  3) petal_length>=2.45 69 30 virginica (0.00000000 0.43478261 0.56521739)  
    6) petal_width< 1.75 33  3 versicolor (0.00000000 0.90909091 0.09090909) *
    7) petal_width>=1.75 36  0 virginica (0.00000000 0.00000000 1.00000000) *
Code
# plot(rp)
# text(rp)
rpart.plot::rpart.plot(rp)

Code
pred=predict(rp,dtest,type="class")
pred
         8         12         17         19         22         24         30 
    setosa     setosa     setosa     setosa     setosa     setosa     setosa 
        32         33         39         45         47         48         49 
    setosa     setosa     setosa     setosa     setosa     setosa     setosa 
        51         57         60         61         64         66         68 
versicolor versicolor versicolor versicolor versicolor versicolor versicolor 
        69         70         71         74         77         80         81 
versicolor versicolor  virginica versicolor versicolor versicolor versicolor 
        82         84         89         90         95         98        102 
versicolor versicolor versicolor versicolor versicolor versicolor  virginica 
       105        114        116        122        125        126        132 
 virginica  virginica  virginica  virginica  virginica  virginica  virginica 
       134        135        149 
versicolor versicolor  virginica 
Levels: setosa versicolor virginica
Code
t=table(dtest$species,pred)
acc=sum(diag(t))/sum(t)
acc
[1] 0.9333333

predicting model using rpart

Code
pred=predict(rp,dpred,type="class")
pred
        1         2         3         4         5         6         7         8 
   setosa    setosa virginica virginica virginica virginica virginica virginica 
        9        10 
virginica virginica 
Levels: setosa versicolor virginica

creating model using randomforest

Code
rf=randomForest::randomForest(species~.,dtrain,mtry=3,ntree=1000,proximity=T,importance=T)
rf

Call:
 randomForest(formula = species ~ ., data = dtrain, mtry = 3,      ntree = 1000, proximity = T, importance = T) 
               Type of random forest: classification
                     Number of trees: 1000
No. of variables tried at each split: 3

        OOB estimate of  error rate: 3.81%
Confusion matrix:
           setosa versicolor virginica class.error
setosa         36          0         0  0.00000000
versicolor      0         29         1  0.03333333
virginica       0          3        36  0.07692308
Code
summary(rf)
                Length Class  Mode     
call                7  -none- call     
type                1  -none- character
predicted         105  factor numeric  
err.rate         4000  -none- numeric  
confusion          12  -none- numeric  
votes             315  matrix numeric  
oob.times         105  -none- numeric  
classes             3  -none- character
importance         20  -none- numeric  
importanceSD       16  -none- numeric  
localImportance     0  -none- NULL     
proximity       11025  -none- numeric  
ntree               1  -none- numeric  
mtry                1  -none- numeric  
forest             14  -none- list     
y                 105  factor numeric  
test                0  -none- NULL     
inbag               0  -none- NULL     
terms               3  terms  call     
Code
pred=predict(rf,dtest)
pred
         8         12         17         19         22         24         30 
    setosa     setosa     setosa     setosa     setosa     setosa     setosa 
        32         33         39         45         47         48         49 
    setosa     setosa     setosa     setosa     setosa     setosa     setosa 
        51         57         60         61         64         66         68 
versicolor versicolor versicolor versicolor versicolor versicolor versicolor 
        69         70         71         74         77         80         81 
versicolor versicolor  virginica versicolor versicolor versicolor versicolor 
        82         84         89         90         95         98        102 
versicolor  virginica versicolor versicolor versicolor versicolor  virginica 
       105        114        116        122        125        126        132 
 virginica  virginica  virginica  virginica  virginica  virginica  virginica 
       134        135        149 
versicolor  virginica  virginica 
Levels: setosa versicolor virginica
Code
t=table(pred,dtest$species)
t
            
pred         setosa versicolor virginica
  setosa         14          0         0
  versicolor      0         18         1
  virginica       0          2        10
Code
acc=sum(diag(t))/sum(t)
acc
[1] 0.9333333

predicting output using rf model

Code
pred=predict(rf,dpred)
pred
        1         2         3         4         5         6         7         8 
   setosa virginica virginica virginica virginica virginica virginica virginica 
        9        10 
virginica virginica 
Levels: setosa versicolor virginica
Code
dpred$species=pred
dpred
sepal_length sepal_width petal_length petal_width species
5.915432 2.184280 1.200000 1.000000 setosa
5.146724 2.414594 1.200000 2.000000 virginica
4.443416 3.435550 4.443416 3.435550 virginica
4.717700 3.626872 4.717700 3.626872 virginica
6.786433 4.355298 6.786433 4.355298 virginica
6.338248 3.883521 6.338248 3.883521 virginica
6.691690 4.287332 6.691690 4.287332 virginica
6.487944 2.246687 6.487944 2.246687 virginica
5.153071 3.903833 5.153071 3.903833 virginica
5.774870 2.430127 5.774870 2.430127 virginica

clusterring

Code
df <- iris[,c(1:4)]

# Omitting any NA values
df <- na.omit(df)

# Scaling dataset
df <- scale(df)

# output to be present as PNG file 
png(file = "KMeansExample.png")

km <- kmeans(df, centers = 3,iter.max = 30, nstart = 25)
#str(km)
km
K-means clustering with 3 clusters of sizes 47, 50, 53

Cluster means:
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1   1.13217737  0.08812645    0.9928284   1.0141287
2  -1.01119138  0.85041372   -1.3006301  -1.2507035
3  -0.05005221 -0.88042696    0.3465767   0.2805873

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  2   2   2   2   2   2   2   2   2   2   1   1   1   3   3   3   1   3   3   3 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  3   3   3   3   3   1   3   3   3   3   1   3   3   3   3   1   1   1   3   3 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  3   3   3   3   3   1   1   3   3   3   3   3   3   3   3   3   3   3   3   3 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  1   3   1   1   1   1   3   1   1   1   1   1   1   3   3   1   1   1   1   3 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  1   3   1   3   1   1   3   1   1   1   1   1   1   3   3   1   1   1   3   1 
141 142 143 144 145 146 147 148 149 150 
  1   1   3   1   1   1   3   1   1   3 

Within cluster sum of squares by cluster:
[1] 47.45019 47.35062 44.08754
 (between_SS / total_SS =  76.7 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      
Code
# Visualize the clusters
fviz_cluster(km, data = df)

# saving the file 
dev.off()
png 
  2 
Code
# output to be present as PNG file 
png(file = "KMeansExample2.png")

km <- kmeans(df, centers = 4,iter.max = 30, nstart = 25)
# Visualize the clusters
fviz_cluster(km, data = df)
km
K-means clustering with 4 clusters of sizes 47, 25, 53, 25

Cluster means:
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1   1.13217737  0.08812645    0.9928284   1.0141287
2  -0.71894419  1.50198969   -1.2972312  -1.2165934
3  -0.05005221 -0.88042696    0.3465767   0.2805873
4  -1.30343857  0.19883774   -1.3040289  -1.2848136

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  2   4   4   4   2   2   4   4   4   4   2   4   4   4   2   2   2   2   2   2 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  2   2   2   4   4   4   4   2   2   4   4   2   2   2   4   4   2   2   4   4 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  2   4   4   2   2   4   2   4   2   4   1   1   1   3   3   3   1   3   3   3 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  3   3   3   3   3   1   3   3   3   3   1   3   3   3   3   1   1   1   3   3 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  3   3   3   3   3   1   1   3   3   3   3   3   3   3   3   3   3   3   3   3 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  1   3   1   1   1   1   3   1   1   1   1   1   1   3   3   1   1   1   1   3 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  1   3   1   3   1   1   3   1   1   1   1   1   1   3   3   1   1   1   3   1 
141 142 143 144 145 146 147 148 149 150 
  1   1   3   1   1   1   3   1   1   3 

Within cluster sum of squares by cluster:
[1] 47.450194 12.147537 44.087545  9.646348
 (between_SS / total_SS =  81.0 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      
Code
# saving the file 
dev.off()
png 
  2 
Code
#str(km)
km
K-means clustering with 4 clusters of sizes 47, 25, 53, 25

Cluster means:
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1   1.13217737  0.08812645    0.9928284   1.0141287
2  -0.71894419  1.50198969   -1.2972312  -1.2165934
3  -0.05005221 -0.88042696    0.3465767   0.2805873
4  -1.30343857  0.19883774   -1.3040289  -1.2848136

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  2   4   4   4   2   2   4   4   4   4   2   4   4   4   2   2   2   2   2   2 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  2   2   2   4   4   4   4   2   2   4   4   2   2   2   4   4   2   2   4   4 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  2   4   4   2   2   4   2   4   2   4   1   1   1   3   3   3   1   3   3   3 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  3   3   3   3   3   1   3   3   3   3   1   3   3   3   3   1   1   1   3   3 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  3   3   3   3   3   1   1   3   3   3   3   3   3   3   3   3   3   3   3   3 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  1   3   1   1   1   1   3   1   1   1   1   1   1   3   3   1   1   1   1   3 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  1   3   1   3   1   1   3   1   1   1   1   1   1   3   3   1   1   1   3   1 
141 142 143 144 145 146 147 148 149 150 
  1   1   3   1   1   1   3   1   1   3 

Within cluster sum of squares by cluster:
[1] 47.450194 12.147537 44.087545  9.646348
 (between_SS / total_SS =  81.0 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      
Code
df
    Sepal.Length Sepal.Width Petal.Length   Petal.Width
1    -0.89767388  1.01560199  -1.33575163 -1.3110521482
2    -1.13920048 -0.13153881  -1.33575163 -1.3110521482
3    -1.38072709  0.32731751  -1.39239929 -1.3110521482
4    -1.50149039  0.09788935  -1.27910398 -1.3110521482
5    -1.01843718  1.24503015  -1.33575163 -1.3110521482
6    -0.53538397  1.93331463  -1.16580868 -1.0486667950
7    -1.50149039  0.78617383  -1.33575163 -1.1798594716
8    -1.01843718  0.78617383  -1.27910398 -1.3110521482
9    -1.74301699 -0.36096697  -1.33575163 -1.3110521482
10   -1.13920048  0.09788935  -1.27910398 -1.4422448248
11   -0.53538397  1.47445831  -1.27910398 -1.3110521482
12   -1.25996379  0.78617383  -1.22245633 -1.3110521482
13   -1.25996379 -0.13153881  -1.33575163 -1.4422448248
14   -1.86378030 -0.13153881  -1.50569459 -1.4422448248
15   -0.05233076  2.16274279  -1.44904694 -1.3110521482
16   -0.17309407  3.08045544  -1.27910398 -1.0486667950
17   -0.53538397  1.93331463  -1.39239929 -1.0486667950
18   -0.89767388  1.01560199  -1.33575163 -1.1798594716
19   -0.17309407  1.70388647  -1.16580868 -1.1798594716
20   -0.89767388  1.70388647  -1.27910398 -1.1798594716
21   -0.53538397  0.78617383  -1.16580868 -1.3110521482
22   -0.89767388  1.47445831  -1.27910398 -1.0486667950
23   -1.50149039  1.24503015  -1.56234224 -1.3110521482
24   -0.89767388  0.55674567  -1.16580868 -0.9174741184
25   -1.25996379  0.78617383  -1.05251337 -1.3110521482
26   -1.01843718 -0.13153881  -1.22245633 -1.3110521482
27   -1.01843718  0.78617383  -1.22245633 -1.0486667950
28   -0.77691058  1.01560199  -1.27910398 -1.3110521482
29   -0.77691058  0.78617383  -1.33575163 -1.3110521482
30   -1.38072709  0.32731751  -1.22245633 -1.3110521482
31   -1.25996379  0.09788935  -1.22245633 -1.3110521482
32   -0.53538397  0.78617383  -1.27910398 -1.0486667950
33   -0.77691058  2.39217095  -1.27910398 -1.4422448248
34   -0.41462067  2.62159911  -1.33575163 -1.3110521482
35   -1.13920048  0.09788935  -1.27910398 -1.3110521482
36   -1.01843718  0.32731751  -1.44904694 -1.3110521482
37   -0.41462067  1.01560199  -1.39239929 -1.3110521482
38   -1.13920048  1.24503015  -1.33575163 -1.4422448248
39   -1.74301699 -0.13153881  -1.39239929 -1.3110521482
40   -0.89767388  0.78617383  -1.27910398 -1.3110521482
41   -1.01843718  1.01560199  -1.39239929 -1.1798594716
42   -1.62225369 -1.73753594  -1.39239929 -1.1798594716
43   -1.74301699  0.32731751  -1.39239929 -1.3110521482
44   -1.01843718  1.01560199  -1.22245633 -0.7862814418
45   -0.89767388  1.70388647  -1.05251337 -1.0486667950
46   -1.25996379 -0.13153881  -1.33575163 -1.1798594716
47   -0.89767388  1.70388647  -1.22245633 -1.3110521482
48   -1.50149039  0.32731751  -1.33575163 -1.3110521482
49   -0.65614727  1.47445831  -1.27910398 -1.3110521482
50   -1.01843718  0.55674567  -1.33575163 -1.3110521482
51    1.39682886  0.32731751   0.53362088  0.2632599711
52    0.67224905  0.32731751   0.42032558  0.3944526477
53    1.27606556  0.09788935   0.64691619  0.3944526477
54   -0.41462067 -1.73753594   0.13708732  0.1320672944
55    0.79301235 -0.59039513   0.47697323  0.3944526477
56   -0.17309407 -0.59039513   0.42032558  0.1320672944
57    0.55148575  0.55674567   0.53362088  0.5256453243
58   -1.13920048 -1.50810778  -0.25944625 -0.2615107354
59    0.91377565 -0.36096697   0.47697323  0.1320672944
60   -0.77691058 -0.81982329   0.08043967  0.2632599711
61   -1.01843718 -2.42582042  -0.14615094 -0.2615107354
62    0.06843254 -0.13153881   0.25038262  0.3944526477
63    0.18919584 -1.96696410   0.13708732 -0.2615107354
64    0.30995914 -0.36096697   0.53362088  0.2632599711
65   -0.29385737 -0.36096697  -0.08950329  0.1320672944
66    1.03453895  0.09788935   0.36367793  0.2632599711
67   -0.29385737 -0.13153881   0.42032558  0.3944526477
68   -0.05233076 -0.81982329   0.19373497 -0.2615107354
69    0.43072244 -1.96696410   0.42032558  0.3944526477
70   -0.29385737 -1.27867961   0.08043967 -0.1303180588
71    0.06843254  0.32731751   0.59026853  0.7880306775
72    0.30995914 -0.59039513   0.13708732  0.1320672944
73    0.55148575 -1.27867961   0.64691619  0.3944526477
74    0.30995914 -0.59039513   0.53362088  0.0008746178
75    0.67224905 -0.36096697   0.30703027  0.1320672944
76    0.91377565 -0.13153881   0.36367793  0.2632599711
77    1.15530226 -0.59039513   0.59026853  0.2632599711
78    1.03453895 -0.13153881   0.70356384  0.6568380009
79    0.18919584 -0.36096697   0.42032558  0.3944526477
80   -0.17309407 -1.04925145  -0.14615094 -0.2615107354
81   -0.41462067 -1.50810778   0.02379201 -0.1303180588
82   -0.41462067 -1.50810778  -0.03285564 -0.2615107354
83   -0.05233076 -0.81982329   0.08043967  0.0008746178
84    0.18919584 -0.81982329   0.76021149  0.5256453243
85   -0.53538397 -0.13153881   0.42032558  0.3944526477
86    0.18919584  0.78617383   0.42032558  0.5256453243
87    1.03453895  0.09788935   0.53362088  0.3944526477
88    0.55148575 -1.73753594   0.36367793  0.1320672944
89   -0.29385737 -0.13153881   0.19373497  0.1320672944
90   -0.41462067 -1.27867961   0.13708732  0.1320672944
91   -0.41462067 -1.04925145   0.36367793  0.0008746178
92    0.30995914 -0.13153881   0.47697323  0.2632599711
93   -0.05233076 -1.04925145   0.13708732  0.0008746178
94   -1.01843718 -1.73753594  -0.25944625 -0.2615107354
95   -0.29385737 -0.81982329   0.25038262  0.1320672944
96   -0.17309407 -0.13153881   0.25038262  0.0008746178
97   -0.17309407 -0.36096697   0.25038262  0.1320672944
98    0.43072244 -0.36096697   0.30703027  0.1320672944
99   -0.89767388 -1.27867961  -0.42938920 -0.1303180588
100  -0.17309407 -0.59039513   0.19373497  0.1320672944
101   0.55148575  0.55674567   1.27004036  1.7063794137
102  -0.05233076 -0.81982329   0.76021149  0.9192233541
103   1.51759216 -0.13153881   1.21339271  1.1816087073
104   0.55148575 -0.36096697   1.04344975  0.7880306775
105   0.79301235 -0.13153881   1.15674505  1.3128013839
106   2.12140867 -0.13153881   1.60992627  1.1816087073
107  -1.13920048 -1.27867961   0.42032558  0.6568380009
108   1.75911877 -0.36096697   1.43998331  0.7880306775
109   1.03453895 -1.27867961   1.15674505  0.7880306775
110   1.63835547  1.24503015   1.32668801  1.7063794137
111   0.79301235  0.32731751   0.76021149  1.0504160307
112   0.67224905 -0.81982329   0.87350679  0.9192233541
113   1.15530226 -0.13153881   0.98680210  1.1816087073
114  -0.17309407 -1.27867961   0.70356384  1.0504160307
115  -0.05233076 -0.59039513   0.76021149  1.5751867371
116   0.67224905  0.32731751   0.87350679  1.4439940605
117   0.79301235 -0.13153881   0.98680210  0.7880306775
118   2.24217198  1.70388647   1.66657392  1.3128013839
119   2.24217198 -1.04925145   1.77986923  1.4439940605
120   0.18919584 -1.96696410   0.70356384  0.3944526477
121   1.27606556  0.32731751   1.10009740  1.4439940605
122  -0.29385737 -0.59039513   0.64691619  1.0504160307
123   2.24217198 -0.59039513   1.66657392  1.0504160307
124   0.55148575 -0.81982329   0.64691619  0.7880306775
125   1.03453895  0.55674567   1.10009740  1.1816087073
126   1.63835547  0.32731751   1.27004036  0.7880306775
127   0.43072244 -0.59039513   0.59026853  0.7880306775
128   0.30995914 -0.13153881   0.64691619  0.7880306775
129   0.67224905 -0.59039513   1.04344975  1.1816087073
130   1.63835547 -0.13153881   1.15674505  0.5256453243
131   1.87988207 -0.59039513   1.32668801  0.9192233541
132   2.48369858  1.70388647   1.49663097  1.0504160307
133   0.67224905 -0.59039513   1.04344975  1.3128013839
134   0.55148575 -0.59039513   0.76021149  0.3944526477
135   0.30995914 -1.04925145   1.04344975  0.2632599711
136   2.24217198 -0.13153881   1.32668801  1.4439940605
137   0.55148575  0.78617383   1.04344975  1.5751867371
138   0.67224905  0.09788935   0.98680210  0.7880306775
139   0.18919584 -0.13153881   0.59026853  0.7880306775
140   1.27606556  0.09788935   0.93015445  1.1816087073
141   1.03453895  0.09788935   1.04344975  1.5751867371
142   1.27606556  0.09788935   0.76021149  1.4439940605
143  -0.05233076 -0.81982329   0.76021149  0.9192233541
144   1.15530226  0.32731751   1.21339271  1.4439940605
145   1.03453895  0.55674567   1.10009740  1.7063794137
146   1.03453895 -0.13153881   0.81685914  1.4439940605
147   0.55148575 -1.27867961   0.70356384  0.9192233541
148   0.79301235 -0.13153881   0.81685914  1.0504160307
149   0.43072244  0.78617383   0.93015445  1.4439940605
150   0.06843254 -0.13153881   0.76021149  0.7880306775
attr(,"scaled:center")
Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
    5.843333     3.057333     3.758000     1.199333 
attr(,"scaled:scale")
Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
   0.8280661    0.4358663    1.7652982    0.7622377 
Code
km=list()
for(i in 2:10){
  p=paste("k",i,".png",sep="")
  png(file=p)
  km[[i]]=kmeans(df,centers=i,iter.max = 30,nstart = 25,trace = F)
  fviz_cluster(km[[i]], data = df,ellipse.type="norm")
  km[[i]]
  dev.off()
}

km
[[1]]
NULL

[[2]]
K-means clustering with 2 clusters of sizes 100, 50

Cluster means:
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1    0.5055957  -0.4252069     0.650315   0.6253518
2   -1.0111914   0.8504137    -1.300630  -1.2507035

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  2   2   2   2   2   2   2   2   2   2   1   1   1   1   1   1   1   1   1   1 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
141 142 143 144 145 146 147 148 149 150 
  1   1   1   1   1   1   1   1   1   1 

Within cluster sum of squares by cluster:
[1] 173.52867  47.35062
 (between_SS / total_SS =  62.9 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      

[[3]]
K-means clustering with 3 clusters of sizes 47, 53, 50

Cluster means:
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1   1.13217737  0.08812645    0.9928284   1.0141287
2  -0.05005221 -0.88042696    0.3465767   0.2805873
3  -1.01119138  0.85041372   -1.3006301  -1.2507035

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  3   3   3   3   3   3   3   3   3   3   1   1   1   2   2   2   1   2   2   2 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  2   2   2   2   2   1   2   2   2   2   1   2   2   2   2   1   1   1   2   2 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  2   2   2   2   2   1   1   2   2   2   2   2   2   2   2   2   2   2   2   2 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  1   2   1   1   1   1   2   1   1   1   1   1   1   2   2   1   1   1   1   2 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  1   2   1   2   1   1   2   1   1   1   1   1   1   2   2   1   1   1   2   1 
141 142 143 144 145 146 147 148 149 150 
  1   1   2   1   1   1   2   1   1   2 

Within cluster sum of squares by cluster:
[1] 47.45019 44.08754 47.35062
 (between_SS / total_SS =  76.7 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      

[[4]]
K-means clustering with 4 clusters of sizes 53, 25, 47, 25

Cluster means:
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1  -0.05005221 -0.88042696    0.3465767   0.2805873
2  -1.30343857  0.19883774   -1.3040289  -1.2848136
3   1.13217737  0.08812645    0.9928284   1.0141287
4  -0.71894419  1.50198969   -1.2972312  -1.2165934

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  4   2   2   2   4   4   2   2   2   2   4   2   2   2   4   4   4   4   4   4 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  4   4   4   2   2   2   2   4   4   2   2   4   4   4   2   2   4   4   2   2 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  4   2   2   4   4   2   4   2   4   2   3   3   3   1   1   1   3   1   1   1 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  1   1   1   1   1   3   1   1   1   1   3   1   1   1   1   3   3   3   1   1 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  1   1   1   1   1   3   3   1   1   1   1   1   1   1   1   1   1   1   1   1 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  3   1   3   3   3   3   1   3   3   3   3   3   3   1   1   3   3   3   3   1 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  3   1   3   1   3   3   1   3   3   3   3   3   3   1   1   3   3   3   1   3 
141 142 143 144 145 146 147 148 149 150 
  3   3   1   3   3   3   1   3   3   1 

Within cluster sum of squares by cluster:
[1] 44.087545  9.646348 47.450194 12.147537
 (between_SS / total_SS =  81.0 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      

[[5]]
K-means clustering with 5 clusters of sizes 29, 48, 25, 23, 25

Cluster means:
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1    1.3926646   0.2323817    1.1567451  1.21327591
2    0.3804044  -0.3896455    0.6067908  0.56390985
3   -1.3034386   0.1988377   -1.3040289 -1.28481361
4   -0.3516137  -1.3285553    0.1026061  0.01228268
5   -0.7189442   1.5019897   -1.2972312 -1.21659342

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  5   3   3   3   5   5   3   3   3   3   5   3   3   3   5   5   5   5   5   5 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  5   5   5   3   3   3   3   5   5   3   3   5   5   5   3   3   5   5   3   3 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  5   3   3   5   5   3   5   3   5   3   1   2   1   4   2   2   2   4   2   4 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  4   2   4   2   4   2   2   4   4   4   2   2   2   2   2   2   2   2   2   4 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  4   4   4   2   2   2   2   4   2   4   4   2   4   4   4   2   2   2   4   4 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  1   2   1   2   1   1   4   1   2   1   1   2   1   2   2   1   2   1   1   4 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  1   2   1   2   1   1   2   2   2   1   1   1   2   2   2   1   1   2   2   1 
141 142 143 144 145 146 147 148 149 150 
  1   1   2   1   1   1   2   2   1   2 

Within cluster sum of squares by cluster:
[1] 26.891293 27.830133  9.646348 13.686590 12.147537
 (between_SS / total_SS =  84.9 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      

[[6]]
K-means clustering with 6 clusters of sizes 29, 21, 38, 25, 25, 12

Cluster means:
  Sepal.Length Sepal.Width Petal.Length   Petal.Width
1    0.8596404   0.1928251    0.8520198  1.0504160307
2   -0.3628650  -1.4097814    0.1074147  0.0008746178
3    0.2527555  -0.5360569    0.5470374  0.4911209357
4   -1.3034386   0.1988377   -1.3040289 -1.2848136129
5   -0.7189442   1.5019897   -1.2972312 -1.2165934210
6    1.9704545   0.1552464    1.4399833  1.1160123690

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  5   4   4   4   5   5   4   4   4   4   5   4   4   4   5   5   5   5   5   5 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  5   5   5   4   4   4   4   5   5   4   4   5   5   5   4   4   5   5   4   4 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  5   4   4   5   5   4   5   4   5   4   1   1   1   2   3   3   1   2   3   2 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  2   3   2   3   3   1   3   2   2   2   1   3   3   3   3   3   3   1   3   2 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  2   2   2   3   3   1   1   2   3   2   2   3   2   2   2   3   3   3   2   3 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  1   3   6   3   1   6   2   6   3   6   1   3   1   3   3   1   1   6   6   2 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  1   3   6   3   1   6   3   3   1   6   6   6   1   3   3   6   1   1   3   1 
141 142 143 144 145 146 147 148 149 150 
  1   1   3   1   1   1   3   1   1   3 

Within cluster sum of squares by cluster:
[1] 14.596105 11.951942 19.109637  9.646348 12.147537 12.013666
 (between_SS / total_SS =  86.7 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      

[[7]]
K-means clustering with 7 clusters of sizes 12, 38, 12, 21, 21, 29, 17

Cluster means:
  Sepal.Length Sepal.Width Petal.Length   Petal.Width
1    1.9704545  0.15524639    1.4399833  1.1160123690
2    0.2527555 -0.53605688    0.5470374  0.4911209357
3   -0.5454476  1.99067167   -1.2649421 -1.2126576408
4   -0.3628650 -1.40978142    0.1074147  0.0008746178
5   -0.9666815  0.92820079   -1.2925915 -1.2173430935
6    0.8596404  0.19282514    0.8520198  1.0504160307
7   -1.3949345 -0.05056417   -1.3357516 -1.3187693645

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  5   7   7   7   5   3   5   5   7   7   3   5   7   7   3   3   3   5   3   3 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  5   5   5   5   5   7   5   5   5   7   7   5   3   3   7   7   5   5   7   5 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  5   7   7   5   3   7   3   7   3   5   6   6   6   4   2   2   6   4   2   4 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  4   2   4   2   2   6   2   4   4   4   6   2   2   2   2   2   2   6   2   4 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  4   4   4   2   2   6   6   4   2   4   4   2   4   4   4   2   2   2   4   2 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  6   2   1   2   6   1   4   1   2   1   6   2   6   2   2   6   6   1   1   4 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  6   2   1   2   6   1   2   2   6   1   1   1   6   2   2   1   6   6   2   6 
141 142 143 144 145 146 147 148 149 150 
  6   6   2   6   6   6   2   6   6   2 

Within cluster sum of squares by cluster:
[1] 12.013666 19.109637  3.954505 11.951942  3.397867 14.596105  5.163861
 (between_SS / total_SS =  88.2 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      

[[8]]
K-means clustering with 8 clusters of sizes 21, 27, 19, 21, 12, 17, 21, 12

Cluster means:
  Sepal.Length Sepal.Width Petal.Length   Petal.Width
1    0.9540301  0.21806600    0.9409445  1.1941032479
2    0.2383957 -0.14853349    0.3972469  0.3264268153
3    0.4243665 -0.80774813    0.7900260  0.8018404329
4   -0.9666815  0.92820079   -1.2925915 -1.2173430935
5    1.9704545  0.15524639    1.4399833  1.1160123690
6   -1.3949345 -0.05056417   -1.3357516 -1.3187693645
7   -0.3628650 -1.40978142    0.1074147  0.0008746178
8   -0.5454476  1.99067167   -1.2649421 -1.2126576408

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  4   6   6   6   4   8   4   4   6   6   8   4   6   6   8   8   8   4   8   8 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  4   4   4   4   4   6   4   4   4   6   6   4   8   8   6   6   4   4   6   4 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  4   6   6   4   8   6   8   6   8   4   1   2   1   7   3   2   2   7   2   7 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  7   2   7   2   2   2   2   7   7   7   2   2   3   2   2   2   3   1   2   7 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  7   7   7   3   2   2   2   7   2   7   7   2   7   7   7   2   2   2   7   2 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  1   3   5   3   1   5   7   5   3   5   1   3   1   3   3   1   1   5   5   7 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  1   3   5   3   1   5   3   2   3   5   5   5   3   3   3   5   1   1   2   1 
141 142 143 144 145 146 147 148 149 150 
  1   1   3   1   1   1   3   1   1   2 

Within cluster sum of squares by cluster:
[1]  7.551820 10.338980  7.448417  3.397867 12.013666  5.163861 11.951942
[8]  3.954505
 (between_SS / total_SS =  89.6 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      

[[9]]
K-means clustering with 9 clusters of sizes 16, 23, 11, 18, 14, 21, 18, 17, 12

Cluster means:
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1   0.98170501  0.35599603   1.02574736  1.42759498
2   0.69325136 -0.05173771   0.60997207  0.51423727
3   2.00064537  0.05617514   1.45028289  1.06234264
4   0.34350450 -1.04925145   0.75706440  0.79531916
5  -0.59576562 -1.45894460   0.00760697 -0.05535081
6  -0.96668148  0.92820079  -1.29259152 -1.21734309
7  -0.05233076 -0.46293504   0.27241226  0.14664426
8  -1.39493454 -0.05056417  -1.33575163 -1.31876936
9  -0.54544758  1.99067167  -1.26494207 -1.21265764

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  6   8   8   8   6   9   6   6   8   8   9   6   8   8   9   9   9   6   9   9 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  6   6   6   6   6   8   6   6   6   8   8   6   9   9   8   8   6   6   8   6 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  6   8   8   6   9   8   9   8   9   6   2   2   2   5   2   7   2   5   2   5 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  5   7   5   7   7   2   7   7   4   5   2   7   4   7   2   2   2   2   7   5 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  5   5   7   4   7   2   2   4   7   5   5   2   7   5   7   7   7   7   5   7 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  1   4   3   2   1   3   5   3   4   1   1   4   1   4   4   1   2   3   3   4 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  1   4   3   4   1   3   4   2   4   3   3   3   4   2   4   3   1   2   2   1 
141 142 143 144 145 146 147 148 149 150 
  1   1   4   1   1   1   4   2   1   2 

Within cluster sum of squares by cluster:
[1]  4.863671  8.410043 10.203534  9.121563  6.046325  3.397867  3.579667
[8]  5.163861  3.954505
 (between_SS / total_SS =  90.8 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      

[[10]]
K-means clustering with 10 clusters of sizes 9, 12, 12, 18, 18, 16, 3, 24, 17, 21

Cluster means:
   Sepal.Length Sepal.Width Petal.Length Petal.Width
1     1.9201365 -0.30998294    1.4211008  1.03583907
2    -0.5454476  1.99067167   -1.2649421 -1.21265764
3    -0.5957656 -1.54634580   -0.0281350 -0.08658717
4    -0.1596759 -0.52666509    0.2503826  0.13935578
5     0.3435045 -1.04925145    0.7570644  0.79531916
6     0.9288711  0.26996047    0.9938831  1.38659726
7     2.1214087  1.55093437    1.4966310  1.35653228
8     0.6621854 -0.07418177    0.5855479  0.46551535
9    -1.3949345 -0.05056417   -1.3357516 -1.31876936
10   -0.9666815  0.92820079   -1.2925915 -1.21734309

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
 10   9   9   9  10   2  10  10   9   9   2  10   9   9   2   2   2  10   2   2 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
 10  10  10  10  10   9  10  10  10   9   9  10   2   2   9   9  10  10   9  10 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
 10   9   9  10   2   9   2   9   2  10   8   8   8   3   8   4   8   3   8   4 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  3   4   3   8   4   8   4   4   5   3   8   4   5   4   8   8   8   8   4   3 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  3   3   4   5   4   8   8   5   4   3   4   8   4   3   4   4   4   8   3   4 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  6   5   1   8   6   1   3   1   5   7   6   5   6   5   5   6   8   7   1   5 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  6   5   1   5   6   1   5   8   5   1   1   7   5   8   5   1   6   8   8   6 
141 142 143 144 145 146 147 148 149 150 
  6   6   5   6   6   6   5   6   6   8 

Within cluster sum of squares by cluster:
 [1] 3.091184 3.954505 5.048902 4.028262 9.121563 3.726772 0.795318 8.670618
 [9] 5.163861 3.397867
 (between_SS / total_SS =  92.1 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      
Code
png(file="tmp1.png")
fviz_nbclust(df, kmeans, method = "silhouette")+
  labs(subtitle = "Silhouette method")
dev.off()
png 
  2 
Code
png(file="tmp2.png")
fviz_nbclust(df, kmeans, nstart = 25,  method = "gap_stat", nboot = 50)+
  labs(subtitle = "Gap statistic method")
dev.off()
png 
  2 
Code
png(file="tmp3.png")
NbClust(data = df, diss = NULL, distance = "euclidean",
        min.nc = 2, max.nc = 15, method = "kmeans")
*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot. 
 
*** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 10 proposed 2 as the best number of clusters 
* 6 proposed 3 as the best number of clusters 
* 1 proposed 4 as the best number of clusters 
* 1 proposed 5 as the best number of clusters 
* 3 proposed 12 as the best number of clusters 
* 1 proposed 14 as the best number of clusters 
* 2 proposed 15 as the best number of clusters 

                   ***** Conclusion *****                            
 
* According to the majority rule, the best number of clusters is  2 
 
 
******************************************************************* 
$All.index
       KL       CH Hartigan    CCC     Scott   Marriot    TrCovW   TraceW
2  3.9498 251.3493  87.3699 3.3595  357.8871 1471010.8 1643.9577 220.8793
3  5.1669 241.9044  33.1486 5.1886  489.5281 1376126.9 1225.4423 138.8884
4  0.5567 207.2659  37.4374 3.6814  555.6392 1574434.9  705.5542 113.3319
5  3.5421 203.2674  19.5911 3.5789  652.9526 1285860.3  667.4659  90.2022
6  0.7874 187.2031  19.0351 3.3533  720.9245 1176948.3  510.6882  79.4655
7  1.1988 178.5481  16.2779 3.4533  763.2771 1207890.6  394.3442  70.1876
8  0.5699 171.5792  20.5630 3.5166  846.4971  905868.3  351.9925  63.0146
9  2.1882 173.2143  13.2186 4.2469  921.2836  696371.3  222.4485  55.0437
10 1.1910 168.6666   2.3371 4.3223  945.2575  732730.5  185.9404  50.3257
11 0.5545 153.4637  14.6665 3.2722  946.7235  877980.9  182.2093  49.4994
12 3.4040 154.4471   3.2103 3.7498 1020.6039  638493.7  153.0732  44.7750
13 1.5176 144.0858   6.7708 3.0052 1032.0196  694431.0  143.4864  43.7571
14 0.0701 139.0737  31.6603 2.7569 1054.2586  694400.3  120.0184  41.6964
15 4.5221 160.2774  10.5759 5.2266 1121.4592  509296.4   75.4837  33.8226
   Friedman   Rubin Cindex     DB Silhouette   Duda Pseudot2   Beale Ratkowsky
2   50.5461  2.6983 0.2709 0.6828     0.5818 1.9311 -48.6978 -1.1403    0.5535
3   58.5837  4.2912 0.2428 0.9141     0.4599 0.4603  56.2860  2.7732    0.5028
4   61.9721  5.2589 0.3474 0.9814     0.3869 0.9204   4.3246  0.2048    0.4491
5   67.5363  6.6074 0.3598 1.0526     0.3455 2.2695 -31.8842 -1.2891    0.4114
6   77.5691  7.5001 0.3307 1.1560     0.3266 0.5343  27.0234  2.0266    0.3797
7   78.2758  8.4915 0.3177 1.1076     0.3254 0.6799  14.1232  1.0960    0.3548
8   86.3269  9.4581 0.2989 1.1326     0.3227 1.8821 -14.9977 -1.0719    0.3341
9   95.0688 10.8278 0.2730 1.0595     0.3388 0.9118   3.4826  0.2219    0.3174
10  94.7241 11.8429 0.2580 1.0584     0.3377 1.4544  -8.1230 -0.7071    0.3025
11  91.0161 12.0406 0.2545 1.1341     0.3075 2.1682 -13.4695 -1.2007    0.2886
12 105.5354 13.3110 0.2442 0.9875     0.3299 1.5579  -6.8042 -0.8213    0.2775
13 106.5486 13.6207 0.2469 1.0763     0.2958 1.8617 -11.5714 -1.0158    0.2669
14 108.5813 14.2938 0.2412 1.0789     0.2918 1.1559  -1.4840 -0.2931    0.2577
15 110.4875 17.6214 0.3555 0.9910     0.3154 2.1520  -8.5650 -1.1749    0.2507
       Ball Ptbiserial    Frey McClain   Dunn Hubert SDindex Dindex   SDbw
2  110.4396     0.7815  1.4732  0.3492 0.2674 0.0028  3.0377 1.0566 0.4276
3   46.2961     0.6797  2.0078  0.7938 0.0265 0.0030  2.8248 0.8573 0.5612
4   28.3330     0.6245  0.6583  1.0201 0.0399 0.0032  2.5023 0.7738 0.4574
5   18.0404     0.5905  0.7997  1.2718 0.0808 0.0034  2.4689 0.6936 0.2555
6   13.2443     0.5556  0.9092  1.5025 0.0842 0.0036  2.9665 0.6558 0.2588
7   10.0268     0.5340  0.9015  1.6553 0.0912 0.0037  2.7896 0.6120 0.1772
8    7.8768     0.4992  0.6392  1.9329 0.0861 0.0037  3.0573 0.5747 0.1510
9    6.1160     0.4624  0.2551  2.2801 0.0861 0.0038  3.0630 0.5336 0.1361
10   5.0326     0.4561  1.8980  2.3325 0.0861 0.0038  2.9536 0.5155 0.1257
11   4.4999     0.4359  0.2507  2.5654 0.0475 0.0038  3.7685 0.5093 0.1325
12   3.7312     0.4307 -2.8078  2.6103 0.0912 0.0038  3.4704 0.4861 0.0956
13   3.3659     0.4028  0.7216  3.0160 0.0475 0.0039  5.5228 0.4790 0.1024
14   2.9783     0.3893  0.0340  3.2251 0.0475 0.0039  5.6579 0.4664 0.0844
15   2.2548     0.3919  0.1034  3.1206 0.0750 0.0040  6.0749 0.4380 0.0638

$All.CriticalValues
   CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
2          0.5551            80.9487       1.0000
3          0.5551            38.4707       0.0284
4          0.5633            38.7617       0.9355
5          0.4195            78.8634       1.0000
6          0.4590            36.5375       0.0961
7          0.4656            34.4267       0.3624
8          0.3890            50.2528       1.0000
9          0.3999            54.0151       0.9254
10         0.3508            48.1166       1.0000
11         0.3008            58.1006       1.0000
12         0.3999            28.5079       1.0000
13         0.2576            72.0597       1.0000
14         0.2316            36.4880       1.0000
15         0.2576            46.1182       1.0000

$Best.nc
                    KL       CH Hartigan     CCC    Scott  Marriot   TrCovW
Number_clusters 3.0000   2.0000   3.0000 15.0000   3.0000     12.0   4.0000
Value_Index     5.1669 251.3493  54.2213  5.2266 131.6411 295424.5 519.8881
                 TraceW Friedman   Rubin  Cindex     DB Silhouette   Duda
Number_clusters  3.0000  12.0000 12.0000 14.0000 2.0000     2.0000 2.0000
Value_Index     56.4345  14.5193 -0.9608  0.2412 0.6828     0.5818 1.9311
                PseudoT2   Beale Ratkowsky    Ball PtBiserial   Frey McClain
Number_clusters   2.0000  2.0000    2.0000  3.0000     2.0000 3.0000  2.0000
Value_Index     -48.6978 -1.1403    0.5535 64.1435     0.7815 2.0078  0.3492
                  Dunn Hubert SDindex Dindex    SDbw
Number_clusters 2.0000      0  5.0000      0 15.0000
Value_Index     0.2674      0  2.4689      0  0.0638

$Best.partition
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  1   1   1   1   1   1   1   1   1   1   2   2   2   2   2   2   2   2   2   2 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
141 142 143 144 145 146 147 148 149 150 
  2   2   2   2   2   2   2   2   2   2 
Code
dev.off()
png 
  2 
Code
NbClust(data = df, diss = NULL, distance = "euclidean",
        min.nc = 2, max.nc = 15, method = "kmeans")

*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot. 
 

*** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 10 proposed 2 as the best number of clusters 
* 6 proposed 3 as the best number of clusters 
* 1 proposed 4 as the best number of clusters 
* 1 proposed 5 as the best number of clusters 
* 3 proposed 12 as the best number of clusters 
* 1 proposed 14 as the best number of clusters 
* 2 proposed 15 as the best number of clusters 

                   ***** Conclusion *****                            
 
* According to the majority rule, the best number of clusters is  2 
 
 
******************************************************************* 
$All.index
       KL       CH Hartigan    CCC     Scott   Marriot    TrCovW   TraceW
2  3.9498 251.3493  87.3699 3.3595  357.8871 1471010.8 1643.9577 220.8793
3  5.1669 241.9044  33.1486 5.1886  489.5281 1376126.9 1225.4423 138.8884
4  0.5567 207.2659  37.4374 3.6814  555.6392 1574434.9  705.5542 113.3319
5  3.5421 203.2674  19.5911 3.5789  652.9526 1285860.3  667.4659  90.2022
6  0.7874 187.2031  19.0351 3.3533  720.9245 1176948.3  510.6882  79.4655
7  1.1988 178.5481  16.2779 3.4533  763.2771 1207890.6  394.3442  70.1876
8  0.5699 171.5792  20.5630 3.5166  846.4971  905868.3  351.9925  63.0146
9  2.1882 173.2143  13.2186 4.2469  921.2836  696371.3  222.4485  55.0437
10 1.1910 168.6666   2.3371 4.3223  945.2575  732730.5  185.9404  50.3257
11 0.5545 153.4637  14.6665 3.2722  946.7235  877980.9  182.2093  49.4994
12 3.4040 154.4471   3.2103 3.7498 1020.6039  638493.7  153.0732  44.7750
13 1.5176 144.0858   6.7708 3.0052 1032.0196  694431.0  143.4864  43.7571
14 0.0701 139.0737  31.6603 2.7569 1054.2586  694400.3  120.0184  41.6964
15 4.5221 160.2774  10.5759 5.2266 1121.4592  509296.4   75.4837  33.8226
   Friedman   Rubin Cindex     DB Silhouette   Duda Pseudot2   Beale Ratkowsky
2   50.5461  2.6983 0.2709 0.6828     0.5818 1.9311 -48.6978 -1.1403    0.5535
3   58.5837  4.2912 0.2428 0.9141     0.4599 0.4603  56.2860  2.7732    0.5028
4   61.9721  5.2589 0.3474 0.9814     0.3869 0.9204   4.3246  0.2048    0.4491
5   67.5363  6.6074 0.3598 1.0526     0.3455 2.2695 -31.8842 -1.2891    0.4114
6   77.5691  7.5001 0.3307 1.1560     0.3266 0.5343  27.0234  2.0266    0.3797
7   78.2758  8.4915 0.3177 1.1076     0.3254 0.6799  14.1232  1.0960    0.3548
8   86.3269  9.4581 0.2989 1.1326     0.3227 1.8821 -14.9977 -1.0719    0.3341
9   95.0688 10.8278 0.2730 1.0595     0.3388 0.9118   3.4826  0.2219    0.3174
10  94.7241 11.8429 0.2580 1.0584     0.3377 1.4544  -8.1230 -0.7071    0.3025
11  91.0161 12.0406 0.2545 1.1341     0.3075 2.1682 -13.4695 -1.2007    0.2886
12 105.5354 13.3110 0.2442 0.9875     0.3299 1.5579  -6.8042 -0.8213    0.2775
13 106.5486 13.6207 0.2469 1.0763     0.2958 1.8617 -11.5714 -1.0158    0.2669
14 108.5813 14.2938 0.2412 1.0789     0.2918 1.1559  -1.4840 -0.2931    0.2577
15 110.4875 17.6214 0.3555 0.9910     0.3154 2.1520  -8.5650 -1.1749    0.2507
       Ball Ptbiserial    Frey McClain   Dunn Hubert SDindex Dindex   SDbw
2  110.4396     0.7815  1.4732  0.3492 0.2674 0.0028  3.0377 1.0566 0.4276
3   46.2961     0.6797  2.0078  0.7938 0.0265 0.0030  2.8248 0.8573 0.5612
4   28.3330     0.6245  0.6583  1.0201 0.0399 0.0032  2.5023 0.7738 0.4574
5   18.0404     0.5905  0.7997  1.2718 0.0808 0.0034  2.4689 0.6936 0.2555
6   13.2443     0.5556  0.9092  1.5025 0.0842 0.0036  2.9665 0.6558 0.2588
7   10.0268     0.5340  0.9015  1.6553 0.0912 0.0037  2.7896 0.6120 0.1772
8    7.8768     0.4992  0.6392  1.9329 0.0861 0.0037  3.0573 0.5747 0.1510
9    6.1160     0.4624  0.2551  2.2801 0.0861 0.0038  3.0630 0.5336 0.1361
10   5.0326     0.4561  1.8980  2.3325 0.0861 0.0038  2.9536 0.5155 0.1257
11   4.4999     0.4359  0.2507  2.5654 0.0475 0.0038  3.7685 0.5093 0.1325
12   3.7312     0.4307 -2.8078  2.6103 0.0912 0.0038  3.4704 0.4861 0.0956
13   3.3659     0.4028  0.7216  3.0160 0.0475 0.0039  5.5228 0.4790 0.1024
14   2.9783     0.3893  0.0340  3.2251 0.0475 0.0039  5.6579 0.4664 0.0844
15   2.2548     0.3919  0.1034  3.1206 0.0750 0.0040  6.0749 0.4380 0.0638

$All.CriticalValues
   CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
2          0.5551            80.9487       1.0000
3          0.5551            38.4707       0.0284
4          0.5633            38.7617       0.9355
5          0.4195            78.8634       1.0000
6          0.4590            36.5375       0.0961
7          0.4656            34.4267       0.3624
8          0.3890            50.2528       1.0000
9          0.3999            54.0151       0.9254
10         0.3508            48.1166       1.0000
11         0.3008            58.1006       1.0000
12         0.3999            28.5079       1.0000
13         0.2576            72.0597       1.0000
14         0.2316            36.4880       1.0000
15         0.2576            46.1182       1.0000

$Best.nc
                    KL       CH Hartigan     CCC    Scott  Marriot   TrCovW
Number_clusters 3.0000   2.0000   3.0000 15.0000   3.0000     12.0   4.0000
Value_Index     5.1669 251.3493  54.2213  5.2266 131.6411 295424.5 519.8881
                 TraceW Friedman   Rubin  Cindex     DB Silhouette   Duda
Number_clusters  3.0000  12.0000 12.0000 14.0000 2.0000     2.0000 2.0000
Value_Index     56.4345  14.5193 -0.9608  0.2412 0.6828     0.5818 1.9311
                PseudoT2   Beale Ratkowsky    Ball PtBiserial   Frey McClain
Number_clusters   2.0000  2.0000    2.0000  3.0000     2.0000 3.0000  2.0000
Value_Index     -48.6978 -1.1403    0.5535 64.1435     0.7815 2.0078  0.3492
                  Dunn Hubert SDindex Dindex    SDbw
Number_clusters 2.0000      0  5.0000      0 15.0000
Value_Index     0.2674      0  2.4689      0  0.0638

$Best.partition
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  1   1   1   1   1   1   1   1   1   1   2   2   2   2   2   2   2   2   2   2 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
141 142 143 144 145 146 147 148 149 150 
  2   2   2   2   2   2   2   2   2   2 
Code
close_open_devices()

using neural network on iris data

Code
d=iris|>janitor::clean_names()
rs=.7
tmp=sample(1:nrow(d),nrow(d)*rs)
d1=d[tmp,]
d2=d[-tmp,]

nn <- neuralnet(species~., 
                d1, 
                hidden = c(5),
                linear.output = FALSE)

plot(nn,rep = "best")

Code
pred=predict(nn,d2)
#pred
labels <- c("setosa", "versicolor", "virginca")
prediction_label <- data.frame(max.col(pred)) %>%     
  mutate(pred=labels[max.col.pred.]) %>%
  select(2) %>%
  unlist()
#prediction_label 
t=table(d2$species, prediction_label)
acc=sum(diag(t))/sum(t)
acc
[1] 0.9111111
Code
n=15
sl=runif(n,
         min(d$sepal_length),
         max(d$sepal_length)
         )
sw=runif(n,
         min(d$sepal_width),
         max(d$sepal_width)
)
pl=runif(n,
         min(d$petal_length),
         max(d$petal_length)
)
pw=runif(n,
         min(d$petal_width),
         max(d$petal_width)
)
newdata=data.frame(sepal_length=sl,
                   sepal_width=sw,
                   petal_length=pl,
                   petal_width=pw
                   )
# Make predictions on new data

pred=predict(nn, newdata)
#pred
labels <- c("setosa", "versicolor", "virginca")
prediction_label <- data.frame(max.col(pred)) %>%     
  mutate(pred=labels[max.col.pred.]) %>%
  select(2) #%>%
  #unlist()
#prediction_label 
newdata$species=prediction_label 
newdata 
sepal_length sepal_width petal_length petal_width species
7.694645 3.685910 5.547349 1.3166548 versicolor
7.043106 2.396066 1.163944 1.7354620 setosa
7.658475 2.154698 4.111134 0.3380058 setosa
5.994443 3.811294 6.193882 0.3853661 versicolor
6.472917 3.488984 3.201074 0.2210552 setosa
6.045963 2.406984 1.282959 2.3302094 versicolor
4.691703 2.149314 1.817907 1.7169094 setosa
5.191817 2.261670 2.896803 0.3276589 setosa
6.094652 2.916119 1.913506 1.2822307 setosa
5.642320 2.406346 1.780146 1.2077244 setosa
7.664889 2.716766 2.305705 1.0005197 setosa
6.186350 2.461303 2.335647 2.4786381 versicolor
5.441721 2.617208 1.775357 0.5232417 setosa
5.300678 2.434956 6.791224 2.0522445 virginca
7.135146 3.145553 2.929381 0.2642719 setosa

support vector m/c

Code
### support vector machine (svm) model

mysvm=svm(species~.,d1)
mysvm

Call:
svm(formula = species ~ ., data = d1)


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  radial 
       cost:  1 

Number of Support Vectors:  45
Code
#str(mysvm)
summary(mysvm)

Call:
svm(formula = species ~ ., data = d1)


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  radial 
       cost:  1 

Number of Support Vectors:  45

 ( 9 18 18 )


Number of Classes:  3 

Levels: 
 setosa versicolor virginica
Code
pred=predict(mysvm,d2)
#pred
t=table(pred,d2$species)
t
            
pred         setosa versicolor virginica
  setosa         15          0         0
  versicolor      0         17         1
  virginica       0          0        12
Code
acc=sum(diag(t))/sum(t)
acc
[1] 0.9777778
Code
pred=predict(mysvm,newdata)
#pred
newdata$species=pred
newdata
sepal_length sepal_width petal_length petal_width species
7.694645 3.685910 5.547349 1.3166548 virginica
7.043106 2.396066 1.163944 1.7354620 versicolor
7.658475 2.154698 4.111134 0.3380058 versicolor
5.994443 3.811294 6.193882 0.3853661 versicolor
6.472917 3.488984 3.201074 0.2210552 setosa
6.045963 2.406984 1.282959 2.3302094 versicolor
4.691703 2.149314 1.817907 1.7169094 versicolor
5.191817 2.261670 2.896803 0.3276589 versicolor
6.094652 2.916119 1.913506 1.2822307 versicolor
5.642320 2.406346 1.780146 1.2077244 versicolor
7.664889 2.716766 2.305705 1.0005197 versicolor
6.186350 2.461303 2.335647 2.4786381 virginica
5.441721 2.617208 1.775357 0.5232417 setosa
5.300678 2.434956 6.791224 2.0522445 virginica
7.135146 3.145553 2.929381 0.2642719 versicolor

knn (k nearest neighbour) model

Code
## knn nearest neighbour model
 dorg1=scale(d1[,1:4])

 dorg2=scale(d2[,1:4])

knn_model <- knn(train = dorg1, test = dorg2, cl = d1$species, k = 5)
#knn_model
t=table(knn_model,d2$species)
t
            
knn_model    setosa versicolor virginica
  setosa         15          0         0
  versicolor      0         15         0
  virginica       0          2        13
Code
acc=sum(diag(t))/sum(t)
acc
[1] 0.9555556
Code
knn_model <- knn(train = dorg1, test = newdata[,1:4], cl = d1$species, k = 5)
#knn_model

newdata$species=knn_model
newdata
sepal_length sepal_width petal_length petal_width species
7.694645 3.685910 5.547349 1.3166548 virginica
7.043106 2.396066 1.163944 1.7354620 virginica
7.658475 2.154698 4.111134 0.3380058 virginica
5.994443 3.811294 6.193882 0.3853661 virginica
6.472917 3.488984 3.201074 0.2210552 virginica
6.045963 2.406984 1.282959 2.3302094 virginica
4.691703 2.149314 1.817907 1.7169094 virginica
5.191817 2.261670 2.896803 0.3276589 virginica
6.094652 2.916119 1.913506 1.2822307 virginica
5.642320 2.406346 1.780146 1.2077244 virginica
7.664889 2.716766 2.305705 1.0005197 virginica
6.186350 2.461303 2.335647 2.4786381 virginica
5.441721 2.617208 1.775357 0.5232417 virginica
5.300678 2.434956 6.791224 2.0522445 virginica
7.135146 3.145553 2.929381 0.2642719 virginica