simulation & linear regression for employee datatest

Author

kirit ved

Published

November 23, 2023

Lecture-5

author’s details

author’s image

author’s website

https://kiritved.com

setting R environment

Code

rm(list=ls(all.names = T))
set.seed(1234)
setwd("d:/met/met_lect6")
if(! require("pacman")) install.packages("pacman")

Loading required package: pacman

Code

library("pacman")
p_load(tidyverse,
       janitor,
       randomForest,
       rpart,
       rpart.plot,
       readxl,
       writexl,
       Boruta,
       factoextra,
       cluster,
       NbClust,
       neuralnet,
       tidyverse,
       e1071,
       class,
       caTools,
       gamlss)
p_loaded()

 [1] "gamlss"       "nlme"         "gamlss.dist"  "gamlss.data"  "caTools"     
 [6] "class"        "e1071"        "neuralnet"    "NbClust"      "cluster"     
[11] "factoextra"   "Boruta"       "writexl"      "readxl"       "rpart.plot"  
[16] "rpart"        "randomForest" "janitor"      "lubridate"    "forcats"     
[21] "stringr"      "dplyr"        "purrr"        "readr"        "tidyr"       
[26] "tibble"       "ggplot2"      "tidyverse"    "pacman"

Code

myage=function(dt){
  t=Sys.Date()-dt
  t=as.numeric(t)
  t=t/365.25
  t=ceiling(t)+1
  return(t)
}
close_open_devices=function(){
  for (i in dev.list()) {
  dev.off(i)
  }
}
close_open_devices()

simulation

two machine simulation

two m/cs run in series in the assembly line to get the final product

However time taken by m/c 1 is normally distributed with mean 80 & sd of 15 mins, while that of the second m/c is 60 & 10. The manager wants to know :

What are the chances that the product will be available in

less than 125 mins
above 150 mins.
between 120 to 140 mins.

Code

```{r}
n=100000
t1=rnorm(n,80,15);t1=round(t1,2)
t2=rnorm(n,60,10);t2=round(t2,2)
ttlt=t1+t2
fl1=fl2=fl3=c()
for(i in 1:n){
if(ttlt[i]<=125){
  fl1[i]=1
} else{
  fl1[i]=0
}
if(ttlt[i]>=150){
  fl2[i]=1
} else{
  fl2[i]=0
}
 if(ttlt[i]<=140 & ttlt[i]>120){
  fl3[i]=1
} else{
  fl3[i]=0
} 
}
s1=sum(fl1)
s2=sum(fl2)
s3=sum(fl3)
p1=s1/n
p2=s2/n
p3=s3/n
s1;s2;s3;p1;p2;p3
d=data.frame(t1,t2,ttlt,fl1,fl2,fl3)
head(d)
```

[1] 20199
[1] 29065
[1] 36394
[1] 0.20199
[1] 0.29065
[1] 0.36394

t1	t2	ttlt	fl1	fl2	fl3
61.89	45.84	107.73	1	0	0
84.16	63.17	147.33	0	0	0
96.27	67.28	163.55	0	1	0
44.81	41.00	85.81	1	0	0
86.44	48.55	134.99	0	0	1
87.59	65.77	153.36	0	1	0

linear regression with employee dataset

load employee dataset & correct date format

Code

d=read.csv("d:/met/met_lect6/emp.csv") |>
  select(-c(6),c(6)) 
d$bdate=mdy(d$bdate)

Warning: 1 failed to parse.

Code

d$bdate=d$bdate+years(30)
head(d)

id	gender	bdate	educ	jobcat	salbegin	jobtime	prevexp	salary
1	m	1982-02-03	15	3	27000	98	144	57000
2	m	1988-05-23	16	1	18750	98	36	40200
3	f	1959-07-26	12	1	12000	98	381	21450
4	f	1977-04-15	8	1	13200	98	190	21900
5	m	1985-02-09	15	1	21000	98	138	45000
6	m	1988-08-22	15	1	13500	98	67	32100

Code

tail(d)

	id	gender	bdate	educ	jobcat	salbegin	jobtime	prevexp	minority	salary
469	469	f	1994-06-01	15	1	13950	64	57	0	25200
470	470	m	1994-01-22	12	1	15750	64	69	1	26250
471	471	m	1996-08-03	15	1	15750	64	32	1	26400
472	472	m	1996-02-21	15	1	15750	63	46	0	39150
473	473	f	1967-11-25	12	1	12750	63	139	0	21450
474	474	f	1998-11-05	12	1	14250	63	9	0	29400

Code

summary(d)

       id           gender              bdate                 educ      
 Min.   :  1.0   Length:474         Min.   :1959-02-10   Min.   : 8.00  
 1st Qu.:119.2   Class :character   1st Qu.:1978-01-03   1st Qu.:12.00  
 Median :237.5   Mode  :character   Median :1992-01-23   Median :12.00  
 Mean   :237.5                      Mean   :1986-10-08   Mean   :13.49  
 3rd Qu.:355.8                      3rd Qu.:1995-07-06   3rd Qu.:15.00  
 Max.   :474.0                      Max.   :2001-02-10   Max.   :21.00  
                                    NA's   :1                           
     jobcat         salbegin        jobtime         prevexp      
 Min.   :1.000   Min.   : 9000   Min.   :63.00   Min.   :  0.00  
 1st Qu.:1.000   1st Qu.:12488   1st Qu.:72.00   1st Qu.: 19.25  
 Median :1.000   Median :15000   Median :81.00   Median : 55.00  
 Mean   :1.411   Mean   :17016   Mean   :81.11   Mean   : 95.86  
 3rd Qu.:1.000   3rd Qu.:17490   3rd Qu.:90.00   3rd Qu.:138.75  
 Max.   :3.000   Max.   :79980   Max.   :98.00   Max.   :476.00  
                                                                 
    minority          salary      
 Min.   :0.0000   Min.   : 15750  
 1st Qu.:0.0000   1st Qu.: 24000  
 Median :0.0000   Median : 28875  
 Mean   :0.2194   Mean   : 34420  
 3rd Qu.:0.0000   3rd Qu.: 36938  
 Max.   :1.0000   Max.   :135000

Code

str(d)

'data.frame':   474 obs. of  10 variables:
 $ id      : int  1 2 3 4 5 6 7 8 9 10 ...
 $ gender  : chr  "m" "m" "f" "f" ...
 $ bdate   : Date, format: "1982-02-03" "1988-05-23" ...
 $ educ    : int  15 16 12 8 15 15 15 12 15 12 ...
 $ jobcat  : int  3 1 1 1 1 1 1 1 1 1 ...
 $ salbegin: int  27000 18750 12000 13200 21000 13500 18750 9750 12750 13500 ...
 $ jobtime : int  98 98 98 98 98 98 98 98 98 98 ...
 $ prevexp : int  144 36 381 190 138 67 114 0 115 244 ...
 $ minority: int  0 0 0 0 0 0 0 0 0 0 ...
 $ salary  : int  57000 40200 21450 21900 45000 32100 36000 21900 27900 24000 ...

remove rows with na value

Code

tmp=!complete.cases(d)
sum(tmp)

[1] 1

Code

d=d[tmp==F,]
head(d)

id	gender	bdate	educ	jobcat	salbegin	jobtime	prevexp	salary
1	m	1982-02-03	15	3	27000	98	144	57000
2	m	1988-05-23	16	1	18750	98	36	40200
3	f	1959-07-26	12	1	12000	98	381	21450
4	f	1977-04-15	8	1	13200	98	190	21900
5	m	1985-02-09	15	1	21000	98	138	45000
6	m	1988-08-22	15	1	13500	98	67	32100

Code

tail(d)

	id	gender	bdate	educ	jobcat	salbegin	jobtime	prevexp	minority	salary
469	469	f	1994-06-01	15	1	13950	64	57	0	25200
470	470	m	1994-01-22	12	1	15750	64	69	1	26250
471	471	m	1996-08-03	15	1	15750	64	32	1	26400
472	472	m	1996-02-21	15	1	15750	63	46	0	39150
473	473	f	1967-11-25	12	1	12750	63	139	0	21450
474	474	f	1998-11-05	12	1	14250	63	9	0	29400

Code

str(d)

'data.frame':   473 obs. of  10 variables:
 $ id      : int  1 2 3 4 5 6 7 8 9 10 ...
 $ gender  : chr  "m" "m" "f" "f" ...
 $ bdate   : Date, format: "1982-02-03" "1988-05-23" ...
 $ educ    : int  15 16 12 8 15 15 15 12 15 12 ...
 $ jobcat  : int  3 1 1 1 1 1 1 1 1 1 ...
 $ salbegin: int  27000 18750 12000 13200 21000 13500 18750 9750 12750 13500 ...
 $ jobtime : int  98 98 98 98 98 98 98 98 98 98 ...
 $ prevexp : int  144 36 381 190 138 67 114 0 115 244 ...
 $ minority: int  0 0 0 0 0 0 0 0 0 0 ...
 $ salary  : int  57000 40200 21450 21900 45000 32100 36000 21900 27900 24000 ...

Code

summary(d)

       id           gender              bdate                 educ      
 Min.   :  1.0   Length:473         Min.   :1959-02-10   Min.   : 8.00  
 1st Qu.:119.0   Class :character   1st Qu.:1978-01-03   1st Qu.:12.00  
 Median :237.0   Mode  :character   Median :1992-01-23   Median :12.00  
 Mean   :237.1                      Mean   :1986-10-08   Mean   :13.49  
 3rd Qu.:355.0                      3rd Qu.:1995-07-06   3rd Qu.:15.00  
 Max.   :474.0                      Max.   :2001-02-10   Max.   :21.00  
     jobcat         salbegin        jobtime         prevexp      
 Min.   :1.000   Min.   : 9000   Min.   :63.00   Min.   :  0.00  
 1st Qu.:1.000   1st Qu.:12450   1st Qu.:72.00   1st Qu.: 19.00  
 Median :1.000   Median :15000   Median :81.00   Median : 55.00  
 Mean   :1.412   Mean   :17009   Mean   :81.14   Mean   : 95.95  
 3rd Qu.:1.000   3rd Qu.:17490   3rd Qu.:90.00   3rd Qu.:139.00  
 Max.   :3.000   Max.   :79980   Max.   :98.00   Max.   :476.00  
    minority          salary      
 Min.   :0.0000   Min.   : 15750  
 1st Qu.:0.0000   1st Qu.: 24000  
 Median :0.0000   Median : 28800  
 Mean   :0.2199   Mean   : 34418  
 3rd Qu.:0.0000   3rd Qu.: 37050  
 Max.   :1.0000   Max.   :135000

Code

writexl::write_xlsx(d,"cleanedemp.xlsx")
do=d

selecting feaatures

foe employee data

Code

br=Boruta::Boruta(salary~.,d)
br

Boruta performed 18 iterations in 3.58509 secs.
 9 attributes confirmed important: bdate, educ, gender, id, jobcat and
4 more;
 No attributes deemed unimportant.

Code

tmp=br$finalDecision[!br$finalDecision=="Rejected"]
tmp

       id    gender     bdate      educ    jobcat  salbegin   jobtime   prevexp 
Confirmed Confirmed Confirmed Confirmed Confirmed Confirmed Confirmed Confirmed 
 minority 
Confirmed 
Levels: Tentative Confirmed Rejected

the above analysis shows that 9 attributes were selected

for iris data

Code

diris=iris |>janitor::clean_names()
br=Boruta(species~.,diris)
br

Boruta performed 9 iterations in 0.2725961 secs.
 4 attributes confirmed important: petal_length, petal_width,
sepal_length, sepal_width;
 No attributes deemed unimportant.

Code

tmp=br$finalDecision[!br$finalDecision=="Rejected"]
tmp

sepal_length  sepal_width petal_length  petal_width 
   Confirmed    Confirmed    Confirmed    Confirmed 
Levels: Tentative Confirmed Rejected

the above analysis shows that 4 attributes were selected

data visualization

Code

mybarplot=function(df,cn,ttl="bar plot",xlbl,ylbl){
  colnm=colnames(df)[cn]
  v=df|>select(c(cn))
  t=table(v)
  ttl=paste(ttl,"for",colnm,sep=" ")
  b=barplot(t,col="lightgreen",main=ttl,xlab=colnm)
  text(b,min(t)*1.1,labels=as.character(t),col="red")
  #return (T)
}
mypie=function(df,cn,ttl="pie chart"){
  colnm=colnames(df)[cn]
  v=df|>select(c(cn))
  
  t=table(v)

  revnm=paste(names(t),t,sep="--")
  ttl=paste(ttl,"for",colnm,sep=" ")
    pie(t,labels=revnm,main=ttl)
}
dtmp=d|>group_by(gender)|>summarise(n=n())|>mutate(nper=100*n/sum(n))
dtmp

gender	n	nper
f	216	45.66596
m	257	54.33404

Code

mybarplot(d,2)

Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(cn)

  # Now:
  data %>% select(all_of(cn))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.

Code

mybarplot(d,4)

Code

mybarplot(d,5)

Code

mybarplot(d,9)

Code

mypie(d,2)

Code

mypie(d,4)

Code

mypie(d,5)

Code

mypie(d,9)

Code

plot(d$salbegin,d$salary,col="lightgreen")

Code

boxplot(salary~gender,d,col="lightgreen",main="boxplot of salary vs gender")

Code

boxplot(salary~jobcat,d,col="lightgreen",main="boxplot of salary vs jobcategory")

Code

boxplot(salary~educ,d,col="lightgreen",main="boxplot of salary vs educ")

Code

boxplot(salary~minority,d,col="lightgreen",main="boxplot of salary vs minority")

Code

hist(d$salary,
     main="histogram of salary",
     xlab="salary",
     ylab="frequency",col="lightgreen")

Code

hist(d$salbegin,
     main="histogram of begining salary",
     xlab="begining salary",
     ylab="frequency",col="lightgreen")

Code

hist(d$educ,
     main="histogram of education",
     xlab="education",
     ylab="frequency",col="lightgreen")

structure of data needed for employee database

Code

str(do)

'data.frame':   473 obs. of  10 variables:
 $ id      : int  1 2 3 4 5 6 7 8 9 10 ...
 $ gender  : chr  "m" "m" "f" "f" ...
 $ bdate   : Date, format: "1982-02-03" "1988-05-23" ...
 $ educ    : int  15 16 12 8 15 15 15 12 15 12 ...
 $ jobcat  : int  3 1 1 1 1 1 1 1 1 1 ...
 $ salbegin: int  27000 18750 12000 13200 21000 13500 18750 9750 12750 13500 ...
 $ jobtime : int  98 98 98 98 98 98 98 98 98 98 ...
 $ prevexp : int  144 36 381 190 138 67 114 0 115 244 ...
 $ minority: int  0 0 0 0 0 0 0 0 0 0 ...
 $ salary  : int  57000 40200 21450 21900 45000 32100 36000 21900 27900 24000 ...

calculate age of employee & replacing gender with numerical value using pipe operator

Code

do1=d|>mutate(age=myage(bdate),
gn=ifelse(gender=="f",0,1))|> select(-c(2,3)) |>select(-c(8),c(8))
head(do1)

id	educ	jobcat	salbegin	jobtime	prevexp	age	gn	salary
1	15	3	27000	98	144	43	1	57000
2	16	1	18750	98	36	37	1	40200
3	12	1	12000	98	381	66	0	21450
4	8	1	13200	98	190	48	0	21900
5	15	1	21000	98	138	40	1	45000
6	15	1	13500	98	67	37	1	32100

Code

str(do1)

'data.frame':   473 obs. of  10 variables:
 $ id      : int  1 2 3 4 5 6 7 8 9 10 ...
 $ educ    : int  15 16 12 8 15 15 15 12 15 12 ...
 $ jobcat  : int  3 1 1 1 1 1 1 1 1 1 ...
 $ salbegin: int  27000 18750 12000 13200 21000 13500 18750 9750 12750 13500 ...
 $ jobtime : int  98 98 98 98 98 98 98 98 98 98 ...
 $ prevexp : int  144 36 381 190 138 67 114 0 115 244 ...
 $ minority: int  0 0 0 0 0 0 0 0 0 0 ...
 $ age     : num  43 37 66 48 40 37 39 29 49 49 ...
 $ gn      : num  1 1 0 0 1 1 1 0 0 0 ...
 $ salary  : int  57000 40200 21450 21900 45000 32100 36000 21900 27900 24000 ...

performing linear regression

split the data

Code

split_ratio=2/3
tmp=sample(1:nrow(do1),nrow(do1)*split_ratio)
 # tmp
  do1train=do1[tmp,]
  do1test=do1[-tmp,]
  nrow(do1train);nrow(do1test)

[1] 315

[1] 158

Code

  mylm=lm(salary~.,do1train)
  mylm


Call:
lm(formula = salary ~ ., data = do1train)

Coefficients:
(Intercept)           id         educ       jobcat     salbegin      jobtime  
  35427.183      -33.270      453.599     4299.745        1.394     -312.294  
    prevexp     minority          age           gn  
    -14.229    -1091.049      -75.782     1524.412

Code

  summary(mylm)


Call:
lm(formula = salary ~ ., data = do1train)

Residuals:
   Min     1Q Median     3Q    Max 
-19682  -3329   -528   2487  45676 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 35427.1833 62297.5212   0.569   0.5700    
id            -33.2696    46.2515  -0.719   0.4725    
educ          453.5995   179.2839   2.530   0.0119 *  
jobcat       4299.7455   698.7595   6.153 2.38e-09 ***
salbegin        1.3944     0.0743  18.769  < 2e-16 ***
jobtime      -312.2943   629.8997  -0.496   0.6204    
prevexp       -14.2288     6.0757  -2.342   0.0198 *  
minority    -1091.0494   962.6457  -1.133   0.2579    
age           -75.7820    53.3130  -1.421   0.1562    
gn           1524.4123   917.1525   1.662   0.0975 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6375 on 305 degrees of freedom
Multiple R-squared:  0.8641,    Adjusted R-squared:  0.8601 
F-statistic: 215.5 on 9 and 305 DF,  p-value: < 2.2e-16

Code

  pred=predict(mylm,do1test)
  pred=round(pred,0)
  #paste(pred,do1test$salary,sep=" - ")
  plot(do1test$salary,pred)

Code

  tmpcor=cor(pred,do1test$salary)
  tmpcor

[1] 0.8972594

Code

  RMSE = sqrt(mean((pred - do1test$salary)^2))
  RMSE

[1] 7723.815

create dummy data of employees for prediction

Code

str(do1)

'data.frame':   473 obs. of  10 variables:
 $ id      : int  1 2 3 4 5 6 7 8 9 10 ...
 $ educ    : int  15 16 12 8 15 15 15 12 15 12 ...
 $ jobcat  : int  3 1 1 1 1 1 1 1 1 1 ...
 $ salbegin: int  27000 18750 12000 13200 21000 13500 18750 9750 12750 13500 ...
 $ jobtime : int  98 98 98 98 98 98 98 98 98 98 ...
 $ prevexp : int  144 36 381 190 138 67 114 0 115 244 ...
 $ minority: int  0 0 0 0 0 0 0 0 0 0 ...
 $ age     : num  43 37 66 48 40 37 39 29 49 49 ...
 $ gn      : num  1 1 0 0 1 1 1 0 0 0 ...
 $ salary  : int  57000 40200 21450 21900 45000 32100 36000 21900 27900 24000 ...

Code

n=20
id=round(runif(n,474,550))
gender=(sample(c("m","f"),n,replace=T))
bdate=runif(n,min(do$bdate),max(d$bdate))
bdate=as.Date(bdate)
educ=round(runif(n,min(do1$educ),max(do1$educ)))
jobcat=round(runif(n,min(do1$jobcat),max(do1$jobcat)))
salbegin=round(runif(n,min(do1$salbegin),max(do1$salbegin)))
jobtime=round(runif(n,min(do1$jobtime),max(do1$jobtime)))
prevexp=round(runif(n,min(do1$prevexp),max(do1$prevexp)))
minority=round(runif(n,min(do1$educ),max(do1$educ)))
age=round(runif(n,min(do1$age),max(do1$age)))
gendern=(sample(c("m","f"),n,replace=T))
dpred=data.frame(id,gender,bdate,educ,jobcat,salbegin,jobtime,prevexp,minority)
dpred1=dpred|>mutate(age=myage(bdate),
gn=ifelse(gender=="f",0,1))|> select(-c(2,3)) |>select(-c(8),c(8))
dpred1

id	educ	jobcat	salbegin	jobtime	prevexp	minority	gn	age
525	14	3	39198	73	399	11	1	33
534	19	1	28098	86	108	21	0	30
483	14	2	57865	82	3	9	1	66
483	8	3	9757	67	386	16	1	39
487	13	1	67078	76	310	20	1	58
500	8	2	45325	89	261	19	0	31
478	9	1	45498	97	263	12	1	51
545	17	2	58439	90	317	9	1	50
505	11	1	73943	68	248	9	1	56
528	14	2	56822	83	147	12	1	32
527	15	2	41003	81	298	13	0	31
475	18	2	18584	92	429	9	1	40
500	19	2	70025	97	392	17	0	43
547	20	3	76414	78	354	19	1	43
480	13	2	42040	73	370	12	0	28
488	10	3	36818	83	17	13	0	47
494	13	1	32092	97	251	12	0	50
484	9	1	9790	85	304	17	0	27
546	19	1	49217	93	171	20	1	66
542	13	2	62887	84	265	16	1	44

predicting salary for new employee

Code

finpred=predict(mylm,dpred1)
finpred

         1          2          3          4          5          6          7 
 50417.149  16181.048  76050.184   4187.694  70121.044  39636.967  41883.299 
         8          9         10         11         12         13         14 
 70394.978  93721.538  70040.830  44405.257  16141.271  75979.843  93895.254 
        15         16         17         18         19         20 
 49300.048  44059.956  23193.510 -10106.061  42036.022  70313.911

Code

dpred1$salary=round(finpred)
dpred1

id	educ	jobcat	salbegin	jobtime	prevexp	minority	gn	age	salary
525	14	3	39198	73	399	11	1	33	50417
534	19	1	28098	86	108	21	0	30	16181
483	14	2	57865	82	3	9	1	66	76050
483	8	3	9757	67	386	16	1	39	4188
487	13	1	67078	76	310	20	1	58	70121
500	8	2	45325	89	261	19	0	31	39637
478	9	1	45498	97	263	12	1	51	41883
545	17	2	58439	90	317	9	1	50	70395
505	11	1	73943	68	248	9	1	56	93722
528	14	2	56822	83	147	12	1	32	70041
527	15	2	41003	81	298	13	0	31	44405
475	18	2	18584	92	429	9	1	40	16141
500	19	2	70025	97	392	17	0	43	75980
547	20	3	76414	78	354	19	1	43	93895
480	13	2	42040	73	370	12	0	28	49300
488	10	3	36818	83	17	13	0	47	44060
494	13	1	32092	97	251	12	0	50	23194
484	9	1	9790	85	304	17	0	27	-10106
546	19	1	49217	93	171	20	1	66	42036
542	13	2	62887	84	265	16	1	44	70314

analysis & prediction of species using iris dataset

loading iris dataset

Code

d=iris|>janitor::clean_names()
head(d)

sepal_length	sepal_width	petal_length	petal_width	species
5.1	3.5	1.4	0.2	setosa
4.9	3.0	1.4	0.2	setosa
4.7	3.2	1.3	0.2	setosa
4.6	3.1	1.5	0.2	setosa
5.0	3.6	1.4	0.2	setosa
5.4	3.9	1.7	0.4	setosa

Code

tail(d)

	sepal_length	sepal_width	petal_length	petal_width	species
145	6.7	3.3	5.7	2.5	virginica
146	6.7	3.0	5.2	2.3	virginica
147	6.3	2.5	5.0	1.9	virginica
148	6.5	3.0	5.2	2.0	virginica
149	6.2	3.4	5.4	2.3	virginica
150	5.9	3.0	5.1	1.8	virginica

Code

str(d)

'data.frame':   150 obs. of  5 variables:
 $ sepal_length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
 $ sepal_width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
 $ petal_length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
 $ petal_width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
 $ species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

Code

summary(d)

  sepal_length    sepal_width     petal_length    petal_width   
 Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
 1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
 Median :5.800   Median :3.000   Median :4.350   Median :1.300  
 Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
 3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
 Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
       species  
 setosa    :50  
 versicolor:50  
 virginica :50

partitioning the iris dataset

Code

sr=0.7
tmp=sample(1:nrow(d),nrow(d)*sr)
dtrain=d[tmp,]
dtest=d[-tmp,]
nrow(dtrain)

[1] 105

Code

nrow(dtest)

[1] 45

creating dataset for prediction

Code

n=10
sl=runif(n,min(d$sepal_length),max(d$sepal_length))
sw=runif(n,min(d$sepal_width),max(d$sepal_width))
pl=runif(n,min(d$petal_length),max(d$petal_length))
pw=runif(n,min(d$petal_width),max(d$petal_width))
dpred=data.frame(sepal_length=sl,
                 sepal_width=sw,
                 petal_length=sl,
                 petal_width=sw
                 )
dpred[1,3]=1.2
dpred[1,4]=1
dpred[2,3]=1.2
dpred[2,4]=2

dpred

sepal_length	sepal_width	petal_length	petal_width
5.915432	2.184280	1.200000	1.000000
5.146724	2.414594	1.200000	2.000000
4.443416	3.435550	4.443416	3.435550
4.717700	3.626872	4.717700	3.626872
6.786433	4.355298	6.786433	4.355298
6.338248	3.883521	6.338248	3.883521
6.691690	4.287332	6.691690	4.287332
6.487944	2.246687	6.487944	2.246687
5.153071	3.903833	5.153071	3.903833
5.774870	2.430127	5.774870	2.430127

creating model using rpart

Code

rp=rpart::rpart(species~.,dtrain)
rp

n= 105 

node), split, n, loss, yval, (yprob)
      * denotes terminal node

1) root 105 66 virginica (0.34285714 0.28571429 0.37142857)  
  2) petal_length< 2.45 36  0 setosa (1.00000000 0.00000000 0.00000000) *
  3) petal_length>=2.45 69 30 virginica (0.00000000 0.43478261 0.56521739)  
    6) petal_width< 1.75 33  3 versicolor (0.00000000 0.90909091 0.09090909) *
    7) petal_width>=1.75 36  0 virginica (0.00000000 0.00000000 1.00000000) *

Code

# plot(rp)
# text(rp)
rpart.plot::rpart.plot(rp)

Code

pred=predict(rp,dtest,type="class")
pred

         8         12         17         19         22         24         30 
    setosa     setosa     setosa     setosa     setosa     setosa     setosa 
        32         33         39         45         47         48         49 
    setosa     setosa     setosa     setosa     setosa     setosa     setosa 
        51         57         60         61         64         66         68 
versicolor versicolor versicolor versicolor versicolor versicolor versicolor 
        69         70         71         74         77         80         81 
versicolor versicolor  virginica versicolor versicolor versicolor versicolor 
        82         84         89         90         95         98        102 
versicolor versicolor versicolor versicolor versicolor versicolor  virginica 
       105        114        116        122        125        126        132 
 virginica  virginica  virginica  virginica  virginica  virginica  virginica 
       134        135        149 
versicolor versicolor  virginica 
Levels: setosa versicolor virginica

Code

t=table(dtest$species,pred)
acc=sum(diag(t))/sum(t)
acc

[1] 0.9333333

predicting model using rpart

Code

pred=predict(rp,dpred,type="class")
pred

        1         2         3         4         5         6         7         8 
   setosa    setosa virginica virginica virginica virginica virginica virginica 
        9        10 
virginica virginica 
Levels: setosa versicolor virginica

creating model using randomforest

Code

rf=randomForest::randomForest(species~.,dtrain,mtry=3,ntree=1000,proximity=T,importance=T)
rf


Call:
 randomForest(formula = species ~ ., data = dtrain, mtry = 3,      ntree = 1000, proximity = T, importance = T) 
               Type of random forest: classification
                     Number of trees: 1000
No. of variables tried at each split: 3

        OOB estimate of  error rate: 3.81%
Confusion matrix:
           setosa versicolor virginica class.error
setosa         36          0         0  0.00000000
versicolor      0         29         1  0.03333333
virginica       0          3        36  0.07692308

Code

summary(rf)

                Length Class  Mode     
call                7  -none- call     
type                1  -none- character
predicted         105  factor numeric  
err.rate         4000  -none- numeric  
confusion          12  -none- numeric  
votes             315  matrix numeric  
oob.times         105  -none- numeric  
classes             3  -none- character
importance         20  -none- numeric  
importanceSD       16  -none- numeric  
localImportance     0  -none- NULL     
proximity       11025  -none- numeric  
ntree               1  -none- numeric  
mtry                1  -none- numeric  
forest             14  -none- list     
y                 105  factor numeric  
test                0  -none- NULL     
inbag               0  -none- NULL     
terms               3  terms  call

Code

pred=predict(rf,dtest)
pred

         8         12         17         19         22         24         30 
    setosa     setosa     setosa     setosa     setosa     setosa     setosa 
        32         33         39         45         47         48         49 
    setosa     setosa     setosa     setosa     setosa     setosa     setosa 
        51         57         60         61         64         66         68 
versicolor versicolor versicolor versicolor versicolor versicolor versicolor 
        69         70         71         74         77         80         81 
versicolor versicolor  virginica versicolor versicolor versicolor versicolor 
        82         84         89         90         95         98        102 
versicolor  virginica versicolor versicolor versicolor versicolor  virginica 
       105        114        116        122        125        126        132 
 virginica  virginica  virginica  virginica  virginica  virginica  virginica 
       134        135        149 
versicolor  virginica  virginica 
Levels: setosa versicolor virginica

Code

t=table(pred,dtest$species)
t

            
pred         setosa versicolor virginica
  setosa         14          0         0
  versicolor      0         18         1
  virginica       0          2        10

Code

acc=sum(diag(t))/sum(t)
acc

[1] 0.9333333

predicting output using rf model

Code

pred=predict(rf,dpred)
pred

        1         2         3         4         5         6         7         8 
   setosa virginica virginica virginica virginica virginica virginica virginica 
        9        10 
virginica virginica 
Levels: setosa versicolor virginica

Code

dpred$species=pred
dpred

sepal_length	sepal_width	petal_length	petal_width	species
5.915432	2.184280	1.200000	1.000000	setosa
5.146724	2.414594	1.200000	2.000000	virginica
4.443416	3.435550	4.443416	3.435550	virginica
4.717700	3.626872	4.717700	3.626872	virginica
6.786433	4.355298	6.786433	4.355298	virginica
6.338248	3.883521	6.338248	3.883521	virginica
6.691690	4.287332	6.691690	4.287332	virginica
6.487944	2.246687	6.487944	2.246687	virginica
5.153071	3.903833	5.153071	3.903833	virginica
5.774870	2.430127	5.774870	2.430127	virginica

clusterring

Code

df <- iris[,c(1:4)]

# Omitting any NA values
df <- na.omit(df)

# Scaling dataset
df <- scale(df)

# output to be present as PNG file 
png(file = "KMeansExample.png")

km <- kmeans(df, centers = 3,iter.max = 30, nstart = 25)
#str(km)
km

K-means clustering with 3 clusters of sizes 47, 50, 53

Cluster means:
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1   1.13217737  0.08812645    0.9928284   1.0141287
2  -1.01119138  0.85041372   -1.3006301  -1.2507035
3  -0.05005221 -0.88042696    0.3465767   0.2805873

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  2   2   2   2   2   2   2   2   2   2   1   1   1   3   3   3   1   3   3   3 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  3   3   3   3   3   1   3   3   3   3   1   3   3   3   3   1   1   1   3   3 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  3   3   3   3   3   1   1   3   3   3   3   3   3   3   3   3   3   3   3   3 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  1   3   1   1   1   1   3   1   1   1   1   1   1   3   3   1   1   1   1   3 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  1   3   1   3   1   1   3   1   1   1   1   1   1   3   3   1   1   1   3   1 
141 142 143 144 145 146 147 148 149 150 
  1   1   3   1   1   1   3   1   1   3 

Within cluster sum of squares by cluster:
[1] 47.45019 47.35062 44.08754
 (between_SS / total_SS =  76.7 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"

Code

# Visualize the clusters
fviz_cluster(km, data = df)

# saving the file 
dev.off()

png 
  2

Code

# output to be present as PNG file 
png(file = "KMeansExample2.png")

km <- kmeans(df, centers = 4,iter.max = 30, nstart = 25)
# Visualize the clusters
fviz_cluster(km, data = df)
km

K-means clustering with 4 clusters of sizes 47, 25, 53, 25

Cluster means:
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1   1.13217737  0.08812645    0.9928284   1.0141287
2  -0.71894419  1.50198969   -1.2972312  -1.2165934
3  -0.05005221 -0.88042696    0.3465767   0.2805873
4  -1.30343857  0.19883774   -1.3040289  -1.2848136

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  2   4   4   4   2   2   4   4   4   4   2   4   4   4   2   2   2   2   2   2 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  2   2   2   4   4   4   4   2   2   4   4   2   2   2   4   4   2   2   4   4 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  2   4   4   2   2   4   2   4   2   4   1   1   1   3   3   3   1   3   3   3 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  3   3   3   3   3   1   3   3   3   3   1   3   3   3   3   1   1   1   3   3 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  3   3   3   3   3   1   1   3   3   3   3   3   3   3   3   3   3   3   3   3 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  1   3   1   1   1   1   3   1   1   1   1   1   1   3   3   1   1   1   1   3 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  1   3   1   3   1   1   3   1   1   1   1   1   1   3   3   1   1   1   3   1 
141 142 143 144 145 146 147 148 149 150 
  1   1   3   1   1   1   3   1   1   3 

Within cluster sum of squares by cluster:
[1] 47.450194 12.147537 44.087545  9.646348
 (between_SS / total_SS =  81.0 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"

Code

# saving the file 
dev.off()

png 
  2

Code

#str(km)
km

K-means clustering with 4 clusters of sizes 47, 25, 53, 25

Cluster means:
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1   1.13217737  0.08812645    0.9928284   1.0141287
2  -0.71894419  1.50198969   -1.2972312  -1.2165934
3  -0.05005221 -0.88042696    0.3465767   0.2805873
4  -1.30343857  0.19883774   -1.3040289  -1.2848136

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  2   4   4   4   2   2   4   4   4   4   2   4   4   4   2   2   2   2   2   2 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  2   2   2   4   4   4   4   2   2   4   4   2   2   2   4   4   2   2   4   4 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  2   4   4   2   2   4   2   4   2   4   1   1   1   3   3   3   1   3   3   3 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  3   3   3   3   3   1   3   3   3   3   1   3   3   3   3   1   1   1   3   3 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  3   3   3   3   3   1   1   3   3   3   3   3   3   3   3   3   3   3   3   3 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  1   3   1   1   1   1   3   1   1   1   1   1   1   3   3   1   1   1   1   3 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  1   3   1   3   1   1   3   1   1   1   1   1   1   3   3   1   1   1   3   1 
141 142 143 144 145 146 147 148 149 150 
  1   1   3   1   1   1   3   1   1   3 

Within cluster sum of squares by cluster:
[1] 47.450194 12.147537 44.087545  9.646348
 (between_SS / total_SS =  81.0 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"

Code

df

    Sepal.Length Sepal.Width Petal.Length   Petal.Width
1    -0.89767388  1.01560199  -1.33575163 -1.3110521482
2    -1.13920048 -0.13153881  -1.33575163 -1.3110521482
3    -1.38072709  0.32731751  -1.39239929 -1.3110521482
4    -1.50149039  0.09788935  -1.27910398 -1.3110521482
5    -1.01843718  1.24503015  -1.33575163 -1.3110521482
6    -0.53538397  1.93331463  -1.16580868 -1.0486667950
7    -1.50149039  0.78617383  -1.33575163 -1.1798594716
8    -1.01843718  0.78617383  -1.27910398 -1.3110521482
9    -1.74301699 -0.36096697  -1.33575163 -1.3110521482
10   -1.13920048  0.09788935  -1.27910398 -1.4422448248
11   -0.53538397  1.47445831  -1.27910398 -1.3110521482
12   -1.25996379  0.78617383  -1.22245633 -1.3110521482
13   -1.25996379 -0.13153881  -1.33575163 -1.4422448248
14   -1.86378030 -0.13153881  -1.50569459 -1.4422448248
15   -0.05233076  2.16274279  -1.44904694 -1.3110521482
16   -0.17309407  3.08045544  -1.27910398 -1.0486667950
17   -0.53538397  1.93331463  -1.39239929 -1.0486667950
18   -0.89767388  1.01560199  -1.33575163 -1.1798594716
19   -0.17309407  1.70388647  -1.16580868 -1.1798594716
20   -0.89767388  1.70388647  -1.27910398 -1.1798594716
21   -0.53538397  0.78617383  -1.16580868 -1.3110521482
22   -0.89767388  1.47445831  -1.27910398 -1.0486667950
23   -1.50149039  1.24503015  -1.56234224 -1.3110521482
24   -0.89767388  0.55674567  -1.16580868 -0.9174741184
25   -1.25996379  0.78617383  -1.05251337 -1.3110521482
26   -1.01843718 -0.13153881  -1.22245633 -1.3110521482
27   -1.01843718  0.78617383  -1.22245633 -1.0486667950
28   -0.77691058  1.01560199  -1.27910398 -1.3110521482
29   -0.77691058  0.78617383  -1.33575163 -1.3110521482
30   -1.38072709  0.32731751  -1.22245633 -1.3110521482
31   -1.25996379  0.09788935  -1.22245633 -1.3110521482
32   -0.53538397  0.78617383  -1.27910398 -1.0486667950
33   -0.77691058  2.39217095  -1.27910398 -1.4422448248
34   -0.41462067  2.62159911  -1.33575163 -1.3110521482
35   -1.13920048  0.09788935  -1.27910398 -1.3110521482
36   -1.01843718  0.32731751  -1.44904694 -1.3110521482
37   -0.41462067  1.01560199  -1.39239929 -1.3110521482
38   -1.13920048  1.24503015  -1.33575163 -1.4422448248
39   -1.74301699 -0.13153881  -1.39239929 -1.3110521482
40   -0.89767388  0.78617383  -1.27910398 -1.3110521482
41   -1.01843718  1.01560199  -1.39239929 -1.1798594716
42   -1.62225369 -1.73753594  -1.39239929 -1.1798594716
43   -1.74301699  0.32731751  -1.39239929 -1.3110521482
44   -1.01843718  1.01560199  -1.22245633 -0.7862814418
45   -0.89767388  1.70388647  -1.05251337 -1.0486667950
46   -1.25996379 -0.13153881  -1.33575163 -1.1798594716
47   -0.89767388  1.70388647  -1.22245633 -1.3110521482
48   -1.50149039  0.32731751  -1.33575163 -1.3110521482
49   -0.65614727  1.47445831  -1.27910398 -1.3110521482
50   -1.01843718  0.55674567  -1.33575163 -1.3110521482
51    1.39682886  0.32731751   0.53362088  0.2632599711
52    0.67224905  0.32731751   0.42032558  0.3944526477
53    1.27606556  0.09788935   0.64691619  0.3944526477
54   -0.41462067 -1.73753594   0.13708732  0.1320672944
55    0.79301235 -0.59039513   0.47697323  0.3944526477
56   -0.17309407 -0.59039513   0.42032558  0.1320672944
57    0.55148575  0.55674567   0.53362088  0.5256453243
58   -1.13920048 -1.50810778  -0.25944625 -0.2615107354
59    0.91377565 -0.36096697   0.47697323  0.1320672944
60   -0.77691058 -0.81982329   0.08043967  0.2632599711
61   -1.01843718 -2.42582042  -0.14615094 -0.2615107354
62    0.06843254 -0.13153881   0.25038262  0.3944526477
63    0.18919584 -1.96696410   0.13708732 -0.2615107354
64    0.30995914 -0.36096697   0.53362088  0.2632599711
65   -0.29385737 -0.36096697  -0.08950329  0.1320672944
66    1.03453895  0.09788935   0.36367793  0.2632599711
67   -0.29385737 -0.13153881   0.42032558  0.3944526477
68   -0.05233076 -0.81982329   0.19373497 -0.2615107354
69    0.43072244 -1.96696410   0.42032558  0.3944526477
70   -0.29385737 -1.27867961   0.08043967 -0.1303180588
71    0.06843254  0.32731751   0.59026853  0.7880306775
72    0.30995914 -0.59039513   0.13708732  0.1320672944
73    0.55148575 -1.27867961   0.64691619  0.3944526477
74    0.30995914 -0.59039513   0.53362088  0.0008746178
75    0.67224905 -0.36096697   0.30703027  0.1320672944
76    0.91377565 -0.13153881   0.36367793  0.2632599711
77    1.15530226 -0.59039513   0.59026853  0.2632599711
78    1.03453895 -0.13153881   0.70356384  0.6568380009
79    0.18919584 -0.36096697   0.42032558  0.3944526477
80   -0.17309407 -1.04925145  -0.14615094 -0.2615107354
81   -0.41462067 -1.50810778   0.02379201 -0.1303180588
82   -0.41462067 -1.50810778  -0.03285564 -0.2615107354
83   -0.05233076 -0.81982329   0.08043967  0.0008746178
84    0.18919584 -0.81982329   0.76021149  0.5256453243
85   -0.53538397 -0.13153881   0.42032558  0.3944526477
86    0.18919584  0.78617383   0.42032558  0.5256453243
87    1.03453895  0.09788935   0.53362088  0.3944526477
88    0.55148575 -1.73753594   0.36367793  0.1320672944
89   -0.29385737 -0.13153881   0.19373497  0.1320672944
90   -0.41462067 -1.27867961   0.13708732  0.1320672944
91   -0.41462067 -1.04925145   0.36367793  0.0008746178
92    0.30995914 -0.13153881   0.47697323  0.2632599711
93   -0.05233076 -1.04925145   0.13708732  0.0008746178
94   -1.01843718 -1.73753594  -0.25944625 -0.2615107354
95   -0.29385737 -0.81982329   0.25038262  0.1320672944
96   -0.17309407 -0.13153881   0.25038262  0.0008746178
97   -0.17309407 -0.36096697   0.25038262  0.1320672944
98    0.43072244 -0.36096697   0.30703027  0.1320672944
99   -0.89767388 -1.27867961  -0.42938920 -0.1303180588
100  -0.17309407 -0.59039513   0.19373497  0.1320672944
101   0.55148575  0.55674567   1.27004036  1.7063794137
102  -0.05233076 -0.81982329   0.76021149  0.9192233541
103   1.51759216 -0.13153881   1.21339271  1.1816087073
104   0.55148575 -0.36096697   1.04344975  0.7880306775
105   0.79301235 -0.13153881   1.15674505  1.3128013839
106   2.12140867 -0.13153881   1.60992627  1.1816087073
107  -1.13920048 -1.27867961   0.42032558  0.6568380009
108   1.75911877 -0.36096697   1.43998331  0.7880306775
109   1.03453895 -1.27867961   1.15674505  0.7880306775
110   1.63835547  1.24503015   1.32668801  1.7063794137
111   0.79301235  0.32731751   0.76021149  1.0504160307
112   0.67224905 -0.81982329   0.87350679  0.9192233541
113   1.15530226 -0.13153881   0.98680210  1.1816087073
114  -0.17309407 -1.27867961   0.70356384  1.0504160307
115  -0.05233076 -0.59039513   0.76021149  1.5751867371
116   0.67224905  0.32731751   0.87350679  1.4439940605
117   0.79301235 -0.13153881   0.98680210  0.7880306775
118   2.24217198  1.70388647   1.66657392  1.3128013839
119   2.24217198 -1.04925145   1.77986923  1.4439940605
120   0.18919584 -1.96696410   0.70356384  0.3944526477
121   1.27606556  0.32731751   1.10009740  1.4439940605
122  -0.29385737 -0.59039513   0.64691619  1.0504160307
123   2.24217198 -0.59039513   1.66657392  1.0504160307
124   0.55148575 -0.81982329   0.64691619  0.7880306775
125   1.03453895  0.55674567   1.10009740  1.1816087073
126   1.63835547  0.32731751   1.27004036  0.7880306775
127   0.43072244 -0.59039513   0.59026853  0.7880306775
128   0.30995914 -0.13153881   0.64691619  0.7880306775
129   0.67224905 -0.59039513   1.04344975  1.1816087073
130   1.63835547 -0.13153881   1.15674505  0.5256453243
131   1.87988207 -0.59039513   1.32668801  0.9192233541
132   2.48369858  1.70388647   1.49663097  1.0504160307
133   0.67224905 -0.59039513   1.04344975  1.3128013839
134   0.55148575 -0.59039513   0.76021149  0.3944526477
135   0.30995914 -1.04925145   1.04344975  0.2632599711
136   2.24217198 -0.13153881   1.32668801  1.4439940605
137   0.55148575  0.78617383   1.04344975  1.5751867371
138   0.67224905  0.09788935   0.98680210  0.7880306775
139   0.18919584 -0.13153881   0.59026853  0.7880306775
140   1.27606556  0.09788935   0.93015445  1.1816087073
141   1.03453895  0.09788935   1.04344975  1.5751867371
142   1.27606556  0.09788935   0.76021149  1.4439940605
143  -0.05233076 -0.81982329   0.76021149  0.9192233541
144   1.15530226  0.32731751   1.21339271  1.4439940605
145   1.03453895  0.55674567   1.10009740  1.7063794137
146   1.03453895 -0.13153881   0.81685914  1.4439940605
147   0.55148575 -1.27867961   0.70356384  0.9192233541
148   0.79301235 -0.13153881   0.81685914  1.0504160307
149   0.43072244  0.78617383   0.93015445  1.4439940605
150   0.06843254 -0.13153881   0.76021149  0.7880306775
attr(,"scaled:center")
Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
    5.843333     3.057333     3.758000     1.199333 
attr(,"scaled:scale")
Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
   0.8280661    0.4358663    1.7652982    0.7622377

Code

km=list()
for(i in 2:10){
  p=paste("k",i,".png",sep="")
  png(file=p)
  km[[i]]=kmeans(df,centers=i,iter.max = 30,nstart = 25,trace = F)
  fviz_cluster(km[[i]], data = df,ellipse.type="norm")
  km[[i]]
  dev.off()
}

km

[[1]]
NULL

[[2]]
K-means clustering with 2 clusters of sizes 100, 50

Cluster means:
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1    0.5055957  -0.4252069     0.650315   0.6253518
2   -1.0111914   0.8504137    -1.300630  -1.2507035

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  2   2   2   2   2   2   2   2   2   2   1   1   1   1   1   1   1   1   1   1 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
141 142 143 144 145 146 147 148 149 150 
  1   1   1   1   1   1   1   1   1   1 

Within cluster sum of squares by cluster:
[1] 173.52867  47.35062
 (between_SS / total_SS =  62.9 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      

[[3]]
K-means clustering with 3 clusters of sizes 47, 53, 50

Cluster means:
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1   1.13217737  0.08812645    0.9928284   1.0141287
2  -0.05005221 -0.88042696    0.3465767   0.2805873
3  -1.01119138  0.85041372   -1.3006301  -1.2507035

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3   3 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  3   3   3   3   3   3   3   3   3   3   1   1   1   2   2   2   1   2   2   2 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  2   2   2   2   2   1   2   2   2   2   1   2   2   2   2   1   1   1   2   2 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  2   2   2   2   2   1   1   2   2   2   2   2   2   2   2   2   2   2   2   2 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  1   2   1   1   1   1   2   1   1   1   1   1   1   2   2   1   1   1   1   2 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  1   2   1   2   1   1   2   1   1   1   1   1   1   2   2   1   1   1   2   1 
141 142 143 144 145 146 147 148 149 150 
  1   1   2   1   1   1   2   1   1   2 

Within cluster sum of squares by cluster:
[1] 47.45019 44.08754 47.35062
 (between_SS / total_SS =  76.7 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      

[[4]]
K-means clustering with 4 clusters of sizes 53, 25, 47, 25

Cluster means:
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1  -0.05005221 -0.88042696    0.3465767   0.2805873
2  -1.30343857  0.19883774   -1.3040289  -1.2848136
3   1.13217737  0.08812645    0.9928284   1.0141287
4  -0.71894419  1.50198969   -1.2972312  -1.2165934

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  4   2   2   2   4   4   2   2   2   2   4   2   2   2   4   4   4   4   4   4 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  4   4   4   2   2   2   2   4   4   2   2   4   4   4   2   2   4   4   2   2 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  4   2   2   4   4   2   4   2   4   2   3   3   3   1   1   1   3   1   1   1 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  1   1   1   1   1   3   1   1   1   1   3   1   1   1   1   3   3   3   1   1 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  1   1   1   1   1   3   3   1   1   1   1   1   1   1   1   1   1   1   1   1 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  3   1   3   3   3   3   1   3   3   3   3   3   3   1   1   3   3   3   3   1 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  3   1   3   1   3   3   1   3   3   3   3   3   3   1   1   3   3   3   1   3 
141 142 143 144 145 146 147 148 149 150 
  3   3   1   3   3   3   1   3   3   1 

Within cluster sum of squares by cluster:
[1] 44.087545  9.646348 47.450194 12.147537
 (between_SS / total_SS =  81.0 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      

[[5]]
K-means clustering with 5 clusters of sizes 29, 48, 25, 23, 25

Cluster means:
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1    1.3926646   0.2323817    1.1567451  1.21327591
2    0.3804044  -0.3896455    0.6067908  0.56390985
3   -1.3034386   0.1988377   -1.3040289 -1.28481361
4   -0.3516137  -1.3285553    0.1026061  0.01228268
5   -0.7189442   1.5019897   -1.2972312 -1.21659342

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  5   3   3   3   5   5   3   3   3   3   5   3   3   3   5   5   5   5   5   5 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  5   5   5   3   3   3   3   5   5   3   3   5   5   5   3   3   5   5   3   3 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  5   3   3   5   5   3   5   3   5   3   1   2   1   4   2   2   2   4   2   4 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  4   2   4   2   4   2   2   4   4   4   2   2   2   2   2   2   2   2   2   4 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  4   4   4   2   2   2   2   4   2   4   4   2   4   4   4   2   2   2   4   4 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  1   2   1   2   1   1   4   1   2   1   1   2   1   2   2   1   2   1   1   4 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  1   2   1   2   1   1   2   2   2   1   1   1   2   2   2   1   1   2   2   1 
141 142 143 144 145 146 147 148 149 150 
  1   1   2   1   1   1   2   2   1   2 

Within cluster sum of squares by cluster:
[1] 26.891293 27.830133  9.646348 13.686590 12.147537
 (between_SS / total_SS =  84.9 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      

[[6]]
K-means clustering with 6 clusters of sizes 29, 21, 38, 25, 25, 12

Cluster means:
  Sepal.Length Sepal.Width Petal.Length   Petal.Width
1    0.8596404   0.1928251    0.8520198  1.0504160307
2   -0.3628650  -1.4097814    0.1074147  0.0008746178
3    0.2527555  -0.5360569    0.5470374  0.4911209357
4   -1.3034386   0.1988377   -1.3040289 -1.2848136129
5   -0.7189442   1.5019897   -1.2972312 -1.2165934210
6    1.9704545   0.1552464    1.4399833  1.1160123690

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  5   4   4   4   5   5   4   4   4   4   5   4   4   4   5   5   5   5   5   5 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  5   5   5   4   4   4   4   5   5   4   4   5   5   5   4   4   5   5   4   4 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  5   4   4   5   5   4   5   4   5   4   1   1   1   2   3   3   1   2   3   2 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  2   3   2   3   3   1   3   2   2   2   1   3   3   3   3   3   3   1   3   2 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  2   2   2   3   3   1   1   2   3   2   2   3   2   2   2   3   3   3   2   3 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  1   3   6   3   1   6   2   6   3   6   1   3   1   3   3   1   1   6   6   2 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  1   3   6   3   1   6   3   3   1   6   6   6   1   3   3   6   1   1   3   1 
141 142 143 144 145 146 147 148 149 150 
  1   1   3   1   1   1   3   1   1   3 

Within cluster sum of squares by cluster:
[1] 14.596105 11.951942 19.109637  9.646348 12.147537 12.013666
 (between_SS / total_SS =  86.7 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      

[[7]]
K-means clustering with 7 clusters of sizes 12, 38, 12, 21, 21, 29, 17

Cluster means:
  Sepal.Length Sepal.Width Petal.Length   Petal.Width
1    1.9704545  0.15524639    1.4399833  1.1160123690
2    0.2527555 -0.53605688    0.5470374  0.4911209357
3   -0.5454476  1.99067167   -1.2649421 -1.2126576408
4   -0.3628650 -1.40978142    0.1074147  0.0008746178
5   -0.9666815  0.92820079   -1.2925915 -1.2173430935
6    0.8596404  0.19282514    0.8520198  1.0504160307
7   -1.3949345 -0.05056417   -1.3357516 -1.3187693645

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  5   7   7   7   5   3   5   5   7   7   3   5   7   7   3   3   3   5   3   3 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  5   5   5   5   5   7   5   5   5   7   7   5   3   3   7   7   5   5   7   5 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  5   7   7   5   3   7   3   7   3   5   6   6   6   4   2   2   6   4   2   4 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  4   2   4   2   2   6   2   4   4   4   6   2   2   2   2   2   2   6   2   4 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  4   4   4   2   2   6   6   4   2   4   4   2   4   4   4   2   2   2   4   2 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  6   2   1   2   6   1   4   1   2   1   6   2   6   2   2   6   6   1   1   4 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  6   2   1   2   6   1   2   2   6   1   1   1   6   2   2   1   6   6   2   6 
141 142 143 144 145 146 147 148 149 150 
  6   6   2   6   6   6   2   6   6   2 

Within cluster sum of squares by cluster:
[1] 12.013666 19.109637  3.954505 11.951942  3.397867 14.596105  5.163861
 (between_SS / total_SS =  88.2 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      

[[8]]
K-means clustering with 8 clusters of sizes 21, 27, 19, 21, 12, 17, 21, 12

Cluster means:
  Sepal.Length Sepal.Width Petal.Length   Petal.Width
1    0.9540301  0.21806600    0.9409445  1.1941032479
2    0.2383957 -0.14853349    0.3972469  0.3264268153
3    0.4243665 -0.80774813    0.7900260  0.8018404329
4   -0.9666815  0.92820079   -1.2925915 -1.2173430935
5    1.9704545  0.15524639    1.4399833  1.1160123690
6   -1.3949345 -0.05056417   -1.3357516 -1.3187693645
7   -0.3628650 -1.40978142    0.1074147  0.0008746178
8   -0.5454476  1.99067167   -1.2649421 -1.2126576408

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  4   6   6   6   4   8   4   4   6   6   8   4   6   6   8   8   8   4   8   8 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  4   4   4   4   4   6   4   4   4   6   6   4   8   8   6   6   4   4   6   4 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  4   6   6   4   8   6   8   6   8   4   1   2   1   7   3   2   2   7   2   7 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  7   2   7   2   2   2   2   7   7   7   2   2   3   2   2   2   3   1   2   7 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  7   7   7   3   2   2   2   7   2   7   7   2   7   7   7   2   2   2   7   2 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  1   3   5   3   1   5   7   5   3   5   1   3   1   3   3   1   1   5   5   7 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  1   3   5   3   1   5   3   2   3   5   5   5   3   3   3   5   1   1   2   1 
141 142 143 144 145 146 147 148 149 150 
  1   1   3   1   1   1   3   1   1   2 

Within cluster sum of squares by cluster:
[1]  7.551820 10.338980  7.448417  3.397867 12.013666  5.163861 11.951942
[8]  3.954505
 (between_SS / total_SS =  89.6 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      

[[9]]
K-means clustering with 9 clusters of sizes 16, 23, 11, 18, 14, 21, 18, 17, 12

Cluster means:
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1   0.98170501  0.35599603   1.02574736  1.42759498
2   0.69325136 -0.05173771   0.60997207  0.51423727
3   2.00064537  0.05617514   1.45028289  1.06234264
4   0.34350450 -1.04925145   0.75706440  0.79531916
5  -0.59576562 -1.45894460   0.00760697 -0.05535081
6  -0.96668148  0.92820079  -1.29259152 -1.21734309
7  -0.05233076 -0.46293504   0.27241226  0.14664426
8  -1.39493454 -0.05056417  -1.33575163 -1.31876936
9  -0.54544758  1.99067167  -1.26494207 -1.21265764

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  6   8   8   8   6   9   6   6   8   8   9   6   8   8   9   9   9   6   9   9 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  6   6   6   6   6   8   6   6   6   8   8   6   9   9   8   8   6   6   8   6 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  6   8   8   6   9   8   9   8   9   6   2   2   2   5   2   7   2   5   2   5 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  5   7   5   7   7   2   7   7   4   5   2   7   4   7   2   2   2   2   7   5 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  5   5   7   4   7   2   2   4   7   5   5   2   7   5   7   7   7   7   5   7 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  1   4   3   2   1   3   5   3   4   1   1   4   1   4   4   1   2   3   3   4 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  1   4   3   4   1   3   4   2   4   3   3   3   4   2   4   3   1   2   2   1 
141 142 143 144 145 146 147 148 149 150 
  1   1   4   1   1   1   4   2   1   2 

Within cluster sum of squares by cluster:
[1]  4.863671  8.410043 10.203534  9.121563  6.046325  3.397867  3.579667
[8]  5.163861  3.954505
 (between_SS / total_SS =  90.8 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"      

[[10]]
K-means clustering with 10 clusters of sizes 9, 12, 12, 18, 18, 16, 3, 24, 17, 21

Cluster means:
   Sepal.Length Sepal.Width Petal.Length Petal.Width
1     1.9201365 -0.30998294    1.4211008  1.03583907
2    -0.5454476  1.99067167   -1.2649421 -1.21265764
3    -0.5957656 -1.54634580   -0.0281350 -0.08658717
4    -0.1596759 -0.52666509    0.2503826  0.13935578
5     0.3435045 -1.04925145    0.7570644  0.79531916
6     0.9288711  0.26996047    0.9938831  1.38659726
7     2.1214087  1.55093437    1.4966310  1.35653228
8     0.6621854 -0.07418177    0.5855479  0.46551535
9    -1.3949345 -0.05056417   -1.3357516 -1.31876936
10   -0.9666815  0.92820079   -1.2925915 -1.21734309

Clustering vector:
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
 10   9   9   9  10   2  10  10   9   9   2  10   9   9   2   2   2  10   2   2 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
 10  10  10  10  10   9  10  10  10   9   9  10   2   2   9   9  10  10   9  10 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
 10   9   9  10   2   9   2   9   2  10   8   8   8   3   8   4   8   3   8   4 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  3   4   3   8   4   8   4   4   5   3   8   4   5   4   8   8   8   8   4   3 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  3   3   4   5   4   8   8   5   4   3   4   8   4   3   4   4   4   8   3   4 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  6   5   1   8   6   1   3   1   5   7   6   5   6   5   5   6   8   7   1   5 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  6   5   1   5   6   1   5   8   5   1   1   7   5   8   5   1   6   8   8   6 
141 142 143 144 145 146 147 148 149 150 
  6   6   5   6   6   6   5   6   6   8 

Within cluster sum of squares by cluster:
 [1] 3.091184 3.954505 5.048902 4.028262 9.121563 3.726772 0.795318 8.670618
 [9] 5.163861 3.397867
 (between_SS / total_SS =  92.1 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
[6] "betweenss"    "size"         "iter"         "ifault"

Code

png(file="tmp1.png")
fviz_nbclust(df, kmeans, method = "silhouette")+
  labs(subtitle = "Silhouette method")
dev.off()

png 
  2

Code

png(file="tmp2.png")
fviz_nbclust(df, kmeans, nstart = 25,  method = "gap_stat", nboot = 50)+
  labs(subtitle = "Gap statistic method")
dev.off()

png 
  2

Code

png(file="tmp3.png")
NbClust(data = df, diss = NULL, distance = "euclidean",
        min.nc = 2, max.nc = 15, method = "kmeans")

*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot.

*** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 10 proposed 2 as the best number of clusters 
* 6 proposed 3 as the best number of clusters 
* 1 proposed 4 as the best number of clusters 
* 1 proposed 5 as the best number of clusters 
* 3 proposed 12 as the best number of clusters 
* 1 proposed 14 as the best number of clusters 
* 2 proposed 15 as the best number of clusters 

                   ***** Conclusion *****                            
 
* According to the majority rule, the best number of clusters is  2 
 
 
*******************************************************************

$All.index
       KL       CH Hartigan    CCC     Scott   Marriot    TrCovW   TraceW
2  3.9498 251.3493  87.3699 3.3595  357.8871 1471010.8 1643.9577 220.8793
3  5.1669 241.9044  33.1486 5.1886  489.5281 1376126.9 1225.4423 138.8884
4  0.5567 207.2659  37.4374 3.6814  555.6392 1574434.9  705.5542 113.3319
5  3.5421 203.2674  19.5911 3.5789  652.9526 1285860.3  667.4659  90.2022
6  0.7874 187.2031  19.0351 3.3533  720.9245 1176948.3  510.6882  79.4655
7  1.1988 178.5481  16.2779 3.4533  763.2771 1207890.6  394.3442  70.1876
8  0.5699 171.5792  20.5630 3.5166  846.4971  905868.3  351.9925  63.0146
9  2.1882 173.2143  13.2186 4.2469  921.2836  696371.3  222.4485  55.0437
10 1.1910 168.6666   2.3371 4.3223  945.2575  732730.5  185.9404  50.3257
11 0.5545 153.4637  14.6665 3.2722  946.7235  877980.9  182.2093  49.4994
12 3.4040 154.4471   3.2103 3.7498 1020.6039  638493.7  153.0732  44.7750
13 1.5176 144.0858   6.7708 3.0052 1032.0196  694431.0  143.4864  43.7571
14 0.0701 139.0737  31.6603 2.7569 1054.2586  694400.3  120.0184  41.6964
15 4.5221 160.2774  10.5759 5.2266 1121.4592  509296.4   75.4837  33.8226
   Friedman   Rubin Cindex     DB Silhouette   Duda Pseudot2   Beale Ratkowsky
2   50.5461  2.6983 0.2709 0.6828     0.5818 1.9311 -48.6978 -1.1403    0.5535
3   58.5837  4.2912 0.2428 0.9141     0.4599 0.4603  56.2860  2.7732    0.5028
4   61.9721  5.2589 0.3474 0.9814     0.3869 0.9204   4.3246  0.2048    0.4491
5   67.5363  6.6074 0.3598 1.0526     0.3455 2.2695 -31.8842 -1.2891    0.4114
6   77.5691  7.5001 0.3307 1.1560     0.3266 0.5343  27.0234  2.0266    0.3797
7   78.2758  8.4915 0.3177 1.1076     0.3254 0.6799  14.1232  1.0960    0.3548
8   86.3269  9.4581 0.2989 1.1326     0.3227 1.8821 -14.9977 -1.0719    0.3341
9   95.0688 10.8278 0.2730 1.0595     0.3388 0.9118   3.4826  0.2219    0.3174
10  94.7241 11.8429 0.2580 1.0584     0.3377 1.4544  -8.1230 -0.7071    0.3025
11  91.0161 12.0406 0.2545 1.1341     0.3075 2.1682 -13.4695 -1.2007    0.2886
12 105.5354 13.3110 0.2442 0.9875     0.3299 1.5579  -6.8042 -0.8213    0.2775
13 106.5486 13.6207 0.2469 1.0763     0.2958 1.8617 -11.5714 -1.0158    0.2669
14 108.5813 14.2938 0.2412 1.0789     0.2918 1.1559  -1.4840 -0.2931    0.2577
15 110.4875 17.6214 0.3555 0.9910     0.3154 2.1520  -8.5650 -1.1749    0.2507
       Ball Ptbiserial    Frey McClain   Dunn Hubert SDindex Dindex   SDbw
2  110.4396     0.7815  1.4732  0.3492 0.2674 0.0028  3.0377 1.0566 0.4276
3   46.2961     0.6797  2.0078  0.7938 0.0265 0.0030  2.8248 0.8573 0.5612
4   28.3330     0.6245  0.6583  1.0201 0.0399 0.0032  2.5023 0.7738 0.4574
5   18.0404     0.5905  0.7997  1.2718 0.0808 0.0034  2.4689 0.6936 0.2555
6   13.2443     0.5556  0.9092  1.5025 0.0842 0.0036  2.9665 0.6558 0.2588
7   10.0268     0.5340  0.9015  1.6553 0.0912 0.0037  2.7896 0.6120 0.1772
8    7.8768     0.4992  0.6392  1.9329 0.0861 0.0037  3.0573 0.5747 0.1510
9    6.1160     0.4624  0.2551  2.2801 0.0861 0.0038  3.0630 0.5336 0.1361
10   5.0326     0.4561  1.8980  2.3325 0.0861 0.0038  2.9536 0.5155 0.1257
11   4.4999     0.4359  0.2507  2.5654 0.0475 0.0038  3.7685 0.5093 0.1325
12   3.7312     0.4307 -2.8078  2.6103 0.0912 0.0038  3.4704 0.4861 0.0956
13   3.3659     0.4028  0.7216  3.0160 0.0475 0.0039  5.5228 0.4790 0.1024
14   2.9783     0.3893  0.0340  3.2251 0.0475 0.0039  5.6579 0.4664 0.0844
15   2.2548     0.3919  0.1034  3.1206 0.0750 0.0040  6.0749 0.4380 0.0638

$All.CriticalValues
   CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
2          0.5551            80.9487       1.0000
3          0.5551            38.4707       0.0284
4          0.5633            38.7617       0.9355
5          0.4195            78.8634       1.0000
6          0.4590            36.5375       0.0961
7          0.4656            34.4267       0.3624
8          0.3890            50.2528       1.0000
9          0.3999            54.0151       0.9254
10         0.3508            48.1166       1.0000
11         0.3008            58.1006       1.0000
12         0.3999            28.5079       1.0000
13         0.2576            72.0597       1.0000
14         0.2316            36.4880       1.0000
15         0.2576            46.1182       1.0000

$Best.nc
                    KL       CH Hartigan     CCC    Scott  Marriot   TrCovW
Number_clusters 3.0000   2.0000   3.0000 15.0000   3.0000     12.0   4.0000
Value_Index     5.1669 251.3493  54.2213  5.2266 131.6411 295424.5 519.8881
                 TraceW Friedman   Rubin  Cindex     DB Silhouette   Duda
Number_clusters  3.0000  12.0000 12.0000 14.0000 2.0000     2.0000 2.0000
Value_Index     56.4345  14.5193 -0.9608  0.2412 0.6828     0.5818 1.9311
                PseudoT2   Beale Ratkowsky    Ball PtBiserial   Frey McClain
Number_clusters   2.0000  2.0000    2.0000  3.0000     2.0000 3.0000  2.0000
Value_Index     -48.6978 -1.1403    0.5535 64.1435     0.7815 2.0078  0.3492
                  Dunn Hubert SDindex Dindex    SDbw
Number_clusters 2.0000      0  5.0000      0 15.0000
Value_Index     0.2674      0  2.4689      0  0.0638

$Best.partition
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  1   1   1   1   1   1   1   1   1   1   2   2   2   2   2   2   2   2   2   2 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
141 142 143 144 145 146 147 148 149 150 
  2   2   2   2   2   2   2   2   2   2

Code

dev.off()

png 
  2

Code

NbClust(data = df, diss = NULL, distance = "euclidean",
        min.nc = 2, max.nc = 15, method = "kmeans")

*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot.

*** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 10 proposed 2 as the best number of clusters 
* 6 proposed 3 as the best number of clusters 
* 1 proposed 4 as the best number of clusters 
* 1 proposed 5 as the best number of clusters 
* 3 proposed 12 as the best number of clusters 
* 1 proposed 14 as the best number of clusters 
* 2 proposed 15 as the best number of clusters 

                   ***** Conclusion *****                            
 
* According to the majority rule, the best number of clusters is  2 
 
 
*******************************************************************

$All.index
       KL       CH Hartigan    CCC     Scott   Marriot    TrCovW   TraceW
2  3.9498 251.3493  87.3699 3.3595  357.8871 1471010.8 1643.9577 220.8793
3  5.1669 241.9044  33.1486 5.1886  489.5281 1376126.9 1225.4423 138.8884
4  0.5567 207.2659  37.4374 3.6814  555.6392 1574434.9  705.5542 113.3319
5  3.5421 203.2674  19.5911 3.5789  652.9526 1285860.3  667.4659  90.2022
6  0.7874 187.2031  19.0351 3.3533  720.9245 1176948.3  510.6882  79.4655
7  1.1988 178.5481  16.2779 3.4533  763.2771 1207890.6  394.3442  70.1876
8  0.5699 171.5792  20.5630 3.5166  846.4971  905868.3  351.9925  63.0146
9  2.1882 173.2143  13.2186 4.2469  921.2836  696371.3  222.4485  55.0437
10 1.1910 168.6666   2.3371 4.3223  945.2575  732730.5  185.9404  50.3257
11 0.5545 153.4637  14.6665 3.2722  946.7235  877980.9  182.2093  49.4994
12 3.4040 154.4471   3.2103 3.7498 1020.6039  638493.7  153.0732  44.7750
13 1.5176 144.0858   6.7708 3.0052 1032.0196  694431.0  143.4864  43.7571
14 0.0701 139.0737  31.6603 2.7569 1054.2586  694400.3  120.0184  41.6964
15 4.5221 160.2774  10.5759 5.2266 1121.4592  509296.4   75.4837  33.8226
   Friedman   Rubin Cindex     DB Silhouette   Duda Pseudot2   Beale Ratkowsky
2   50.5461  2.6983 0.2709 0.6828     0.5818 1.9311 -48.6978 -1.1403    0.5535
3   58.5837  4.2912 0.2428 0.9141     0.4599 0.4603  56.2860  2.7732    0.5028
4   61.9721  5.2589 0.3474 0.9814     0.3869 0.9204   4.3246  0.2048    0.4491
5   67.5363  6.6074 0.3598 1.0526     0.3455 2.2695 -31.8842 -1.2891    0.4114
6   77.5691  7.5001 0.3307 1.1560     0.3266 0.5343  27.0234  2.0266    0.3797
7   78.2758  8.4915 0.3177 1.1076     0.3254 0.6799  14.1232  1.0960    0.3548
8   86.3269  9.4581 0.2989 1.1326     0.3227 1.8821 -14.9977 -1.0719    0.3341
9   95.0688 10.8278 0.2730 1.0595     0.3388 0.9118   3.4826  0.2219    0.3174
10  94.7241 11.8429 0.2580 1.0584     0.3377 1.4544  -8.1230 -0.7071    0.3025
11  91.0161 12.0406 0.2545 1.1341     0.3075 2.1682 -13.4695 -1.2007    0.2886
12 105.5354 13.3110 0.2442 0.9875     0.3299 1.5579  -6.8042 -0.8213    0.2775
13 106.5486 13.6207 0.2469 1.0763     0.2958 1.8617 -11.5714 -1.0158    0.2669
14 108.5813 14.2938 0.2412 1.0789     0.2918 1.1559  -1.4840 -0.2931    0.2577
15 110.4875 17.6214 0.3555 0.9910     0.3154 2.1520  -8.5650 -1.1749    0.2507
       Ball Ptbiserial    Frey McClain   Dunn Hubert SDindex Dindex   SDbw
2  110.4396     0.7815  1.4732  0.3492 0.2674 0.0028  3.0377 1.0566 0.4276
3   46.2961     0.6797  2.0078  0.7938 0.0265 0.0030  2.8248 0.8573 0.5612
4   28.3330     0.6245  0.6583  1.0201 0.0399 0.0032  2.5023 0.7738 0.4574
5   18.0404     0.5905  0.7997  1.2718 0.0808 0.0034  2.4689 0.6936 0.2555
6   13.2443     0.5556  0.9092  1.5025 0.0842 0.0036  2.9665 0.6558 0.2588
7   10.0268     0.5340  0.9015  1.6553 0.0912 0.0037  2.7896 0.6120 0.1772
8    7.8768     0.4992  0.6392  1.9329 0.0861 0.0037  3.0573 0.5747 0.1510
9    6.1160     0.4624  0.2551  2.2801 0.0861 0.0038  3.0630 0.5336 0.1361
10   5.0326     0.4561  1.8980  2.3325 0.0861 0.0038  2.9536 0.5155 0.1257
11   4.4999     0.4359  0.2507  2.5654 0.0475 0.0038  3.7685 0.5093 0.1325
12   3.7312     0.4307 -2.8078  2.6103 0.0912 0.0038  3.4704 0.4861 0.0956
13   3.3659     0.4028  0.7216  3.0160 0.0475 0.0039  5.5228 0.4790 0.1024
14   2.9783     0.3893  0.0340  3.2251 0.0475 0.0039  5.6579 0.4664 0.0844
15   2.2548     0.3919  0.1034  3.1206 0.0750 0.0040  6.0749 0.4380 0.0638

$All.CriticalValues
   CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
2          0.5551            80.9487       1.0000
3          0.5551            38.4707       0.0284
4          0.5633            38.7617       0.9355
5          0.4195            78.8634       1.0000
6          0.4590            36.5375       0.0961
7          0.4656            34.4267       0.3624
8          0.3890            50.2528       1.0000
9          0.3999            54.0151       0.9254
10         0.3508            48.1166       1.0000
11         0.3008            58.1006       1.0000
12         0.3999            28.5079       1.0000
13         0.2576            72.0597       1.0000
14         0.2316            36.4880       1.0000
15         0.2576            46.1182       1.0000

$Best.nc
                    KL       CH Hartigan     CCC    Scott  Marriot   TrCovW
Number_clusters 3.0000   2.0000   3.0000 15.0000   3.0000     12.0   4.0000
Value_Index     5.1669 251.3493  54.2213  5.2266 131.6411 295424.5 519.8881
                 TraceW Friedman   Rubin  Cindex     DB Silhouette   Duda
Number_clusters  3.0000  12.0000 12.0000 14.0000 2.0000     2.0000 2.0000
Value_Index     56.4345  14.5193 -0.9608  0.2412 0.6828     0.5818 1.9311
                PseudoT2   Beale Ratkowsky    Ball PtBiserial   Frey McClain
Number_clusters   2.0000  2.0000    2.0000  3.0000     2.0000 3.0000  2.0000
Value_Index     -48.6978 -1.1403    0.5535 64.1435     0.7815 2.0078  0.3492
                  Dunn Hubert SDindex Dindex    SDbw
Number_clusters 2.0000      0  5.0000      0 15.0000
Value_Index     0.2674      0  2.4689      0  0.0638

$Best.partition
  1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
 21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40 
  1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
 41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60 
  1   1   1   1   1   1   1   1   1   1   2   2   2   2   2   2   2   2   2   2 
 61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
 81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 
  2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2   2 
141 142 143 144 145 146 147 148 149 150 
  2   2   2   2   2   2   2   2   2   2

Code

close_open_devices()

using neural network on iris data

Code

d=iris|>janitor::clean_names()
rs=.7
tmp=sample(1:nrow(d),nrow(d)*rs)
d1=d[tmp,]
d2=d[-tmp,]

nn <- neuralnet(species~., 
                d1, 
                hidden = c(5),
                linear.output = FALSE)

plot(nn,rep = "best")

Code

pred=predict(nn,d2)
#pred
labels <- c("setosa", "versicolor", "virginca")
prediction_label <- data.frame(max.col(pred)) %>%     
  mutate(pred=labels[max.col.pred.]) %>%
  select(2) %>%
  unlist()
#prediction_label 
t=table(d2$species, prediction_label)
acc=sum(diag(t))/sum(t)
acc

[1] 0.9111111

Code

n=15
sl=runif(n,
         min(d$sepal_length),
         max(d$sepal_length)
         )
sw=runif(n,
         min(d$sepal_width),
         max(d$sepal_width)
)
pl=runif(n,
         min(d$petal_length),
         max(d$petal_length)
)
pw=runif(n,
         min(d$petal_width),
         max(d$petal_width)
)
newdata=data.frame(sepal_length=sl,
                   sepal_width=sw,
                   petal_length=pl,
                   petal_width=pw
                   )
# Make predictions on new data

pred=predict(nn, newdata)
#pred
labels <- c("setosa", "versicolor", "virginca")
prediction_label <- data.frame(max.col(pred)) %>%     
  mutate(pred=labels[max.col.pred.]) %>%
  select(2) #%>%
  #unlist()
#prediction_label 
newdata$species=prediction_label 
newdata

sepal_length	sepal_width	petal_length	petal_width	species
7.694645	3.685910	5.547349	1.3166548	versicolor
7.043106	2.396066	1.163944	1.7354620	setosa
7.658475	2.154698	4.111134	0.3380058	setosa
5.994443	3.811294	6.193882	0.3853661	versicolor
6.472917	3.488984	3.201074	0.2210552	setosa
6.045963	2.406984	1.282959	2.3302094	versicolor
4.691703	2.149314	1.817907	1.7169094	setosa
5.191817	2.261670	2.896803	0.3276589	setosa
6.094652	2.916119	1.913506	1.2822307	setosa
5.642320	2.406346	1.780146	1.2077244	setosa
7.664889	2.716766	2.305705	1.0005197	setosa
6.186350	2.461303	2.335647	2.4786381	versicolor
5.441721	2.617208	1.775357	0.5232417	setosa
5.300678	2.434956	6.791224	2.0522445	virginca
7.135146	3.145553	2.929381	0.2642719	setosa

support vector m/c

Code

### support vector machine (svm) model

mysvm=svm(species~.,d1)
mysvm


Call:
svm(formula = species ~ ., data = d1)


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  radial 
       cost:  1 

Number of Support Vectors:  45

Code

#str(mysvm)
summary(mysvm)


Call:
svm(formula = species ~ ., data = d1)


Parameters:
   SVM-Type:  C-classification 
 SVM-Kernel:  radial 
       cost:  1 

Number of Support Vectors:  45

 ( 9 18 18 )


Number of Classes:  3 

Levels: 
 setosa versicolor virginica

Code

pred=predict(mysvm,d2)
#pred
t=table(pred,d2$species)
t

            
pred         setosa versicolor virginica
  setosa         15          0         0
  versicolor      0         17         1
  virginica       0          0        12

Code

acc=sum(diag(t))/sum(t)
acc

[1] 0.9777778

Code

pred=predict(mysvm,newdata)
#pred
newdata$species=pred
newdata

sepal_length	sepal_width	petal_length	petal_width	species
7.694645	3.685910	5.547349	1.3166548	virginica
7.043106	2.396066	1.163944	1.7354620	versicolor
7.658475	2.154698	4.111134	0.3380058	versicolor
5.994443	3.811294	6.193882	0.3853661	versicolor
6.472917	3.488984	3.201074	0.2210552	setosa
6.045963	2.406984	1.282959	2.3302094	versicolor
4.691703	2.149314	1.817907	1.7169094	versicolor
5.191817	2.261670	2.896803	0.3276589	versicolor
6.094652	2.916119	1.913506	1.2822307	versicolor
5.642320	2.406346	1.780146	1.2077244	versicolor
7.664889	2.716766	2.305705	1.0005197	versicolor
6.186350	2.461303	2.335647	2.4786381	virginica
5.441721	2.617208	1.775357	0.5232417	setosa
5.300678	2.434956	6.791224	2.0522445	virginica
7.135146	3.145553	2.929381	0.2642719	versicolor

knn (k nearest neighbour) model

Code

## knn nearest neighbour model
 dorg1=scale(d1[,1:4])

 dorg2=scale(d2[,1:4])

knn_model <- knn(train = dorg1, test = dorg2, cl = d1$species, k = 5)
#knn_model
t=table(knn_model,d2$species)
t

            
knn_model    setosa versicolor virginica
  setosa         15          0         0
  versicolor      0         15         0
  virginica       0          2        13

Code

acc=sum(diag(t))/sum(t)
acc

[1] 0.9555556

Code

knn_model <- knn(train = dorg1, test = newdata[,1:4], cl = d1$species, k = 5)
#knn_model

newdata$species=knn_model
newdata

sepal_length	sepal_width	petal_length	petal_width	species
7.694645	3.685910	5.547349	1.3166548	virginica
7.043106	2.396066	1.163944	1.7354620	virginica
7.658475	2.154698	4.111134	0.3380058	virginica
5.994443	3.811294	6.193882	0.3853661	virginica
6.472917	3.488984	3.201074	0.2210552	virginica
6.045963	2.406984	1.282959	2.3302094	virginica
4.691703	2.149314	1.817907	1.7169094	virginica
5.191817	2.261670	2.896803	0.3276589	virginica
6.094652	2.916119	1.913506	1.2822307	virginica
5.642320	2.406346	1.780146	1.2077244	virginica
7.664889	2.716766	2.305705	1.0005197	virginica
6.186350	2.461303	2.335647	2.4786381	virginica
5.441721	2.617208	1.775357	0.5232417	virginica
5.300678	2.434956	6.791224	2.0522445	virginica
7.135146	3.145553	2.929381	0.2642719	virginica