library(readr)
schoolExpenditure <- read_csv("C:/Users/dnred/Downloads/schoolExpenditure.csv")
## Rows: 220 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (14): expreg, expspecial, expbil, expocc, exptot, scratio, special, lunc...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(schoolExpenditure)
library(mlbench)
## Warning: package 'mlbench' was built under R version 4.3.3
library(e1071)
## Warning: package 'e1071' was built under R version 4.3.3
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: ggplot2
## Loading required package: lattice
library(dendextend)
## Warning: package 'dendextend' was built under R version 4.3.3
##
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
library(cluster)
library(fpc)
## Warning: package 'fpc' was built under R version 4.3.3
library(clValid)
## Warning: package 'clValid' was built under R version 4.3.3
se <- schoolExpenditure
str(se)
## spc_tbl_ [220 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ expreg : num [1:220] 4201 4129 3627 4015 4273 ...
## $ expspecial: num [1:220] 7376 8574 8082 8181 7037 ...
## $ expbil : num [1:220] 0 0 0 0 0 ...
## $ expocc : num [1:220] 0 0 0 0 0 ...
## $ exptot : num [1:220] 4646 4930 4281 4826 4824 ...
## $ scratio : num [1:220] 16.6 5.7 7.5 8.6 6.1 ...
## $ special : num [1:220] 14.6 17.4 12.1 21.1 16.8 ...
## $ lunch : num [1:220] 11.8 2.5 14.1 12.1 17.4 ...
## $ stratio : num [1:220] 19 22.6 19.3 17.9 17.5 ...
## $ income : num [1:220] 16.4 25.8 14 16.1 15.4 ...
## $ score4 : num [1:220] 714 731 704 704 701 714 725 717 702 701 ...
## $ score8 : num [1:220] 691 NA 693 691 699 NA 728 715 705 688 ...
## $ salary : num [1:220] 34.4 38.1 32.5 33.1 34.4 ...
## $ english : num [1:220] 0 1.246 0 0.323 0 ...
## - attr(*, "spec")=
## .. cols(
## .. expreg = col_double(),
## .. expspecial = col_double(),
## .. expbil = col_double(),
## .. expocc = col_double(),
## .. exptot = col_double(),
## .. scratio = col_double(),
## .. special = col_double(),
## .. lunch = col_double(),
## .. stratio = col_double(),
## .. income = col_double(),
## .. score4 = col_double(),
## .. score8 = col_double(),
## .. salary = col_double(),
## .. english = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
View(se)
#summarize the dataset
summary(se)
## expreg expspecial expbil expocc exptot
## Min. :2905 Min. : 3832 Min. : 0 Min. : 0 Min. :3465
## 1st Qu.:4065 1st Qu.: 7442 1st Qu.: 0 1st Qu.: 0 1st Qu.:4730
## Median :4488 Median : 8354 Median : 0 Median : 0 Median :5155
## Mean :4605 Mean : 8901 Mean : 3037 Mean : 1104 Mean :5370
## 3rd Qu.:4972 3rd Qu.: 9722 3rd Qu.: 0 3rd Qu.: 0 3rd Qu.:5789
## Max. :8759 Max. :53569 Max. :295140 Max. :15088 Max. :9868
##
## scratio special lunch stratio
## Min. : 2.300 Min. : 8.10 Min. : 0.40 Min. :11.40
## 1st Qu.: 6.100 1st Qu.:13.38 1st Qu.: 5.30 1st Qu.:15.80
## Median : 7.800 Median :15.45 Median :10.55 Median :17.10
## Mean : 8.107 Mean :15.97 Mean :15.32 Mean :17.34
## 3rd Qu.: 9.800 3rd Qu.:17.93 3rd Qu.:20.02 3rd Qu.:19.02
## Max. :18.400 Max. :34.30 Max. :76.20 Max. :27.00
## NA's :9
## income score4 score8 salary
## Min. : 9.686 Min. :658.0 Min. :641.0 Min. :24.96
## 1st Qu.:15.223 1st Qu.:701.0 1st Qu.:685.0 1st Qu.:33.80
## Median :17.128 Median :711.0 Median :698.0 Median :35.88
## Mean :18.747 Mean :709.8 Mean :698.4 Mean :35.99
## 3rd Qu.:20.376 3rd Qu.:720.0 3rd Qu.:712.0 3rd Qu.:37.96
## Max. :46.855 Max. :740.0 Max. :747.0 Max. :44.49
## NA's :40 NA's :25
## english
## Min. : 0.0000
## 1st Qu.: 0.0000
## Median : 0.0000
## Mean : 1.1177
## 3rd Qu.: 0.8859
## Max. :24.4939
##
#display first 20 rows of data
head(se, n=20)
## # A tibble: 20 × 14
## expreg expspecial expbil expocc exptot scratio special lunch stratio income
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4201 7376. 0 0 4646 16.6 14.6 11.8 19 16.4
## 2 4129 8574. 0 0 4930 5.70 17.4 2.5 22.6 25.8
## 3 3627 8082. 0 0 4281 7.5 12.1 14.1 19.3 14.0
## 4 4015 8181. 0 0 4826 8.60 21.1 12.1 17.9 16.1
## 5 4273 7037. 0 0 4824 6.10 16.8 17.4 17.5 15.4
## 6 5183 10596. 6235 0 6454 7.70 17.2 26.8 15.7 11.1
## 7 4685 12280. 0 0 5537 5.40 11.3 3.30 17.1 26.3
## 8 5518 10055. 0 0 6405 7.10 20.4 11.2 16.8 21.4
## 9 5009 8841. 0 0 5649 10.6 13.9 8.60 17.3 21.9
## 10 3823 9547. 12943 11519 4814 6.70 13.2 20.7 20.5 15.0
## 11 4625 8213. 0 0 5210 12.5 11.1 10.7 15.6 17.5
## 12 4777 11875. 0 0 5615 7.60 13.3 11.9 15.5 16.2
## 13 6554 12105. 0 0 7389 4.20 16.8 23.9 11.4 14.6
## 14 4484 9321. 1741 0 5323 8.80 14.9 18.7 17.6 17.4
## 15 6240 10993. 0 0 7234 4.80 17.2 2.90 14.8 24.6
## 16 4254 7516. 5281 1470 5048 9.40 20 8.10 19.6 15.9
## 17 5323 8725. 0 0 6065 15 15.1 6.5 17.3 26.8
## 18 3079 7755. 0 0 3930 13.8 19.6 5.60 20.1 14.6
## 19 4836 10220. 0 0 6121 6 16.6 7.5 14.9 19.1
## 20 4205 8288. 0 0 4961 9.80 17.3 15.3 17 18.4
## # ℹ 4 more variables: score4 <dbl>, score8 <dbl>, salary <dbl>, english <dbl>
#display the dimensions of the dataset
dim(se)
## [1] 220 14
#list types for each attribute
sapply(se, class)
## expreg expspecial expbil expocc exptot scratio special
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## lunch stratio income score4 score8 salary english
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
# distribution of class variable
y <- se$expreg
cbind(freq=table(y), percentage=prop.table(table(y))*100)
## freq percentage
## 2905 1 0.4545455
## 2956 1 0.4545455
## 3023 1 0.4545455
## 3079 1 0.4545455
## 3149 1 0.4545455
## 3287 1 0.4545455
## 3326 1 0.4545455
## 3333 1 0.4545455
## 3337 1 0.4545455
## 3400 1 0.4545455
## 3407 1 0.4545455
## 3465 1 0.4545455
## 3522 1 0.4545455
## 3533 1 0.4545455
## 3555 1 0.4545455
## 3580 1 0.4545455
## 3614 1 0.4545455
## 3627 1 0.4545455
## 3647 1 0.4545455
## 3679 2 0.9090909
## 3687 1 0.4545455
## 3693 1 0.4545455
## 3703 1 0.4545455
## 3704 1 0.4545455
## 3744 1 0.4545455
## 3747 1 0.4545455
## 3748 1 0.4545455
## 3795 1 0.4545455
## 3802 1 0.4545455
## 3823 2 0.9090909
## 3839 1 0.4545455
## 3845 1 0.4545455
## 3850 1 0.4545455
## 3858 1 0.4545455
## 3862 1 0.4545455
## 3867 1 0.4545455
## 3903 1 0.4545455
## 3905 1 0.4545455
## 3906 1 0.4545455
## 3908 1 0.4545455
## 3913 2 0.9090909
## 3928 1 0.4545455
## 3935 1 0.4545455
## 3969 1 0.4545455
## 3979 1 0.4545455
## 3986 1 0.4545455
## 4009 1 0.4545455
## 4013 1 0.4545455
## 4015 1 0.4545455
## 4016 1 0.4545455
## 4034 1 0.4545455
## 4062 1 0.4545455
## 4066 1 0.4545455
## 4071 1 0.4545455
## 4078 1 0.4545455
## 4079 1 0.4545455
## 4100 2 0.9090909
## 4105 1 0.4545455
## 4106 1 0.4545455
## 4122 1 0.4545455
## 4129 1 0.4545455
## 4134 1 0.4545455
## 4138 1 0.4545455
## 4150 1 0.4545455
## 4166 1 0.4545455
## 4177 1 0.4545455
## 4189 1 0.4545455
## 4201 1 0.4545455
## 4204 1 0.4545455
## 4205 2 0.9090909
## 4221 2 0.9090909
## 4234 1 0.4545455
## 4239 1 0.4545455
## 4240 1 0.4545455
## 4242 1 0.4545455
## 4247 1 0.4545455
## 4254 1 0.4545455
## 4255 1 0.4545455
## 4263 1 0.4545455
## 4267 1 0.4545455
## 4271 1 0.4545455
## 4273 1 0.4545455
## 4283 1 0.4545455
## 4287 1 0.4545455
## 4297 1 0.4545455
## 4312 2 0.9090909
## 4328 1 0.4545455
## 4344 1 0.4545455
## 4389 1 0.4545455
## 4398 1 0.4545455
## 4411 1 0.4545455
## 4413 1 0.4545455
## 4415 1 0.4545455
## 4436 1 0.4545455
## 4439 1 0.4545455
## 4440 1 0.4545455
## 4442 2 0.9090909
## 4447 1 0.4545455
## 4464 1 0.4545455
## 4465 1 0.4545455
## 4472 1 0.4545455
## 4484 1 0.4545455
## 4493 1 0.4545455
## 4494 1 0.4545455
## 4500 1 0.4545455
## 4506 1 0.4545455
## 4511 1 0.4545455
## 4512 2 0.9090909
## 4521 1 0.4545455
## 4529 1 0.4545455
## 4537 1 0.4545455
## 4539 1 0.4545455
## 4545 1 0.4545455
## 4557 1 0.4545455
## 4558 1 0.4545455
## 4562 1 0.4545455
## 4564 1 0.4545455
## 4566 1 0.4545455
## 4578 1 0.4545455
## 4579 1 0.4545455
## 4586 1 0.4545455
## 4598 1 0.4545455
## 4606 1 0.4545455
## 4609 1 0.4545455
## 4611 1 0.4545455
## 4617 1 0.4545455
## 4623 2 0.9090909
## 4625 1 0.4545455
## 4638 1 0.4545455
## 4640 1 0.4545455
## 4643 1 0.4545455
## 4660 1 0.4545455
## 4682 1 0.4545455
## 4685 1 0.4545455
## 4686 1 0.4545455
## 4707 1 0.4545455
## 4716 1 0.4545455
## 4754 1 0.4545455
## 4756 1 0.4545455
## 4764 1 0.4545455
## 4777 1 0.4545455
## 4793 1 0.4545455
## 4810 1 0.4545455
## 4821 1 0.4545455
## 4825 1 0.4545455
## 4832 1 0.4545455
## 4836 1 0.4545455
## 4859 2 0.9090909
## 4899 1 0.4545455
## 4922 1 0.4545455
## 4932 1 0.4545455
## 4945 1 0.4545455
## 4961 2 0.9090909
## 5004 1 0.4545455
## 5006 1 0.4545455
## 5009 1 0.4545455
## 5014 1 0.4545455
## 5048 1 0.4545455
## 5066 1 0.4545455
## 5087 1 0.4545455
## 5107 1 0.4545455
## 5146 1 0.4545455
## 5152 1 0.4545455
## 5183 1 0.4545455
## 5196 1 0.4545455
## 5242 1 0.4545455
## 5254 1 0.4545455
## 5273 1 0.4545455
## 5304 1 0.4545455
## 5323 1 0.4545455
## 5338 1 0.4545455
## 5345 1 0.4545455
## 5372 1 0.4545455
## 5384 1 0.4545455
## 5387 1 0.4545455
## 5396 1 0.4545455
## 5455 1 0.4545455
## 5457 1 0.4545455
## 5518 1 0.4545455
## 5548 1 0.4545455
## 5556 1 0.4545455
## 5558 1 0.4545455
## 5608 1 0.4545455
## 5613 1 0.4545455
## 5631 1 0.4545455
## 5690 1 0.4545455
## 5764 1 0.4545455
## 5770 1 0.4545455
## 5859 1 0.4545455
## 5896 1 0.4545455
## 5937 1 0.4545455
## 5978 1 0.4545455
## 6049 1 0.4545455
## 6152 1 0.4545455
## 6158 1 0.4545455
## 6240 1 0.4545455
## 6277 1 0.4545455
## 6337 1 0.4545455
## 6370 1 0.4545455
## 6415 1 0.4545455
## 6554 1 0.4545455
## 6672 2 0.9090909
## 6881 1 0.4545455
## 6902 1 0.4545455
## 7763 1 0.4545455
## 7944 1 0.4545455
## 8759 1 0.4545455
#calculate standard deviation for all attributes
sapply(se[,1:14], sd)
## expreg expspecial expbil expocc exptot scratio
## 880.252436 3511.696142 20259.258149 2732.448792 977.040427 NA
## special lunch stratio income score4 score8
## 3.538000 15.060068 2.276666 5.807637 15.126474 NA
## salary english
## NA 2.900940
#calculate skewness for each variable
skew <- apply(se[,1:14], 2, skewness)
# display skewness, larger/smaller deviations from 0 show more skew
print(skew)
## expreg expspecial expbil expocc exptot scratio special
## 1.2853907 9.4422682 13.6557024 2.6068895 1.3033470 NA 0.9859238
## lunch stratio income score4 score8 salary english
## 1.9386044 0.3256516 1.7373340 -0.5278722 NA NA 4.5268155
#calculate a correlation matrix for numeric variables
correlations <- cor(se[,1:8])
#display the correlation matrix
print(correlations)
## expreg expspecial expbil expocc exptot
## expreg 1.0000000000 0.40147653 -0.0001108855 0.06408180 0.965985748
## expspecial 0.4014765288 1.00000000 -0.0422537622 0.03031629 0.442009487
## expbil -0.0001108855 -0.04225376 1.0000000000 0.05660895 0.001748274
## expocc 0.0640818035 0.03031629 0.0566089499 1.00000000 0.142213495
## exptot 0.9659857477 0.44200949 0.0017482739 0.14221350 1.000000000
## scratio NA NA NA NA NA
## special 0.0071764034 -0.07084139 0.1278220698 0.09571385 0.055256770
## lunch -0.0210092797 0.03905312 0.0628743969 0.52063374 0.070473676
## scratio special lunch
## expreg NA 0.007176403 -0.02100928
## expspecial NA -0.070841393 0.03905312
## expbil NA 0.127822070 0.06287440
## expocc NA 0.095713851 0.52063374
## exptot NA 0.055256770 0.07047368
## scratio 1 NA NA
## special NA 1.000000000 0.19377989
## lunch NA 0.193779894 1.00000000
any(is.na(se))
## [1] TRUE
se<-na.omit(se)
str(se)
## tibble [155 × 14] (S3: tbl_df/tbl/data.frame)
## $ expreg : num [1:155] 4201 3627 4015 4273 4685 ...
## $ expspecial: num [1:155] 7376 8082 8181 7037 12280 ...
## $ expbil : num [1:155] 0 0 0 0 0 ...
## $ expocc : num [1:155] 0 0 0 0 0 ...
## $ exptot : num [1:155] 4646 4281 4826 4824 5537 ...
## $ scratio : num [1:155] 16.6 7.5 8.6 6.1 5.4 ...
## $ special : num [1:155] 14.6 12.1 21.1 16.8 11.3 ...
## $ lunch : num [1:155] 11.8 14.1 12.1 17.4 3.3 ...
## $ stratio : num [1:155] 19 19.3 17.9 17.5 17.1 ...
## $ income : num [1:155] 16.4 14 16.1 15.4 26.3 ...
## $ score4 : num [1:155] 714 704 704 701 725 717 702 701 713 703 ...
## $ score8 : num [1:155] 691 693 691 699 728 715 705 688 703 680 ...
## $ salary : num [1:155] 34.4 32.5 33.1 34.4 41.6 ...
## $ english : num [1:155] 0 0 0.323 0 0 ...
## - attr(*, "na.action")= 'omit' Named int [1:65] 2 6 12 19 21 24 25 26 28 29 ...
## ..- attr(*, "names")= chr [1:65] "2" "6" "12" "19" ...
View(se)
#summarize data
summary(se[,1:4])
## expreg expspecial expbil expocc
## Min. :3023 Min. : 3832 Min. : 0 Min. : 0
## 1st Qu.:4144 1st Qu.: 7421 1st Qu.: 0 1st Qu.: 0
## Median :4512 Median : 8288 Median : 0 Median : 0
## Mean :4637 Mean : 8592 Mean : 4119 Mean : 1318
## 3rd Qu.:5008 3rd Qu.: 9540 3rd Qu.: 3628 3rd Qu.: 0
## Max. :7944 Max. :15741 Max. :295140 Max. :15088
#calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(se[,1:4], method=c("scale"))
#summarize transform parameters
print(preprocessParams)
## Created from 155 samples and 4 variables
##
## Pre-processing:
## - ignored (0)
## - scaled (4)
#transform the dataset using the parameters
transformed <- predict(preprocessParams, se[,1:4])
#summarize the transformed dataset
summary(transformed)
## expreg expspecial expbil expocc
## Min. : 3.931 Min. :2.242 Min. : 0.0000 Min. :0.0000
## 1st Qu.: 5.389 1st Qu.:4.341 1st Qu.: 0.0000 1st Qu.:0.0000
## Median : 5.867 Median :4.849 Median : 0.0000 Median :0.0000
## Mean : 6.030 Mean :5.027 Mean : 0.1712 Mean :0.4537
## 3rd Qu.: 6.512 3rd Qu.:5.581 3rd Qu.: 0.1508 3rd Qu.:0.0000
## Max. :10.330 Max. :9.208 Max. :12.2714 Max. :5.1929
#calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(se[,1:4], method=c("center"))
#summarize transform parameters
print(preprocessParams)
## Created from 155 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
#transform the dataset using the parameters
transformed <- predict(preprocessParams, se[,1:4])
#summarize the transformed dataset
summary(transformed)
## expreg expspecial expbil expocc
## Min. :-1614.4 Min. :-4760.0 Min. : -4118.6 Min. :-1318
## 1st Qu.: -493.4 1st Qu.:-1171.6 1st Qu.: -4118.6 1st Qu.:-1318
## Median : -125.4 Median : -303.8 Median : -4118.6 Median :-1318
## Mean : 0.0 Mean : 0.0 Mean : 0.0 Mean : 0
## 3rd Qu.: 370.1 3rd Qu.: 947.3 3rd Qu.: -490.6 3rd Qu.:-1318
## Max. : 3306.6 Max. : 7148.4 Max. :291021.4 Max. :13770
#calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(se[,1:4], method=c("center", "scale"))
#summarize transform parameters
print(preprocessParams)
## Created from 155 samples and 4 variables
##
## Pre-processing:
## - centered (4)
## - ignored (0)
## - scaled (4)
#transform the dataset using the parameters
transformed <- predict(preprocessParams, se[,1:4])
#summarize the transformed dataset
summary(transformed)
## expreg expspecial expbil expocc
## Min. :-2.0993 Min. :-2.7847 Min. :-0.1712 Min. :-0.4537
## 1st Qu.:-0.6416 1st Qu.:-0.6854 1st Qu.:-0.1712 1st Qu.:-0.4537
## Median :-0.1631 Median :-0.1777 Median :-0.1712 Median :-0.4537
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.4813 3rd Qu.: 0.5542 3rd Qu.:-0.0204 3rd Qu.:-0.4537
## Max. : 4.2999 Max. : 4.1819 Max. :12.1002 Max. : 4.7391
#calculate the pre-process parameters from the dataset
preprocessParams <- preProcess(se[,1:4], method=c("range"))
#summarize transform parameters
print(preprocessParams)
## Created from 155 samples and 4 variables
##
## Pre-processing:
## - ignored (0)
## - re-scaling to [0, 1] (4)
#transform the dataset using the parameters
transformed <- predict(preprocessParams, se[,1:4])
#summarize the transformed dataset
summary(transformed)
## expreg expspecial expbil expocc
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.2278 1st Qu.:0.3013 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.3026 Median :0.3742 Median :0.00000 Median :0.00000
## Mean :0.3281 Mean :0.3997 Mean :0.01395 Mean :0.08738
## 3rd Qu.:0.4033 3rd Qu.:0.4793 3rd Qu.:0.01229 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.00000
df_se<-as.data.frame(scale(se))
str(df_se)
## 'data.frame': 155 obs. of 14 variables:
## $ expreg : num -0.5675 -1.3139 -0.8094 -0.4739 0.0619 ...
## $ expspecial: num -0.712 -0.299 -0.24 -0.91 2.157 ...
## $ expbil : num -0.171 -0.171 -0.171 -0.171 -0.171 ...
## $ expocc : num -0.454 -0.454 -0.454 -0.454 -0.454 ...
## $ exptot : num -0.864 -1.288 -0.655 -0.657 0.171 ...
## $ scratio : num 3.173 -0.254 0.161 -0.781 -1.044 ...
## $ special : num -0.415 -1.181 1.575 0.258 -1.426 ...
## $ lunch : num -0.2955 -0.157 -0.2774 0.0418 -0.8074 ...
## $ stratio : num 0.7484 0.8861 0.2434 0.0598 -0.1238 ...
## $ income : num -0.366 -0.801 -0.416 -0.544 1.483 ...
## $ score4 : num 0.352 -0.27 -0.27 -0.456 1.036 ...
## $ score8 : num -0.309 -0.215 -0.309 0.064 1.415 ...
## $ salary : num -0.561 -1.14 -0.949 -0.537 1.688 ...
## $ english : num -0.413 -0.413 -0.314 -0.413 -0.413 ...
dist_mat<-dist(df_se, method='euclidean')
#View(dist_mat)
hclust_avg<-hclust(dist_mat, method='average')
plot(hclust_avg, hang=- 0.1, cex=0.3)
cut_avg<-cutree(hclust_avg, k=6)
plot(hclust_avg)
rect.hclust(hclust_avg , k = 6, border = 2:6)
abline(h = 6, col = "red")
avg_dend_obj <- as.dendrogram(hclust_avg)
avg_col_dend <- color_branches(avg_dend_obj, h = 6)
plot(avg_col_dend)
#setting a seed for the random number generator
set.seed (1234)
se3 <- kmeans(se, center=6, iter.max=200)
se3
## K-means clustering with 6 clusters of sizes 10, 16, 57, 49, 1, 22
##
## Cluster means:
## expreg expspecial expbil expocc exptot scratio special
## 1 4688.700 8486.307 15791.10000 548.50000 5606.200 8.470000 17.46000
## 2 6035.875 11212.531 1442.93750 43.87500 6983.688 6.143750 15.08750
## 3 4138.702 7219.053 486.54386 77.52632 4742.246 8.303509 16.32105
## 4 4790.531 9078.824 35.53061 582.87755 5518.265 8.628571 15.01837
## 5 4239.000 6270.070 295140.00000 0.00000 4757.000 9.900000 20.70000
## 6 4566.091 9314.210 6035.18182 7508.18182 5551.545 8.086364 16.83636
## lunch stratio income score4 score8 salary english
## 1 18.17000 17.16000 18.37410 706.3000 699.5000 37.63980 1.6255800
## 2 11.53750 15.39375 24.94112 719.3125 717.0000 39.36941 1.8215147
## 3 13.56316 17.71579 16.96451 709.1404 697.4035 35.36532 0.4777922
## 4 10.23469 16.97551 19.76882 713.2653 704.4490 36.08104 0.3375868
## 5 13.90000 17.60000 14.12200 699.0000 684.0000 35.82300 0.0000000
## 6 42.48636 18.87273 14.16468 688.6364 668.6818 35.46720 5.4283942
##
## Clustering vector:
## [1] 3 3 3 3 2 4 4 6 4 2 4 2 3 4 3 3 6 4 3 6 2 4 4 5 2 3 2 6 4 3 4 2 4 3 4 3 3
## [38] 3 3 3 3 4 6 4 6 3 3 3 2 3 3 4 4 3 3 3 4 3 2 4 4 4 4 6 4 1 4 3 6 6 3 2 2 4
## [75] 4 6 1 3 6 4 2 3 4 3 4 3 3 6 3 3 6 4 1 3 2 4 1 4 1 4 4 3 3 4 1 3 3 3 4 4 1
## [112] 1 6 4 3 6 4 4 1 4 3 3 6 4 3 3 6 4 6 4 3 4 3 6 3 3 4 4 3 3 1 4 2 3 3 4 6 3
## [149] 2 3 2 6 4 3 6
##
## Within cluster sum of squares by cluster:
## [1] 882141509 166488539 192843723 167726291 0 456768965
## (between_SS / total_SS = 98.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
se3$betweenss
## [1] 89170633404
table(se3$cluster, se$expreg)
##
## 3023 3079 3287 3400 3627 3647 3679 3693 3703 3704 3744 3747 3802 3823 3839
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
## 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0
##
## 3850 3858 3862 3867 3903 3913 3928 3935 3969 3979 4009 4015 4034 4062 4071
## 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0
## 4 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
##
## 4079 4100 4105 4122 4134 4138 4150 4166 4177 4189 4201 4204 4205 4221 4234
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 1 1 1 1 1 0 0 1 1 1 1 1 1 0
## 4 0 1 0 0 0 0 1 1 0 0 0 0 0 1 1
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0
##
## 4239 4240 4242 4247 4254 4255 4267 4273 4283 4287 4297 4312 4344 4389 4411
## 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 3 0 0 1 0 1 0 0 1 0 1 0 1 0 1 0
## 4 0 0 0 1 0 1 1 0 0 0 1 0 1 0 0
## 5 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
##
## 4413 4415 4439 4440 4442 4447 4465 4472 4484 4494 4500 4512 4521 4529 4539
## 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0
## 4 0 0 0 0 1 1 0 0 1 0 1 1 0 1 1
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0
##
## 4545 4557 4562 4564 4579 4586 4598 4606 4609 4611 4617 4623 4625 4638 4643
## 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 1 0 1 1 0 1 0 1 0 1 0 0 1
## 4 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 1 0 0 0 0 0 1 0 0 0 1 0 0 1 0
##
## 4682 4685 4686 4707 4716 4754 4756 4764 4793 4810 4821 4859 4899 4922 4932
## 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 1 0 0 0 0 0 1 0 1 1 0 0 0 1 0
## 4 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 1 1 1 0 0 0 0 0 1 0 0 1
##
## 4961 5004 5006 5009 5014 5048 5066 5107 5146 5152 5196 5242 5254 5273 5304
## 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 3 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## 4 2 1 1 1 0 1 1 0 0 1 0 1 1 1 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1
##
## 5323 5338 5345 5372 5387 5396 5457 5518 5548 5558 5608 5613 5690 5764 5770
## 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 2 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 1 1 1 1 1 1 1 1 1 0 0 0 1 1 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##
## 5896 5937 5978 6049 6158 6240 6337 6415 6554 6881 6902 7944
## 1 1 0 0 0 0 0 0 0 0 0 0 0
## 2 0 1 1 0 1 1 1 1 1 1 1 1
## 3 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 1 0 0 0 0 0 0 0 0
cm<- table(se3$cluster, se$expreg)
1-sum(diag(cm))/sum(cm)
## [1] 0.9935484
s <- silhouette(se3$cluster, dist(se))
plot(s)
sol <- pamk(se, krange=2:10, criterion="asw", usepam=TRUE)
sol
## $pamobject
## Medoids:
## ID expreg expspecial expbil expocc exptot scratio special lunch stratio
## [1,] 18 4512 8280.89 0 0 5203 15.5 12.2 21.5 18.7
## [2,] 24 4239 6270.07 295140 0 4757 9.9 20.7 13.9 17.6
## income score4 score8 salary english
## [1,] 14.962 702 685 36.670 0
## [2,] 14.122 699 684 35.823 0
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1
## Objective function:
## build swap
## 4181.385 4181.385
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
##
## $nc
## [1] 2
##
## $crit
## [1] 0.0000000 0.9717478 0.5807691 0.6034074 0.5934414 0.3885390 0.3211006
## [8] 0.3412241 0.3506747 0.3479333
set.seed(1234)
d <- dist(scale(se))
methds <- c('complete', 'single', 'average')
avgS <- matrix(NA, ncol=3, nrow=5,
dimnames=list(2:6, methds))
for(k in 2:6)
for(m in seq_along(methds)) {
h <- hclust(d, meth=methds[m])
c <- cutree(h, k)
s <- silhouette(c, d)
avgS[k-1, m] <- mean(s[ , 3])
}
avgS
## complete single average
## 2 0.6321872 0.6321872 0.6321872
## 3 0.2442047 0.4808063 0.5030745
## 4 0.2536492 0.4806255 0.4188783
## 5 0.2309225 0.4044959 0.4044959
## 6 0.2060003 0.2295902 0.2797655
dunn(dist_mat,cut_avg)
## [1] 0.2443944
df_se<-as.data.frame(scale(se))
str(df_se)
## 'data.frame': 155 obs. of 14 variables:
## $ expreg : num -0.5675 -1.3139 -0.8094 -0.4739 0.0619 ...
## $ expspecial: num -0.712 -0.299 -0.24 -0.91 2.157 ...
## $ expbil : num -0.171 -0.171 -0.171 -0.171 -0.171 ...
## $ expocc : num -0.454 -0.454 -0.454 -0.454 -0.454 ...
## $ exptot : num -0.864 -1.288 -0.655 -0.657 0.171 ...
## $ scratio : num 3.173 -0.254 0.161 -0.781 -1.044 ...
## $ special : num -0.415 -1.181 1.575 0.258 -1.426 ...
## $ lunch : num -0.2955 -0.157 -0.2774 0.0418 -0.8074 ...
## $ stratio : num 0.7484 0.8861 0.2434 0.0598 -0.1238 ...
## $ income : num -0.366 -0.801 -0.416 -0.544 1.483 ...
## $ score4 : num 0.352 -0.27 -0.27 -0.456 1.036 ...
## $ score8 : num -0.309 -0.215 -0.309 0.064 1.415 ...
## $ salary : num -0.561 -1.14 -0.949 -0.537 1.688 ...
## $ english : num -0.413 -0.413 -0.314 -0.413 -0.413 ...
dist_mat<-dist(df_se, method='euclidean')
#View(dist_mat)
hclust_avg<-hclust(dist_mat, method='average')
plot(hclust_avg, hang=- 0.1, cex=0.3)
cut_avg<-cutree(hclust_avg, k=4)
plot(hclust_avg)
rect.hclust(hclust_avg , k = 4, border = 2:6)
abline(h = 4, col = "red")
avg_dend_obj <- as.dendrogram(hclust_avg)
avg_col_dend <- color_branches(avg_dend_obj, h = 4)
plot(avg_col_dend)
#setting a seed for the random number generator
set.seed (1234)
se3 <- kmeans(se, center=4, iter.max=200)
se3
## K-means clustering with 4 clusters of sizes 1, 33, 77, 44
##
## Cluster means:
## expreg expspecial expbil expocc exptot scratio special
## 1 4239.000 6270.070 295140.0000 0.00000 4757.000 9.900000 20.70000
## 2 4686.606 9167.305 9481.6667 4692.48485 5650.818 8.003030 17.04848
## 3 4257.701 7512.618 264.0130 594.41558 4889.636 8.429870 16.16883
## 4 5274.000 10102.999 227.7727 84.65909 6083.159 7.813636 14.65909
## lunch stratio income score4 score8 salary english
## 1 13.90000 17.60000 14.12200 699.0000 684.0000 35.82300 0.0000000
## 2 34.86364 18.31818 15.95339 694.9091 679.9091 36.38385 4.4629281
## 3 13.45065 17.49481 17.04842 709.1429 696.8312 35.24503 0.3890154
## 4 8.85000 16.43409 22.52002 717.2045 712.6136 37.63288 0.7129501
##
## Clustering vector:
## [1] 3 3 3 3 4 4 4 2 3 4 4 4 3 4 3 3 2 3 3 2 2 4 3 1 4 3 4 2 3 2 4 4 4 3 3 3 3
## [38] 3 3 3 3 4 2 4 2 3 3 3 2 3 3 3 3 3 3 3 4 3 4 4 4 4 4 2 3 2 3 3 2 3 3 4 4 4
## [75] 4 2 2 3 2 4 4 3 4 3 3 3 3 2 3 3 2 3 2 3 4 4 2 4 2 3 4 3 3 4 2 3 3 3 4 4 2
## [112] 2 2 3 3 2 4 4 2 3 3 3 2 4 3 3 2 4 2 4 3 3 3 2 3 3 3 3 3 3 2 3 4 3 3 4 2 3
## [149] 4 3 4 3 3 3 2
##
## Within cluster sum of squares by cluster:
## [1] 0 2242470845 399972756 177871627
## (between_SS / total_SS = 96.9 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
se3$betweenss
## [1] 88216287203
table(se3$cluster, se$expreg)
##
## 3023 3079 3287 3400 3627 3647 3679 3693 3703 3704 3744 3747 3802 3823 3839
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0
## 3 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
## 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##
## 3850 3858 3862 3867 3903 3913 3928 3935 3969 3979 4009 4015 4034 4062 4071
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0
## 3 0 1 1 1 1 1 0 0 1 1 0 1 1 1 0
## 4 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1
##
## 4079 4100 4105 4122 4134 4138 4150 4166 4177 4189 4201 4204 4205 4221 4234
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 3 0 2 1 1 1 1 1 0 1 1 1 1 1 1 1
## 4 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
##
## 4239 4240 4242 4247 4254 4255 4267 4273 4283 4287 4297 4312 4344 4389 4411
## 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0
## 3 0 0 1 0 1 1 0 1 0 1 0 1 0 1 0
## 4 0 0 0 1 0 0 1 0 0 0 1 0 1 0 1
##
## 4413 4415 4439 4440 4442 4447 4465 4472 4484 4494 4500 4512 4521 4529 4539
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0
## 3 0 0 1 1 1 1 1 1 0 0 1 1 0 1 1
## 4 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
##
## 4545 4557 4562 4564 4579 4586 4598 4606 4609 4611 4617 4623 4625 4638 4643
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0
## 3 0 1 1 0 1 1 0 1 0 1 0 1 1 0 1
## 4 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
##
## 4682 4685 4686 4707 4716 4754 4756 4764 4793 4810 4821 4859 4899 4922 4932
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 1 1 0 0 1 0 0 0 1 0 0 1
## 3 1 0 1 0 0 1 1 0 1 1 1 0 1 1 0
## 4 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
##
## 4961 5004 5006 5009 5014 5048 5066 5107 5146 5152 5196 5242 5254 5273 5304
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1
## 3 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0
## 4 2 1 0 1 0 1 1 0 0 1 1 1 1 1 0
##
## 5323 5338 5345 5372 5387 5396 5457 5518 5548 5558 5608 5613 5690 5764 5770
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 3 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0
## 4 1 0 1 1 1 1 0 1 1 1 1 0 1 1 1
##
## 5896 5937 5978 6049 6158 6240 6337 6415 6554 6881 6902 7944
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 1 1 0 1 1 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 1 0 0 1 1 1 1 1 1 1
cm<- table(se3$cluster, se$expreg)
1-sum(diag(cm))/sum(cm)
## [1] 0.9935484
s <- silhouette(se3$cluster, dist(se))
plot(s)
sol <- pamk(se, krange=2:10, criterion="asw", usepam=TRUE)
sol
## $pamobject
## Medoids:
## ID expreg expspecial expbil expocc exptot scratio special lunch stratio
## [1,] 18 4512 8280.89 0 0 5203 15.5 12.2 21.5 18.7
## [2,] 24 4239 6270.07 295140 0 4757 9.9 20.7 13.9 17.6
## income score4 score8 salary english
## [1,] 14.962 702 685 36.670 0
## [2,] 14.122 699 684 35.823 0
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1
## Objective function:
## build swap
## 4181.385 4181.385
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
##
## $nc
## [1] 2
##
## $crit
## [1] 0.0000000 0.9717478 0.5807691 0.6034074 0.5934414 0.3885390 0.3211006
## [8] 0.3412241 0.3506747 0.3479333
set.seed(1234)
d <- dist(scale(se))
methds <- c('complete', 'single', 'average')
avgS <- matrix(NA, ncol=3, nrow=5,
dimnames=list(2:6, methds))
for(k in 2:6)
for(m in seq_along(methds)) {
h <- hclust(d, meth=methds[m])
c <- cutree(h, k)
s <- silhouette(c, d)
avgS[k-1, m] <- mean(s[ , 3])
}
avgS
## complete single average
## 2 0.6321872 0.6321872 0.6321872
## 3 0.2442047 0.4808063 0.5030745
## 4 0.2536492 0.4806255 0.4188783
## 5 0.2309225 0.4044959 0.4044959
## 6 0.2060003 0.2295902 0.2797655
dunn(dist_mat,cut_avg)
## [1] 0.377325
df_se<-as.data.frame(scale(se))
str(df_se)
## 'data.frame': 155 obs. of 14 variables:
## $ expreg : num -0.5675 -1.3139 -0.8094 -0.4739 0.0619 ...
## $ expspecial: num -0.712 -0.299 -0.24 -0.91 2.157 ...
## $ expbil : num -0.171 -0.171 -0.171 -0.171 -0.171 ...
## $ expocc : num -0.454 -0.454 -0.454 -0.454 -0.454 ...
## $ exptot : num -0.864 -1.288 -0.655 -0.657 0.171 ...
## $ scratio : num 3.173 -0.254 0.161 -0.781 -1.044 ...
## $ special : num -0.415 -1.181 1.575 0.258 -1.426 ...
## $ lunch : num -0.2955 -0.157 -0.2774 0.0418 -0.8074 ...
## $ stratio : num 0.7484 0.8861 0.2434 0.0598 -0.1238 ...
## $ income : num -0.366 -0.801 -0.416 -0.544 1.483 ...
## $ score4 : num 0.352 -0.27 -0.27 -0.456 1.036 ...
## $ score8 : num -0.309 -0.215 -0.309 0.064 1.415 ...
## $ salary : num -0.561 -1.14 -0.949 -0.537 1.688 ...
## $ english : num -0.413 -0.413 -0.314 -0.413 -0.413 ...
dist_mat<-dist(df_se, method='euclidean')
#View(dist_mat)
hclust_avg<-hclust(dist_mat, method='average')
plot(hclust_avg, hang=- 0.1, cex=0.3)
cut_avg<-cutree(hclust_avg, k=3)
plot(hclust_avg)
rect.hclust(hclust_avg , k = 3, border = 2:6)
abline(h = 3, col = "red")
avg_dend_obj <- as.dendrogram(hclust_avg)
avg_col_dend <- color_branches(avg_dend_obj, h = 3)
plot(avg_col_dend)
#setting a seed for the random number generator
set.seed (1234)
se3 <- kmeans(se, center=3, iter.max=200)
se3
## K-means clustering with 3 clusters of sizes 1, 33, 121
##
## Cluster means:
## expreg expspecial expbil expocc exptot scratio special lunch
## 1 4239.000 6270.070 295140.0000 0.0000 4757.000 9.900000 20.70000 13.90000
## 2 4686.606 9167.305 9481.6667 4692.4848 5650.818 8.003030 17.04848 34.86364
## 3 4627.264 8454.575 250.8347 409.0496 5323.645 8.205785 15.61983 11.77769
## stratio income score4 score8 salary english
## 1 17.60000 14.12200 699.0000 684.0000 35.82300 0.0000000
## 2 18.31818 15.95339 694.9091 679.9091 36.38385 4.4629281
## 3 17.10909 19.03809 712.0744 702.5702 36.11334 0.5068098
##
## Clustering vector:
## [1] 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 2 3 3 2 2 3 3 1 3 3 3 2 3 2 3 3 3 3 3 3 3
## [38] 3 3 3 3 3 2 3 2 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 2 3 3 2 3 3 3 3 3
## [75] 3 2 2 3 2 3 3 3 3 3 3 3 3 2 3 3 2 3 2 3 3 3 2 3 2 3 3 3 3 3 2 3 3 3 3 3 2
## [112] 2 2 3 3 2 3 3 2 3 3 3 2 3 3 3 2 3 2 3 3 3 3 2 3 3 3 3 3 3 2 3 3 3 3 3 2 3
## [149] 3 3 3 3 3 3 2
##
## Within cluster sum of squares by cluster:
## [1] 0 2242470845 841855740
## (between_SS / total_SS = 96.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
se3$betweenss
## [1] 87952275845
table(se3$cluster, se$expreg)
##
## 3023 3079 3287 3400 3627 3647 3679 3693 3703 3704 3744 3747 3802 3823 3839
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0
## 3 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
##
## 3850 3858 3862 3867 3903 3913 3928 3935 3969 3979 4009 4015 4034 4062 4071
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 1 0 0 0 0 1 1 0 0 0 1 0 0 0 0
## 3 0 1 1 1 1 1 0 1 1 1 0 1 1 1 1
##
## 4079 4100 4105 4122 4134 4138 4150 4166 4177 4189 4201 4204 4205 4221 4234
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 3 0 2 1 1 1 1 1 1 1 1 1 1 1 2 1
##
## 4239 4240 4242 4247 4254 4255 4267 4273 4283 4287 4297 4312 4344 4389 4411
## 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0
## 3 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1
##
## 4413 4415 4439 4440 4442 4447 4465 4472 4484 4494 4500 4512 4521 4529 4539
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 1 1 0 0 0 0 0 0 0 1 0 1 1 0 0
## 3 0 0 1 1 1 1 1 1 1 0 1 1 0 1 1
##
## 4545 4557 4562 4564 4579 4586 4598 4606 4609 4611 4617 4623 4625 4638 4643
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0
## 3 0 1 1 0 1 1 0 1 1 1 0 2 1 0 1
##
## 4682 4685 4686 4707 4716 4754 4756 4764 4793 4810 4821 4859 4899 4922 4932
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 1 1 0 0 1 0 0 0 1 0 0 1
## 3 1 1 1 0 0 1 1 0 1 1 1 0 1 1 0
##
## 4961 5004 5006 5009 5014 5048 5066 5107 5146 5152 5196 5242 5254 5273 5304
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1
## 3 2 1 1 1 1 1 1 0 0 1 1 1 1 1 0
##
## 5323 5338 5345 5372 5387 5396 5457 5518 5548 5558 5608 5613 5690 5764 5770
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 3 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1
##
## 5896 5937 5978 6049 6158 6240 6337 6415 6554 6881 6902 7944
## 1 0 0 0 0 0 0 0 0 0 0 0 0
## 2 1 1 0 1 1 0 0 0 0 0 0 0
## 3 0 0 1 0 0 1 1 1 1 1 1 1
cm<- table(se3$cluster, se$expreg)
1-sum(diag(cm))/sum(cm)
## [1] 0.9935484
s <- silhouette(se3$cluster, dist(se))
plot(s)
sol <- pamk(se, krange=2:10, criterion="asw", usepam=TRUE)
sol
## $pamobject
## Medoids:
## ID expreg expspecial expbil expocc exptot scratio special lunch stratio
## [1,] 18 4512 8280.89 0 0 5203 15.5 12.2 21.5 18.7
## [2,] 24 4239 6270.07 295140 0 4757 9.9 20.7 13.9 17.6
## income score4 score8 salary english
## [1,] 14.962 702 685 36.670 0
## [2,] 14.122 699 684 35.823 0
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1
## Objective function:
## build swap
## 4181.385 4181.385
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
##
## $nc
## [1] 2
##
## $crit
## [1] 0.0000000 0.9717478 0.5807691 0.6034074 0.5934414 0.3885390 0.3211006
## [8] 0.3412241 0.3506747 0.3479333
set.seed(1234)
d <- dist(scale(se))
methds <- c('complete', 'single', 'average')
avgS <- matrix(NA, ncol=3, nrow=5,
dimnames=list(2:6, methds))
for(k in 2:6)
for(m in seq_along(methds)) {
h <- hclust(d, meth=methds[m])
c <- cutree(h, k)
s <- silhouette(c, d)
avgS[k-1, m] <- mean(s[ , 3])
}
avgS
## complete single average
## 2 0.6321872 0.6321872 0.6321872
## 3 0.2442047 0.4808063 0.5030745
## 4 0.2536492 0.4806255 0.4188783
## 5 0.2309225 0.4044959 0.4044959
## 6 0.2060003 0.2295902 0.2797655
#Dunn’s coefficient
dunn(dist_mat,cut_avg)
## [1] 0.3876777
df_se<-as.data.frame(scale(se))
str(df_se)
## 'data.frame': 155 obs. of 14 variables:
## $ expreg : num -0.5675 -1.3139 -0.8094 -0.4739 0.0619 ...
## $ expspecial: num -0.712 -0.299 -0.24 -0.91 2.157 ...
## $ expbil : num -0.171 -0.171 -0.171 -0.171 -0.171 ...
## $ expocc : num -0.454 -0.454 -0.454 -0.454 -0.454 ...
## $ exptot : num -0.864 -1.288 -0.655 -0.657 0.171 ...
## $ scratio : num 3.173 -0.254 0.161 -0.781 -1.044 ...
## $ special : num -0.415 -1.181 1.575 0.258 -1.426 ...
## $ lunch : num -0.2955 -0.157 -0.2774 0.0418 -0.8074 ...
## $ stratio : num 0.7484 0.8861 0.2434 0.0598 -0.1238 ...
## $ income : num -0.366 -0.801 -0.416 -0.544 1.483 ...
## $ score4 : num 0.352 -0.27 -0.27 -0.456 1.036 ...
## $ score8 : num -0.309 -0.215 -0.309 0.064 1.415 ...
## $ salary : num -0.561 -1.14 -0.949 -0.537 1.688 ...
## $ english : num -0.413 -0.413 -0.314 -0.413 -0.413 ...
dist_mat<-dist(df_se, method='euclidean')
#View(dist_mat)
hclust_avg<-hclust(dist_mat, method='average')
plot(hclust_avg, hang=- 0.1, cex=0.3)
cut_avg<-cutree(hclust_avg, k=8)
plot(hclust_avg)
rect.hclust(hclust_avg , k = 8, border = 2:6)
abline(h = 8, col = "red")
avg_dend_obj <- as.dendrogram(hclust_avg)
avg_col_dend <- color_branches(avg_dend_obj, h = 8)
plot(avg_col_dend)
#setting a seed for the random number generator
set.seed (1234)
se3 <- kmeans(se, center=8, iter.max=200)
se3
## K-means clustering with 8 clusters of sizes 3, 13, 8, 41, 1, 19, 14, 56
##
## Cluster means:
## expreg expspecial expbil expocc exptot scratio special
## 1 5307.333 9249.610 29109.33333 1828.33333 6305.333 6.266667 17.93333
## 2 6159.077 11309.758 267.53846 0.00000 7119.000 5.876923 14.86154
## 3 4689.625 8294.774 0.00000 5353.75000 5360.000 7.375000 16.57500
## 4 4809.683 9223.730 42.46341 0.00000 5545.902 8.736585 14.70488
## 5 4239.000 6270.070 295140.00000 0.00000 4757.000 9.900000 20.70000
## 6 4564.000 9447.155 6643.84211 7669.15789 5564.579 8.078947 16.56316
## 7 4633.857 8724.561 8208.42857 365.92857 5520.429 8.642857 17.22857
## 8 4147.554 7224.607 170.50000 92.76786 4748.214 8.394643 16.32500
## lunch stratio income score4 score8 salary english
## 1 9.133333 16.63333 23.43200 717.6667 717.6667 40.39317 1.2555644
## 2 6.253846 14.96923 26.00369 723.3077 723.3846 39.60015 0.6455729
## 3 15.200000 16.73750 17.44413 710.6250 694.3750 34.29319 0.1849637
## 4 9.329268 16.99268 20.26588 714.1951 706.6341 36.37010 0.3415171
## 5 13.900000 17.60000 14.12200 699.0000 684.0000 35.82300 0.0000000
## 6 44.921053 19.32105 13.92363 686.0526 666.7368 35.48597 6.1129774
## 7 28.114286 17.66429 16.51321 700.0714 687.0000 36.40754 3.3531593
## 8 12.780357 17.59286 17.06320 709.5357 697.8214 35.44538 0.3190851
##
## Clustering vector:
## [1] 8 8 8 8 2 4 4 6 4 2 4 2 7 4 8 8 6 4 8 6 7 3 4 5 2 8 7 6 4 7 4 2 4 8 4 8 8
## [38] 8 8 8 8 4 6 4 6 8 7 8 7 8 8 4 3 8 8 8 4 8 2 4 4 4 4 6 4 7 3 8 6 3 8 2 2 4
## [75] 4 6 7 8 6 4 2 8 4 8 3 8 8 6 8 8 6 3 7 8 2 4 7 4 1 4 4 8 8 4 7 8 8 8 4 4 7
## [112] 1 6 4 8 6 4 4 7 4 8 8 6 4 8 8 6 4 6 4 8 4 8 6 8 8 3 4 8 8 1 8 2 8 8 4 6 8
## [149] 2 8 2 3 8 8 7
##
## Within cluster sum of squares by cluster:
## [1] 72137725 46418519 32474772 49023226 0 348893454 145712315
## [8] 95398513
## (between_SS / total_SS = 99.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
se3$betweenss
## [1] 90246543907
table(se3$cluster, se$expreg)
##
## 3023 3079 3287 3400 3627 3647 3679 3693 3703 3704 3744 3747 3802 3823 3839
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0
## 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 8 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
##
## 3850 3858 3862 3867 3903 3913 3928 3935 3969 3979 4009 4015 4034 4062 4071
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
## 7 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 8 0 1 1 1 1 1 0 0 1 1 0 1 1 0 0
##
## 4079 4100 4105 4122 4134 4138 4150 4166 4177 4189 4201 4204 4205 4221 4234
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 7 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## 8 0 1 1 1 1 1 0 0 1 0 1 1 1 1 0
##
## 4239 4240 4242 4247 4254 4255 4267 4273 4283 4287 4297 4312 4344 4389 4411
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 1 0 1 1 0 0 0 1 0 1 0 0
## 5 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## 7 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1
## 8 0 0 1 0 0 0 0 1 0 1 0 1 0 1 0
##
## 4413 4415 4439 4440 4442 4447 4465 4472 4484 4494 4500 4512 4521 4529 4539
## 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 1 1 0 0 1 0 1 1 0 0 1
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0
## 7 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0
## 8 0 0 0 1 0 0 1 1 0 0 0 0 0 1 0
##
## 4545 4557 4562 4564 4579 4586 4598 4606 4609 4611 4617 4623 4625 4638 4643
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0
## 7 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## 8 0 0 1 0 1 1 0 1 0 1 0 1 0 0 1
##
## 4682 4685 4686 4707 4716 4754 4756 4764 4793 4810 4821 4859 4899 4922 4932
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 1 1 0 0 0 0 0 0 1 0 0 1
## 7 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 8 1 0 1 0 0 0 1 0 1 1 0 0 0 1 0
##
## 4961 5004 5006 5009 5014 5048 5066 5107 5146 5152 5196 5242 5254 5273 5304
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 3 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## 4 2 1 0 1 0 1 1 0 0 1 0 1 1 1 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1
## 7 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## 8 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
##
## 5323 5338 5345 5372 5387 5396 5457 5518 5548 5558 5608 5613 5690 5764 5770
## 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## 2 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1
## 3 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 4 1 1 1 1 1 1 1 1 1 0 0 0 0 1 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##
## 5896 5937 5978 6049 6158 6240 6337 6415 6554 6881 6902 7944
## 1 1 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 1 0 0 1 1 1 1 1 1 1
## 3 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 1 0 0 0 0 0 0 0 0
## 7 0 1 0 0 1 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0 0 0 0 0
cm<- table(se3$cluster, se$expreg)
1-sum(diag(cm))/sum(cm)
## [1] 0.9935484
s <- silhouette(se3$cluster, dist(se))
plot(s)
sol <- pamk(se, krange=2:10, criterion="asw", usepam=TRUE)
sol
## $pamobject
## Medoids:
## ID expreg expspecial expbil expocc exptot scratio special lunch stratio
## [1,] 18 4512 8280.89 0 0 5203 15.5 12.2 21.5 18.7
## [2,] 24 4239 6270.07 295140 0 4757 9.9 20.7 13.9 17.6
## income score4 score8 salary english
## [1,] 14.962 702 685 36.670 0
## [2,] 14.122 699 684 35.823 0
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [149] 1 1 1 1 1 1 1
## Objective function:
## build swap
## 4181.385 4181.385
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
##
## $nc
## [1] 2
##
## $crit
## [1] 0.0000000 0.9717478 0.5807691 0.6034074 0.5934414 0.3885390 0.3211006
## [8] 0.3412241 0.3506747 0.3479333
set.seed(1234)
d <- dist(scale(se))
methds <- c('complete', 'single', 'average')
avgS <- matrix(NA, ncol=3, nrow=5,
dimnames=list(2:6, methds))
for(k in 2:6)
for(m in seq_along(methds)) {
h <- hclust(d, meth=methds[m])
c <- cutree(h, k)
s <- silhouette(c, d)
avgS[k-1, m] <- mean(s[ , 3])
}
avgS
## complete single average
## 2 0.6321872 0.6321872 0.6321872
## 3 0.2442047 0.4808063 0.5030745
## 4 0.2536492 0.4806255 0.4188783
## 5 0.2309225 0.4044959 0.4044959
## 6 0.2060003 0.2295902 0.2797655
dunn(dist_mat,cut_avg)
## [1] 0.2428229