This is analysis of peruvians schools to making clustering about its variables.

Let’s load some sample data.

The data have 14 variables and one ID.

rm(list=ls())
setwd("D:/1. 2019/CLUSTER")
a <- read.csv("BaseCop.csv",header=TRUE,sep=",")
names(a)
##  [1] "ID"  "V1"  "V2"  "V3"  "V4"  "V5"  "V6"  "V7"  "V8"  "V9"  "V10"
## [12] "V11" "V12" "V13" "V14"

Next, we analyze the scale of variables and the outliers

boxplot(a[,2:15],main="Box Plot")

colores <- factor(a$ID)
boxplot(a[,3:10],col=colores,main="Box Plot")

boxplot(a[,11:15],col=colores,main="Box Plot")

The scales of variables are different, so we scaling the variables.

a.scale <- as.data.frame(scale(a[,2:15]))
boxplot(a.scale[,1:14],col=colores,main="Box Plot")

We clustering the cases with the scaled variables and we analyzed the different methods

Method: Single linkage

library(cluster)
hc.single=hclust(dist(a.scale),method="single")
hc.single$merge
##       [,1] [,2]
##  [1,]  -14  -15
##  [2,]   -2  -12
##  [3,]   -6   -8
##  [4,]   -7  -13
##  [5,]   -3   -4
##  [6,]  -11    1
##  [7,]   -5    3
##  [8,]  -18    6
##  [9,]    4    8
## [10,]    7    9
## [11,]    5   10
## [12,]  -16   11
## [13,]    2   12
## [14,]  -10   13
## [15,]  -17   14
## [16,]   -1   15
## [17,]   -9   16
plot(hc.single,
     main="Dendograma",
     ylab="",
     xlab="Institución Educativa",
     sub="Agrupamiento: Método Simple",
     cex=0.9)

Method: Complete linkage

hc.complet=hclust(dist(a.scale),method="complete")
hc.complet$merge
##       [,1] [,2]
##  [1,]  -14  -15
##  [2,]   -2  -12
##  [3,]   -6   -8
##  [4,]   -7  -13
##  [5,]   -3   -4
##  [6,]  -11  -18
##  [7,]   -5    3
##  [8,]    1    4
##  [9,]  -17    2
## [10,]  -10    6
## [11,]    5    7
## [12,]  -16    8
## [13,]   10   12
## [14,]   11   13
## [15,]   -1    9
## [16,]   14   15
## [17,]   -9   16
plot(hc.complet,
     main="Dendograma",
     ylab="",
     xlab="Institución Educativa",
     sub="Agrupamiento: Método Vecino más lejano",
     cex=0.9)

Method: Centroid

hc.centroid=hclust(dist(a.scale),method="centroid")
hc.centroid$merge
##       [,1] [,2]
##  [1,]  -14  -15
##  [2,]   -2  -12
##  [3,]   -6   -8
##  [4,]   -5    3
##  [5,]   -7  -13
##  [6,]    1    5
##  [7,]  -11    4
##  [8,]  -18    7
##  [9,]    2    8
## [10,]    6    9
## [11,]   -4   10
## [12,]   -3   11
## [13,]  -17   12
## [14,]  -10   13
## [15,]  -16   14
## [16,]   -1   15
## [17,]   -9   16
plot(hc.centroid,
     main="Dendograma",
     ylab="",
     xlab="Institución Educativa",
     sub="Agrupamiento: Método de Centroide",
     cex=0.9)

Method: Median

hc.median=hclust(dist(a.scale),method="median")
hc.median$merge
##       [,1] [,2]
##  [1,]  -14  -15
##  [2,]   -2  -12
##  [3,]   -6   -8
##  [4,]   -5    3
##  [5,]   -7  -13
##  [6,]    1    5
##  [7,]  -11    4
##  [8,]    6    7
##  [9,]   -4    8
## [10,]   -3    9
## [11,]  -18   10
## [12,]    2   11
## [13,]  -17   12
## [14,]   -1   13
## [15,]  -10  -16
## [16,]   14   15
## [17,]   -9   16
plot(hc.median,
     main="Dendograma",
     ylab="",
     xlab="Institución Educativa",
     sub="Agrupamiento: Método de la Mediana",
     cex=0.9)

Method: Ward

hc.ward=hclust(dist(a.scale),method="ward.D")
hc.ward$merge
##       [,1] [,2]
##  [1,]  -14  -15
##  [2,]   -2  -12
##  [3,]   -6   -8
##  [4,]   -7  -13
##  [5,]   -3   -4
##  [6,]  -11  -18
##  [7,]   -5    3
##  [8,]  -16    4
##  [9,]    6    7
## [10,]  -17    2
## [11,]  -10    5
## [12,]    1    8
## [13,]    9   10
## [14,]   11   12
## [15,]   -1   13
## [16,]   -9   14
## [17,]   15   16
plot(hc.ward,
     main="Dendograma",
     ylab="",
     xlab="Institución Educativa",
     sub="Agrupamiento: Método de Ward",
     cex=0.9)

We chose the Ward method because it’s the best method that group the cases.

plot(hc.ward,
     main="Dendograma",
     ylab="",
     xlab="Institución Educativa",
     sub="Agrupamiento: Método de Ward",
     cex=0.9)

cluster <- cutree(hc.ward,5)
rect.hclust(hc.ward,k=5,border="red")

abline(h=7.4,col=4,lty=2)

Other graph

library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
library(dendextend)
## 
## ---------------------
## Welcome to dendextend version 1.10.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
dend <- as.dendrogram(hc.ward)
dend <- dend %>% color_branches(k=5) %>% color_labels
ggplot(dend)
## Warning: Removed 35 rows containing missing values (geom_point).

Scatter 2D

clusplot(a.scale,cluster,
         main='Representacion 2D de la solucion',
         color=T,shade=T,labels=5,lines=0)