This is analysis of peruvians schools to making clustering about its variables.
Let’s load some sample data.
The data have 14 variables and one ID.
rm(list=ls())
setwd("D:/1. 2019/CLUSTER")
a <- read.csv("BaseCop.csv",header=TRUE,sep=",")
names(a)
## [1] "ID" "V1" "V2" "V3" "V4" "V5" "V6" "V7" "V8" "V9" "V10"
## [12] "V11" "V12" "V13" "V14"
Next, we analyze the scale of variables and the outliers
boxplot(a[,2:15],main="Box Plot")
colores <- factor(a$ID)
boxplot(a[,3:10],col=colores,main="Box Plot")
boxplot(a[,11:15],col=colores,main="Box Plot")
The scales of variables are different, so we scaling the variables.
a.scale <- as.data.frame(scale(a[,2:15]))
boxplot(a.scale[,1:14],col=colores,main="Box Plot")
We clustering the cases with the scaled variables and we analyzed the different methods
Method: Single linkage
library(cluster)
hc.single=hclust(dist(a.scale),method="single")
hc.single$merge
## [,1] [,2]
## [1,] -14 -15
## [2,] -2 -12
## [3,] -6 -8
## [4,] -7 -13
## [5,] -3 -4
## [6,] -11 1
## [7,] -5 3
## [8,] -18 6
## [9,] 4 8
## [10,] 7 9
## [11,] 5 10
## [12,] -16 11
## [13,] 2 12
## [14,] -10 13
## [15,] -17 14
## [16,] -1 15
## [17,] -9 16
plot(hc.single,
main="Dendograma",
ylab="",
xlab="Institución Educativa",
sub="Agrupamiento: Método Simple",
cex=0.9)
Method: Complete linkage
hc.complet=hclust(dist(a.scale),method="complete")
hc.complet$merge
## [,1] [,2]
## [1,] -14 -15
## [2,] -2 -12
## [3,] -6 -8
## [4,] -7 -13
## [5,] -3 -4
## [6,] -11 -18
## [7,] -5 3
## [8,] 1 4
## [9,] -17 2
## [10,] -10 6
## [11,] 5 7
## [12,] -16 8
## [13,] 10 12
## [14,] 11 13
## [15,] -1 9
## [16,] 14 15
## [17,] -9 16
plot(hc.complet,
main="Dendograma",
ylab="",
xlab="Institución Educativa",
sub="Agrupamiento: Método Vecino más lejano",
cex=0.9)
Method: Centroid
hc.centroid=hclust(dist(a.scale),method="centroid")
hc.centroid$merge
## [,1] [,2]
## [1,] -14 -15
## [2,] -2 -12
## [3,] -6 -8
## [4,] -5 3
## [5,] -7 -13
## [6,] 1 5
## [7,] -11 4
## [8,] -18 7
## [9,] 2 8
## [10,] 6 9
## [11,] -4 10
## [12,] -3 11
## [13,] -17 12
## [14,] -10 13
## [15,] -16 14
## [16,] -1 15
## [17,] -9 16
plot(hc.centroid,
main="Dendograma",
ylab="",
xlab="Institución Educativa",
sub="Agrupamiento: Método de Centroide",
cex=0.9)
Method: Median
hc.median=hclust(dist(a.scale),method="median")
hc.median$merge
## [,1] [,2]
## [1,] -14 -15
## [2,] -2 -12
## [3,] -6 -8
## [4,] -5 3
## [5,] -7 -13
## [6,] 1 5
## [7,] -11 4
## [8,] 6 7
## [9,] -4 8
## [10,] -3 9
## [11,] -18 10
## [12,] 2 11
## [13,] -17 12
## [14,] -1 13
## [15,] -10 -16
## [16,] 14 15
## [17,] -9 16
plot(hc.median,
main="Dendograma",
ylab="",
xlab="Institución Educativa",
sub="Agrupamiento: Método de la Mediana",
cex=0.9)
Method: Ward
hc.ward=hclust(dist(a.scale),method="ward.D")
hc.ward$merge
## [,1] [,2]
## [1,] -14 -15
## [2,] -2 -12
## [3,] -6 -8
## [4,] -7 -13
## [5,] -3 -4
## [6,] -11 -18
## [7,] -5 3
## [8,] -16 4
## [9,] 6 7
## [10,] -17 2
## [11,] -10 5
## [12,] 1 8
## [13,] 9 10
## [14,] 11 12
## [15,] -1 13
## [16,] -9 14
## [17,] 15 16
plot(hc.ward,
main="Dendograma",
ylab="",
xlab="Institución Educativa",
sub="Agrupamiento: Método de Ward",
cex=0.9)
We chose the Ward method because it’s the best method that group the cases.
plot(hc.ward,
main="Dendograma",
ylab="",
xlab="Institución Educativa",
sub="Agrupamiento: Método de Ward",
cex=0.9)
cluster <- cutree(hc.ward,5)
rect.hclust(hc.ward,k=5,border="red")
abline(h=7.4,col=4,lty=2)
Other graph
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.10.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
dend <- as.dendrogram(hc.ward)
dend <- dend %>% color_branches(k=5) %>% color_labels
ggplot(dend)
## Warning: Removed 35 rows containing missing values (geom_point).
Scatter 2D
clusplot(a.scale,cluster,
main='Representacion 2D de la solucion',
color=T,shade=T,labels=5,lines=0)