AnƔlisis de Cluster (conglomerados)

Factor: Especie; (Setosa, Virginice, Versicolor)

Respuesta: largo y ancho de PƩtalos y SƩpalos (4 en total)

ANOVA para largo de pƩtalo (Alpha=0.0125)

A1 <- aov(iris$Petal.Length ~ iris$Species)
anova(A1)
## Analysis of Variance Table
## 
## Response: iris$Petal.Length
##               Df Sum Sq Mean Sq F value    Pr(>F)    
## iris$Species   2 437.10 218.551  1180.2 < 2.2e-16 ***
## Residuals    147  27.22   0.185                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(A1, 'iris$Species')
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = iris$Petal.Length ~ iris$Species)
## 
## $`iris$Species`
##                       diff     lwr     upr p adj
## versicolor-setosa    2.798 2.59422 3.00178     0
## virginica-setosa     4.090 3.88622 4.29378     0
## virginica-versicolor 1.292 1.08822 1.49578     0
medrano <- (A1$residuals)
shapiro.test(medrano)
## 
##  Shapiro-Wilk normality test
## 
## data:  medrano
## W = 0.98108, p-value = 0.03676
shapiro.test(A1$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  A1$residuals
## W = 0.98108, p-value = 0.03676
bartlett.test(A1$residuals,
              iris$Species)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  A1$residuals and iris$Species
## Bartlett's K-squared = 55.423, df = 2, p-value = 9.229e-13

ANOVA para ancho de pƩtalo (Alpha=0.0125)

A2 <- aov(iris$Petal.Width ~ iris$Species)
anova(A2)
## Analysis of Variance Table
## 
## Response: iris$Petal.Width
##               Df Sum Sq Mean Sq F value    Pr(>F)    
## iris$Species   2 80.413  40.207  960.01 < 2.2e-16 ***
## Residuals    147  6.157   0.042                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(A2, 'iris$Species')
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = iris$Petal.Width ~ iris$Species)
## 
## $`iris$Species`
##                      diff       lwr       upr p adj
## versicolor-setosa    1.08 0.9830903 1.1769097     0
## virginica-setosa     1.78 1.6830903 1.8769097     0
## virginica-versicolor 0.70 0.6030903 0.7969097     0
shapiro.test(A2$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  A2$residuals
## W = 0.97217, p-value = 0.003866
bartlett.test(A2$residuals,
              iris$Species)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  A2$residuals and iris$Species
## Bartlett's K-squared = 39.213, df = 2, p-value = 3.055e-09

ANOVA para largo de sƩpalo

A3 <- aov(iris$Sepal.Length ~ iris$Species)
anova(A3)
## Analysis of Variance Table
## 
## Response: iris$Sepal.Length
##               Df Sum Sq Mean Sq F value    Pr(>F)    
## iris$Species   2 63.212  31.606  119.26 < 2.2e-16 ***
## Residuals    147 38.956   0.265                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(A3, 'iris$Species')
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = iris$Sepal.Length ~ iris$Species)
## 
## $`iris$Species`
##                       diff       lwr       upr p adj
## versicolor-setosa    0.930 0.6862273 1.1737727     0
## virginica-setosa     1.582 1.3382273 1.8257727     0
## virginica-versicolor 0.652 0.4082273 0.8957727     0
shapiro.test(A3$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  A3$residuals
## W = 0.9879, p-value = 0.2189
bartlett.test(A3$residuals,
              iris$Species)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  A3$residuals and iris$Species
## Bartlett's K-squared = 16.006, df = 2, p-value = 0.0003345

ANOVA para ancho de sƩpalo

A4 <- aov(iris$Sepal.Width ~ iris$Species)
anova(A4)
## Analysis of Variance Table
## 
## Response: iris$Sepal.Width
##               Df Sum Sq Mean Sq F value    Pr(>F)    
## iris$Species   2 11.345  5.6725   49.16 < 2.2e-16 ***
## Residuals    147 16.962  0.1154                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(A4, 'iris$Species')
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = iris$Sepal.Width ~ iris$Species)
## 
## $`iris$Species`
##                        diff         lwr        upr     p adj
## versicolor-setosa    -0.658 -0.81885528 -0.4971447 0.0000000
## virginica-setosa     -0.454 -0.61485528 -0.2931447 0.0000000
## virginica-versicolor  0.204  0.04314472  0.3648553 0.0087802
shapiro.test(A4$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  A4$residuals
## W = 0.98948, p-value = 0.323
bartlett.test(A4$residuals,
              iris$Species)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  A4$residuals and iris$Species
## Bartlett's K-squared = 2.0911, df = 2, p-value = 0.3515

MANOVA para las 4 varaibles conjuntas

Y1 = cbind(iris$Sepal.Length, iris$Sepal.Width, iris$Petal.Length, iris$Petal.Width)
mod_moav = manova(Y1 ~ iris$Species)
summary(mod_moav)
##               Df Pillai approx F num Df den Df    Pr(>F)    
## iris$Species   2 1.1919   53.466      8    290 < 2.2e-16 ***
## Residuals    147                                            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
resid = (mod_moav$residuals)

Matriz de correlaciones

R = cor(iris[,1:4])
(R = round(R,3))
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length        1.000      -0.118        0.872       0.818
## Sepal.Width        -0.118       1.000       -0.428      -0.366
## Petal.Length        0.872      -0.428        1.000       0.963
## Petal.Width         0.818      -0.366        0.963       1.000
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend

Largo de pƩtalos con ancho de pƩtalos (hacer este mismo grƔfico con todos los pares posibles)

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:xts':
## 
##     first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
split.screen(c(6,6))
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
## [26] 26 27 28 29 30 31 32 33 34 35 36
screen(1)
iris %>% ggplot(aes(x=Petal.Length, y=Petal.Width, col=Species))+geom_point()
screen(2)
iris %>% ggplot(aes(x=Petal.Length, y=Sepal.Length, col=Species))+geom_point()
screen(3)
iris %>% ggplot(aes(x=Petal.Length, y=Sepal.Width, col=Species))+geom_point()
screen(4)
iris %>% ggplot(aes(x=Sepal.Length, y=Sepal.Width, col=Species))+geom_point()
screen(5)
iris %>% ggplot(aes(x=Sepal.Length, y=Petal.Width, col=Species))+geom_point()
screen(6)
iris %>% ggplot(aes(x=Sepal.Width, y=Petal.Width, col=Species))+geom_point()

Matriz de distancia

d = dist(iris[,1:4])
dim(as.matrix(d))
## [1] 150 150

Metodo de Ward (basado en suma de cuadrados para agrupar)

clust = hclust(d, method= 'ward.D2')
plot(clust)
abline(h=10, col='red', lty=2)
grupos = cutree(clust, 3)
rect.hclust(clust, k = 3)

Matriz de confución Virginica tiene mayor confución

table(grupos)
## grupos
##  1  2  3 
## 50 64 36
(tbl = table(iris$Species, grupos))
##             grupos
##               1  2  3
##   setosa     50  0  0
##   versicolor  0 49  1
##   virginica   0 15 35
100 * sum(diag(tbl))/sum(tbl)
## [1] 89.33333