setwd ("C:/Users/Felipe/Desktop/PROMIDAT. MinerÃa de datos")
Datos <- read.table ("SAheart.csv", header=TRUE, sep=";", dec=".")
head(Datos)
## sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
## 1 160 12.00 5.73 23.11 Present 49 25.30 97.20 52 Si
## 2 144 0.01 4.41 28.61 Absent 55 28.87 2.06 63 Si
## 3 118 0.08 3.48 32.28 Present 52 29.14 3.81 46 No
## 4 170 7.50 6.41 38.03 Present 51 31.99 24.26 58 Si
## 5 134 13.60 3.50 27.78 Present 60 25.99 57.34 49 Si
## 6 132 6.20 6.47 36.21 Present 62 30.77 14.14 45 No
MÃnimo, máximo, media y mediana de variables cuantitativas
summary(Datos[,c(-5,-10)])
## sbp tobacco ldl adiposity
## Min. :101.0 Min. : 0.0000 Min. : 0.980 Min. : 6.74
## 1st Qu.:124.0 1st Qu.: 0.0525 1st Qu.: 3.283 1st Qu.:19.77
## Median :134.0 Median : 2.0000 Median : 4.340 Median :26.11
## Mean :138.3 Mean : 3.6356 Mean : 4.740 Mean :25.41
## 3rd Qu.:148.0 3rd Qu.: 5.5000 3rd Qu.: 5.790 3rd Qu.:31.23
## Max. :218.0 Max. :31.2000 Max. :15.330 Max. :42.49
## typea obesity alcohol age
## Min. :13.0 Min. :14.70 Min. : 0.00 Min. :15.00
## 1st Qu.:47.0 1st Qu.:22.98 1st Qu.: 0.51 1st Qu.:31.00
## Median :53.0 Median :25.80 Median : 7.51 Median :45.00
## Mean :53.1 Mean :26.04 Mean : 17.04 Mean :42.82
## 3rd Qu.:60.0 3rd Qu.:28.50 3rd Qu.: 23.89 3rd Qu.:55.00
## Max. :78.0 Max. :46.58 Max. :147.19 Max. :64.00
Fecuencias chd
summary(Datos[,10])
## No Si
## 302 160
Distribuciones de variables cuantitativas.
hist(Datos$sbp)
hist(Datos$tobacco)
hist(Datos$ldl)
hist(Datos$adiposity)
hist(Datos$typea)
hist(Datos$obesity)
hist(Datos$alcohol)
hist(Datos$age)
Matriz de correlaciones.
cor(Datos[,c(-5,-10)])
## sbp tobacco ldl adiposity typea
## sbp 1.00000000 0.21224652 0.15829633 0.35650008 -0.05745431
## tobacco 0.21224652 1.00000000 0.15890546 0.28664037 -0.01460788
## ldl 0.15829633 0.15890546 1.00000000 0.44043175 0.04404758
## adiposity 0.35650008 0.28664037 0.44043175 1.00000000 -0.04314364
## typea -0.05745431 -0.01460788 0.04404758 -0.04314364 1.00000000
## obesity 0.23806661 0.12452941 0.33050586 0.71655625 0.07400610
## alcohol 0.14009559 0.20081339 -0.03340340 0.10033013 0.03949794
## age 0.38877060 0.45033016 0.31179923 0.62595442 -0.10260632
## obesity alcohol age
## sbp 0.23806661 0.14009559 0.3887706
## tobacco 0.12452941 0.20081339 0.4503302
## ldl 0.33050586 -0.03340340 0.3117992
## adiposity 0.71655625 0.10033013 0.6259544
## typea 0.07400610 0.03949794 -0.1026063
## obesity 1.00000000 0.05161957 0.2917771
## alcohol 0.05161957 1.00000000 0.1011246
## age 0.29177713 0.10112465 1.0000000
stackloss
## Air.Flow Water.Temp Acid.Conc. stack.loss
## 1 80 27 89 42
## 2 80 27 88 37
## 3 75 25 90 37
## 4 62 24 87 28
## 5 62 22 87 18
## 6 62 23 87 18
## 7 62 24 93 19
## 8 62 24 93 20
## 9 58 23 87 15
## 10 58 18 80 14
## 11 58 18 89 14
## 12 58 17 88 13
## 13 58 18 82 11
## 14 58 19 93 12
## 15 50 18 89 8
## 16 50 18 86 7
## 17 50 19 72 8
## 18 50 19 79 8
## 19 50 20 80 9
## 20 56 20 82 15
## 21 70 20 91 15
Correlación entre variables
panel.cor <- function(x, y, ...) {
par(usr = c(0, 1, 0, 1))
txt <- as.character(format(cor(x, y), digits=2))
text(0.5, 0.5, txt, cex = 6* abs(cor(x, y)))
}
pairs(stackloss,upper.panel=panel.cor)
Las variables stack.loss y Air.Flow presentan una relacion lineal de valor 0.92. Igualmente las variables stack.loss y Water.Temp, pero con una relacion lineal de 0.88. Las variables Air.Flow y Water.Temp también tienen una relacion lineal, en este caso su valor es de 0.78.
Por otra parte, Air.Flow y Acid.Conc. no presentan una relacion lineal (0.5). Al igual que Wayer.Temp y Acid.Conc (0.39); como es el caso de Acid.Conc. y stack.loss (0.4).
Análisis en Componentes Principales. Plano Principal.
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 3.1.1
res<-PCA(stackloss, scale.unit=TRUE, ncp=5, graph = FALSE)
plot(res, axes=c(1, 2), choix="ind", col.ind="red",new.plot=TRUE)
CÃrculo de correlaciones.
plot(res, axes=c(1, 2), choix="var", col.var="blue",new.plot=TRUE)
Clustering Jerarquico. Salto MÃnimo.
modelo <- hclust(dist(stackloss),method = "complete")
plot(modelo)
Salto MÃnimo.
modelo <- hclust(dist(stackloss),method = "single")
plot(modelo)
Promedio.
modelo <- hclust(dist(stackloss),method = "average")
plot(modelo)
Ward.
modelo <- hclust(dist(stackloss),method = "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
plot(modelo)