Documento Auto reproducible

Autor: Felipe Hernández Baeza

Tarea 4. Programación en R. PROMIDAT

  1. SAheart
setwd ("C:/Users/Felipe/Desktop/PROMIDAT. Minería de datos")
Datos <- read.table ("SAheart.csv", header=TRUE, sep=";", dec=".")
head(Datos)
##   sbp tobacco  ldl adiposity famhist typea obesity alcohol age chd
## 1 160   12.00 5.73     23.11 Present    49   25.30   97.20  52  Si
## 2 144    0.01 4.41     28.61  Absent    55   28.87    2.06  63  Si
## 3 118    0.08 3.48     32.28 Present    52   29.14    3.81  46  No
## 4 170    7.50 6.41     38.03 Present    51   31.99   24.26  58  Si
## 5 134   13.60 3.50     27.78 Present    60   25.99   57.34  49  Si
## 6 132    6.20 6.47     36.21 Present    62   30.77   14.14  45  No

Mínimo, máximo, media y mediana de variables cuantitativas

summary(Datos[,c(-5,-10)])
##       sbp           tobacco             ldl           adiposity    
##  Min.   :101.0   Min.   : 0.0000   Min.   : 0.980   Min.   : 6.74  
##  1st Qu.:124.0   1st Qu.: 0.0525   1st Qu.: 3.283   1st Qu.:19.77  
##  Median :134.0   Median : 2.0000   Median : 4.340   Median :26.11  
##  Mean   :138.3   Mean   : 3.6356   Mean   : 4.740   Mean   :25.41  
##  3rd Qu.:148.0   3rd Qu.: 5.5000   3rd Qu.: 5.790   3rd Qu.:31.23  
##  Max.   :218.0   Max.   :31.2000   Max.   :15.330   Max.   :42.49  
##      typea         obesity         alcohol            age       
##  Min.   :13.0   Min.   :14.70   Min.   :  0.00   Min.   :15.00  
##  1st Qu.:47.0   1st Qu.:22.98   1st Qu.:  0.51   1st Qu.:31.00  
##  Median :53.0   Median :25.80   Median :  7.51   Median :45.00  
##  Mean   :53.1   Mean   :26.04   Mean   : 17.04   Mean   :42.82  
##  3rd Qu.:60.0   3rd Qu.:28.50   3rd Qu.: 23.89   3rd Qu.:55.00  
##  Max.   :78.0   Max.   :46.58   Max.   :147.19   Max.   :64.00

Fecuencias chd

summary(Datos[,10])
##  No  Si 
## 302 160

Distribuciones de variables cuantitativas.

hist(Datos$sbp)

hist(Datos$tobacco)

hist(Datos$ldl)

hist(Datos$adiposity)

hist(Datos$typea)

hist(Datos$obesity)

hist(Datos$alcohol)

hist(Datos$age)

Matriz de correlaciones.

cor(Datos[,c(-5,-10)])
##                   sbp     tobacco         ldl   adiposity       typea
## sbp        1.00000000  0.21224652  0.15829633  0.35650008 -0.05745431
## tobacco    0.21224652  1.00000000  0.15890546  0.28664037 -0.01460788
## ldl        0.15829633  0.15890546  1.00000000  0.44043175  0.04404758
## adiposity  0.35650008  0.28664037  0.44043175  1.00000000 -0.04314364
## typea     -0.05745431 -0.01460788  0.04404758 -0.04314364  1.00000000
## obesity    0.23806661  0.12452941  0.33050586  0.71655625  0.07400610
## alcohol    0.14009559  0.20081339 -0.03340340  0.10033013  0.03949794
## age        0.38877060  0.45033016  0.31179923  0.62595442 -0.10260632
##              obesity     alcohol        age
## sbp       0.23806661  0.14009559  0.3887706
## tobacco   0.12452941  0.20081339  0.4503302
## ldl       0.33050586 -0.03340340  0.3117992
## adiposity 0.71655625  0.10033013  0.6259544
## typea     0.07400610  0.03949794 -0.1026063
## obesity   1.00000000  0.05161957  0.2917771
## alcohol   0.05161957  1.00000000  0.1011246
## age       0.29177713  0.10112465  1.0000000
  1. stackloss
stackloss
##    Air.Flow Water.Temp Acid.Conc. stack.loss
## 1        80         27         89         42
## 2        80         27         88         37
## 3        75         25         90         37
## 4        62         24         87         28
## 5        62         22         87         18
## 6        62         23         87         18
## 7        62         24         93         19
## 8        62         24         93         20
## 9        58         23         87         15
## 10       58         18         80         14
## 11       58         18         89         14
## 12       58         17         88         13
## 13       58         18         82         11
## 14       58         19         93         12
## 15       50         18         89          8
## 16       50         18         86          7
## 17       50         19         72          8
## 18       50         19         79          8
## 19       50         20         80          9
## 20       56         20         82         15
## 21       70         20         91         15

Correlación entre variables

panel.cor <- function(x, y, ...) {
    par(usr = c(0, 1, 0, 1))
    txt <- as.character(format(cor(x, y), digits=2))
    text(0.5, 0.5, txt,  cex = 6* abs(cor(x, y)))
}

pairs(stackloss,upper.panel=panel.cor)

Las variables stack.loss y Air.Flow presentan una relacion lineal de valor 0.92. Igualmente las variables stack.loss y Water.Temp, pero con una relacion lineal de 0.88. Las variables Air.Flow y Water.Temp también tienen una relacion lineal, en este caso su valor es de 0.78.

Por otra parte, Air.Flow y Acid.Conc. no presentan una relacion lineal (0.5). Al igual que Wayer.Temp y Acid.Conc (0.39); como es el caso de Acid.Conc. y stack.loss (0.4).

Análisis en Componentes Principales. Plano Principal.

library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 3.1.1
res<-PCA(stackloss, scale.unit=TRUE, ncp=5, graph = FALSE)
plot(res, axes=c(1, 2), choix="ind", col.ind="red",new.plot=TRUE)

Círculo de correlaciones.

plot(res, axes=c(1, 2), choix="var", col.var="blue",new.plot=TRUE)

Clustering Jerarquico. Salto Mínimo.

modelo <- hclust(dist(stackloss),method = "complete")
plot(modelo)

Salto Mínimo.

modelo <- hclust(dist(stackloss),method = "single")
plot(modelo)

Promedio.

modelo <- hclust(dist(stackloss),method = "average")
plot(modelo)

Ward.

modelo <- hclust(dist(stackloss),method = "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
plot(modelo)