Proyecto Final Curso R

Abstract

Sara Elisa Cruz Castro

Trabajo en la Escuela Politecnica Nacional en la Direccion de Gestion de la Informacion y Procesos (DGIP), actualmente me encuentro muy interesada en el analisis de datos y poder contar historias a partir de estos.

Soy mama de dos hermosas niñas Salome 11 meses (foto) y Sofy 3 años, son el amor de mi vida.

Analisis de Pacientes

Se analizara la informacion de pacientes para conocer la tension arterial diastolica ajustando por colesterol e indice de masa corporal y edad.

library(readxl)
pacientes <- read_excel("pacientes.xlsx", skip = 2)

View(pacientes)

Adecuando la base de datos

#Adecuando la base de datos
pacientes$PACIENTE = as.character(pacientes$PACIENTE)

pacientes$EDAD = as.numeric(pacientes$EDAD)

pacientes$COLESTEROL = as.numeric(pacientes$COLESTEROL)

pacientes$IMC = as.numeric(pacientes$IMC)

## Warning: NAs introducidos por coerción

pacientes$TAD = as.numeric(pacientes$TAD)

## Warning: NAs introducidos por coerción

pacientes$GENERO=factor(pacientes$GENERO, levels=c("Hombre","Mujer"),
                  labels=c("MASCULINO","FEMENINO"))


#Fijar rango de edad
range(pacientes$EDAD,na.rm=TRUE)

## [1] 42 68

# N?mero de intervalos
nclass.Sturges(pacientes$EDAD)

## [1] 8

round(seq(42,68,length=nclass.Sturges(pacientes$EDAD)))

## [1] 42 46 49 53 57 61 64 68

pacientes$INTERVALOS_EDAD=cut(pacientes$EDAD,breaks=round(seq(42,68,length=nclass.Sturges(pacientes$EDAD))),include.lowest=TRUE)


#IMC

#Funcion para poner media en valores NA
pacientes$IMC[is.na(pacientes$IMC)]<-mean(pacientes$IMC,na.rm=T)
round(pacientes$IMC, digits =  2)

##  [1] 31.64 30.80 25.61 26.17 31.96 23.18 21.19 26.95 24.26 21.87 25.61
## [12] 27.92 27.73 22.49 25.47 30.04 23.88 21.99 26.93 25.47 24.09 28.65
## [23] 25.71 25.33 25.42 23.99 25.20 25.81 26.93 27.77 30.85 21.61 26.30
## [34] 31.44 25.00 23.31 27.15 21.09 22.53 19.83 26.17 25.95 26.30 24.89
## [45] 26.89 21.09 31.11 21.60 19.78 22.99 32.32 31.14 28.89 20.55 22.49
## [56] 22.53 20.82 32.32 33.91 20.76 31.35 31.14 20.76 19.59 20.08 31.60
## [67] 25.34 22.86 19.53 19.10

#TAD
pacientes$TAD[is.na(pacientes$TAD)]<-mean(pacientes$TAD,na.rm=T)
round(pacientes$TAD,digits =  2)

##  [1]  97.00  90.00  80.00  75.00 100.00  67.00  81.65  70.00 105.00  70.00
## [11]  70.00  75.00  90.00  95.00  95.00  90.00  85.00  75.00  75.00  85.00
## [21]  70.00 105.00  85.00  80.00  90.00  85.00  70.00  85.00  80.00  80.00
## [31]  70.00  75.00  95.00 100.00  75.00  80.00  65.00  80.00  95.00  75.00
## [41]  75.00  90.00  75.00  70.00 100.00  65.00  95.00  80.00  65.00  75.00
## [51]  95.00 100.00  80.00  75.00  70.00  70.00  65.00  95.00  90.00  75.00
## [61]  80.00  95.00  80.00  90.00  75.00 100.00  70.00  75.00  70.00  65.00

pacientes = data.frame(pacientes)
pacientes

##    PACIENTE EDAD COLESTEROL      IMC       TAD    GENERO INTERVALOS_EDAD
## 1         1   42        292 31.64000  97.00000 MASCULINO         [42,46]
## 2         2   64        235 30.80000  90.00000 MASCULINO         (61,64]
## 3         3   47        200 25.61000  80.00000 MASCULINO         (46,49]
## 4         4   56        200 26.17000  75.00000  FEMENINO         (53,57]
## 5         5   54        300 31.96000 100.00000 MASCULINO         (53,57]
## 6         6   48        215 23.18000  67.00000 MASCULINO         (46,49]
## 7         7   57        216 21.19000  81.65217  FEMENINO         (53,57]
## 8         8   52        254 26.95000  70.00000 MASCULINO         (49,53]
## 9         9   67        310 24.26000 105.00000 MASCULINO         (64,68]
## 10       10   46        237 21.87000  70.00000  FEMENINO         [42,46]
## 11       11   58        220 25.61000  70.00000 MASCULINO         (57,61]
## 12       12   62        233 27.92000  75.00000  FEMENINO         (61,64]
## 13       13   49        240 27.73000  90.00000 MASCULINO         (46,49]
## 14       14   56        295 22.49000  95.00000 MASCULINO         (53,57]
## 15       15   63        310 25.47176  95.00000 MASCULINO         (61,64]
## 16       16   64        268 30.04000  90.00000  FEMENINO         (61,64]
## 17       17   67        243 23.88000  85.00000  FEMENINO         (64,68]
## 18       18   49        239 21.99000  75.00000  FEMENINO         (46,49]
## 19       19   53        198 26.93000  75.00000  FEMENINO         (49,53]
## 20       20   59        218 25.47176  85.00000 MASCULINO         (57,61]
## 21       21   65        215 24.09000  70.00000 MASCULINO         (64,68]
## 22       22   67        254 28.65000 105.00000 MASCULINO         (64,68]
## 23       23   49        218 25.71000  85.00000 MASCULINO         (46,49]
## 24       24   53        221 25.33000  80.00000  FEMENINO         (49,53]
## 25       25   57        237 25.42000  90.00000  FEMENINO         (53,57]
## 26       26   47        244 23.99000  85.00000  FEMENINO         (46,49]
## 27       27   58        223 25.20000  70.00000  FEMENINO         (57,61]
## 28       28   48        198 25.81000  85.00000  FEMENINO         (46,49]
## 29       29   51        234 26.93000  80.00000 MASCULINO         (49,53]
## 30       30   49        175 27.77000  80.00000 MASCULINO         (46,49]
## 31       31   68        230 30.85000  70.00000  FEMENINO         (64,68]
## 32       32   58        248 21.61000  75.00000  FEMENINO         (57,61]
## 33       33   54        218 26.30000  95.00000  FEMENINO         (53,57]
## 34       34   59        285 31.44000 100.00000 MASCULINO         (57,61]
## 35       35   45        253 25.00000  75.00000 MASCULINO         [42,46]
## 36       36   53        187 23.31000  80.00000 MASCULINO         (49,53]
## 37       37   43        208 27.15000  65.00000 MASCULINO         [42,46]
## 38       38   57        246 21.09000  80.00000 MASCULINO         (53,57]
## 39       39   64        275 22.53000  95.00000 MASCULINO         (61,64]
## 40       40   43        218 19.83000  75.00000 MASCULINO         [42,46]
## 41       41   47        231 26.17000  75.00000 MASCULINO         (46,49]
## 42       42   58        200 25.95000  90.00000 MASCULINO         (57,61]
## 43       43   58        214 26.30000  75.00000  FEMENINO         (57,61]
## 44       44   48        230 24.89000  70.00000  FEMENINO         (46,49]
## 45       45   62        280 26.89000 100.00000  FEMENINO         (61,64]
## 46       46   54        198 21.09000  65.00000 MASCULINO         (53,57]
## 47       47   67        285 31.11000  95.00000 MASCULINO         (64,68]
## 48       48   68        201 21.60000  80.00000 MASCULINO         (64,68]
## 49       49   55        206 19.78000  65.00000  FEMENINO         (53,57]
## 50       50   50        223 22.99000  75.00000  FEMENINO         (49,53]
## 51       51   53        290 32.32000  95.00000  FEMENINO         (49,53]
## 52       52   63        315 31.14000 100.00000  FEMENINO         (61,64]
## 53       53   60        220 28.89000  80.00000  FEMENINO         (57,61]
## 54       54   46        230 20.55000  75.00000 MASCULINO         [42,46]
## 55       55   45        175 22.49000  70.00000 MASCULINO         [42,46]
## 56       56   53        213 22.53000  70.00000  FEMENINO         (49,53]
## 57       57   59        220 20.82000  65.00000 MASCULINO         (57,61]
## 58       58   62        287 32.32000  95.00000 MASCULINO         (61,64]
## 59       59   60        290 33.91000  90.00000 MASCULINO         (57,61]
## 60       60   62        209 20.76000  75.00000 MASCULINO         (61,64]
## 61       61   58        290 31.35000  80.00000 MASCULINO         (57,61]
## 62       62   57        260 31.14000  95.00000 MASCULINO         (53,57]
## 63       63   49        202 20.76000  80.00000  FEMENINO         (46,49]
## 64       64   61        214 19.59000  90.00000  FEMENINO         (57,61]
## 65       65   52        231 20.08000  75.00000  FEMENINO         (49,53]
## 66       66   59        280 31.60000 100.00000 MASCULINO         (57,61]
## 67       67   50        220 25.34000  70.00000 MASCULINO         (49,53]
## 68       68   46        233 22.86000  75.00000 MASCULINO         [42,46]
## 69       69   44        215 19.53000  70.00000 MASCULINO         [42,46]
## 70       70   60        202 19.10000  65.00000  FEMENINO         (57,61]

*Verificar datos perdidos

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(wakefield)

## 
## Attaching package: 'wakefield'

## The following object is masked from 'package:dplyr':
## 
##     id

library(Amelia)

## Loading required package: Rcpp

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2018 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

missmap(pacientes,col=c('pink','black'),y.at=1,y.labels='',legend=TRUE)

Se puede observar que no existen datos perdidos.

Analisis exploratorio de datos

Se muestra la estructura y el resumen de pacientes:

#Estructura
str(pacientes)

## 'data.frame':    70 obs. of  7 variables:
##  $ PACIENTE       : chr  "1" "2" "3" "4" ...
##  $ EDAD           : num  42 64 47 56 54 48 57 52 67 46 ...
##  $ COLESTEROL     : num  292 235 200 200 300 215 216 254 310 237 ...
##  $ IMC            : num  31.6 30.8 25.6 26.2 32 ...
##  $ TAD            : num  97 90 80 75 100 ...
##  $ GENERO         : Factor w/ 2 levels "MASCULINO","FEMENINO": 1 1 1 2 1 1 2 1 1 2 ...
##  $ INTERVALOS_EDAD: Factor w/ 7 levels "[42,46]","(46,49]",..: 1 6 2 4 4 2 4 3 7 1 ...

#Resumes
summary(pacientes)

##    PACIENTE              EDAD         COLESTEROL         IMC       
##  Length:70          Min.   :42.00   Min.   :175.0   Min.   :19.10  
##  Class :character   1st Qu.:49.00   1st Qu.:214.2   1st Qu.:22.49  
##  Mode  :character   Median :56.00   Median :230.0   Median :25.45  
##                     Mean   :55.24   Mean   :236.8   Mean   :25.47  
##                     3rd Qu.:60.00   3rd Qu.:254.0   3rd Qu.:27.76  
##                     Max.   :68.00   Max.   :315.0   Max.   :33.91  
##                                                                    
##       TAD               GENERO   INTERVALOS_EDAD
##  Min.   : 65.00   MASCULINO:41   [42,46]: 9     
##  1st Qu.: 75.00   FEMENINO :29   (46,49]:11     
##  Median : 80.00                  (49,53]:10     
##  Mean   : 81.65                  (53,57]:10     
##  3rd Qu.: 90.00                  (57,61]:14     
##  Max.   :105.00                  (61,64]: 9     
##                                  (64,68]: 7

DIAGRAMAS DE CAJA

IMC

#Diagramas de Caja - Variables continuas

boxplot(pacientes$IMC ~ pacientes$GENERO, ylab="IMC", col = c("purple","pink"))

TAD

boxplot(pacientes$TAD ~ pacientes$GENERO, ylab="TAD", col = c("aquamarine","chocolate1"))

COLESTEROL

boxplot(pacientes$COLESTEROL ~ pacientes$GENERO, ylab="COLESTEROL", col = c("brown1","deepskyblue1"))

DIAGRAMAS DE BARRA

#Diagramas de Barra - Variables discretas


library(ggplot2)
library(scales)



barplot(prop.table(table(pacientes$GENERO)) ,col=c("blue","pink"),legend.text=c("Hombre","Mujer"),ylim=c(0,0.8),main="Genero")

COLESTEROL - GENERO

ggplot(data=pacientes, aes(x=GENERO, y=COLESTEROL,fill=pacientes$INTERVALOS_EDAD)) + 
    geom_bar(stat="identity", position="dodge")

COLESTEROL - INTERVALOS DE EDAD - COLESTEROL

ggplot(data=pacientes, aes(x=reorder(pacientes$INTERVALOS_EDAD,COLESTEROL), y=COLESTEROL, fill=GENERO)) + 
    geom_bar(stat="identity", position="dodge")

Calcular la correlacion entre la variable dependiente y cada una de las variables explicativas (numericas).

Correlacion lineal de la edad, colesterol e indice de masa corporal con la tension arterial diastolica.

pairs(pacientes$TAD ~ pacientes$EDAD*pacientes$COLESTEROL*pacientes$IMC)

TAD - EDAD

library(PerformanceAnalytics)

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## 
## Attaching package: 'PerformanceAnalytics'

## The following object is masked from 'package:graphics':
## 
##     legend

library(quantmod)

## Loading required package: TTR

## Version 0.4-0 included new data defaults. See ?getSymbols.

dat1 <- data.frame(pacientes$TAD, pacientes$EDAD)
chart.Correlation(dat1)

TAD - COLESTEROL

dat2 <- data.frame(pacientes$TAD, pacientes$COLESTEROL)
chart.Correlation(dat2)

TAD - IMC

dat3 <- data.frame(pacientes$TAD, pacientes$IMC)
chart.Correlation(dat3)

Calculo de la correlaci?n

TAD - COLESTEROL

cor(pacientes$TAD,pacientes$COLESTEROL)

## [1] 0.6913791

cor.test(pacientes$TAD,pacientes$COLESTEROL)

## 
##  Pearson's product-moment correlation
## 
## data:  pacientes$TAD and pacientes$COLESTEROL
## t = 7.8911, df = 68, p-value = 3.455e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5449327 0.7968929
## sample estimates:
##       cor 
## 0.6913791

TAD - EDAD

cor(pacientes$TAD,pacientes$EDAD)

## [1] 0.3881265

cor.test(pacientes$TAD,pacientes$EDAD)

## 
##  Pearson's product-moment correlation
## 
## data:  pacientes$TAD and pacientes$EDAD
## t = 3.4728, df = 68, p-value = 0.0009001
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1685215 0.5710234
## sample estimates:
##       cor 
## 0.3881265

TAD - IMC

cor(pacientes$TAD,pacientes$IMC)

## [1] 0.5496267

cor.test(pacientes$TAD,pacientes$IMC)

## 
##  Pearson's product-moment correlation
## 
## data:  pacientes$TAD and pacientes$IMC
## t = 5.4253, df = 68, p-value = 8.315e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3613159 0.6948608
## sample estimates:
##       cor 
## 0.5496267

ANALISIS ANOVA

TAD = pacientes$TAD
GENERO = pacientes$GENERO
pac = aov( lm(TAD ~ GENERO) )

Resumen de la tabla del ANOVA

summary(pac)

##             Df Sum Sq Mean Sq F value Pr(>F)
## GENERO       1    155   154.7   1.228  0.272
## Residuals   68   8565   126.0

Elementos generados en el ANOVA:

names(pac)

##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "contrasts"     "xlevels"       "call"          "terms"        
## [13] "model"

qf(0.05,2-1, 70-2, lower.tail = F)

## [1] 3.981896

Los valores del estad?stico > 3.981896 estar?n incluidos en la regi?n de rechazo.En este caso el valor F que se obtuvo es de 1.228.

tapply(TAD, GENERO, mean)

## MASCULINO  FEMENINO 
##  82.90244  79.88456

Prueba T

Estimacion de la varianza comun de los datos

media <- mean(TAD[GENERO=="MASCULINO"]) 
valor_t <- pt(0.05/2, 70 - 2) 
sp <- sqrt(126)  #desviaci?n t?pica de la varianza muestral com?n
ee  <- valor_t * (sp/ sqrt(41))  #error de estimaci?n 
media

## [1] 82.90244

Intervalos de confianza para las medias de TAD de los pacientes

Limite Superior

media+ee

## [1] 83.79638

Limite Inferior

media-ee

## [1] 82.0085

Test HSD de Tukey

intervals =TukeyHSD(pac)
intervals

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = lm(TAD ~ GENERO))
## 
## $GENERO
##                         diff       lwr      upr     p adj
## FEMENINO-MASCULINO -3.017881 -8.451774 2.416012 0.2716603

plot(intervals)

Validaci?n del modelo ANOVA A partir de los residuos del modelo comprobaremos si el modelo ANOVA es adecuado. Los supuestos que se deben cumplir son tres: independencia, homocedasticidad y normalidad.

Independencia

plot(pac$residuals,xlab = "Pacientes", ylab = "Residuales")

###Normalidad

Los gr?ficos y descriptivos nos informan si se verifica la igualdad de varianzas en los grupos descritos:

summary(pac$residuals)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -17.902  -7.902  -2.902   0.000  10.115  22.098

boxplot(pac$residuals, col = "magenta2")

hist(pac$residuals, col = "pink4",main = "Histograma de Residuales", xlab = "Residuales")

qqnorm(pac$residuals)

El test de Shapiro-Wilk indica que no tenemos evidencia suficiente para rechazar la hipotesis nula (normalidad de los residuos)

shapiro.test(pac$residuals)

## 
##  Shapiro-Wilk normality test
## 
## data:  pac$residuals
## W = 0.95157, p-value = 0.008765

Se puede ver que los valores de p son menores a 0,05 entonces la distribuci?n no es normal

Homocedasticidad

Los graficos y descriptivos nos informan si se verifica la igualdad de varianzas en los grupos descritos:

boxplot(pac$residuals~pacientes$GENERO, col = c("hotpink", "mediumblue"))

desviaciones <- tapply(pac$residuals, GENERO, sd)

Comparando la desviaci?n m?xima con la m?nima obtenemos una orientaci?n sobre la falta de homocedasticidad (>2 aproximadamente)

max(desviaciones) / min(desviaciones)

## [1] 1.207769

Podemos ver que la muestra no es homocedastica.

El test de Bartlett indica que no tenemos evidencia suficiente para rechazar la hip?tesis nula (las varianzas son iguales)

bartlett.test(pac$residuals ~ GENERO)

## 
##  Bartlett test of homogeneity of variances
## 
## data:  pac$residuals by GENERO
## Bartlett's K-squared = 1.1247, df = 1, p-value = 0.2889

Con el test de Bartlett se puede observar que tenemos evidencia suficiente para rechazar la hip?tesis nula (las varianzas son iguales).

Kruskal-Wallis y pruebas post-hoc ?Qu? hip?tesis contrasta el test ANOVA?

Ho: la variable respuesta es la misma en todas las poblaciones valoradas.

Ha: la variable respuesta es mayor en alguna de las poblaciones.

kruskal.test(TAD,GENERO)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  TAD and GENERO
## Kruskal-Wallis chi-squared = 0.9036, df = 1, p-value = 0.3418

Bajo la Ho el estadistico de contraste H del test de Kruskal-Wallis se distribuye como una Chi-cuadrado de grados de libertad (2-1).

Cuartil buscado

qchisq(0.05, 2-1, lower.tail = F)

## [1] 3.841459

Valores del estadistico > 3.841459 estaran incluidos en la region de rechazo. En nuetro caso 0.9036 es mucho mayor que el valor critico obtenido.

kruskal.test(log(TAD), GENERO)

## 
##  Kruskal-Wallis rank sum test
## 
## data:  log(TAD) and GENERO
## Kruskal-Wallis chi-squared = 0.9036, df = 1, p-value = 0.3418

CONSTRUCCION DEL MODELO Y PREDICCION

library(caTools)
set.seed(105) 
split <- sample.split(pacientes,SplitRatio=0.75) 
train <- subset(pacientes,split==TRUE) #75%
test <- subset(pacientes,split==FALSE) #25%

test <- subset(pacientes,split==FALSE)

MODELO

model <- lm(pacientes$TAD ~ -1 + pacientes$COLESTEROL+ pacientes$EDAD+pacientes$IMC, data = train)
summary(model)

## 
## Call:
## lm(formula = pacientes$TAD ~ -1 + pacientes$COLESTEROL + pacientes$EDAD + 
##     pacientes$IMC, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.0853  -5.4363  -0.3766   5.6170  16.5940 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## pacientes$COLESTEROL  0.17708    0.03209   5.518 5.97e-07 ***
## pacientes$EDAD        0.35817    0.11378   3.148  0.00245 ** 
## pacientes$IMC         0.77798    0.27334   2.846  0.00587 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.791 on 67 degrees of freedom
## Multiple R-squared:  0.9914, Adjusted R-squared:  0.9911 
## F-statistic:  2589 on 3 and 67 DF,  p-value: < 2.2e-16

Modelo encontrado (Ecuación)

model$coefficients

## pacientes$COLESTEROL       pacientes$EDAD        pacientes$IMC 
##            0.1770818            0.3581735            0.7779782

paste("TAD", round(model$coefficients[1],4),
      round(model$coefficients[2],4),names(model$coefficients[2]),"+",
      round(model$coefficients[3],4),names(model$coefficients[3])
      )

## [1] "TAD 0.1771 0.3582 pacientes$EDAD + 0.778 pacientes$IMC"

res <- residuals(model)
res <- as.data.frame(res)
ggplot(res,aes(res)) +  geom_histogram(fill='violetred1',alpha=0.5)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

plot(model)

Proyecto Final Curso R - 2018

Sara Cruz Castro

7 de octubre de 2018