Métodos multivariados

library(readr)
library(tidyverse)

## -- Attaching packages --------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.0     v dplyr   0.8.4
## v tibble  2.1.3     v stringr 1.4.0
## v tidyr   1.0.2     v forcats 0.4.0
## v purrr   0.3.3

## -- Conflicts ------------------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(MVN)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

## sROC 0.1-2 loaded

library(biotools)

## Loading required package: rpanel

## Loading required package: tcltk

## Package `rpanel', version 1.1-4: type help(rpanel) for summary information

## 
## Attaching package: 'rpanel'

## The following object is masked from 'package:tidyr':
## 
##     population

## Loading required package: tkrplot

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

## Loading required package: lattice

## Loading required package: SpatialEpi

## Loading required package: sp

## ---
## biotools version 3.1

##

library(MASS)
library(dplyr)
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

Realizar una LDA, QDA y Regresión logística sobre el conjunto de datos boston-housing-classification.csv, la variable respuesta es MEDV_CAT, en todos los casos hallar la matriz de confusión y en el caso de la regresión logística dibujar la curva ROC

data <- read.csv("boston-housing-classification.csv")

LDA

boston <- read.csv("boston-housing-classification.csv")
head(boston, 3)

##      CRIM ZN INDUS CHAS   NOX    RM  AGE    DIS RAD TAX PTRATIO      B LSTAT
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 3 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
##   MEDV_CAT
## 1     High
## 2     High
## 3     High

normalidad<-function(var){
  shapiro.test(var)$p
}

# Verificamos normalidad para cada grupo 
tb1 <- boston %>%
  group_by(MEDV_CAT) %>% 
  summarise_all(normalidad)

tb1

## # A tibble: 2 x 14
##   MEDV_CAT     CRIM       ZN    INDUS     CHAS      NOX      RM      AGE
##   <fct>       <dbl>    <dbl>    <dbl>    <dbl>    <dbl>   <dbl>    <dbl>
## 1 High     1.03e-24 1.72e-16 1.85e-14 6.64e-26 2.00e-11 2.96e-6 8.37e- 7
## 2 Low      7.00e-19 3.36e-26 5.19e-13 5.03e-27 1.14e- 4 3.94e-3 1.42e-17
## # ... with 6 more variables: DIS <dbl>, RAD <dbl>, TAX <dbl>, PTRATIO <dbl>,
## #   B <dbl>, LSTAT <dbl>

####Normalidad Multivariada
mvn(boston[-14], mvnTest = "mardia")

## $multivariateNormality
##              Test        Statistic p value Result
## 1 Mardia Skewness 6440.29351636337       0     NO
## 2 Mardia Kurtosis 38.8421257939282       0     NO
## 3             MVN             <NA>    <NA>     NO
## 
## $univariateNormality
##            Test  Variable Statistic   p value Normality
## 1  Shapiro-Wilk   CRIM       0.4933  <0.001      NO    
## 2  Shapiro-Wilk    ZN        0.5842  <0.001      NO    
## 3  Shapiro-Wilk   INDUS      0.8764  <0.001      NO    
## 4  Shapiro-Wilk   CHAS       0.2672  <0.001      NO    
## 5  Shapiro-Wilk    NOX       0.9362  <0.001      NO    
## 6  Shapiro-Wilk    RM        0.9847   7e-04      NO    
## 7  Shapiro-Wilk    AGE       0.8567  <0.001      NO    
## 8  Shapiro-Wilk    DIS       0.8855  <0.001      NO    
## 9  Shapiro-Wilk    RAD       0.6941  <0.001      NO    
## 10 Shapiro-Wilk    TAX       0.8141  <0.001      NO    
## 11 Shapiro-Wilk  PTRATIO     0.8935  <0.001      NO    
## 12 Shapiro-Wilk     B        0.5273  <0.001      NO    
## 13 Shapiro-Wilk   LSTAT      0.9269  <0.001      NO    
## 
## $Descriptives
##           n        Mean     Std.Dev    Median       Min      Max      25th
## CRIM    363   4.5177472   9.8702156   0.35233   0.00632  88.9762   0.08204
## ZN      363  12.8608815  24.8749205   0.00000   0.00000 100.0000   0.00000
## INDUS   363  11.1674931   7.1135931   8.56000   0.46000  27.7400   4.86000
## CHAS    363   0.0661157   0.2488272   0.00000   0.00000   1.0000   0.00000
## NOX     363   0.5632860   0.1202039   0.53800   0.39400   0.8710   0.44850
## RM      363   6.3659780   0.7804789   6.37600   3.56100   8.7250   5.89700
## AGE     363  70.2754821  29.4988384  82.90000   2.90000 100.0000  44.60000
## DIS     363   3.6847259   2.1810679   2.89440   1.12960  12.1265   1.91190
## RAD     363  10.2258953   8.9700971   5.00000   1.00000  24.0000   4.00000
## TAX     363 418.8787879 173.8518823 337.00000 187.00000 711.0000 282.50000
## PTRATIO 363  18.3438017   2.2754476  19.10000  12.60000  22.0000  16.95000
## B       363 346.4125344 103.4525766 390.30000   0.32000 396.9000 370.26500
## LSTAT   363  13.0043802   8.0823512  10.97000   1.73000  37.9700   6.24000
##              75th        Skew   Kurtosis
## CRIM      5.62372  4.51518869 27.1109607
## ZN       20.00000  2.02278436  3.0227525
## INDUS    18.10000  0.21379710 -1.4129701
## CHAS      0.00000  3.47782722 10.1231925
## NOX       0.66350  0.56948697 -0.4384579
## RM        6.79700  0.07052145  0.9775789
## AGE      95.60000 -0.72107872 -0.9107082
## DIS       5.11670  1.04564883  0.4564249
## RAD      24.00000  0.83148914 -1.2029782
## TAX     666.00000  0.53674383 -1.3626985
## PTRATIO  20.20000 -0.75537070 -0.5113094
## B       395.66000 -2.37271513  4.2311834
## LSTAT    18.13500  0.73967139 -0.2573823

# prueba de box
# Ho: matriz de covarianza es constante
boxM(data = boston[-14], grouping = boston[,14])

## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  boston[-14]
## Chi-Sq (approx.) = 1859.7, df = 91, p-value < 2.2e-16

# ser rechaza  la Ho

# LDA
modelo <- lda(MEDV_CAT~.,data=boston)
modelo

## Call:
## lda(MEDV_CAT ~ ., data = boston)
## 
## Prior probabilities of groups:
##     High      Low 
## 0.523416 0.476584 
## 
## Group means:
##           CRIM        ZN     INDUS       CHAS       NOX       RM      AGE
## High 0.8191541 22.431579  7.003316 0.09473684 0.4939153 6.781821 53.01211
## Low  8.5797860  2.349711 15.740867 0.03468208 0.6394734 5.909272 89.23526
##           DIS       RAD      TAX  PTRATIO        B     LSTAT
## High 4.536371  6.157895 322.2789 17.21789 383.3425  7.015947
## Low  2.749393 14.693642 524.9711 19.58035 305.8536 19.581272
## 
## Coefficients of linear discriminants:
##                  LD1
## CRIM     0.007800163
## ZN       0.003391075
## INDUS   -0.009807511
## CHAS    -0.203557463
## NOX      6.147883533
## RM      -0.064738427
## AGE      0.020859568
## DIS      0.308244930
## RAD     -0.089534694
## TAX      0.003331247
## PTRATIO  0.285438068
## B       -0.001896369
## LSTAT    0.120484172

prediccion <- predict(modelo,boston[-14])
m_confusion <- table(boston$MEDV_CAT, 
                     prediccion$class,
                     dnn=c("Real","Predicho"))
m_confusion

##       Predicho
## Real   High Low
##   High  178  12
##   Low    11 162

mosaicplot(m_confusion,col=2:4)

precision <- mean(boston$MEDV_CAT == prediccion$class)
precision

## [1] 0.9366391

error <- 1 - precision
error

## [1] 0.06336088

QDA

modelo2 <- qda(MEDV_CAT~.,data = boston)
prediccion2 <- predict(modelo2,boston[-14])
m_confusion2 <- table(boston$MEDV_CAT,
                      prediccion2$class,
                      dnn=c("Real","predicho"))
m_confusion2

##       predicho
## Real   High Low
##   High  179  11
##   Low    26 147

mosaicplot(m_confusion2,col=2:4)

precision2 <- mean(boston$MEDV_CAT == prediccion2$class)
precision2

## [1] 0.8980716

error2 <- 1 - precision2
error2

## [1] 0.1019284

Regresión logística

glimpse(boston)

## Observations: 363
## Variables: 14
## $ CRIM     <dbl> 0.00632, 0.02729, 0.03237, 0.06905, 0.02985, 0.14455, 0.21...
## $ ZN       <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5, 0....
## $ INDUS    <dbl> 2.31, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87, 7.87...
## $ CHAS     <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ NOX      <dbl> 0.538, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.524, 0....
## $ RM       <dbl> 6.575, 7.185, 6.998, 7.147, 6.430, 6.172, 5.631, 6.004, 6....
## $ AGE      <dbl> 65.2, 61.1, 45.8, 54.2, 58.7, 96.1, 100.0, 85.9, 94.3, 82....
## $ DIS      <dbl> 4.0900, 4.9671, 6.0622, 6.0622, 6.0622, 5.9505, 6.0821, 6....
## $ RAD      <int> 1, 2, 3, 3, 3, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4...
## $ TAX      <int> 296, 242, 222, 222, 222, 311, 311, 311, 311, 311, 307, 307...
## $ PTRATIO  <dbl> 15.3, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, 15.2, 15.2...
## $ B        <dbl> 396.90, 392.83, 394.63, 396.90, 394.12, 396.90, 386.63, 38...
## $ LSTAT    <dbl> 4.98, 4.03, 2.94, 5.33, 5.21, 19.15, 29.93, 17.10, 20.45, ...
## $ MEDV_CAT <fct> High, High, High, High, High, High, Low, Low, Low, Low, Lo...

# explorando variable respuesta
prop.table(table(boston$MEDV_CAT))

## 
##     High      Low 
## 0.523416 0.476584

# fabricando el modelo
mod1 <- glm(MEDV_CAT~.,
            data = boston,
            family = binomial)
mod1

## 
## Call:  glm(formula = MEDV_CAT ~ ., family = binomial, data = boston)
## 
## Coefficients:
## (Intercept)         CRIM           ZN        INDUS         CHAS          NOX  
##  -32.334632     0.097510    -0.043489    -0.034755    -0.412807    13.969348  
##          RM          AGE          DIS          RAD          TAX      PTRATIO  
##   -0.983256     0.065805     1.131166    -0.469344     0.016460     0.893019  
##           B        LSTAT  
##   -0.005775     0.393894  
## 
## Degrees of Freedom: 362 Total (i.e. Null);  349 Residual
## Null Deviance:       502.4 
## Residual Deviance: 95.63     AIC: 123.6

#calculando las  variable respuesta 
pred_p <- predict(mod1,type="response")

# convertimos la probabilidad de clasificación
pred <- ifelse(pred_p>0.05,1,0)
sum(pred)

## [1] 208

# evaluamos la precisión del modelo
mean(boston$MEDV_CAT==pred)

## [1] 0

conf <- table(boston$MEDV_CAT, pred, dnn=c("actual","predicho"))

mosaicplot(conf,col=2:4)

ROC <- roc(boston$MEDV_CAT,pred)

## Setting levels: control = High, case = Low

## Setting direction: controls < cases

plot(ROC)

auc(ROC)

## Area under the curve: 0.9079

Métodos multivariados - Tarea 4

Elmer Calel

10/5/2020

LDA

QDA

Regresión logística