library(readr)
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0 v dplyr 0.8.4
## v tibble 2.1.3 v stringr 1.4.0
## v tidyr 1.0.2 v forcats 0.4.0
## v purrr 0.3.3
## -- Conflicts ------------------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(MVN)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## sROC 0.1-2 loaded
library(biotools)
## Loading required package: rpanel
## Loading required package: tcltk
## Package `rpanel', version 1.1-4: type help(rpanel) for summary information
##
## Attaching package: 'rpanel'
## The following object is masked from 'package:tidyr':
##
## population
## Loading required package: tkrplot
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: lattice
## Loading required package: SpatialEpi
## Loading required package: sp
## ---
## biotools version 3.1
##
library(MASS)
library(dplyr)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
Realizar una LDA, QDA y Regresión logística sobre el conjunto de datos boston-housing-classification.csv, la variable respuesta es MEDV_CAT, en todos los casos hallar la matriz de confusión y en el caso de la regresión logística dibujar la curva ROC
data <- read.csv("boston-housing-classification.csv")
boston <- read.csv("boston-housing-classification.csv")
head(boston, 3)
## CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 3 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## MEDV_CAT
## 1 High
## 2 High
## 3 High
normalidad<-function(var){
shapiro.test(var)$p
}
# Verificamos normalidad para cada grupo
tb1 <- boston %>%
group_by(MEDV_CAT) %>%
summarise_all(normalidad)
tb1
## # A tibble: 2 x 14
## MEDV_CAT CRIM ZN INDUS CHAS NOX RM AGE
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 High 1.03e-24 1.72e-16 1.85e-14 6.64e-26 2.00e-11 2.96e-6 8.37e- 7
## 2 Low 7.00e-19 3.36e-26 5.19e-13 5.03e-27 1.14e- 4 3.94e-3 1.42e-17
## # ... with 6 more variables: DIS <dbl>, RAD <dbl>, TAX <dbl>, PTRATIO <dbl>,
## # B <dbl>, LSTAT <dbl>
####Normalidad Multivariada
mvn(boston[-14], mvnTest = "mardia")
## $multivariateNormality
## Test Statistic p value Result
## 1 Mardia Skewness 6440.29351636337 0 NO
## 2 Mardia Kurtosis 38.8421257939282 0 NO
## 3 MVN <NA> <NA> NO
##
## $univariateNormality
## Test Variable Statistic p value Normality
## 1 Shapiro-Wilk CRIM 0.4933 <0.001 NO
## 2 Shapiro-Wilk ZN 0.5842 <0.001 NO
## 3 Shapiro-Wilk INDUS 0.8764 <0.001 NO
## 4 Shapiro-Wilk CHAS 0.2672 <0.001 NO
## 5 Shapiro-Wilk NOX 0.9362 <0.001 NO
## 6 Shapiro-Wilk RM 0.9847 7e-04 NO
## 7 Shapiro-Wilk AGE 0.8567 <0.001 NO
## 8 Shapiro-Wilk DIS 0.8855 <0.001 NO
## 9 Shapiro-Wilk RAD 0.6941 <0.001 NO
## 10 Shapiro-Wilk TAX 0.8141 <0.001 NO
## 11 Shapiro-Wilk PTRATIO 0.8935 <0.001 NO
## 12 Shapiro-Wilk B 0.5273 <0.001 NO
## 13 Shapiro-Wilk LSTAT 0.9269 <0.001 NO
##
## $Descriptives
## n Mean Std.Dev Median Min Max 25th
## CRIM 363 4.5177472 9.8702156 0.35233 0.00632 88.9762 0.08204
## ZN 363 12.8608815 24.8749205 0.00000 0.00000 100.0000 0.00000
## INDUS 363 11.1674931 7.1135931 8.56000 0.46000 27.7400 4.86000
## CHAS 363 0.0661157 0.2488272 0.00000 0.00000 1.0000 0.00000
## NOX 363 0.5632860 0.1202039 0.53800 0.39400 0.8710 0.44850
## RM 363 6.3659780 0.7804789 6.37600 3.56100 8.7250 5.89700
## AGE 363 70.2754821 29.4988384 82.90000 2.90000 100.0000 44.60000
## DIS 363 3.6847259 2.1810679 2.89440 1.12960 12.1265 1.91190
## RAD 363 10.2258953 8.9700971 5.00000 1.00000 24.0000 4.00000
## TAX 363 418.8787879 173.8518823 337.00000 187.00000 711.0000 282.50000
## PTRATIO 363 18.3438017 2.2754476 19.10000 12.60000 22.0000 16.95000
## B 363 346.4125344 103.4525766 390.30000 0.32000 396.9000 370.26500
## LSTAT 363 13.0043802 8.0823512 10.97000 1.73000 37.9700 6.24000
## 75th Skew Kurtosis
## CRIM 5.62372 4.51518869 27.1109607
## ZN 20.00000 2.02278436 3.0227525
## INDUS 18.10000 0.21379710 -1.4129701
## CHAS 0.00000 3.47782722 10.1231925
## NOX 0.66350 0.56948697 -0.4384579
## RM 6.79700 0.07052145 0.9775789
## AGE 95.60000 -0.72107872 -0.9107082
## DIS 5.11670 1.04564883 0.4564249
## RAD 24.00000 0.83148914 -1.2029782
## TAX 666.00000 0.53674383 -1.3626985
## PTRATIO 20.20000 -0.75537070 -0.5113094
## B 395.66000 -2.37271513 4.2311834
## LSTAT 18.13500 0.73967139 -0.2573823
# prueba de box
# Ho: matriz de covarianza es constante
boxM(data = boston[-14], grouping = boston[,14])
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: boston[-14]
## Chi-Sq (approx.) = 1859.7, df = 91, p-value < 2.2e-16
# ser rechaza la Ho
# LDA
modelo <- lda(MEDV_CAT~.,data=boston)
modelo
## Call:
## lda(MEDV_CAT ~ ., data = boston)
##
## Prior probabilities of groups:
## High Low
## 0.523416 0.476584
##
## Group means:
## CRIM ZN INDUS CHAS NOX RM AGE
## High 0.8191541 22.431579 7.003316 0.09473684 0.4939153 6.781821 53.01211
## Low 8.5797860 2.349711 15.740867 0.03468208 0.6394734 5.909272 89.23526
## DIS RAD TAX PTRATIO B LSTAT
## High 4.536371 6.157895 322.2789 17.21789 383.3425 7.015947
## Low 2.749393 14.693642 524.9711 19.58035 305.8536 19.581272
##
## Coefficients of linear discriminants:
## LD1
## CRIM 0.007800163
## ZN 0.003391075
## INDUS -0.009807511
## CHAS -0.203557463
## NOX 6.147883533
## RM -0.064738427
## AGE 0.020859568
## DIS 0.308244930
## RAD -0.089534694
## TAX 0.003331247
## PTRATIO 0.285438068
## B -0.001896369
## LSTAT 0.120484172
prediccion <- predict(modelo,boston[-14])
m_confusion <- table(boston$MEDV_CAT,
prediccion$class,
dnn=c("Real","Predicho"))
m_confusion
## Predicho
## Real High Low
## High 178 12
## Low 11 162
mosaicplot(m_confusion,col=2:4)
precision <- mean(boston$MEDV_CAT == prediccion$class)
precision
## [1] 0.9366391
error <- 1 - precision
error
## [1] 0.06336088
modelo2 <- qda(MEDV_CAT~.,data = boston)
prediccion2 <- predict(modelo2,boston[-14])
m_confusion2 <- table(boston$MEDV_CAT,
prediccion2$class,
dnn=c("Real","predicho"))
m_confusion2
## predicho
## Real High Low
## High 179 11
## Low 26 147
mosaicplot(m_confusion2,col=2:4)
precision2 <- mean(boston$MEDV_CAT == prediccion2$class)
precision2
## [1] 0.8980716
error2 <- 1 - precision2
error2
## [1] 0.1019284
glimpse(boston)
## Observations: 363
## Variables: 14
## $ CRIM <dbl> 0.00632, 0.02729, 0.03237, 0.06905, 0.02985, 0.14455, 0.21...
## $ ZN <dbl> 18.0, 0.0, 0.0, 0.0, 0.0, 12.5, 12.5, 12.5, 12.5, 12.5, 0....
## $ INDUS <dbl> 2.31, 7.07, 2.18, 2.18, 2.18, 7.87, 7.87, 7.87, 7.87, 7.87...
## $ CHAS <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ NOX <dbl> 0.538, 0.469, 0.458, 0.458, 0.458, 0.524, 0.524, 0.524, 0....
## $ RM <dbl> 6.575, 7.185, 6.998, 7.147, 6.430, 6.172, 5.631, 6.004, 6....
## $ AGE <dbl> 65.2, 61.1, 45.8, 54.2, 58.7, 96.1, 100.0, 85.9, 94.3, 82....
## $ DIS <dbl> 4.0900, 4.9671, 6.0622, 6.0622, 6.0622, 5.9505, 6.0821, 6....
## $ RAD <int> 1, 2, 3, 3, 3, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4...
## $ TAX <int> 296, 242, 222, 222, 222, 311, 311, 311, 311, 311, 307, 307...
## $ PTRATIO <dbl> 15.3, 17.8, 18.7, 18.7, 18.7, 15.2, 15.2, 15.2, 15.2, 15.2...
## $ B <dbl> 396.90, 392.83, 394.63, 396.90, 394.12, 396.90, 386.63, 38...
## $ LSTAT <dbl> 4.98, 4.03, 2.94, 5.33, 5.21, 19.15, 29.93, 17.10, 20.45, ...
## $ MEDV_CAT <fct> High, High, High, High, High, High, Low, Low, Low, Low, Lo...
# explorando variable respuesta
prop.table(table(boston$MEDV_CAT))
##
## High Low
## 0.523416 0.476584
# fabricando el modelo
mod1 <- glm(MEDV_CAT~.,
data = boston,
family = binomial)
mod1
##
## Call: glm(formula = MEDV_CAT ~ ., family = binomial, data = boston)
##
## Coefficients:
## (Intercept) CRIM ZN INDUS CHAS NOX
## -32.334632 0.097510 -0.043489 -0.034755 -0.412807 13.969348
## RM AGE DIS RAD TAX PTRATIO
## -0.983256 0.065805 1.131166 -0.469344 0.016460 0.893019
## B LSTAT
## -0.005775 0.393894
##
## Degrees of Freedom: 362 Total (i.e. Null); 349 Residual
## Null Deviance: 502.4
## Residual Deviance: 95.63 AIC: 123.6
#calculando las variable respuesta
pred_p <- predict(mod1,type="response")
# convertimos la probabilidad de clasificación
pred <- ifelse(pred_p>0.05,1,0)
sum(pred)
## [1] 208
# evaluamos la precisión del modelo
mean(boston$MEDV_CAT==pred)
## [1] 0
conf <- table(boston$MEDV_CAT, pred, dnn=c("actual","predicho"))
mosaicplot(conf,col=2:4)
ROC <- roc(boston$MEDV_CAT,pred)
## Setting levels: control = High, case = Low
## Setting direction: controls < cases
plot(ROC)
auc(ROC)
## Area under the curve: 0.9079