ata.knit

El primer paso es cargar la base de datos, como la data es corta se puede copiar y pegar directamente usando la libreria paste table que nos permite realizar el copiado y ponerlo en diferentes objetos de R.

data <- data.frame(
          CM = c(2.66,2.89,3.28,2.92,4,2.86,2.76,2.87,
                 3.03,3.92,2.63,3.32,3.57,3.26,3.53,2.74,2.75,2.83,
                 3.12,3.16,2.06,3.62,2.89,3.51,3.54,2.83,3.39,2.67,3.65,
                 4,3.1,2.39),
          NP = c(20L,22L,24L,12L,21L,17L,17L,21L,25L,
                 29L,20L,23L,23L,25L,26L,19L,25L,19L,23L,25L,22L,
                 28L,14L,26L,24L,27L,17L,24L,21L,23L,21L,19L),
         PSI = c(0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,0L,
                 0L,0L,0L,0L,0L,0L,0L,1L,1L,1L,1L,1L,1L,1L,1L,1L,
                 1L,1L,1L,1L,1L),
      MEJORA = c(0L,0L,0L,0L,1L,0L,0L,0L,0L,1L,0L,
                 0L,0L,1L,0L,0L,0L,0L,0L,1L,0L,1L,0L,0L,1L,1L,1L,
                 0L,1L,1L,0L,1L)
)

Ahora realizaremos un analisis de las variables que tenemos una forma es usar str() o summary()

# Funcion str()
str(data)

'data.frame':   32 obs. of  4 variables:
 $ CM    : num  2.66 2.89 3.28 2.92 4 2.86 2.76 2.87 3.03 3.92 ...
 $ NP    : int  20 22 24 12 21 17 17 21 25 29 ...
 $ PSI   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ MEJORA: int  0 0 0 0 1 0 0 0 0 1 ...

# Funcion summary()
summary(data)

       CM              NP             PSI             MEJORA      
 Min.   :2.060   Min.   :12.00   Min.   :0.0000   Min.   :0.0000  
 1st Qu.:2.812   1st Qu.:19.75   1st Qu.:0.0000   1st Qu.:0.0000  
 Median :3.065   Median :22.50   Median :0.0000   Median :0.0000  
 Mean   :3.117   Mean   :21.94   Mean   :0.4375   Mean   :0.3438  
 3rd Qu.:3.515   3rd Qu.:25.00   3rd Qu.:1.0000   3rd Qu.:1.0000  
 Max.   :4.000   Max.   :29.00   Max.   :1.0000   Max.   :1.0000

Como se puede ver todas las variables son de tipo numericas pero debemos de transformar en categorica tanto a psi y a mejora.

library(tidyverse)

data$PSI <- data$PSI %>% factor()
data$MEJORA <- data$MEJORA %>%  factor()

Con esta transformacion hemos puesto a estas variables en tipo fatcor que es necesario para el analisis del modelo.

EDA

Observamos si existe algunos valores ausentes o mising values

library(DataExplorer)

plot_missing(
  data    = data, 
  title   = "Porcentaje de valores ausentes",
  ggtheme = theme_bw(),
  theme_config = list(legend.position = "none")
)

Como se puede observar no existe ningun valor ausente por lo que no es necesario relaizar algun tipo de imputacion o de eliminacion de estos.

Ahora observamos como se relaciona las 2 variables nuemricas respecto a la variable indenpendiente

library(cowplot)

library(cowplot) 


data %>% 
  ggplot(aes(x =CM , y =NP , color =as.factor(MEJORA)))+
  geom_point(size=2.5)+
  ggpubr::color_palette("jco")+theme_minimal()-> pmain

xdens <- axis_canvas(pmain, axis = "x")+
  geom_density(data = data, aes(x = CM, fill = as.factor(MEJORA)),
               alpha = 0.7, size = 0.2)+
  ggpubr::fill_palette("jco")

ydens <- axis_canvas(pmain, axis = "y", coord_flip = TRUE)+
  geom_density(data = data, aes(x = NP, fill = as.factor(MEJORA)),
               alpha = 0.7, size = 0.2)+
  coord_flip()+
  ggpubr::fill_palette("jco")
p1 <- insert_xaxis_grob(pmain, xdens, grid::unit(.2, "null"), position = "top")
p2<- insert_yaxis_grob(p1, ydens, grid::unit(.2, "null"), position = "right")
ggdraw(p2)

Otra forma de hacer estos tipos de graficos es de la siguiente manera

library(GGally) 

data %>% select(CM,NP) %>% ggpairs(., legend = 1, 
                                        aes(color = factor(data$MEJORA, levels = 0:1, labels = c("V-Block", "Straight")),
                                            alpha=0.5)) + theme_minimal()+
  theme(legend.position = "bottom") + 
  labs(color = "Engine Type")

Como se puede observar las 2 variables presentan una correlacion positiva pero baja ya que su valor ronda en 0.38 aproximadamente.

Ahora toca ver el comportamiento de la variable categorica respecto la variable de interes

data %>% select(PSI,MEJORA) %>% 
  na.omit() %>% ggplot(aes(x = PSI, y = ..count.., fill = as.factor(MEJORA))) +
  geom_bar() +
  scale_fill_manual(values = c("gray50", "orangered2")) +
  labs(title = " ") +
  theme_bw() +
  theme(legend.position = "bottom")

Ahora vemos la variable mejora como esta proporcionada

data %>% select(MEJORA) %>% group_by(MEJORA) %>% summarise(cantidad=n()) %>% as.data.frame() %>% 
  ggplot(aes(MEJORA,cantidad))+geom_col()+theme_minimal()

como se puede obsrvar la clase 0 es la dominate ante la clase 1. Esto puede generar problemas al momento de generar nuevas predicciones

Modelamiento

Primero haremos un modelo de regresión lineal simple

modelo1 <- lm(as.numeric(MEJORA) ~.,data=data)
summary(modelo1)


Call:
lm(formula = as.numeric(MEJORA) ~ ., data = data)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.78153 -0.27731  0.00531  0.21089  0.81145 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept) -0.49802    0.52389  -0.951  0.34993   
CM           0.46385    0.16196   2.864  0.00784 **
NP           0.01050    0.01948   0.539  0.59436   
PSI1         0.37855    0.13917   2.720  0.01109 * 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.3881 on 28 degrees of freedom
Multiple R-squared:  0.4159,    Adjusted R-squared:  0.3533 
F-statistic: 6.646 on 3 and 28 DF,  p-value: 0.001571

Como se puede observar solo va variable np es no significativa

predichos <- modelo1$fitted.values

data %>% ggplot(aes(1:32,MEJORA))+geom_point(mapping=aes(color=MEJORA))+
  geom_line(mapping = aes(x=1:32,y=predichos))+theme_minimal()

como se puede observar existe valores que sobre pasan a cero estos es uno de las tipicas fallas al aplicar un modelo de mco para estos tipos de regresion

summary(predichos)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.9318  1.0502  1.3257  1.3438  1.5855  1.9773

pred1 <- ifelse(predichos>0.5,1,0)

library(caret)

table(data$MEJORA,pred1)

Como se puede observar solo una clase ha sido bien clasificada por lo que el modelo regresion lineal no sirve para este ejemplo.

modelo2 <-  glm(MEJORA ~.,data=data,family=binomial())
summary(modelo2)


Call:
glm(formula = MEJORA ~ ., family = binomial(), data = data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9551  -0.6453  -0.2570   0.5888   2.0966  

Coefficients:
             Estimate Std. Error z value Pr(>|z|)   
(Intercept) -13.02135    4.93127  -2.641  0.00828 **
CM            2.82611    1.26293   2.238  0.02524 * 
NP            0.09516    0.14155   0.672  0.50143   
PSI1          2.37869    1.06456   2.234  0.02545 * 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 41.183  on 31  degrees of freedom
Residual deviance: 25.779  on 28  degrees of freedom
AIC: 33.779

Number of Fisher Scoring iterations: 5

En este modelo se aplico el logit en donde se ve que al igual que el modelo de regresion lieneal solo la variable np es no significativa.

pred2 <- predict(modelo2,data)
pred2 <- ifelse(pred2>0.5,1,0)
table(data$MEJORA,pred2) %>% confusionMatrix()

Confusion Matrix and Statistics

   pred2
     0  1
  0 20  1
  1  4  7
                                          
               Accuracy : 0.8438          
                 95% CI : (0.6721, 0.9472)
    No Information Rate : 0.75            
    P-Value [Acc > NIR] : 0.1530          
                                          
                  Kappa : 0.6296          
                                          
 Mcnemar's Test P-Value : 0.3711          
                                          
            Sensitivity : 0.8333          
            Specificity : 0.8750          
         Pos Pred Value : 0.9524          
         Neg Pred Value : 0.6364          
             Prevalence : 0.7500          
         Detection Rate : 0.6250          
   Detection Prevalence : 0.6562          
      Balanced Accuracy : 0.8542          
                                          
       'Positive' Class : 0

Modelo probit

modelo3 <- glm(MEJORA ~. ,
               data = data , family = binomial(link = "probit"))
summary(modelo3)


Call:
glm(formula = MEJORA ~ ., family = binomial(link = "probit"), 
    data = data)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9392  -0.6508  -0.2229   0.5934   2.0451  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)   
(Intercept) -7.45231    2.57152  -2.898  0.00376 **
CM           1.62581    0.68973   2.357  0.01841 * 
NP           0.05173    0.08119   0.637  0.52406   
PSI1         1.42633    0.58695   2.430  0.01510 * 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 41.183  on 31  degrees of freedom
Residual deviance: 25.638  on 28  degrees of freedom
AIC: 33.638

Number of Fisher Scoring iterations: 6

pred3 <- predict(modelo3,data)
pred3 <- ifelse(pred3>0.5,1,0)

table(data$MEJORA,pred3) %>% confusionMatrix()

Confusion Matrix and Statistics

   pred3
     0  1
  0 20  1
  1  7  4
                                         
               Accuracy : 0.75           
                 95% CI : (0.566, 0.8854)
    No Information Rate : 0.8438         
    P-Value [Acc > NIR] : 0.9483         
                                         
                  Kappa : 0.3632         
                                         
 Mcnemar's Test P-Value : 0.0771         
                                         
            Sensitivity : 0.7407         
            Specificity : 0.8000         
         Pos Pred Value : 0.9524         
         Neg Pred Value : 0.3636         
             Prevalence : 0.8438         
         Detection Rate : 0.6250         
   Detection Prevalence : 0.6562         
      Balanced Accuracy : 0.7704         
                                         
       'Positive' Class : 0

Dalex

Lo primero es crear los moelos pero mediate el paquete caret

probit <- train(MEJORA ~ ., data = data,
                         method = "glm",
                         metric = "Accuracy",
                         family = binomial(link = "probit"))

logit <- train(MEJORA ~ ., data = data,
               method = "glm",
               metric = "Accuracy",
               family = "binomial")

Ahora lo siguiente es realizar la creación de los objetos para realizar los graficos con dalex

library(DALEX)
explainer_logit <- DALEX::explain(logit,
                                data = data[,-4],
                                y = data$MEJORA, 
                                label = "Logistic regression")

Preparation of a new explainer is initiated
  -> model label       :  Logistic regression 
  -> data              :  32  rows  3  cols 
  -> target variable   :  32  values 
  -> predict function  :  yhat.train  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package caret , ver. 6.0.86 , task classification (  default  ) 
  -> model_info        :  Model info detected classification task but 'y' is a factor .  (  WARNING  )
  -> model_info        :  By deafult classification tasks supports only numercical 'y' parameter. 
  -> model_info        :  Consider changing to numerical vector with 0 and 1 values.
  -> model_info        :  Otherwise I will not be able to calculate residuals or loss function.
  -> predicted values  :  numerical, min =  0.02447037 , mean =  0.34375 , max =  0.9453403  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  NA , mean =  NA , max =  NA  
  A new explainer has been created!

explainer_probit <- DALEX::explain(probit,
                                  data = data[,-4],
                                  y = data$MEJORA, 
                                  label = "Probit regression")

Preparation of a new explainer is initiated
  -> model label       :  Probit regression 
  -> data              :  32  rows  3  cols 
  -> target variable   :  32  values 
  -> predict function  :  yhat.train  will be used (  default  )
  -> predicted values  :  No value for predict function target column. (  default  )
  -> model_info        :  package caret , ver. 6.0.86 , task classification (  default  ) 
  -> model_info        :  Model info detected classification task but 'y' is a factor .  (  WARNING  )
  -> model_info        :  By deafult classification tasks supports only numercical 'y' parameter. 
  -> model_info        :  Consider changing to numerical vector with 0 and 1 values.
  -> model_info        :  Otherwise I will not be able to calculate residuals or loss function.
  -> predicted values  :  numerical, min =  0.01610246 , mean =  0.3427204 , max =  0.9522449  
  -> residual function :  difference between y and yhat (  default  )
  -> residuals         :  numerical, min =  NA , mean =  NA , max =  NA  
  A new explainer has been created!

model_profile_glm1 <- model_profile(explainer_logit, type = "partial", 
                                    variables = c("CM","NP"))

plot(model_profile_glm1)

model_profile_probit <- model_profile(explainer_probit, type = "partial", 
                                    variables = c("CM","NP"))
plot(model_profile_probit)

single_customer <- data[5,]
explainer_logit %>% predict_parts(new_observation = single_customer) %>% plot()

explainer_probit %>% predict_parts(new_observation = single_customer) %>% plot()