Importamos la base de datos y renombramos para trabajar en un análisis descriptivo

Conocemos los datos de nuestra base

## # A tibble: 6 × 21
##   battery_power bluetooth clock_speed int_memory n_cores   ram dual_sim three_g
##           <dbl>     <dbl>       <dbl>      <dbl>   <dbl> <dbl>    <dbl>   <dbl>
## 1           842         0         2.2          7       2  2549        0       0
## 2          1021         1         0.5         53       3  2631        1       1
## 3           563         1         0.5         41       5  2603        1       1
## 4           615         1         2.5         10       6  2769        0       1
## 5          1821         1         1.2         44       2  1411        0       1
## 6          1859         0         0.5         22       1  1067        1       1
## # ℹ 13 more variables: four_g <dbl>, fc <dbl>, pc <dbl>, px_height <dbl>,
## #   px_width <dbl>, m_dep <dbl>, mobile_wt <dbl>, sc_h <dbl>, sc_w <dbl>,
## #   talk_time <dbl>, touch_screen <dbl>, wifi <dbl>, price_range <dbl>

## # A tibble: 6 × 21
##   battery_power bluetooth clock_speed int_memory n_cores   ram dual_sim three_g
##           <dbl>     <dbl>       <dbl>      <dbl>   <dbl> <dbl>    <dbl>   <dbl>
## 1           858         0         2.2         50       1  3978        0       1
## 2           794         1         0.5          2       6   668        1       1
## 3          1965         1         2.6         39       4  2032        1       1
## 4          1911         0         0.9         36       8  3057        1       1
## 5          1512         0         0.9         46       5   869        0       1
## 6           510         1         2           45       6  3919        1       1
## # ℹ 13 more variables: four_g <dbl>, fc <dbl>, pc <dbl>, px_height <dbl>,
## #   px_width <dbl>, m_dep <dbl>, mobile_wt <dbl>, sc_h <dbl>, sc_w <dbl>,
## #   talk_time <dbl>, touch_screen <dbl>, wifi <dbl>, price_range <dbl>

## [1] 2000   21

## battery_power     bluetooth   clock_speed    int_memory       n_cores 
##             0             0             0             0             0 
##           ram      dual_sim       three_g        four_g            fc 
##             0             0             0             0             0 
##            pc     px_height      px_width         m_dep     mobile_wt 
##             0             0             0             0             0 
##          sc_h          sc_w     talk_time  touch_screen          wifi 
##             0             0             0             0             0 
##   price_range 
##             0

##  battery_power      bluetooth      clock_speed      int_memory   
##  Min.   : 501.0   Min.   :0.000   Min.   :0.500   Min.   : 2.00  
##  1st Qu.: 851.8   1st Qu.:0.000   1st Qu.:0.700   1st Qu.:16.00  
##  Median :1226.0   Median :0.000   Median :1.500   Median :32.00  
##  Mean   :1238.5   Mean   :0.495   Mean   :1.522   Mean   :32.05  
##  3rd Qu.:1615.2   3rd Qu.:1.000   3rd Qu.:2.200   3rd Qu.:48.00  
##  Max.   :1998.0   Max.   :1.000   Max.   :3.000   Max.   :64.00  
##     n_cores           ram          dual_sim         three_g      
##  Min.   :1.000   Min.   : 256   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:3.000   1st Qu.:1208   1st Qu.:0.0000   1st Qu.:1.0000  
##  Median :4.000   Median :2146   Median :1.0000   Median :1.0000  
##  Mean   :4.521   Mean   :2124   Mean   :0.5095   Mean   :0.7615  
##  3rd Qu.:7.000   3rd Qu.:3064   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :8.000   Max.   :3998   Max.   :1.0000   Max.   :1.0000  
##      four_g             fc               pc           px_height     
##  Min.   :0.0000   Min.   : 0.000   Min.   : 0.000   Min.   :   0.0  
##  1st Qu.:0.0000   1st Qu.: 1.000   1st Qu.: 5.000   1st Qu.: 282.8  
##  Median :1.0000   Median : 3.000   Median :10.000   Median : 564.0  
##  Mean   :0.5215   Mean   : 4.309   Mean   : 9.916   Mean   : 645.1  
##  3rd Qu.:1.0000   3rd Qu.: 7.000   3rd Qu.:15.000   3rd Qu.: 947.2  
##  Max.   :1.0000   Max.   :19.000   Max.   :20.000   Max.   :1960.0  
##     px_width          m_dep          mobile_wt          sc_h      
##  Min.   : 500.0   Min.   :0.1000   Min.   : 80.0   Min.   : 5.00  
##  1st Qu.: 874.8   1st Qu.:0.2000   1st Qu.:109.0   1st Qu.: 9.00  
##  Median :1247.0   Median :0.5000   Median :141.0   Median :12.00  
##  Mean   :1251.5   Mean   :0.5018   Mean   :140.2   Mean   :12.31  
##  3rd Qu.:1633.0   3rd Qu.:0.8000   3rd Qu.:170.0   3rd Qu.:16.00  
##  Max.   :1998.0   Max.   :1.0000   Max.   :200.0   Max.   :19.00  
##       sc_w          talk_time      touch_screen        wifi      
##  Min.   : 0.000   Min.   : 2.00   Min.   :0.000   Min.   :0.000  
##  1st Qu.: 2.000   1st Qu.: 6.00   1st Qu.:0.000   1st Qu.:0.000  
##  Median : 5.000   Median :11.00   Median :1.000   Median :1.000  
##  Mean   : 5.767   Mean   :11.01   Mean   :0.503   Mean   :0.507  
##  3rd Qu.: 9.000   3rd Qu.:16.00   3rd Qu.:1.000   3rd Qu.:1.000  
##  Max.   :18.000   Max.   :20.00   Max.   :1.000   Max.   :1.000  
##   price_range  
##  Min.   :0.00  
##  1st Qu.:0.75  
##  Median :1.50  
##  Mean   :1.50  
##  3rd Qu.:2.25  
##  Max.   :3.00

De acuerdo al análisis descriptivo encontramos que 6 variables son cualitativas nominales y cuantitativas continuas son 2 y las 13 variables restantes son cuantitativas discretas.

Análisis exploratorio - descriptivo univariado

Variables cualitativas: Nominal - Ordinal

Inicialmente vamos a convertir variables númericas a factores

Tercer corte

#ggplot(data=data,aes(x=fc,y=pc))+geom_point(color="red")+geom_smooth(method = "lm")
# Nuestra variable objetivo es la dirección, si va a subir o bajar

# Convertir a factor con dos niveles usando la media
data$battery_power <- factor(ifelse(data$battery_power <= 1200, "Bajo", "Alto"), levels = c("Bajo", "Alto"))
# Ver resultado

# Revisar para qué es
#Smarket_tidy <- melt(data[,-1], value.name = "valor")
#Smarket_tidy %>% group_by(wifi, variable) %>% summarise(p_value_Shapiro.test = shapiro.test(valor)$p.value)

Las variables no se distribuyen de forma normal.

Regresión logística

Se utiliza la función glm y se selecciona a la familia binomial.

attach(data)

## The following objects are masked from data (pos = 3):
## 
##     battery_power, bluetooth, clock_speed, dual_sim, fc, four_g,
##     int_memory, m_dep, mobile_wt, n_cores, pc, price_range, px_height,
##     px_width, ram, sc_h, sc_w, talk_time, three_g, touch_screen, wifi

names(data)

##  [1] "battery_power" "bluetooth"     "clock_speed"   "int_memory"   
##  [5] "n_cores"       "ram"           "dual_sim"      "three_g"      
##  [9] "four_g"        "fc"            "pc"            "px_height"    
## [13] "px_width"      "m_dep"         "mobile_wt"     "sc_h"         
## [17] "sc_w"          "talk_time"     "touch_screen"  "wifi"         
## [21] "price_range"

model_logis<-glm(battery_power~clock_speed+int_memory+ram+fc+pc+px_height+px_width+m_dep+mobile_wt+sc_h+sc_w+talk_time,family = "binomial",data=data)
summary(model_logis)

## 
## Call:
## glm(formula = battery_power ~ clock_speed + int_memory + ram + 
##     fc + pc + px_height + px_width + m_dep + mobile_wt + sc_h + 
##     sc_w + talk_time, family = "binomial", data = data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)  1.889e-01  3.280e-01   0.576    0.565
## clock_speed  3.392e-02  5.500e-02   0.617    0.537
## int_memory  -1.271e-03  2.478e-03  -0.513    0.608
## ram          1.320e-05  4.142e-05   0.319    0.750
## fc           1.280e-02  1.353e-02   0.945    0.344
## pc          -3.382e-03  9.692e-03  -0.349    0.727
## px_height    1.352e-04  1.178e-04   1.147    0.251
## px_width    -1.600e-04  1.208e-04  -1.325    0.185
## m_dep        1.257e-01  1.558e-01   0.807    0.420
## mobile_wt   -8.586e-04  1.269e-03  -0.677    0.499
## sc_h        -9.659e-03  1.238e-02  -0.780    0.435
## sc_w        -9.409e-04  1.195e-02  -0.079    0.937
## talk_time    1.013e-02  8.218e-03   1.232    0.218
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2770.4  on 1999  degrees of freedom
## Residual deviance: 2763.2  on 1987  degrees of freedom
## AIC: 2789.2
## 
## Number of Fisher Scoring iterations: 3

model_logis_todas<-glm(battery_power~bluetooth+wifi+clock_speed+int_memory+n_cores+ram+dual_sim+three_g+four_g+fc+pc+px_height+px_width+m_dep+mobile_wt+sc_h+sc_w+talk_time+touch_screen+price_range,family = "binomial",data=data)
summary(model_logis_todas)

## 
## Call:
## glm(formula = battery_power ~ bluetooth + wifi + clock_speed + 
##     int_memory + n_cores + ram + dual_sim + three_g + four_g + 
##     fc + pc + px_height + px_width + m_dep + mobile_wt + sc_h + 
##     sc_w + talk_time + touch_screen + price_range, family = "binomial", 
##     data = data)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    3.7988289  0.4689847   8.100 5.49e-16 ***
## bluetooth1     0.0249050  0.1056065   0.236   0.8136    
## wifi1          0.1185449  0.1055194   1.123   0.2612    
## clock_speed    0.0714876  0.0645810   1.107   0.2683    
## int_memory    -0.0048447  0.0029219  -1.658   0.0973 .  
## n_cores2      -0.1547263  0.2152731  -0.719   0.4723    
## n_cores3      -0.2216021  0.2123719  -1.043   0.2967    
## n_cores4      -0.1250596  0.2091586  -0.598   0.5499    
## n_cores5      -0.2390201  0.2147604  -1.113   0.2657    
## n_cores6       0.0170645  0.2180753   0.078   0.9376    
## n_cores7      -0.1894964  0.2108842  -0.899   0.3689    
## n_cores8      -0.4962819  0.2140329  -2.319   0.0204 *  
## ram           -0.0032518  0.0001712 -18.993  < 2e-16 ***
## dual_sim1     -0.0296857  0.1056007  -0.281   0.7786    
## three_g1      -0.2204853  0.1526819  -1.444   0.1487    
## four_g1        0.0816328  0.1299990   0.628   0.5300    
## fc             0.0091256  0.0158220   0.577   0.5641    
## pc            -0.0078318  0.0112791  -0.694   0.4875    
## px_height     -0.0008338  0.0001469  -5.677 1.37e-08 ***
## px_width      -0.0011003  0.0001488  -7.394 1.43e-13 ***
## m_dep          0.1080251  0.1839806   0.587   0.5571    
## mobile_wt      0.0020293  0.0014948   1.358   0.1746    
## sc_h          -0.0143958  0.0145519  -0.989   0.3225    
## sc_w           0.0022379  0.0141430   0.158   0.8743    
## talk_time      0.0045323  0.0097207   0.466   0.6410    
## touch_screen1 -0.0119119  0.1053499  -0.113   0.9100    
## price_range1   3.7117182  0.2312772  16.049  < 2e-16 ***
## price_range2   6.6724014  0.3616313  18.451  < 2e-16 ***
## price_range3  10.5447096  0.5311028  19.854  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2770.4  on 1999  degrees of freedom
## Residual deviance: 2164.8  on 1971  degrees of freedom
## AIC: 2222.8
## 
## Number of Fisher Scoring iterations: 4

Procedemos a realizar predicciones a partir del modelo con todas las variables, el cual tiene menor AIC, para ello se utiliza la función predict que permitirá predecir la probabilidad de que la variable de resupesta pertenezca al nivel de referencía, será por defecto la categoría de menor cantidad categorizada en la varibale de interés. Eso se puede observar a través de:

contrasts(data$battery_power)

##      Alto
## Bajo    0
## Alto    1

Las predicciones se observan a partir de:

predicciones_modlogist<-predict(object=model_logis_todas,type = "response")
head(predicciones_modlogist,8)

##         1         2         3         4         5         6         7         8 
## 0.2445850 0.1785481 0.2240311 0.1913596 0.6202617 0.7767052 0.9029429 0.4371286

La 1,2, 3, 4 y 8 están en bajo, en cambio 5, 6 y 7 están en alto, ya que está por debajo de 0.5

predicciones_modlogis<-predict(object=model_logis,type = "response")
head(predicciones_modlogis,8)

##         1         2         3         4         5         6         7         8 
## 0.5470942 0.4581598 0.5152931 0.5207659 0.5763388 0.4902671 0.5529851 0.4758244

La 2,6 y 8 están clasificadas en bajo, en cambio las 1, 3,4, 5 y 7 en alto.

Para visualizar la clasificación se asigna una regla de decisión que toma como umbral la probabilidad de 0.5<0.5 (Bajo), =0.5 (Bajo), >0.5(Alto)

prediccion <- data.frame(probabilidad = predicciones_modlogist,
                         clase = rep(NA,length(predicciones_modlogist)))
prediccion[prediccion$probabilidad <= 0.5,"clase"] <- "Bajo"
prediccion[prediccion$probabilidad > 0.5,"clase"] <- "Alto"
head(prediccion,15)

##    probabilidad clase
## 1     0.2445850  Bajo
## 2     0.1785481  Bajo
## 3     0.2240311  Bajo
## 4     0.1913596  Bajo
## 5     0.6202617  Alto
## 6     0.7767052  Alto
## 7     0.9029429  Alto
## 8     0.4371286  Bajo
## 9     0.1891520  Bajo
## 10    0.3443430  Bajo
## 11    0.6456117  Alto
## 12    0.5889180  Alto
## 13    0.7463901  Alto
## 14    0.5393816  Alto
## 15    0.8457713  Alto

Resumen de los resultados

table(clase_predicha = prediccion$clase, clase_real = data$battery_power)

##               clase_real
## clase_predicha Bajo Alto
##           Alto  297  758
##           Bajo  670  275

Para el caso de regresión logística al tener en cuenta todas las variables, tenemos: 670, casos Bajos que fueron clasificados como Bajo es decir son Verdaderos Positivos (VP), 275 Casos Altos que fueron clasificados como Bajo, son Falsos Negativos (FN), 297 casos Bajos que fueron clasificados como Alto, son Falsos Positivos (FP), 758 Casos Altos que fueron clasificados como Alto, entonces son Verdaderos Negativos (VN).

paste( "% de acierto:", mean(prediccion$clase == data$battery_power))

## [1] "% de acierto: 0.714"

paste( "% de error:", mean(prediccion$clase != data$battery_power))

## [1] "% de error: 0.286"

#Cuando esta Encima de 0.85 se valora la técnica

El resultado es de 0.29

Linear Discriminant Analysis (LDA)

Está técnica multivariada de clasificación me permite trabajar con variables cualitativas con 2 o más niveles.

Se realiza una partición de la data teniendo en cuenta 8 núcleos, los datos de entrenamiento estarán compuestos por el comportamiento que hubo entre los núcleos 1 a 7, se válida la técnica con los registros de los dispositivos con 8 núcleos.

data$n_cores<-as.numeric(data$n_cores)
train<-data[data$n_cores<8,]
test<-data[!(data$n_cores<8),]
dim(train)

## [1] 1744   21

dim(test)

## [1] 256  21

mod_lda<-lda(battery_power~clock_speed+int_memory+ram+fc+pc+px_height+px_width+m_dep+mobile_wt+sc_h+sc_w+talk_time,data=train)
mod_lda

## Call:
## lda(battery_power ~ clock_speed + int_memory + ram + fc + pc + 
##     px_height + px_width + m_dep + mobile_wt + sc_h + sc_w + 
##     talk_time, data = train)
## 
## Prior probabilities of groups:
##      Bajo      Alto 
## 0.4736239 0.5263761 
## 
## Group means:
##      clock_speed int_memory      ram       fc      pc px_height px_width
## Bajo    1.511259   32.25182 2109.636 4.222760 9.90678  643.7857 1254.410
## Alto    1.534641   31.90523 2143.161 4.405229 9.91939  647.5131 1243.646
##          m_dep mobile_wt     sc_h     sc_w talk_time
## Bajo 0.4989104  140.9685 12.43826 5.823245  10.85593
## Alto 0.5037037  139.9401 12.24292 5.797386  11.11438
## 
## Coefficients of linear discriminants:
##                       LD1
## clock_speed  0.3287994360
## int_memory  -0.0101695387
## ram          0.0002714078
## fc           0.1470505213
## pc          -0.0670971686
## px_height    0.0006971676
## px_width    -0.0008874593
## m_dep        0.5301912701
## mobile_wt   -0.0082842544
## sc_h        -0.1224917252
## sc_w         0.0471524586
## talk_time    0.0810948710

Prior probabilities of groupos

La probabilidad priori estima observaciones de las proporciones que hay en cada clase. En este caso los valores de 0.47 corresponden a Bajo, y 0.53 corresponden a Alto, por lo que se puede decir que son muy parecidos, casi que son identicos.

Observamos predicciones:

predicciones_lda<-predict(mod_lda,test)
head(predicciones_lda$class,n=5)

## [1] Alto Alto Alto Bajo Alto
## Levels: Bajo Alto

head(predicciones_lda$posterior,n=5)

##        Bajo      Alto
## 1 0.4368773 0.5631227
## 2 0.4761190 0.5238810
## 3 0.4659770 0.5340230
## 4 0.5101650 0.4898350
## 5 0.4434003 0.5565997

Está clasificando la primera, segunda, tercera como Alto, la cuarta como bajo y la quinta como Alto.

Matriz de confusión

table(Clase_predicha=predicciones_lda$class,clase_real=test$battery_power)

##               clase_real
## Clase_predicha Bajo Alto
##           Bajo   34   22
##           Alto  107   93

Para el caso de Análisis Lineal Discriminante (LDA) al tener en cuenta solo las variables numéricas, tenemos: 34 casos Bajos que fueron clasificados como Bajo es decir son Verdaderos Positivos (VP), 22 Casos Altos que fueron como Bajo, son Falsos Negativos (FN), 107 casos Bajos que fueron clasificados como Alto, son Falsos Positivos (FP) y 93 casos Altos que fueron clasificados como Alto, entonces son Verdaderos Negativos (VN).

Regresión logística con train

Se hace el cambio de uno nuevo para probarlo con test

model_logis_train<-glm(battery_power~clock_speed+int_memory+ram+fc+pc+px_height+px_width+m_dep+mobile_wt+sc_h+sc_w+talk_time,family = "binomial",data=train)
summary(model_logis_train)

## 
## Call:
## glm(formula = battery_power ~ clock_speed + int_memory + ram + 
##     fc + pc + px_height + px_width + m_dep + mobile_wt + sc_h + 
##     sc_w + talk_time, family = "binomial", data = train)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)
## (Intercept)  2.393e-01  3.544e-01   0.675    0.499
## clock_speed  3.723e-02  5.897e-02   0.631    0.528
## int_memory  -1.153e-03  2.650e-03  -0.435    0.664
## ram          3.074e-05  4.430e-05   0.694    0.488
## fc           1.667e-02  1.451e-02   1.149    0.250
## pc          -7.595e-03  1.045e-02  -0.727    0.467
## px_height    7.893e-05  1.270e-04   0.622    0.534
## px_width    -1.005e-04  1.301e-04  -0.773    0.440
## m_dep        6.009e-02  1.667e-01   0.361    0.718
## mobile_wt   -9.384e-04  1.361e-03  -0.689    0.491
## sc_h        -1.387e-02  1.333e-02  -1.040    0.298
## sc_w         5.337e-03  1.273e-02   0.419    0.675
## talk_time    9.187e-03  8.799e-03   1.044    0.296
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2412.8  on 1743  degrees of freedom
## Residual deviance: 2407.3  on 1731  degrees of freedom
## AIC: 2433.3
## 
## Number of Fisher Scoring iterations: 3

model_logis_todas_train<-glm(battery_power~wifi+bluetooth+clock_speed+int_memory+n_cores+ram+dual_sim+three_g+four_g+fc+pc+px_height+px_width+m_dep+mobile_wt+sc_h+sc_w+talk_time+touch_screen+price_range,family = "binomial",data=train)
summary(model_logis_todas_train)

## 
## Call:
## glm(formula = battery_power ~ wifi + bluetooth + clock_speed + 
##     int_memory + n_cores + ram + dual_sim + three_g + four_g + 
##     fc + pc + px_height + px_width + m_dep + mobile_wt + sc_h + 
##     sc_w + talk_time + touch_screen + price_range, family = "binomial", 
##     data = train)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    3.7114671  0.4905636   7.566 3.86e-14 ***
## wifi1          0.1378924  0.1122870   1.228   0.2194    
## bluetooth1    -0.0027333  0.1123230  -0.024   0.9806    
## clock_speed    0.0651206  0.0684030   0.952   0.3411    
## int_memory    -0.0056195  0.0031083  -1.808   0.0706 .  
## n_cores       -0.0115227  0.0282517  -0.408   0.6834    
## ram           -0.0031437  0.0001805 -17.416  < 2e-16 ***
## dual_sim1     -0.0804286  0.1123194  -0.716   0.4739    
## three_g1      -0.2081169  0.1616901  -1.287   0.1980    
## four_g1        0.0787268  0.1368448   0.575   0.5651    
## fc             0.0131715  0.0168022   0.784   0.4331    
## pc            -0.0117594  0.0120529  -0.976   0.3292    
## px_height     -0.0008664  0.0001562  -5.547 2.91e-08 ***
## px_width      -0.0010118  0.0001580  -6.406 1.50e-10 ***
## m_dep          0.0822849  0.1947011   0.423   0.6726    
## mobile_wt      0.0018110  0.0015828   1.144   0.2526    
## sc_h          -0.0204382  0.0154583  -1.322   0.1861    
## sc_w           0.0110005  0.0149421   0.736   0.4616    
## talk_time      0.0046901  0.0103091   0.455   0.6491    
## touch_screen1 -0.0879981  0.1118512  -0.787   0.4314    
## price_range1   3.6093819  0.2439091  14.798  < 2e-16 ***
## price_range2   6.4943066  0.3831026  16.952  < 2e-16 ***
## price_range3  10.2702566  0.5604947  18.324  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2412.8  on 1743  degrees of freedom
## Residual deviance: 1912.5  on 1721  degrees of freedom
## AIC: 1958.5
## 
## Number of Fisher Scoring iterations: 4

Predicciones

predicciones_modlogttrain<-predict(object=model_logis_todas_train,type = "response",test)
head(predicciones_modlogttrain)

##         1         2         3         4         5         6 
## 0.9327126 0.6636054 0.5679562 0.3224541 0.7221850 0.1787516

En este caso predijo la 7,12,17 y 32 como Alto, en cambio la 26 y la 35 como Bajo.

prediccion_test <- data.frame(probabilidad = predicciones_modlogttrain,
                         clase = rep(NA,length(predicciones_modlogttrain)))
prediccion_test[prediccion_test$probabilidad <= 0.5,"clase"] <- "Bajo"
prediccion_test[prediccion_test$probabilidad > 0.5,"clase"] <- "Alto"
head(prediccion_test,15)

##    probabilidad clase
## 1    0.93271258  Alto
## 2    0.66360544  Alto
## 3    0.56795623  Alto
## 4    0.32245406  Bajo
## 5    0.72218501  Alto
## 6    0.17875163  Bajo
## 7    0.62627327  Alto
## 8    0.56533020  Alto
## 9    0.04997839  Bajo
## 10   0.69765739  Alto
## 11   0.64302943  Alto
## 12   0.16311480  Bajo
## 13   0.67598324  Alto
## 14   0.90902138  Alto
## 15   0.82507427  Alto

Resumen de los resultados

table(clase_predicha = prediccion_test$clase, clase_real = test$battery_power)

##               clase_real
## clase_predicha Bajo Alto
##           Alto   41   92
##           Bajo  100   23

Para el caso de regresión logística con data entrenada y teniendo en cuenta todas las variables, tenemos: 100 casos en Bajo que fueron clasificadas como Bajo, es decir Verdaderos Positivos (VP), 23 Casos Altos que fueron clasificados como Bajo, son Falsos Negativos (FN), 41 casos Bajos que fueron clasificados como Alto, son Falsos Positivos (FP), y 92 casos Altos que fueron clasificados como Alto, entonces son Verdaderos Negativos (VN).

LDA todas train

Lo va a comparar con regresión logistica entonces se utilizan todas las covariables

mod_lda_todo<-lda(battery_power~wifi+bluetooth+clock_speed+int_memory+n_cores+ram+dual_sim+three_g+four_g+fc+pc+px_height+px_width+m_dep+mobile_wt+sc_h+sc_w+talk_time+touch_screen+price_range,data=train)
mod_lda_todo

## Call:
## lda(battery_power ~ wifi + bluetooth + clock_speed + int_memory + 
##     n_cores + ram + dual_sim + three_g + four_g + fc + pc + px_height + 
##     px_width + m_dep + mobile_wt + sc_h + sc_w + talk_time + 
##     touch_screen + price_range, data = train)
## 
## Prior probabilities of groups:
##      Bajo      Alto 
## 0.4736239 0.5263761 
## 
## Group means:
##          wifi1 bluetooth1 clock_speed int_memory  n_cores      ram dual_sim1
## Bajo 0.5024213  0.4866828    1.511259   32.25182 4.037530 2109.636 0.5302663
## Alto 0.5152505  0.4901961    1.534641   31.90523 3.984749 2143.161 0.4934641
##       three_g1   four_g1       fc      pc px_height px_width     m_dep
## Bajo 0.7663438 0.5145278 4.222760 9.90678  643.7857 1254.410 0.4989104
## Alto 0.7657952 0.5283224 4.405229 9.91939  647.5131 1243.646 0.5037037
##      mobile_wt     sc_h     sc_w talk_time touch_screen1 price_range1
## Bajo  140.9685 12.43826 5.823245  10.85593     0.5048426    0.2602906
## Alto  139.9401 12.24292 5.797386  11.11438     0.4912854    0.2450980
##      price_range2 price_range3
## Bajo     0.248184    0.1767554
## Alto     0.245098    0.3213508
## 
## Coefficients of linear discriminants:
##                         LD1
## wifi1          0.1371019842
## bluetooth1    -0.0100782969
## clock_speed    0.0599295855
## int_memory    -0.0050114295
## n_cores       -0.0120390005
## ram           -0.0027540154
## dual_sim1     -0.0625504195
## three_g1      -0.1678620327
## four_g1        0.0703118053
## fc             0.0114605637
## pc            -0.0107097038
## px_height     -0.0007814384
## px_width      -0.0009312696
## m_dep          0.0560532302
## mobile_wt      0.0017481166
## sc_h          -0.0169126068
## sc_w           0.0081952887
## talk_time      0.0040051377
## touch_screen1 -0.0813228221
## price_range1   3.1616008448
## price_range2   5.6962670055
## price_range3   8.9756348811

Prior probabilities of groupos

La probabilidad priori se estima de observaciones de las proporciones que hay en cada clase. En este caso los los valores Bajo son 0.47, y Alto 0.53, continúan siendo muy parecidos.

Observamos predicciones:

predicciones_ldat<-predict(mod_lda_todo,test)
head(predicciones_ldat$class,n=5)

## [1] Alto Alto Alto Bajo Alto
## Levels: Bajo Alto

head(predicciones_ldat$posterior,n=5)

##         Bajo      Alto
## 1 0.06506993 0.9349301
## 2 0.33578743 0.6642126
## 3 0.44366955 0.5563305
## 4 0.68012800 0.3198720
## 5 0.27294452 0.7270555

Tenemos el primero, segundo y tercero como alto, el cuarto es bajo y el quinto alto.

Matriz de confusión

table(Clase_predicha=predicciones_ldat$class,clase_real=test$battery_power)

##               clase_real
## Clase_predicha Bajo Alto
##           Bajo  103   23
##           Alto   38   92

Para el caso de análisis lineal discriminante al tener en cuenta todas las variables, tenemos: 103, casos Bajos que fueron clasificados como Bajo es decir son Verdaderos Positivos (VP), 23 Casos que son Altos pero los clasifico como Bajos, son Falsos Negativos (FN), 38 casos Bajos que los clasifico como Altos, son Falsos Positivos (FP) y 92 Casos Altos que fueron clasificados como Alto, entonces son Verdaderos Negativos (VN).

paste( "% de acierto:", mean(prediccion_test$clase == test$battery_power))

## [1] "% de acierto: 0.75"

El resultado de porcentaje de acierto es de 0.75.

paste( "% de error:", mean(prediccion_test$clase != test$battery_power))

## [1] "% de error: 0.25"

El resultado de porcentaje de error es de 0.25.

paste( "% de acierto:", mean(predicciones_ldat$class == test$battery_power))

## [1] "% de acierto: 0.76171875"

El resultado de porcentaje de acierto es de 0.76

paste( "% de error:", mean(predicciones_ldat$class != test$battery_power))

## [1] "% de error: 0.23828125"

El resultado de porcentaje de error es de 0.26

test$prediccion2 <- predicciones_ldat$class
test$acierto <- ifelse(test = test$battery_power == test$prediccion2,
                            yes = "Si", no = "No")
ggplot(data = test, aes(x = px_height, y = px_width, color = battery_power,
                             shape = acierto)) +
  geom_point() +
  scale_shape_manual(values = c("No" = 4, "Si" = 19 )) + 
  theme_bw()

No es tan bueno, porque el porcentaje de error es 0.23

Quadratic Discriminant Analysis (QDA)

mod_qda<-qda(battery_power~bluetooth+wifi+clock_speed+int_memory+n_cores+ram+dual_sim+three_g+four_g+fc+pc+px_height+px_width+m_dep+mobile_wt+sc_h+sc_w+talk_time+touch_screen+price_range,data=train)
mod_qda

## Call:
## qda(battery_power ~ bluetooth + wifi + clock_speed + int_memory + 
##     n_cores + ram + dual_sim + three_g + four_g + fc + pc + px_height + 
##     px_width + m_dep + mobile_wt + sc_h + sc_w + talk_time + 
##     touch_screen + price_range, data = train)
## 
## Prior probabilities of groups:
##      Bajo      Alto 
## 0.4736239 0.5263761 
## 
## Group means:
##      bluetooth1     wifi1 clock_speed int_memory  n_cores      ram dual_sim1
## Bajo  0.4866828 0.5024213    1.511259   32.25182 4.037530 2109.636 0.5302663
## Alto  0.4901961 0.5152505    1.534641   31.90523 3.984749 2143.161 0.4934641
##       three_g1   four_g1       fc      pc px_height px_width     m_dep
## Bajo 0.7663438 0.5145278 4.222760 9.90678  643.7857 1254.410 0.4989104
## Alto 0.7657952 0.5283224 4.405229 9.91939  647.5131 1243.646 0.5037037
##      mobile_wt     sc_h     sc_w talk_time touch_screen1 price_range1
## Bajo  140.9685 12.43826 5.823245  10.85593     0.5048426    0.2602906
## Alto  139.9401 12.24292 5.797386  11.11438     0.4912854    0.2450980
##      price_range2 price_range3
## Bajo     0.248184    0.1767554
## Alto     0.245098    0.3213508

REVISAR

Volume no tiene el mismo comportamiento que los otros, ya que es 1 y 1 es decir no me esta capturando nada

Predicciones de la técnica

predicciones_tec_qda<-predict(mod_qda,test)
head(predicciones_tec_qda$class)

## [1] Alto Alto Alto Bajo Alto Bajo
## Levels: Bajo Alto

head(predicciones_tec_qda$posterior)

##        Bajo      Alto
## 1 0.1163200 0.8836800
## 2 0.1905141 0.8094859
## 3 0.1300792 0.8699208
## 4 0.7565111 0.2434889
## 5 0.4369328 0.5630672
## 6 0.6500333 0.3499667

En este caso, nos indica que la probabilidad del primer dato es alto con una probabilidad de 0.88, el segundo en alto de 0.81, en alto de 0.87, en bajo de 0.76, en alto 0.57 y en bajo de 0.65.

table(Clase_predicha=predicciones_tec_qda$class,clase_real=test$battery_power)

##               clase_real
## Clase_predicha Bajo Alto
##           Bajo   97   34
##           Alto   44   81

Para el caso de regresión logística al tener en cuenta todas las variables, tenemos: 97, casos Bajos que fueron clasificados como Bajo es decir son Verdaderos Positivos (VP), 34 Casos Altos que fueron clasificados como Bajo, son Falsos Negativos (FN), 44 casos Bajos que fueron clasificados como Alto, son Falsos Positivos (FP),81 Casos Altos que fueron clasificados como Alto, entonces son Verdaderos Negativos (VN).

paste( "% de acierto:", mean(predicciones_tec_qda$class == test$battery_power))

## [1] "% de acierto: 0.6953125"

El resultado de porcentaje de acierto es de 0.70.

paste( "% de error:", mean(predicciones_tec_qda$class != test$battery_power))

## [1] "% de error: 0.3046875"

El resultado de porcentaje de error es de 0.30.

KNN

library(class)

#set.seed(123)
#cl = train[,"battery_power"]
#prediccion_knn <- knn(train = train[, c( "clock_speed","int_memory","ram","fc","pc","px_height","px_width","m_dep","mobile_wt","sc_h","sc_w","talk_time")],
#                      test = test[, c( "clock_speed","int_memory","ram","fc","pc","px_height","px_width","m_dep","mobile_wt","sc_h","sc_w","talk_time")],
#                      cl = train[,"battery_power"], k = 2 )
#head(prediccion_knn)
#View(train)
#View(cl)
#dim(train[,"battery_power"])

#View(train[,"battery_power"])

#dim(train)

#set.seed(1)
#prediccion_knn <- knn(train = train[, c("Lag1", "Lag2")],
#                      test = test[, c("Lag1", "Lag2")] ,
#                      cl = train[,"Direction"], k = 3 )

#head(prediccion_knn)

#table(clase_predicha = prediccion_knn, clase_real = test$battery_power)

#set.seed(1)
#prediccion_knn2 <- knn(train = train[, c("bluetooth+clock_speed+int_memory+n_cores+ram+dual_sim+three_g+four_g+fc+pc+px_height+px_width+m_dep+mobile_wt+sc_h+sc_w+talk_time+touch_screen+price_range")],
#                      test = test[,c("bluetooth+clock_speed+int_memory+n_cores+ram+dual_sim+three_g+four_g+fc+pc+px_height+px_width+m_dep+mobile_wt+sc_h+sc_w+talk_time+touch_screen+price_range")] ,
#                      cl = train[,"battery_power"], k = 3 )
#head(prediccion_knn2)

#table(clase_predicha = prediccion_knn2, clase_real = test$battery_power)

Tercer corte proyecto

Jenny

2025-05-22