Vamos a leer los datos y examinar las primeras líneas:

df <- read_sav("Variables.sav") %>% haven::as_factor() %>%
  mutate(caida=as.factor(numero_caidas>0))

df2 <- df %>% bin()
## Warning in bin(.): 2419 instance(s) removed due to missing values
df 
## # A tibble: 2,831 x 20
##    fecha      centro  edad sexo  estancia_en_cen~ tinetti resultado_tinet~
##    <date>      <dbl> <dbl> <fct>            <dbl>   <dbl> <chr>           
##  1 2017-10-11  82677    75 Homb~               12       1 Riesgo alto de ~
##  2 2017-10-11  82677    73 Homb~                4       1 Riesgo alto de ~
##  3 2017-10-11  42182    89 Homb~                4       1 Riesgo alto de ~
##  4 2017-10-11  34157    89 Homb~                4       1 Riesgo alto de ~
##  5 2017-10-11    835    84 Homb~                3       1 Riesgo alto de ~
##  6 2017-10-11  43166    88 Homb~                3       1 Riesgo alto de ~
##  7 2017-10-11  52822    86 Homb~                3       1 Riesgo alto de ~
##  8 2017-10-11 134344    85 Homb~                2       1 Riesgo alto de ~
##  9 2017-10-11 134347    83 Homb~                9       1 Riesgo alto de ~
## 10 2017-10-11 140716    98 Mujer                1       1 Riesgo alto de ~
## # ... with 2,821 more rows, and 13 more variables: mmse <dbl>,
## #   resultado_mmse <chr>, barthel <dbl>, resultado_barthel <chr>,
## #   tipo_terap_grupal <chr>, num_terap_grupal <dbl>, tipo_ter_indiv <chr>,
## #   num_terap_indiv <dbl>, numero_medicamentos <dbl>, numero_caidas <dbl>,
## #   fumat <dbl>, minutos_fisioterapia_semana <dbl>, caida <fct>

El análisis a hacer corresponde a lo siguiente:

estudié como la estancia en el centro afectaba en el número de caídas (siendo significativa) ajustando por sexo y edad, sigue siendo significativa, teniendo un efecto protector significativo. Lo mismo voy a valorar con las siguientes variables: - Tiempo de estancia en el centro es significativo p=0,016 - FUMAT (escala de calidad de vida) p=0,00 - MMSE (estado cognitivo) p=0,00 - Tinetti (medición riesgo de caídas) - Barthel (independencia para actividades de la vida diaria)

Van ahora los análisis modelo a modelo:

tabla<-df %>%
  finalfit("numero_caidas", c("sexo", "edad","estancia_en_centro"),metrics=TRUE)

  tabla[[1]] %>% knitr::kable(booktabs=T)
Dependent: numero_caidas Mean (sd) Coefficient (univariable) Coefficient (multivariable)
3 sexo Hombre 1.4 (2.4)
4 Mujer 1.4 (2.8) -0.05 (-0.25 to 0.15, p=0.642) -0.01 (-0.22 to 0.19, p=0.901)
1 edad [70,109] 1.4 (2.6) 0.01 (-0.00 to 0.03, p=0.131) 0.01 (-0.00 to 0.03, p=0.143)
2 estancia_en_centro [1,26] 1.4 (2.6) -0.04 (-0.07 to -0.01, p=0.016) -0.04 (-0.07 to -0.01, p=0.017)
x
Number in dataframe = 2831, Number in model = 2831, Missing = 0, Log-likelihood = -6662.8, R-squared = 0.0029, Adjusted r-squared = 0.0018
tabla <- df %>%
  finalfit("numero_caidas", c("sexo", "edad","fumat"),metrics=TRUE)
  tabla[[1]] %>% knitr::kable(booktabs=T)
Dependent: numero_caidas Mean (sd) Coefficient (univariable) Coefficient (multivariable)
3 sexo Hombre 1.4 (2.4)
4 Mujer 1.4 (2.8) -0.05 (-0.25 to 0.15, p=0.642) 0.15 (-0.19 to 0.50, p=0.384)
1 edad [70,109] 1.4 (2.6) 0.01 (-0.00 to 0.03, p=0.131) 0.02 (-0.01 to 0.04, p=0.195)
2 fumat [52,112] 1.6 (2.5) -0.04 (-0.06 to -0.03, p<0.001) -0.05 (-0.06 to -0.03, p<0.001)
x
Number in dataframe = 2831, Number in model = 932, Missing = 1899, Log-likelihood = -2141.89, R-squared = 0.048, Adjusted r-squared = 0.045
tabla <- df %>%
  finalfit("numero_caidas", c("sexo", "edad","mmse"),metrics=TRUE)
  tabla[[1]] %>% knitr::kable(booktabs=T)
Dependent: numero_caidas Mean (sd) Coefficient (univariable) Coefficient (multivariable)
3 sexo Hombre 1.4 (2.4)
4 Mujer 1.4 (2.8) -0.05 (-0.25 to 0.15, p=0.642) 0.07 (-0.14 to 0.29, p=0.505)
1 edad [70,109] 1.4 (2.6) 0.01 (-0.00 to 0.03, p=0.131) 0.01 (-0.00 to 0.03, p=0.123)
2 mmse [-1,30] 1.5 (2.6) -0.05 (-0.06 to -0.04, p<0.001) -0.05 (-0.06 to -0.04, p<0.001)
x
Number in dataframe = 2831, Number in model = 2631, Missing = 200, Log-likelihood = -6212.41, R-squared = 0.026, Adjusted r-squared = 0.025
tabla <- df %>%
  finalfit("numero_caidas", c("sexo", "edad","tinetti"),metrics=TRUE)
  tabla[[1]] %>% knitr::kable(booktabs=T)
Dependent: numero_caidas Mean (sd) Coefficient (univariable) Coefficient (multivariable)
2 sexo Hombre 1.4 (2.4)
3 Mujer 1.4 (2.8) -0.05 (-0.25 to 0.15, p=0.642) -0.00 (-0.21 to 0.20, p=0.980)
1 edad [70,109] 1.4 (2.6) 0.01 (-0.00 to 0.03, p=0.131) 0.01 (-0.01 to 0.02, p=0.292)
4 tinetti [1,28] 1.4 (2.6) -0.03 (-0.04 to -0.01, p=0.001) -0.03 (-0.04 to -0.01, p=0.001)
x
Number in dataframe = 2831, Number in model = 2831, Missing = 0, Log-likelihood = -6660.57, R-squared = 0.0044, Adjusted r-squared = 0.0034
tabla <-df %>%
  finalfit("numero_caidas", c("sexo", "edad","barthel"),metrics=TRUE)
  tabla[[1]] %>% knitr::kable(booktabs=T)
Dependent: numero_caidas Mean (sd) Coefficient (univariable) Coefficient (multivariable)
3 sexo Hombre 1.4 (2.4)
4 Mujer 1.4 (2.8) -0.05 (-0.25 to 0.15, p=0.642) 0.02 (-0.18 to 0.23, p=0.846)
2 edad [70,109] 1.4 (2.6) 0.01 (-0.00 to 0.03, p=0.131) 0.01 (-0.01 to 0.02, p=0.238)
1 barthel [0,100] 1.4 (2.6) -0.01 (-0.01 to -0.01, p<0.001) -0.01 (-0.01 to -0.00, p<0.001)
x
Number in dataframe = 2831, Number in model = 2829, Missing = 2, Log-likelihood = -6651.49, R-squared = 0.008, Adjusted r-squared = 0.007

Por último vamos a ver qué tal va un modelo con todo a la vez:

tabla <- df %>%
  finalfit("numero_caidas", c("sexo", "edad", "estancia_en_centro","fumat", "mmse", "tinetti", "barthel"), metrics=TRUE)
  tabla[[1]] %>% knitr::kable(booktabs=T)
Dependent: numero_caidas Mean (sd) Coefficient (univariable) Coefficient (multivariable)
6 sexo Hombre 1.4 (2.4)
7 Mujer 1.4 (2.8) -0.05 (-0.25 to 0.15, p=0.642) 0.18 (-0.17 to 0.52, p=0.311)
2 edad [70,109] 1.4 (2.6) 0.01 (-0.00 to 0.03, p=0.131) 0.01 (-0.01 to 0.04, p=0.220)
3 estancia_en_centro [1,26] 1.4 (2.6) -0.04 (-0.07 to -0.01, p=0.016) -0.02 (-0.08 to 0.04, p=0.508)
4 fumat [52,112] 1.6 (2.5) -0.04 (-0.06 to -0.03, p<0.001) -0.04 (-0.05 to -0.02, p<0.001)
5 mmse [-1,30] 1.5 (2.6) -0.05 (-0.06 to -0.04, p<0.001) -0.03 (-0.06 to -0.01, p=0.012)
8 tinetti [1,28] 1.4 (2.6) -0.03 (-0.04 to -0.01, p=0.001) -0.02 (-0.05 to 0.01, p=0.202)
1 barthel [0,100] 1.4 (2.6) -0.01 (-0.01 to -0.01, p<0.001) 0.00 (-0.01 to 0.01, p=0.381)
x
Number in dataframe = 2831, Number in model = 932, Missing = 1899, Log-likelihood = -2137.51, R-squared = 0.057, Adjusted r-squared = 0.05

Las estimaciones con todas a la vez no parecen muy buenas. Exploremos las correlaciones entre las explicativas:

df %>% generaTablaCorrelaciones(vNumericas = c("numero_caidas", "estancia_en_centro","edad","fumat", "mmse", "tinetti", "barthel")) %>%
  knitr::kable(booktabs=T)
Variable numero_caidas [01] [02] [03] [04] [05]
[01] estancia_en_centro -0.05*
[02] edad 0.03 -0.01
[03] fumat -0.21*** -0.04 0.05
[04] mmse -0.16*** 0.07*** 0.00 0.52***
[05] tinetti -0.06*** 0.05** -0.14*** 0.17*** 0.25***
[06] barthel -0.09*** 0.10*** -0.08*** 0.33*** 0.46*** 0.51***

Parece que barthel está muy asociada con mmse, tinetti. No deberán estudiarse todas a la vez pues contienen información similar. Probemos análisis alternativos entonces:

df %>%
  finalfit("numero_caidas", c("sexo", "edad", "estancia_en_centro","fumat", "mmse"), metrics=TRUE) %>%
  .[[1]] %>% knitr::kable(booktabs=T)
Dependent: numero_caidas Mean (sd) Coefficient (univariable) Coefficient (multivariable)
5 sexo Hombre 1.4 (2.4)
6 Mujer 1.4 (2.8) -0.05 (-0.25 to 0.15, p=0.642) 0.18 (-0.17 to 0.52, p=0.313)
1 edad [70,109] 1.4 (2.6) 0.01 (-0.00 to 0.03, p=0.131) 0.02 (-0.01 to 0.04, p=0.174)
2 estancia_en_centro [1,26] 1.4 (2.6) -0.04 (-0.07 to -0.01, p=0.016) -0.02 (-0.08 to 0.04, p=0.519)
3 fumat [52,112] 1.6 (2.5) -0.04 (-0.06 to -0.03, p<0.001) -0.04 (-0.05 to -0.02, p<0.001)
4 mmse [-1,30] 1.5 (2.6) -0.05 (-0.06 to -0.04, p<0.001) -0.03 (-0.05 to -0.01, p=0.014)
df %>%
  finalfit("numero_caidas", c("sexo", "edad", "estancia_en_centro", "barthel"), metrics=TRUE) %>%
  .[[1]] %>% knitr::kable(booktabs=T)
Dependent: numero_caidas Mean (sd) Coefficient (univariable) Coefficient (multivariable)
4 sexo Hombre 1.4 (2.4)
5 Mujer 1.4 (2.8) -0.05 (-0.25 to 0.15, p=0.642) 0.03 (-0.18 to 0.23, p=0.777)
2 edad [70,109] 1.4 (2.6) 0.01 (-0.00 to 0.03, p=0.131) 0.01 (-0.01 to 0.02, p=0.232)
3 estancia_en_centro [1,26] 1.4 (2.6) -0.04 (-0.07 to -0.01, p=0.016) -0.03 (-0.07 to -0.00, p=0.046)
1 barthel [0,100] 1.4 (2.6) -0.01 (-0.01 to -0.01, p<0.001) -0.01 (-0.01 to -0.00, p<0.001)

Podría ser útil estudiar la presencia/ausencia de caidas para estas variables, si acaso, agrupando un poco sus valores:

  ggplot(df2, aes(x = edad, fill = caida)) +  geom_bar(position = "fill") +ylab("")

  ggplot(df2, aes(x = sexo, fill = caida)) +  geom_bar(position = "fill") +ylab("")

  ggplot(df2, aes(x = estancia_en_centro, fill = caida)) +  geom_bar(position = "fill") +ylab("")

  ggplot(df2, aes(x = fumat, fill = caida)) +  geom_bar(position = "fill") +ylab("")

  ggplot(df2, aes(x = mmse, fill = caida)) +  geom_bar(position = "fill") +ylab("")

  ggplot(df2, aes(x = tinetti, fill = caida)) +  geom_bar(position = "fill") +ylab("")

  ggplot(df2, aes(x = barthel, fill = caida)) +  geom_bar(position = "fill") +ylab("")

Y ahora vamos a intentar algunos predictores simple de las caidas. Primero con las variables categorizadas:

data <- optbin(formula = caida ~., data = df2 %>% select (-numero_caidas), method = "infogain")
model <- OneR(formula = caida ~., data = data, verbose = TRUE)
## Warning in OneR.data.frame(x = data, ties.method = ties.method, verbose =
## verbose, : data contains unused factor levels
## 
##     Attribute                   Accuracy
## 1 * resultado_mmse              59.71%  
## 2   mmse                        57.52%  
## 3   tipo_ter_indiv              57.28%  
## 4   tinetti                     56.07%  
## 4   fumat                       56.07%  
## 6   edad                        54.13%  
## 7   centro                      53.64%  
## 7   numero_medicamentos         53.64%  
## 9   sexo                        53.16%  
## 9   tipo_terap_grupal           53.16%  
## 9   minutos_fisioterapia_semana 53.16%  
## 12  estancia_en_centro          52.91%  
## 13  num_terap_grupal            52.67%  
## 14  resultado_tinetti           52.43%  
## 14  num_terap_indiv             52.43%  
## 16  barthel                     52.18%  
## 16  resultado_barthel           52.18%  
## 18  fecha                       51.94%  
## ---
## Chosen attribute due to accuracy
## and ties method (if applicable): '*'

Ahora con las variables tal cual vienen:

data <- optbin(formula = caida ~., data = df %>% select (-numero_caidas), method = "infogain")
## Warning in optbin.data.frame(x = data, method = method, na.omit = na.omit):
## 2419 instance(s) removed due to missing values
model <- OneR(formula = caida ~., data = data, verbose = TRUE)
## 
##     Attribute                   Accuracy
## 1 * mmse                        59.71%  
## 1   resultado_mmse              59.71%  
## 3   fumat                       57.77%  
## 4   tipo_ter_indiv              57.28%  
## 5   tinetti                     56.07%  
## 6   centro                      54.37%  
## 7   edad                        53.64%  
## 7   numero_medicamentos         53.64%  
## 9   sexo                        53.16%  
## 9   tipo_terap_grupal           53.16%  
## 11  num_terap_grupal            52.67%  
## 12  resultado_tinetti           52.43%  
## 13  resultado_barthel           52.18%  
## 14  fecha                       51.94%  
## 14  estancia_en_centro          51.94%  
## 14  barthel                     51.94%  
## 14  num_terap_indiv             51.94%  
## 14  minutos_fisioterapia_semana 51.94%  
## ---
## Chosen attribute due to accuracy
## and ties method (if applicable): '*'
summary(model)
## 
## Call:
## OneR.formula(formula = caida ~ ., data = data, verbose = TRUE)
## 
## Rules:
## If mmse = (-1.03,26] then caida = TRUE
## If mmse = (26,30]    then caida = FALSE
## 
## Accuracy:
## 246 of 412 instances classified correctly (60%)
## 
## Contingency table:
##        mmse
## caida   (-1.03,26] (26,30] Sum
##   FALSE        140    * 58 198
##   TRUE       * 188      26 214
##   Sum          328      84 412
## ---
## Maximum in each column: '*'
## 
## Pearson's Chi-squared test:
## X-squared = 20, df = 1, p-value = 3e-05
plot(model)

prediction <- predict(model, data)
eval_model(prediction, data)
## 
## Confusion matrix (absolute):
##           Actual
## Prediction FALSE TRUE Sum
##      FALSE    58   26  84
##      TRUE    140  188 328
##      Sum     198  214 412
## 
## Confusion matrix (relative):
##           Actual
## Prediction FALSE TRUE  Sum
##      FALSE  0.14 0.06 0.20
##      TRUE   0.34 0.46 0.80
##      Sum    0.48 0.52 1.00
## 
## Accuracy:
## 0.6 (246/412)
## 
## Error rate:
## 0.4 (166/412)
## 
## Error rate reduction (vs. base rate):
## 0.16 (p-value = 9e-04)

Vamos a hacer lo mismo,pero excluyendo mmse. Eso hará que la variable de interés sea fumat:

data <- optbin(formula = caida ~., data = df %>% select (-numero_caidas,-mmse,-resultado_mmse), method = "infogain")
## Warning in optbin.data.frame(x = data, method = method, na.omit = na.omit):
## 2419 instance(s) removed due to missing values
model <- OneR(formula = caida ~., data = data, verbose = TRUE)
## 
##     Attribute                   Accuracy
## 1 * fumat                       57.77%  
## 2   tipo_ter_indiv              57.28%  
## 3   tinetti                     56.07%  
## 4   centro                      54.37%  
## 5   edad                        53.64%  
## 5   numero_medicamentos         53.64%  
## 7   sexo                        53.16%  
## 7   tipo_terap_grupal           53.16%  
## 9   num_terap_grupal            52.67%  
## 10  resultado_tinetti           52.43%  
## 11  resultado_barthel           52.18%  
## 12  fecha                       51.94%  
## 12  estancia_en_centro          51.94%  
## 12  barthel                     51.94%  
## 12  num_terap_indiv             51.94%  
## 12  minutos_fisioterapia_semana 51.94%  
## ---
## Chosen attribute due to accuracy
## and ties method (if applicable): '*'
summary(model)
## 
## Call:
## OneR.formula(formula = caida ~ ., data = data, verbose = TRUE)
## 
## Rules:
## If fumat = (53.9,97] then caida = TRUE
## If fumat = (97,111]  then caida = FALSE
## 
## Accuracy:
## 238 of 412 instances classified correctly (58%)
## 
## Contingency table:
##        fumat
## caida   (53.9,97] (97,111] Sum
##   FALSE       145     * 53 198
##   TRUE      * 185       29 214
##   Sum         330       82 412
## ---
## Maximum in each column: '*'
## 
## Pearson's Chi-squared test:
## X-squared = 10, df = 1, p-value = 0.001
plot(model)

prediction <- predict(model, data)
eval_model(prediction, data)
## 
## Confusion matrix (absolute):
##           Actual
## Prediction FALSE TRUE Sum
##      FALSE    53   29  82
##      TRUE    145  185 330
##      Sum     198  214 412
## 
## Confusion matrix (relative):
##           Actual
## Prediction FALSE TRUE  Sum
##      FALSE  0.13 0.07 0.20
##      TRUE   0.35 0.45 0.80
##      Sum    0.48 0.52 1.00
## 
## Accuracy:
## 0.58 (238/412)
## 
## Error rate:
## 0.42 (174/412)
## 
## Error rate reduction (vs. base rate):
## 0.12 (p-value = 0.01)

Ahora excluimos también fumat:

data <- optbin(formula = caida ~., data = df %>% select (-numero_caidas,-mmse,-resultado_mmse, -fumat), method = "infogain")
## Warning in optbin.data.frame(x = data, method = method, na.omit = na.omit):
## 1540 instance(s) removed due to missing values
model <- OneR(formula = caida ~., data = data, verbose = TRUE)
## 
##     Attribute                   Accuracy
## 1 * barthel                     56.31%  
## 2   centro                      56.08%  
## 3   tinetti                     55.85%  
## 4   resultado_tinetti           55.62%  
## 5   tipo_ter_indiv              55.54%  
## 6   resultado_barthel           55.15%  
## 7   tipo_terap_grupal           53.6%   
## 7   num_terap_grupal            53.6%   
## 9   numero_medicamentos         52.67%  
## 10  num_terap_indiv             52.13%  
## 11  estancia_en_centro          51.98%  
## 12  sexo                        51.9%   
## 13  minutos_fisioterapia_semana 51.51%  
## 14  fecha                       51.12%  
## 14  edad                        51.12%  
## ---
## Chosen attribute due to accuracy
## and ties method (if applicable): '*'
summary(model)
## 
## Call:
## OneR.formula(formula = caida ~ ., data = data, verbose = TRUE)
## 
## Rules:
## If barthel = (-0.1,70] then caida = TRUE
## If barthel = (70,100]  then caida = FALSE
## 
## Accuracy:
## 727 of 1291 instances classified correctly (56%)
## 
## Contingency table:
##        barthel
## caida   (-0.1,70] (70,100]  Sum
##   FALSE       245    * 415  660
##   TRUE      * 312      319  631
##   Sum         557      734 1291
## ---
## Maximum in each column: '*'
## 
## Pearson's Chi-squared test:
## X-squared = 20, df = 1, p-value = 1e-05
plot(model)

prediction <- predict(model, data)
eval_model(prediction, data)
## 
## Confusion matrix (absolute):
##           Actual
## Prediction FALSE TRUE  Sum
##      FALSE   415  319  734
##      TRUE    245  312  557
##      Sum     660  631 1291
## 
## Confusion matrix (relative):
##           Actual
## Prediction FALSE TRUE  Sum
##      FALSE  0.32 0.25 0.57
##      TRUE   0.19 0.24 0.43
##      Sum    0.51 0.49 1.00
## 
## Accuracy:
## 0.56 (727/1291)
## 
## Error rate:
## 0.44 (564/1291)
## 
## Error rate reduction (vs. base rate):
## 0.11 (p-value = 1e-04)

y tipo_ter_indiv también la excluímos:

data <- optbin(formula = caida ~., data = df %>% select (-numero_caidas,-mmse,-resultado_mmse, -fumat, -barthel, -centro), method = "infogain")
## Warning in optbin.data.frame(x = data, method = method, na.omit = na.omit):
## 1540 instance(s) removed due to missing values
model <- OneR(formula = caida ~., data = data, verbose = TRUE)
## 
##     Attribute                   Accuracy
## 1 * tinetti                     55.85%  
## 2   resultado_tinetti           55.62%  
## 3   tipo_ter_indiv              55.54%  
## 4   resultado_barthel           55.15%  
## 5   tipo_terap_grupal           53.6%   
## 5   num_terap_grupal            53.6%   
## 7   numero_medicamentos         52.67%  
## 8   num_terap_indiv             52.13%  
## 9   estancia_en_centro          51.98%  
## 10  sexo                        51.9%   
## 11  minutos_fisioterapia_semana 51.51%  
## 12  fecha                       51.12%  
## 12  edad                        51.12%  
## ---
## Chosen attribute due to accuracy
## and ties method (if applicable): '*'
summary(model)
## 
## Call:
## OneR.formula(formula = caida ~ ., data = data, verbose = TRUE)
## 
## Rules:
## If tinetti = (0.973,20] then caida = TRUE
## If tinetti = (20,28]    then caida = FALSE
## 
## Accuracy:
## 721 of 1291 instances classified correctly (56%)
## 
## Contingency table:
##        tinetti
## caida   (0.973,20] (20,28]  Sum
##   FALSE        232   * 428  660
##   TRUE       * 293     338  631
##   Sum          525     766 1291
## ---
## Maximum in each column: '*'
## 
## Pearson's Chi-squared test:
## X-squared = 20, df = 1, p-value = 5e-05
plot(model)

prediction <- predict(model, data)
eval_model(prediction, data)
## 
## Confusion matrix (absolute):
##           Actual
## Prediction FALSE TRUE  Sum
##      FALSE   428  338  766
##      TRUE    232  293  525
##      Sum     660  631 1291
## 
## Confusion matrix (relative):
##           Actual
## Prediction FALSE TRUE  Sum
##      FALSE  0.33 0.26 0.59
##      TRUE   0.18 0.23 0.41
##      Sum    0.51 0.49 1.00
## 
## Accuracy:
## 0.56 (721/1291)
## 
## Error rate:
## 0.44 (570/1291)
## 
## Error rate reduction (vs. base rate):
## 0.097 (p-value = 4e-04)
data <- optbin(formula = caida ~., data = df %>% select (-numero_caidas,-mmse,-resultado_mmse, -fumat, -barthel, -centro, -tinetti, -resultado_tinetti, -tipo_ter_indiv, -resultado_barthel, -tipo_terap_grupal, -num_terap_grupal), method = "infogain")
## Warning in optbin.data.frame(x = data, method = method, na.omit = na.omit):
## 1540 instance(s) removed due to missing values
model <- OneR(formula = caida ~., data = data, verbose = TRUE)
## 
##     Attribute                   Accuracy
## 1 * numero_medicamentos         52.67%  
## 2   num_terap_indiv             52.13%  
## 3   estancia_en_centro          51.98%  
## 4   sexo                        51.9%   
## 5   minutos_fisioterapia_semana 51.51%  
## 6   fecha                       51.12%  
## 6   edad                        51.12%  
## ---
## Chosen attribute due to accuracy
## and ties method (if applicable): '*'
summary(model)
## 
## Call:
## OneR.formula(formula = caida ~ ., data = data, verbose = TRUE)
## 
## Rules:
## If numero_medicamentos = (-0.024,7] then caida = FALSE
## If numero_medicamentos = (7,24]     then caida = TRUE
## 
## Accuracy:
## 680 of 1291 instances classified correctly (53%)
## 
## Contingency table:
##        numero_medicamentos
## caida   (-0.024,7] (7,24]  Sum
##   FALSE      * 328    332  660
##   TRUE         279  * 352  631
##   Sum          607    684 1291
## ---
## Maximum in each column: '*'
## 
## Pearson's Chi-squared test:
## X-squared = 4, df = 1, p-value = 0.06
plot(model)

prediction <- predict(model, data)
eval_model(prediction, data)
## 
## Confusion matrix (absolute):
##           Actual
## Prediction FALSE TRUE  Sum
##      FALSE   328  279  607
##      TRUE    332  352  684
##      Sum     660  631 1291
## 
## Confusion matrix (relative):
##           Actual
## Prediction FALSE TRUE  Sum
##      FALSE  0.25 0.22 0.47
##      TRUE   0.26 0.27 0.53
##      Sum    0.51 0.49 1.00
## 
## Accuracy:
## 0.53 (680/1291)
## 
## Error rate:
## 0.47 (611/1291)
## 
## Error rate reduction (vs. base rate):
## 0.032 (p-value = 0.1)
data <- optbin(formula = caida ~., data = df %>% select (-numero_caidas,-mmse,-resultado_mmse, -fumat, -barthel, -centro, -tinetti, -resultado_tinetti,-tipo_ter_indiv, -resultado_barthel, -tipo_terap_grupal), method = "infogain")
## Warning in optbin.data.frame(x = data, method = method, na.omit = na.omit):
## 1540 instance(s) removed due to missing values
model <- OneR(formula = caida ~., data = data, verbose = TRUE)
## 
##     Attribute                   Accuracy
## 1 * num_terap_grupal            53.6%   
## 2   numero_medicamentos         52.67%  
## 3   num_terap_indiv             52.13%  
## 4   estancia_en_centro          51.98%  
## 5   sexo                        51.9%   
## 6   minutos_fisioterapia_semana 51.51%  
## 7   fecha                       51.12%  
## 7   edad                        51.12%  
## ---
## Chosen attribute due to accuracy
## and ties method (if applicable): '*'
summary(model)
## 
## Call:
## OneR.formula(formula = caida ~ ., data = data, verbose = TRUE)
## 
## Rules:
## If num_terap_grupal = (-0.006,0] then caida = TRUE
## If num_terap_grupal = (0,6.01]   then caida = FALSE
## 
## Accuracy:
## 692 of 1291 instances classified correctly (54%)
## 
## Contingency table:
##        num_terap_grupal
## caida   (-0.006,0] (0,6.01]  Sum
##   FALSE        155    * 505  660
##   TRUE       * 187      444  631
##   Sum          342      949 1291
## ---
## Maximum in each column: '*'
## 
## Pearson's Chi-squared test:
## X-squared = 6, df = 1, p-value = 0.01
plot(model)

prediction <- predict(model, data)
eval_model(prediction, data)
## 
## Confusion matrix (absolute):
##           Actual
## Prediction FALSE TRUE  Sum
##      FALSE   505  444  949
##      TRUE    155  187  342
##      Sum     660  631 1291
## 
## Confusion matrix (relative):
##           Actual
## Prediction FALSE TRUE  Sum
##      FALSE  0.39 0.34 0.74
##      TRUE   0.12 0.14 0.26
##      Sum    0.51 0.49 1.00
## 
## Accuracy:
## 0.54 (692/1291)
## 
## Error rate:
## 0.46 (599/1291)
## 
## Error rate reduction (vs. base rate):
## 0.051 (p-value = 0.04)