regresion

Vamos a leer los datos y examinar las primeras líneas:

df <- read_sav("Variables.sav") %>% haven::as_factor() %>%
  mutate(caida=as.factor(numero_caidas>0))

df2 <- df %>% bin()

## Warning in bin(.): 2419 instance(s) removed due to missing values

df

## # A tibble: 2,831 x 20
##    fecha      centro  edad sexo  estancia_en_cen~ tinetti resultado_tinet~
##    <date>      <dbl> <dbl> <fct>            <dbl>   <dbl> <chr>           
##  1 2017-10-11  82677    75 Homb~               12       1 Riesgo alto de ~
##  2 2017-10-11  82677    73 Homb~                4       1 Riesgo alto de ~
##  3 2017-10-11  42182    89 Homb~                4       1 Riesgo alto de ~
##  4 2017-10-11  34157    89 Homb~                4       1 Riesgo alto de ~
##  5 2017-10-11    835    84 Homb~                3       1 Riesgo alto de ~
##  6 2017-10-11  43166    88 Homb~                3       1 Riesgo alto de ~
##  7 2017-10-11  52822    86 Homb~                3       1 Riesgo alto de ~
##  8 2017-10-11 134344    85 Homb~                2       1 Riesgo alto de ~
##  9 2017-10-11 134347    83 Homb~                9       1 Riesgo alto de ~
## 10 2017-10-11 140716    98 Mujer                1       1 Riesgo alto de ~
## # ... with 2,821 more rows, and 13 more variables: mmse <dbl>,
## #   resultado_mmse <chr>, barthel <dbl>, resultado_barthel <chr>,
## #   tipo_terap_grupal <chr>, num_terap_grupal <dbl>, tipo_ter_indiv <chr>,
## #   num_terap_indiv <dbl>, numero_medicamentos <dbl>, numero_caidas <dbl>,
## #   fumat <dbl>, minutos_fisioterapia_semana <dbl>, caida <fct>

El análisis a hacer corresponde a lo siguiente:

estudié como la estancia en el centro afectaba en el número de caídas (siendo significativa) ajustando por sexo y edad, sigue siendo significativa, teniendo un efecto protector significativo. Lo mismo voy a valorar con las siguientes variables: - Tiempo de estancia en el centro es significativo p=0,016 - FUMAT (escala de calidad de vida) p=0,00 - MMSE (estado cognitivo) p=0,00 - Tinetti (medición riesgo de caídas) - Barthel (independencia para actividades de la vida diaria)

Van ahora los análisis modelo a modelo:

tabla<-df %>%
  finalfit("numero_caidas", c("sexo", "edad","estancia_en_centro"),metrics=TRUE)

  tabla[[1]] %>% knitr::kable(booktabs=T)

	Dependent: numero_caidas		Mean (sd)	Coefficient (univariable)	Coefficient (multivariable)
3	sexo	Hombre	1.4 (2.4)
4		Mujer	1.4 (2.8)	-0.05 (-0.25 to 0.15, p=0.642)	-0.01 (-0.22 to 0.19, p=0.901)
1	edad	[70,109]	1.4 (2.6)	0.01 (-0.00 to 0.03, p=0.131)	0.01 (-0.00 to 0.03, p=0.143)
2	estancia_en_centro	[1,26]	1.4 (2.6)	-0.04 (-0.07 to -0.01, p=0.016)	-0.04 (-0.07 to -0.01, p=0.017)

x
Number in dataframe = 2831, Number in model = 2831, Missing = 0, Log-likelihood = -6662.8, R-squared = 0.0029, Adjusted r-squared = 0.0018

tabla <- df %>%
  finalfit("numero_caidas", c("sexo", "edad","fumat"),metrics=TRUE)
  tabla[[1]] %>% knitr::kable(booktabs=T)

	Dependent: numero_caidas		Mean (sd)	Coefficient (univariable)	Coefficient (multivariable)
3	sexo	Hombre	1.4 (2.4)
4		Mujer	1.4 (2.8)	-0.05 (-0.25 to 0.15, p=0.642)	0.15 (-0.19 to 0.50, p=0.384)
1	edad	[70,109]	1.4 (2.6)	0.01 (-0.00 to 0.03, p=0.131)	0.02 (-0.01 to 0.04, p=0.195)
2	fumat	[52,112]	1.6 (2.5)	-0.04 (-0.06 to -0.03, p<0.001)	-0.05 (-0.06 to -0.03, p<0.001)

x
Number in dataframe = 2831, Number in model = 932, Missing = 1899, Log-likelihood = -2141.89, R-squared = 0.048, Adjusted r-squared = 0.045

tabla <- df %>%
  finalfit("numero_caidas", c("sexo", "edad","mmse"),metrics=TRUE)
  tabla[[1]] %>% knitr::kable(booktabs=T)

	Dependent: numero_caidas		Mean (sd)	Coefficient (univariable)	Coefficient (multivariable)
3	sexo	Hombre	1.4 (2.4)
4		Mujer	1.4 (2.8)	-0.05 (-0.25 to 0.15, p=0.642)	0.07 (-0.14 to 0.29, p=0.505)
1	edad	[70,109]	1.4 (2.6)	0.01 (-0.00 to 0.03, p=0.131)	0.01 (-0.00 to 0.03, p=0.123)
2	mmse	[-1,30]	1.5 (2.6)	-0.05 (-0.06 to -0.04, p<0.001)	-0.05 (-0.06 to -0.04, p<0.001)

x
Number in dataframe = 2831, Number in model = 2631, Missing = 200, Log-likelihood = -6212.41, R-squared = 0.026, Adjusted r-squared = 0.025

tabla <- df %>%
  finalfit("numero_caidas", c("sexo", "edad","tinetti"),metrics=TRUE)
  tabla[[1]] %>% knitr::kable(booktabs=T)

	Dependent: numero_caidas		Mean (sd)	Coefficient (univariable)	Coefficient (multivariable)
2	sexo	Hombre	1.4 (2.4)
3		Mujer	1.4 (2.8)	-0.05 (-0.25 to 0.15, p=0.642)	-0.00 (-0.21 to 0.20, p=0.980)
1	edad	[70,109]	1.4 (2.6)	0.01 (-0.00 to 0.03, p=0.131)	0.01 (-0.01 to 0.02, p=0.292)
4	tinetti	[1,28]	1.4 (2.6)	-0.03 (-0.04 to -0.01, p=0.001)	-0.03 (-0.04 to -0.01, p=0.001)

x
Number in dataframe = 2831, Number in model = 2831, Missing = 0, Log-likelihood = -6660.57, R-squared = 0.0044, Adjusted r-squared = 0.0034

tabla <-df %>%
  finalfit("numero_caidas", c("sexo", "edad","barthel"),metrics=TRUE)
  tabla[[1]] %>% knitr::kable(booktabs=T)

	Dependent: numero_caidas		Mean (sd)	Coefficient (univariable)	Coefficient (multivariable)
3	sexo	Hombre	1.4 (2.4)
4		Mujer	1.4 (2.8)	-0.05 (-0.25 to 0.15, p=0.642)	0.02 (-0.18 to 0.23, p=0.846)
2	edad	[70,109]	1.4 (2.6)	0.01 (-0.00 to 0.03, p=0.131)	0.01 (-0.01 to 0.02, p=0.238)
1	barthel	[0,100]	1.4 (2.6)	-0.01 (-0.01 to -0.01, p<0.001)	-0.01 (-0.01 to -0.00, p<0.001)

x
Number in dataframe = 2831, Number in model = 2829, Missing = 2, Log-likelihood = -6651.49, R-squared = 0.008, Adjusted r-squared = 0.007

Por último vamos a ver qué tal va un modelo con todo a la vez:

tabla <- df %>%
  finalfit("numero_caidas", c("sexo", "edad", "estancia_en_centro","fumat", "mmse", "tinetti", "barthel"), metrics=TRUE)
  tabla[[1]] %>% knitr::kable(booktabs=T)

	Dependent: numero_caidas		Mean (sd)	Coefficient (univariable)	Coefficient (multivariable)
6	sexo	Hombre	1.4 (2.4)
7		Mujer	1.4 (2.8)	-0.05 (-0.25 to 0.15, p=0.642)	0.18 (-0.17 to 0.52, p=0.311)
2	edad	[70,109]	1.4 (2.6)	0.01 (-0.00 to 0.03, p=0.131)	0.01 (-0.01 to 0.04, p=0.220)
3	estancia_en_centro	[1,26]	1.4 (2.6)	-0.04 (-0.07 to -0.01, p=0.016)	-0.02 (-0.08 to 0.04, p=0.508)
4	fumat	[52,112]	1.6 (2.5)	-0.04 (-0.06 to -0.03, p<0.001)	-0.04 (-0.05 to -0.02, p<0.001)
5	mmse	[-1,30]	1.5 (2.6)	-0.05 (-0.06 to -0.04, p<0.001)	-0.03 (-0.06 to -0.01, p=0.012)
8	tinetti	[1,28]	1.4 (2.6)	-0.03 (-0.04 to -0.01, p=0.001)	-0.02 (-0.05 to 0.01, p=0.202)
1	barthel	[0,100]	1.4 (2.6)	-0.01 (-0.01 to -0.01, p<0.001)	0.00 (-0.01 to 0.01, p=0.381)

x
Number in dataframe = 2831, Number in model = 932, Missing = 1899, Log-likelihood = -2137.51, R-squared = 0.057, Adjusted r-squared = 0.05

Las estimaciones con todas a la vez no parecen muy buenas. Exploremos las correlaciones entre las explicativas:

df %>% generaTablaCorrelaciones(vNumericas = c("numero_caidas", "estancia_en_centro","edad","fumat", "mmse", "tinetti", "barthel")) %>%
  knitr::kable(booktabs=T)

Variable	numero_caidas	[01]	[02]	[03]	[04]	[05]
[01] estancia_en_centro	-0.05*
[02] edad	0.03	-0.01
[03] fumat	-0.21***	-0.04	0.05
[04] mmse	-0.16***	0.07***	0.00	0.52***
[05] tinetti	-0.06***	0.05**	-0.14***	0.17***	0.25***
[06] barthel	-0.09***	0.10***	-0.08***	0.33***	0.46***	0.51***

Parece que barthel está muy asociada con mmse, tinetti. No deberán estudiarse todas a la vez pues contienen información similar. Probemos análisis alternativos entonces:

df %>%
  finalfit("numero_caidas", c("sexo", "edad", "estancia_en_centro","fumat", "mmse"), metrics=TRUE) %>%
  .[[1]] %>% knitr::kable(booktabs=T)

	Dependent: numero_caidas		Mean (sd)	Coefficient (univariable)	Coefficient (multivariable)
5	sexo	Hombre	1.4 (2.4)
6		Mujer	1.4 (2.8)	-0.05 (-0.25 to 0.15, p=0.642)	0.18 (-0.17 to 0.52, p=0.313)
1	edad	[70,109]	1.4 (2.6)	0.01 (-0.00 to 0.03, p=0.131)	0.02 (-0.01 to 0.04, p=0.174)
2	estancia_en_centro	[1,26]	1.4 (2.6)	-0.04 (-0.07 to -0.01, p=0.016)	-0.02 (-0.08 to 0.04, p=0.519)
3	fumat	[52,112]	1.6 (2.5)	-0.04 (-0.06 to -0.03, p<0.001)	-0.04 (-0.05 to -0.02, p<0.001)
4	mmse	[-1,30]	1.5 (2.6)	-0.05 (-0.06 to -0.04, p<0.001)	-0.03 (-0.05 to -0.01, p=0.014)

df %>%
  finalfit("numero_caidas", c("sexo", "edad", "estancia_en_centro", "barthel"), metrics=TRUE) %>%
  .[[1]] %>% knitr::kable(booktabs=T)

	Dependent: numero_caidas		Mean (sd)	Coefficient (univariable)	Coefficient (multivariable)
4	sexo	Hombre	1.4 (2.4)
5		Mujer	1.4 (2.8)	-0.05 (-0.25 to 0.15, p=0.642)	0.03 (-0.18 to 0.23, p=0.777)
2	edad	[70,109]	1.4 (2.6)	0.01 (-0.00 to 0.03, p=0.131)	0.01 (-0.01 to 0.02, p=0.232)
3	estancia_en_centro	[1,26]	1.4 (2.6)	-0.04 (-0.07 to -0.01, p=0.016)	-0.03 (-0.07 to -0.00, p=0.046)
1	barthel	[0,100]	1.4 (2.6)	-0.01 (-0.01 to -0.01, p<0.001)	-0.01 (-0.01 to -0.00, p<0.001)

Podría ser útil estudiar la presencia/ausencia de caidas para estas variables, si acaso, agrupando un poco sus valores:

  ggplot(df2, aes(x = edad, fill = caida)) +  geom_bar(position = "fill") +ylab("")

  ggplot(df2, aes(x = sexo, fill = caida)) +  geom_bar(position = "fill") +ylab("")

  ggplot(df2, aes(x = estancia_en_centro, fill = caida)) +  geom_bar(position = "fill") +ylab("")

  ggplot(df2, aes(x = fumat, fill = caida)) +  geom_bar(position = "fill") +ylab("")

  ggplot(df2, aes(x = mmse, fill = caida)) +  geom_bar(position = "fill") +ylab("")

  ggplot(df2, aes(x = tinetti, fill = caida)) +  geom_bar(position = "fill") +ylab("")

  ggplot(df2, aes(x = barthel, fill = caida)) +  geom_bar(position = "fill") +ylab("")

Y ahora vamos a intentar algunos predictores simple de las caidas. Primero con las variables categorizadas:

data <- optbin(formula = caida ~., data = df2 %>% select (-numero_caidas), method = "infogain")
model <- OneR(formula = caida ~., data = data, verbose = TRUE)

## Warning in OneR.data.frame(x = data, ties.method = ties.method, verbose =
## verbose, : data contains unused factor levels

## 
##     Attribute                   Accuracy
## 1 * resultado_mmse              59.71%  
## 2   mmse                        57.52%  
## 3   tipo_ter_indiv              57.28%  
## 4   tinetti                     56.07%  
## 4   fumat                       56.07%  
## 6   edad                        54.13%  
## 7   centro                      53.64%  
## 7   numero_medicamentos         53.64%  
## 9   sexo                        53.16%  
## 9   tipo_terap_grupal           53.16%  
## 9   minutos_fisioterapia_semana 53.16%  
## 12  estancia_en_centro          52.91%  
## 13  num_terap_grupal            52.67%  
## 14  resultado_tinetti           52.43%  
## 14  num_terap_indiv             52.43%  
## 16  barthel                     52.18%  
## 16  resultado_barthel           52.18%  
## 18  fecha                       51.94%  
## ---
## Chosen attribute due to accuracy
## and ties method (if applicable): '*'

Ahora con las variables tal cual vienen:

data <- optbin(formula = caida ~., data = df %>% select (-numero_caidas), method = "infogain")

## Warning in optbin.data.frame(x = data, method = method, na.omit = na.omit):
## 2419 instance(s) removed due to missing values

model <- OneR(formula = caida ~., data = data, verbose = TRUE)

## 
##     Attribute                   Accuracy
## 1 * mmse                        59.71%  
## 1   resultado_mmse              59.71%  
## 3   fumat                       57.77%  
## 4   tipo_ter_indiv              57.28%  
## 5   tinetti                     56.07%  
## 6   centro                      54.37%  
## 7   edad                        53.64%  
## 7   numero_medicamentos         53.64%  
## 9   sexo                        53.16%  
## 9   tipo_terap_grupal           53.16%  
## 11  num_terap_grupal            52.67%  
## 12  resultado_tinetti           52.43%  
## 13  resultado_barthel           52.18%  
## 14  fecha                       51.94%  
## 14  estancia_en_centro          51.94%  
## 14  barthel                     51.94%  
## 14  num_terap_indiv             51.94%  
## 14  minutos_fisioterapia_semana 51.94%  
## ---
## Chosen attribute due to accuracy
## and ties method (if applicable): '*'

summary(model)

## 
## Call:
## OneR.formula(formula = caida ~ ., data = data, verbose = TRUE)
## 
## Rules:
## If mmse = (-1.03,26] then caida = TRUE
## If mmse = (26,30]    then caida = FALSE
## 
## Accuracy:
## 246 of 412 instances classified correctly (60%)
## 
## Contingency table:
##        mmse
## caida   (-1.03,26] (26,30] Sum
##   FALSE        140    * 58 198
##   TRUE       * 188      26 214
##   Sum          328      84 412
## ---
## Maximum in each column: '*'
## 
## Pearson's Chi-squared test:
## X-squared = 20, df = 1, p-value = 3e-05

plot(model)

prediction <- predict(model, data)
eval_model(prediction, data)

## 
## Confusion matrix (absolute):
##           Actual
## Prediction FALSE TRUE Sum
##      FALSE    58   26  84
##      TRUE    140  188 328
##      Sum     198  214 412
## 
## Confusion matrix (relative):
##           Actual
## Prediction FALSE TRUE  Sum
##      FALSE  0.14 0.06 0.20
##      TRUE   0.34 0.46 0.80
##      Sum    0.48 0.52 1.00
## 
## Accuracy:
## 0.6 (246/412)
## 
## Error rate:
## 0.4 (166/412)
## 
## Error rate reduction (vs. base rate):
## 0.16 (p-value = 9e-04)

Vamos a hacer lo mismo,pero excluyendo mmse. Eso hará que la variable de interés sea fumat:

data <- optbin(formula = caida ~., data = df %>% select (-numero_caidas,-mmse,-resultado_mmse), method = "infogain")

## Warning in optbin.data.frame(x = data, method = method, na.omit = na.omit):
## 2419 instance(s) removed due to missing values

model <- OneR(formula = caida ~., data = data, verbose = TRUE)

## 
##     Attribute                   Accuracy
## 1 * fumat                       57.77%  
## 2   tipo_ter_indiv              57.28%  
## 3   tinetti                     56.07%  
## 4   centro                      54.37%  
## 5   edad                        53.64%  
## 5   numero_medicamentos         53.64%  
## 7   sexo                        53.16%  
## 7   tipo_terap_grupal           53.16%  
## 9   num_terap_grupal            52.67%  
## 10  resultado_tinetti           52.43%  
## 11  resultado_barthel           52.18%  
## 12  fecha                       51.94%  
## 12  estancia_en_centro          51.94%  
## 12  barthel                     51.94%  
## 12  num_terap_indiv             51.94%  
## 12  minutos_fisioterapia_semana 51.94%  
## ---
## Chosen attribute due to accuracy
## and ties method (if applicable): '*'

summary(model)

## 
## Call:
## OneR.formula(formula = caida ~ ., data = data, verbose = TRUE)
## 
## Rules:
## If fumat = (53.9,97] then caida = TRUE
## If fumat = (97,111]  then caida = FALSE
## 
## Accuracy:
## 238 of 412 instances classified correctly (58%)
## 
## Contingency table:
##        fumat
## caida   (53.9,97] (97,111] Sum
##   FALSE       145     * 53 198
##   TRUE      * 185       29 214
##   Sum         330       82 412
## ---
## Maximum in each column: '*'
## 
## Pearson's Chi-squared test:
## X-squared = 10, df = 1, p-value = 0.001

plot(model)

prediction <- predict(model, data)
eval_model(prediction, data)

## 
## Confusion matrix (absolute):
##           Actual
## Prediction FALSE TRUE Sum
##      FALSE    53   29  82
##      TRUE    145  185 330
##      Sum     198  214 412
## 
## Confusion matrix (relative):
##           Actual
## Prediction FALSE TRUE  Sum
##      FALSE  0.13 0.07 0.20
##      TRUE   0.35 0.45 0.80
##      Sum    0.48 0.52 1.00
## 
## Accuracy:
## 0.58 (238/412)
## 
## Error rate:
## 0.42 (174/412)
## 
## Error rate reduction (vs. base rate):
## 0.12 (p-value = 0.01)

Ahora excluimos también fumat:

data <- optbin(formula = caida ~., data = df %>% select (-numero_caidas,-mmse,-resultado_mmse, -fumat), method = "infogain")

## Warning in optbin.data.frame(x = data, method = method, na.omit = na.omit):
## 1540 instance(s) removed due to missing values

model <- OneR(formula = caida ~., data = data, verbose = TRUE)

## 
##     Attribute                   Accuracy
## 1 * barthel                     56.31%  
## 2   centro                      56.08%  
## 3   tinetti                     55.85%  
## 4   resultado_tinetti           55.62%  
## 5   tipo_ter_indiv              55.54%  
## 6   resultado_barthel           55.15%  
## 7   tipo_terap_grupal           53.6%   
## 7   num_terap_grupal            53.6%   
## 9   numero_medicamentos         52.67%  
## 10  num_terap_indiv             52.13%  
## 11  estancia_en_centro          51.98%  
## 12  sexo                        51.9%   
## 13  minutos_fisioterapia_semana 51.51%  
## 14  fecha                       51.12%  
## 14  edad                        51.12%  
## ---
## Chosen attribute due to accuracy
## and ties method (if applicable): '*'

summary(model)

## 
## Call:
## OneR.formula(formula = caida ~ ., data = data, verbose = TRUE)
## 
## Rules:
## If barthel = (-0.1,70] then caida = TRUE
## If barthel = (70,100]  then caida = FALSE
## 
## Accuracy:
## 727 of 1291 instances classified correctly (56%)
## 
## Contingency table:
##        barthel
## caida   (-0.1,70] (70,100]  Sum
##   FALSE       245    * 415  660
##   TRUE      * 312      319  631
##   Sum         557      734 1291
## ---
## Maximum in each column: '*'
## 
## Pearson's Chi-squared test:
## X-squared = 20, df = 1, p-value = 1e-05

plot(model)

prediction <- predict(model, data)
eval_model(prediction, data)

## 
## Confusion matrix (absolute):
##           Actual
## Prediction FALSE TRUE  Sum
##      FALSE   415  319  734
##      TRUE    245  312  557
##      Sum     660  631 1291
## 
## Confusion matrix (relative):
##           Actual
## Prediction FALSE TRUE  Sum
##      FALSE  0.32 0.25 0.57
##      TRUE   0.19 0.24 0.43
##      Sum    0.51 0.49 1.00
## 
## Accuracy:
## 0.56 (727/1291)
## 
## Error rate:
## 0.44 (564/1291)
## 
## Error rate reduction (vs. base rate):
## 0.11 (p-value = 1e-04)

y tipo_ter_indiv también la excluímos:

data <- optbin(formula = caida ~., data = df %>% select (-numero_caidas,-mmse,-resultado_mmse, -fumat, -barthel, -centro), method = "infogain")

## Warning in optbin.data.frame(x = data, method = method, na.omit = na.omit):
## 1540 instance(s) removed due to missing values

model <- OneR(formula = caida ~., data = data, verbose = TRUE)

## 
##     Attribute                   Accuracy
## 1 * tinetti                     55.85%  
## 2   resultado_tinetti           55.62%  
## 3   tipo_ter_indiv              55.54%  
## 4   resultado_barthel           55.15%  
## 5   tipo_terap_grupal           53.6%   
## 5   num_terap_grupal            53.6%   
## 7   numero_medicamentos         52.67%  
## 8   num_terap_indiv             52.13%  
## 9   estancia_en_centro          51.98%  
## 10  sexo                        51.9%   
## 11  minutos_fisioterapia_semana 51.51%  
## 12  fecha                       51.12%  
## 12  edad                        51.12%  
## ---
## Chosen attribute due to accuracy
## and ties method (if applicable): '*'

summary(model)

## 
## Call:
## OneR.formula(formula = caida ~ ., data = data, verbose = TRUE)
## 
## Rules:
## If tinetti = (0.973,20] then caida = TRUE
## If tinetti = (20,28]    then caida = FALSE
## 
## Accuracy:
## 721 of 1291 instances classified correctly (56%)
## 
## Contingency table:
##        tinetti
## caida   (0.973,20] (20,28]  Sum
##   FALSE        232   * 428  660
##   TRUE       * 293     338  631
##   Sum          525     766 1291
## ---
## Maximum in each column: '*'
## 
## Pearson's Chi-squared test:
## X-squared = 20, df = 1, p-value = 5e-05

plot(model)

prediction <- predict(model, data)
eval_model(prediction, data)

## 
## Confusion matrix (absolute):
##           Actual
## Prediction FALSE TRUE  Sum
##      FALSE   428  338  766
##      TRUE    232  293  525
##      Sum     660  631 1291
## 
## Confusion matrix (relative):
##           Actual
## Prediction FALSE TRUE  Sum
##      FALSE  0.33 0.26 0.59
##      TRUE   0.18 0.23 0.41
##      Sum    0.51 0.49 1.00
## 
## Accuracy:
## 0.56 (721/1291)
## 
## Error rate:
## 0.44 (570/1291)
## 
## Error rate reduction (vs. base rate):
## 0.097 (p-value = 4e-04)

data <- optbin(formula = caida ~., data = df %>% select (-numero_caidas,-mmse,-resultado_mmse, -fumat, -barthel, -centro, -tinetti, -resultado_tinetti, -tipo_ter_indiv, -resultado_barthel, -tipo_terap_grupal, -num_terap_grupal), method = "infogain")

## Warning in optbin.data.frame(x = data, method = method, na.omit = na.omit):
## 1540 instance(s) removed due to missing values

model <- OneR(formula = caida ~., data = data, verbose = TRUE)

## 
##     Attribute                   Accuracy
## 1 * numero_medicamentos         52.67%  
## 2   num_terap_indiv             52.13%  
## 3   estancia_en_centro          51.98%  
## 4   sexo                        51.9%   
## 5   minutos_fisioterapia_semana 51.51%  
## 6   fecha                       51.12%  
## 6   edad                        51.12%  
## ---
## Chosen attribute due to accuracy
## and ties method (if applicable): '*'

summary(model)

## 
## Call:
## OneR.formula(formula = caida ~ ., data = data, verbose = TRUE)
## 
## Rules:
## If numero_medicamentos = (-0.024,7] then caida = FALSE
## If numero_medicamentos = (7,24]     then caida = TRUE
## 
## Accuracy:
## 680 of 1291 instances classified correctly (53%)
## 
## Contingency table:
##        numero_medicamentos
## caida   (-0.024,7] (7,24]  Sum
##   FALSE      * 328    332  660
##   TRUE         279  * 352  631
##   Sum          607    684 1291
## ---
## Maximum in each column: '*'
## 
## Pearson's Chi-squared test:
## X-squared = 4, df = 1, p-value = 0.06

plot(model)

prediction <- predict(model, data)
eval_model(prediction, data)

## 
## Confusion matrix (absolute):
##           Actual
## Prediction FALSE TRUE  Sum
##      FALSE   328  279  607
##      TRUE    332  352  684
##      Sum     660  631 1291
## 
## Confusion matrix (relative):
##           Actual
## Prediction FALSE TRUE  Sum
##      FALSE  0.25 0.22 0.47
##      TRUE   0.26 0.27 0.53
##      Sum    0.51 0.49 1.00
## 
## Accuracy:
## 0.53 (680/1291)
## 
## Error rate:
## 0.47 (611/1291)
## 
## Error rate reduction (vs. base rate):
## 0.032 (p-value = 0.1)

data <- optbin(formula = caida ~., data = df %>% select (-numero_caidas,-mmse,-resultado_mmse, -fumat, -barthel, -centro, -tinetti, -resultado_tinetti,-tipo_ter_indiv, -resultado_barthel, -tipo_terap_grupal), method = "infogain")

## Warning in optbin.data.frame(x = data, method = method, na.omit = na.omit):
## 1540 instance(s) removed due to missing values

model <- OneR(formula = caida ~., data = data, verbose = TRUE)

## 
##     Attribute                   Accuracy
## 1 * num_terap_grupal            53.6%   
## 2   numero_medicamentos         52.67%  
## 3   num_terap_indiv             52.13%  
## 4   estancia_en_centro          51.98%  
## 5   sexo                        51.9%   
## 6   minutos_fisioterapia_semana 51.51%  
## 7   fecha                       51.12%  
## 7   edad                        51.12%  
## ---
## Chosen attribute due to accuracy
## and ties method (if applicable): '*'

summary(model)

## 
## Call:
## OneR.formula(formula = caida ~ ., data = data, verbose = TRUE)
## 
## Rules:
## If num_terap_grupal = (-0.006,0] then caida = TRUE
## If num_terap_grupal = (0,6.01]   then caida = FALSE
## 
## Accuracy:
## 692 of 1291 instances classified correctly (54%)
## 
## Contingency table:
##        num_terap_grupal
## caida   (-0.006,0] (0,6.01]  Sum
##   FALSE        155    * 505  660
##   TRUE       * 187      444  631
##   Sum          342      949 1291
## ---
## Maximum in each column: '*'
## 
## Pearson's Chi-squared test:
## X-squared = 6, df = 1, p-value = 0.01

plot(model)

prediction <- predict(model, data)
eval_model(prediction, data)

## 
## Confusion matrix (absolute):
##           Actual
## Prediction FALSE TRUE  Sum
##      FALSE   505  444  949
##      TRUE    155  187  342
##      Sum     660  631 1291
## 
## Confusion matrix (relative):
##           Actual
## Prediction FALSE TRUE  Sum
##      FALSE  0.39 0.34 0.74
##      TRUE   0.12 0.14 0.26
##      Sum    0.51 0.49 1.00
## 
## Accuracy:
## 0.54 (692/1291)
## 
## Error rate:
## 0.46 (599/1291)
## 
## Error rate reduction (vs. base rate):
## 0.051 (p-value = 0.04)

regresion

Lourdes Bujalance

2 de diciembre de 2018