setwd("C:/Users/mabra/Downloads")
Life_exp <- read.csv(file = "Life Expectancy Data.csv", header = TRUE, fileEncoding = "ISO-8859-1")
df <- data.frame(Life_exp)

Este análisis se enfocará en cualquier país pero por separado, es decir se analizará un país en específico. En el código simplemente se cambia el nombre de la siguiente variable:

pais <- "Algeria"

Se parte la base de datos de manera que solo se usan los datos del pais elegido

df <- df[df$Country == pais,]
df
##    Country Year     Status Life.expectancy Adult.Mortality infant.deaths
## 33 Algeria 2015 Developing            75.6              19            21
## 34 Algeria 2014 Developing            75.4              11            21
## 35 Algeria 2013 Developing            75.3             112            21
## 36 Algeria 2012 Developing            75.1             113            21
## 37 Algeria 2011 Developing            74.9             116            21
## 38 Algeria 2010 Developing            74.7             119            21
## 39 Algeria 2009 Developing            74.4             123            20
## 40 Algeria 2008 Developing            74.1             126            20
## 41 Algeria 2007 Developing            73.8             129            20
## 42 Algeria 2006 Developing            73.4             132            20
## 43 Algeria 2005 Developing            72.9             136            19
## 44 Algeria 2004 Developing            72.3              14            19
## 45 Algeria 2003 Developing            71.7             146            20
## 46 Algeria 2002 Developing            71.6             145            20
## 47 Algeria 2001 Developing            71.4             145            20
## 48 Algeria 2000 Developing            71.3             145            21
##    Alcohol percentage.expenditure Hepatitis.B Measles  BMI under.five.deaths
## 33      NA               0.000000          95      63 59.5                24
## 34    0.01              54.237318          95       0 58.4                24
## 35    0.53             544.450743          95      25 57.2                24
## 36    0.66             555.926083          95      18 56.1                24
## 37    0.56             509.002041          95     112 55.0                24
## 38    0.45             430.717586          95     103 53.9                24
## 39    0.50             352.063642          94     107 52.8                23
## 40    0.46              43.087173          91     217 51.8                23
## 41    0.44             320.323924           9       0  5.8                23
## 42    0.36             270.240196           8     944 49.8                23
## 43    0.50               2.548923          83    2302 48.9                22
## 44    0.45             220.393699          81    3289 47.9                23
## 45    0.34              25.018523          NA   15374 47.0                23
## 46    0.36             148.511984          NA    5862 46.1                23
## 47    0.23             147.986071          NA    2686 45.3                24
## 48    0.25             154.455944          NA       0 44.4                25
##    Polio Total.expenditure Diphtheria HIV.AIDS        GDP Population
## 33    95                NA         95      0.1 4132.76292   39871528
## 34    95              7.21         95      0.1  547.85170   39113313
## 35    95              7.12         95      0.1 5471.86677   38338562
## 36    95              6.14         95      0.1 5564.82566   37565847
## 37    95              5.29         95      0.1 5432.25230   36819558
## 38    95              5.12         95      0.1 4463.39467   36117637
## 39    94              5.36         95      0.1 3868.83123    3546576
## 40    92              4.20         93      0.1  495.25487    3486715
## 41    95              3.82         95      0.1 3935.18334      34376
## 42    95              3.36         95      0.1 3464.61790   33777915
## 43    88              3.24         88      0.1   31.12238   33288437
## 44    86              3.54         86      0.1 2598.98230    3283196
## 45    87              3.60         87      0.1  294.33556    3243514
## 46    86              3.73         86      0.1 1774.33673    3199546
## 47    89              3.84         89      0.1 1732.85798   31592153
## 48    86              3.49         86      0.1 1757.17797    3118366
##    thinness..1.19.years thinness.5.9.years Income.composition.of.resources
## 33                  6.0                5.8                           0.743
## 34                  6.0                5.8                           0.741
## 35                  5.9                5.8                           0.737
## 36                  5.9                5.8                           0.732
## 37                  5.9                5.8                           0.724
## 38                  5.9                5.8                           0.714
## 39                  6.0                5.9                           0.705
## 40                  6.0                5.9                           0.697
## 41                  6.0                5.9                           0.690
## 42                  6.1                6.0                           0.686
## 43                  6.1                6.0                           0.680
## 44                  6.2                6.1                           0.673
## 45                  6.3                6.1                           0.663
## 46                  6.3                6.2                           0.653
## 47                  6.4                6.3                           0.644
## 48                  6.5                6.4                           0.636
##    Schooling
## 33      14.4
## 34      14.4
## 35      14.4
## 36      14.4
## 37      14.0
## 38      13.6
## 39      13.1
## 40      12.6
## 41      12.3
## 42      12.3
## 43      12.0
## 44      11.7
## 45      11.5
## 46      11.1
## 47      10.9
## 48      10.7

Una vez que tengamos nuestros datos, se llenan los valores faltantes con la media de las observaciones, de esta manera poner esos valores no hará tanta diferencia en el modelo.

df$Alcohol[is.na(df$Alcohol)] <- mean(df$Alcohol, na.rm = TRUE)
df$Life.expectancy[is.na(df$Life.expectancy)] <- mean(df$Life.expectancy, na.rm = TRUE)
df$Adult.Mortality[is.na(df$Adult.Mortality)] <- mean(df$Adult.Mortality, na.rm = TRUE)
df$infant.deaths[is.na(df$infant.deaths)] <- mean(df$infant.deaths, na.rm = TRUE)
df$percentage.expenditure[is.na(df$percentage.expenditure)] <- mean(df$percentage.expenditure, na.rm = TRUE)
df$Hepatitis.B[is.na(df$Hepatitis.B)] <- mean(df$Hepatitis.B, na.rm = TRUE)
df$Measles[is.na(df$Measles)] <- mean(df$Measles, na.rm = TRUE)
df$BMI[is.na(df$BMI)] <- mean(df$BMI, na.rm = TRUE)
df$under.five.deaths[is.na(df$under.five.deaths)] <- mean(df$under.five.deaths, na.rm = TRUE)
df$Polio[is.na(df$Polio)] <- mean(df$Polio, na.rm = TRUE)
df$Total.expenditure[is.na(df$Total.expenditure)] <- mean(df$Total.expenditure, na.rm = TRUE)
df$Diphtheria[is.na(df$Diphtheria)] <- mean(df$Diphtheria, na.rm = TRUE)
df$HIV.AIDS[is.na(df$HIV.AIDS)] <- mean(df$HIV.AIDS, na.rm = TRUE)
df$GDP[is.na(df$GDP)] <- mean(df$GDP, na.rm = TRUE)
df$Population[is.na(df$Population)] <- mean(df$Population, na.rm = TRUE)
df$thinness..1.19.years[is.na(df$thinness..1.19.years)] <- mean(df$thinness..1.19.years, na.rm = TRUE)
df$thinness.5.9.years[is.na(df$thinness.5.9.years)] <- mean(df$thinness.5.9.years, na.rm = TRUE)
df$Income.composition.of.resources[is.na(df$Income.composition.of.resources)] <- mean(df$Income.composition.of.resources, na.rm = TRUE)
df$Schooling[is.na(df$Schooling)] <- mean(df$Schooling, na.rm = TRUE)
# Matriz de correlación para ver las relaciones entre las variables
cor_matrix <- cor(df[,-c(1,2,3,16)]) # Excluir la variable categórica 'HVI.AIDS porque tiene NAs'
hm <- heatmap(cor_matrix, main = "Matriz de correlación", keep.dendro = TRUE)

Vemos que las variables que más se relacionan con la esperanza de vida es Schooling, polio, income of resources, diphtheria y total espenditure. Por eso haremos un modelo tomando en cuenta solo esas variables. Además en el siguiente modelo qu toma en cuenta todas las variables podemos ver como hay muchos coeficientes que son cero o inclusive NAs.

Modelo completo con todas las variables

(complete_model <- lm(Life.expectancy ~ ., data = df[,-c(1,2,3,16)]))
## 
## Call:
## lm(formula = Life.expectancy ~ ., data = df[, -c(1, 2, 3, 16)])
## 
## Coefficients:
##                     (Intercept)                  Adult.Mortality  
##                       1.288e+02                        1.635e-03  
##                   infant.deaths                          Alcohol  
##                       1.806e-01                       -2.086e-02  
##          percentage.expenditure                      Hepatitis.B  
##                      -3.617e-04                       -1.297e-02  
##                         Measles                              BMI  
##                      -1.202e-04                       -3.917e-03  
##               under.five.deaths                            Polio  
##                       1.655e-01                       -1.540e+00  
##               Total.expenditure                       Diphtheria  
##                       2.046e-01                        1.313e+00  
##                             GDP                       Population  
##                       1.962e-04                        2.279e-08  
##            thinness..1.19.years               thinness.5.9.years  
##                       1.009e+01                       -1.743e+01  
## Income.composition.of.resources                        Schooling  
##                              NA                               NA

El modelo con las variables que nos interesan es

df1<-(df[c("Polio","Life.expectancy", "Income.composition.of.resources", "Diphtheria", "Schooling", "Total.expenditure")])
(complete_model1 <- lm(Life.expectancy ~ ., data = df1))
## 
## Call:
## lm(formula = Life.expectancy ~ ., data = df1)
## 
## Coefficients:
##                     (Intercept)                            Polio  
##                        43.00301                         -0.29363  
## Income.composition.of.resources                       Diphtheria  
##                        30.21477                          0.37558  
##                       Schooling                Total.expenditure  
##                         0.18202                         -0.05629

Otra variante del modelo será transformar las variables: log(toal expenditure), schooling^2 además de multiplicar estas dos variables porque creo que tienen mucha relación y se nota en la mariz de correlaciones.

complete_model2 <- lm(Life.expectancy ~ .+ log(Total.expenditure) +  I(Schooling^2) + Total.expenditure:Schooling, data = df1)
complete_model2
## 
## Call:
## lm(formula = Life.expectancy ~ . + log(Total.expenditure) + I(Schooling^2) + 
##     Total.expenditure:Schooling, data = df1)
## 
## Coefficients:
##                     (Intercept)                            Polio  
##                       46.380568                        -0.358622  
## Income.composition.of.resources                       Diphtheria  
##                       33.744644                         0.448282  
##                       Schooling                Total.expenditure  
##                       -0.474480                        -1.718097  
##          log(Total.expenditure)                   I(Schooling^2)  
##                        1.881013                         0.006763  
##     Schooling:Total.expenditure  
##                        0.092440

Una vez que tengamos el modelo podemos hacer modelos. Primero haré dos conjutos uno de entrenamiento (80%) y otro de prueba(20%) para al final saber qué modelo es mejor.

set.seed(123)
train_prop<-.8 #80%
(n<-nrow(df1))
## [1] 16
n_train<-round(train_prop*n)
train_index<-sample(1:n,n_train)

train_dt<-df1[train_index,]
test_dt<-df1[-train_index,]
str(df1)
## 'data.frame':    16 obs. of  6 variables:
##  $ Polio                          : num  95 95 95 95 95 95 94 92 95 95 ...
##  $ Life.expectancy                : num  75.6 75.4 75.3 75.1 74.9 74.7 74.4 74.1 73.8 73.4 ...
##  $ Income.composition.of.resources: num  0.743 0.741 0.737 0.732 0.724 0.714 0.705 0.697 0.69 0.686 ...
##  $ Diphtheria                     : num  95 95 95 95 95 95 95 93 95 95 ...
##  $ Schooling                      : num  14.4 14.4 14.4 14.4 14 13.6 13.1 12.6 12.3 12.3 ...
##  $ Total.expenditure              : num  4.6 7.21 7.12 6.14 5.29 ...

Valido resultados con algoritmo stepwise

# 1. Modelo base vacío
modelo_base <- lm(Life.expectancy ~ 1, data = train_dt)

# 2. Scope: todo lo que puede entrar al modelo
scope <- lm(Life.expectancy ~ . + log(Total.expenditure) +  I(Schooling^2) + Total.expenditure:Schooling, data = train_dt)

### a) Forward Stepwise
modelo_forward <- step(modelo_base, scope = formula(scope), direction = "forward", test = "F", trace = FALSE)

### b) Backward Stepwise
modelo_backward <- step(scope, direction = "backward", test = "F", trace = FALSE)

### c) Método combinado (forward y backward)
modelo_combined <- step(scope, direction = "both", test = "F", trace = FALSE)

# Resumen y gráficos de los modelos
summary(modelo_forward)
## 
## Call:
## lm(formula = Life.expectancy ~ Income.composition.of.resources + 
##     Diphtheria + Polio, data = train_dt)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.16005 -0.08369 -0.03507  0.04712  0.22984 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      42.2638     0.9106  46.412 5.01e-12 ***
## Income.composition.of.resources  36.3016     2.0992  17.293 3.26e-08 ***
## Diphtheria                        0.2931     0.1467   1.998   0.0768 .  
## Polio                            -0.2266     0.1522  -1.489   0.1707    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1374 on 9 degrees of freedom
## Multiple R-squared:  0.9947, Adjusted R-squared:  0.9929 
## F-statistic: 563.8 on 3 and 9 DF,  p-value: 1.474e-10
summary(modelo_backward)
## 
## Call:
## lm(formula = Life.expectancy ~ Income.composition.of.resources + 
##     Schooling + Total.expenditure + log(Total.expenditure) + 
##     I(Schooling^2) + Schooling:Total.expenditure, data = train_dt)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.159700 -0.047238 -0.003186  0.051823  0.165763 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      54.41245    6.05256   8.990 0.000106 ***
## Income.composition.of.resources -18.39322   12.95792  -1.419 0.205570    
## Schooling                         1.44286    0.95190   1.516 0.180361    
## Total.expenditure                14.56445    3.63281   4.009 0.007045 ** 
## log(Total.expenditure)          -19.68431    5.25768  -3.744 0.009578 ** 
## I(Schooling^2)                    0.13992    0.06227   2.247 0.065706 .  
## Schooling:Total.expenditure      -0.78368    0.19193  -4.083 0.006478 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1119 on 6 degrees of freedom
## Multiple R-squared:  0.9977, Adjusted R-squared:  0.9953 
## F-statistic: 425.8 on 6 and 6 DF,  p-value: 1.281e-07
# Calcular MAPE (Mean Absolute Proportional Error)
library(Metrics)
actual <- test_dt$Life.expectancy
predicted_forward <- predict(modelo_forward, newdata = test_dt)
predicted_backward <- predict(modelo_backward, newdata = test_dt)
predicted_complete <- predict(scope, newdata = test_dt)

mape(actual, predicted_forward)
## [1] 0.003265612
mape(actual, predicted_backward)
## [1] 0.008092704
mape(actual, predicted_complete)
## [1] 0.007709235

Los resultados de MAPE en la regresión múltiple muestran que el modelo predicted_forward, con un error de 0.33%, es el más preciso, indicando que sus predicciones se acercan más a los valores reales en comparación con los otros modelos. El modelo predicted_backward presenta un error mayor, de 0.81%, mientras que el modelo predicted_complete tiene un error de 0.77%. En general, el modelo forward ofrece el mejor desempeño en términos de precisión en este conjunto de datos.