setwd("C:/Users/mabra/Downloads")
Life_exp <- read.csv(file = "Life Expectancy Data.csv", header = TRUE, fileEncoding = "ISO-8859-1")
df <- data.frame(Life_exp)
Este análisis se enfocará en cualquier país pero por separado, es decir se analizará un país en específico. En el código simplemente se cambia el nombre de la siguiente variable:
pais <- "Algeria"
Se parte la base de datos de manera que solo se usan los datos del pais elegido
df <- df[df$Country == pais,]
df
## Country Year Status Life.expectancy Adult.Mortality infant.deaths
## 33 Algeria 2015 Developing 75.6 19 21
## 34 Algeria 2014 Developing 75.4 11 21
## 35 Algeria 2013 Developing 75.3 112 21
## 36 Algeria 2012 Developing 75.1 113 21
## 37 Algeria 2011 Developing 74.9 116 21
## 38 Algeria 2010 Developing 74.7 119 21
## 39 Algeria 2009 Developing 74.4 123 20
## 40 Algeria 2008 Developing 74.1 126 20
## 41 Algeria 2007 Developing 73.8 129 20
## 42 Algeria 2006 Developing 73.4 132 20
## 43 Algeria 2005 Developing 72.9 136 19
## 44 Algeria 2004 Developing 72.3 14 19
## 45 Algeria 2003 Developing 71.7 146 20
## 46 Algeria 2002 Developing 71.6 145 20
## 47 Algeria 2001 Developing 71.4 145 20
## 48 Algeria 2000 Developing 71.3 145 21
## Alcohol percentage.expenditure Hepatitis.B Measles BMI under.five.deaths
## 33 NA 0.000000 95 63 59.5 24
## 34 0.01 54.237318 95 0 58.4 24
## 35 0.53 544.450743 95 25 57.2 24
## 36 0.66 555.926083 95 18 56.1 24
## 37 0.56 509.002041 95 112 55.0 24
## 38 0.45 430.717586 95 103 53.9 24
## 39 0.50 352.063642 94 107 52.8 23
## 40 0.46 43.087173 91 217 51.8 23
## 41 0.44 320.323924 9 0 5.8 23
## 42 0.36 270.240196 8 944 49.8 23
## 43 0.50 2.548923 83 2302 48.9 22
## 44 0.45 220.393699 81 3289 47.9 23
## 45 0.34 25.018523 NA 15374 47.0 23
## 46 0.36 148.511984 NA 5862 46.1 23
## 47 0.23 147.986071 NA 2686 45.3 24
## 48 0.25 154.455944 NA 0 44.4 25
## Polio Total.expenditure Diphtheria HIV.AIDS GDP Population
## 33 95 NA 95 0.1 4132.76292 39871528
## 34 95 7.21 95 0.1 547.85170 39113313
## 35 95 7.12 95 0.1 5471.86677 38338562
## 36 95 6.14 95 0.1 5564.82566 37565847
## 37 95 5.29 95 0.1 5432.25230 36819558
## 38 95 5.12 95 0.1 4463.39467 36117637
## 39 94 5.36 95 0.1 3868.83123 3546576
## 40 92 4.20 93 0.1 495.25487 3486715
## 41 95 3.82 95 0.1 3935.18334 34376
## 42 95 3.36 95 0.1 3464.61790 33777915
## 43 88 3.24 88 0.1 31.12238 33288437
## 44 86 3.54 86 0.1 2598.98230 3283196
## 45 87 3.60 87 0.1 294.33556 3243514
## 46 86 3.73 86 0.1 1774.33673 3199546
## 47 89 3.84 89 0.1 1732.85798 31592153
## 48 86 3.49 86 0.1 1757.17797 3118366
## thinness..1.19.years thinness.5.9.years Income.composition.of.resources
## 33 6.0 5.8 0.743
## 34 6.0 5.8 0.741
## 35 5.9 5.8 0.737
## 36 5.9 5.8 0.732
## 37 5.9 5.8 0.724
## 38 5.9 5.8 0.714
## 39 6.0 5.9 0.705
## 40 6.0 5.9 0.697
## 41 6.0 5.9 0.690
## 42 6.1 6.0 0.686
## 43 6.1 6.0 0.680
## 44 6.2 6.1 0.673
## 45 6.3 6.1 0.663
## 46 6.3 6.2 0.653
## 47 6.4 6.3 0.644
## 48 6.5 6.4 0.636
## Schooling
## 33 14.4
## 34 14.4
## 35 14.4
## 36 14.4
## 37 14.0
## 38 13.6
## 39 13.1
## 40 12.6
## 41 12.3
## 42 12.3
## 43 12.0
## 44 11.7
## 45 11.5
## 46 11.1
## 47 10.9
## 48 10.7
Una vez que tengamos nuestros datos, se llenan los valores faltantes con la media de las observaciones, de esta manera poner esos valores no hará tanta diferencia en el modelo.
df$Alcohol[is.na(df$Alcohol)] <- mean(df$Alcohol, na.rm = TRUE)
df$Life.expectancy[is.na(df$Life.expectancy)] <- mean(df$Life.expectancy, na.rm = TRUE)
df$Adult.Mortality[is.na(df$Adult.Mortality)] <- mean(df$Adult.Mortality, na.rm = TRUE)
df$infant.deaths[is.na(df$infant.deaths)] <- mean(df$infant.deaths, na.rm = TRUE)
df$percentage.expenditure[is.na(df$percentage.expenditure)] <- mean(df$percentage.expenditure, na.rm = TRUE)
df$Hepatitis.B[is.na(df$Hepatitis.B)] <- mean(df$Hepatitis.B, na.rm = TRUE)
df$Measles[is.na(df$Measles)] <- mean(df$Measles, na.rm = TRUE)
df$BMI[is.na(df$BMI)] <- mean(df$BMI, na.rm = TRUE)
df$under.five.deaths[is.na(df$under.five.deaths)] <- mean(df$under.five.deaths, na.rm = TRUE)
df$Polio[is.na(df$Polio)] <- mean(df$Polio, na.rm = TRUE)
df$Total.expenditure[is.na(df$Total.expenditure)] <- mean(df$Total.expenditure, na.rm = TRUE)
df$Diphtheria[is.na(df$Diphtheria)] <- mean(df$Diphtheria, na.rm = TRUE)
df$HIV.AIDS[is.na(df$HIV.AIDS)] <- mean(df$HIV.AIDS, na.rm = TRUE)
df$GDP[is.na(df$GDP)] <- mean(df$GDP, na.rm = TRUE)
df$Population[is.na(df$Population)] <- mean(df$Population, na.rm = TRUE)
df$thinness..1.19.years[is.na(df$thinness..1.19.years)] <- mean(df$thinness..1.19.years, na.rm = TRUE)
df$thinness.5.9.years[is.na(df$thinness.5.9.years)] <- mean(df$thinness.5.9.years, na.rm = TRUE)
df$Income.composition.of.resources[is.na(df$Income.composition.of.resources)] <- mean(df$Income.composition.of.resources, na.rm = TRUE)
df$Schooling[is.na(df$Schooling)] <- mean(df$Schooling, na.rm = TRUE)
# Matriz de correlación para ver las relaciones entre las variables
cor_matrix <- cor(df[,-c(1,2,3,16)]) # Excluir la variable categórica 'HVI.AIDS porque tiene NAs'
hm <- heatmap(cor_matrix, main = "Matriz de correlación", keep.dendro = TRUE)
Vemos que las variables que más se relacionan con la esperanza de vida es Schooling, polio, income of resources, diphtheria y total espenditure. Por eso haremos un modelo tomando en cuenta solo esas variables. Además en el siguiente modelo qu toma en cuenta todas las variables podemos ver como hay muchos coeficientes que son cero o inclusive NAs.
Modelo completo con todas las variables
(complete_model <- lm(Life.expectancy ~ ., data = df[,-c(1,2,3,16)]))
##
## Call:
## lm(formula = Life.expectancy ~ ., data = df[, -c(1, 2, 3, 16)])
##
## Coefficients:
## (Intercept) Adult.Mortality
## 1.288e+02 1.635e-03
## infant.deaths Alcohol
## 1.806e-01 -2.086e-02
## percentage.expenditure Hepatitis.B
## -3.617e-04 -1.297e-02
## Measles BMI
## -1.202e-04 -3.917e-03
## under.five.deaths Polio
## 1.655e-01 -1.540e+00
## Total.expenditure Diphtheria
## 2.046e-01 1.313e+00
## GDP Population
## 1.962e-04 2.279e-08
## thinness..1.19.years thinness.5.9.years
## 1.009e+01 -1.743e+01
## Income.composition.of.resources Schooling
## NA NA
El modelo con las variables que nos interesan es
df1<-(df[c("Polio","Life.expectancy", "Income.composition.of.resources", "Diphtheria", "Schooling", "Total.expenditure")])
(complete_model1 <- lm(Life.expectancy ~ ., data = df1))
##
## Call:
## lm(formula = Life.expectancy ~ ., data = df1)
##
## Coefficients:
## (Intercept) Polio
## 43.00301 -0.29363
## Income.composition.of.resources Diphtheria
## 30.21477 0.37558
## Schooling Total.expenditure
## 0.18202 -0.05629
Otra variante del modelo será transformar las variables: log(toal expenditure), schooling^2 además de multiplicar estas dos variables porque creo que tienen mucha relación y se nota en la mariz de correlaciones.
complete_model2 <- lm(Life.expectancy ~ .+ log(Total.expenditure) + I(Schooling^2) + Total.expenditure:Schooling, data = df1)
complete_model2
##
## Call:
## lm(formula = Life.expectancy ~ . + log(Total.expenditure) + I(Schooling^2) +
## Total.expenditure:Schooling, data = df1)
##
## Coefficients:
## (Intercept) Polio
## 46.380568 -0.358622
## Income.composition.of.resources Diphtheria
## 33.744644 0.448282
## Schooling Total.expenditure
## -0.474480 -1.718097
## log(Total.expenditure) I(Schooling^2)
## 1.881013 0.006763
## Schooling:Total.expenditure
## 0.092440
Una vez que tengamos el modelo podemos hacer modelos. Primero haré dos conjutos uno de entrenamiento (80%) y otro de prueba(20%) para al final saber qué modelo es mejor.
set.seed(123)
train_prop<-.8 #80%
(n<-nrow(df1))
## [1] 16
n_train<-round(train_prop*n)
train_index<-sample(1:n,n_train)
train_dt<-df1[train_index,]
test_dt<-df1[-train_index,]
str(df1)
## 'data.frame': 16 obs. of 6 variables:
## $ Polio : num 95 95 95 95 95 95 94 92 95 95 ...
## $ Life.expectancy : num 75.6 75.4 75.3 75.1 74.9 74.7 74.4 74.1 73.8 73.4 ...
## $ Income.composition.of.resources: num 0.743 0.741 0.737 0.732 0.724 0.714 0.705 0.697 0.69 0.686 ...
## $ Diphtheria : num 95 95 95 95 95 95 95 93 95 95 ...
## $ Schooling : num 14.4 14.4 14.4 14.4 14 13.6 13.1 12.6 12.3 12.3 ...
## $ Total.expenditure : num 4.6 7.21 7.12 6.14 5.29 ...
Valido resultados con algoritmo stepwise
# 1. Modelo base vacío
modelo_base <- lm(Life.expectancy ~ 1, data = train_dt)
# 2. Scope: todo lo que puede entrar al modelo
scope <- lm(Life.expectancy ~ . + log(Total.expenditure) + I(Schooling^2) + Total.expenditure:Schooling, data = train_dt)
### a) Forward Stepwise
modelo_forward <- step(modelo_base, scope = formula(scope), direction = "forward", test = "F", trace = FALSE)
### b) Backward Stepwise
modelo_backward <- step(scope, direction = "backward", test = "F", trace = FALSE)
### c) Método combinado (forward y backward)
modelo_combined <- step(scope, direction = "both", test = "F", trace = FALSE)
# Resumen y gráficos de los modelos
summary(modelo_forward)
##
## Call:
## lm(formula = Life.expectancy ~ Income.composition.of.resources +
## Diphtheria + Polio, data = train_dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.16005 -0.08369 -0.03507 0.04712 0.22984
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.2638 0.9106 46.412 5.01e-12 ***
## Income.composition.of.resources 36.3016 2.0992 17.293 3.26e-08 ***
## Diphtheria 0.2931 0.1467 1.998 0.0768 .
## Polio -0.2266 0.1522 -1.489 0.1707
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1374 on 9 degrees of freedom
## Multiple R-squared: 0.9947, Adjusted R-squared: 0.9929
## F-statistic: 563.8 on 3 and 9 DF, p-value: 1.474e-10
summary(modelo_backward)
##
## Call:
## lm(formula = Life.expectancy ~ Income.composition.of.resources +
## Schooling + Total.expenditure + log(Total.expenditure) +
## I(Schooling^2) + Schooling:Total.expenditure, data = train_dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.159700 -0.047238 -0.003186 0.051823 0.165763
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54.41245 6.05256 8.990 0.000106 ***
## Income.composition.of.resources -18.39322 12.95792 -1.419 0.205570
## Schooling 1.44286 0.95190 1.516 0.180361
## Total.expenditure 14.56445 3.63281 4.009 0.007045 **
## log(Total.expenditure) -19.68431 5.25768 -3.744 0.009578 **
## I(Schooling^2) 0.13992 0.06227 2.247 0.065706 .
## Schooling:Total.expenditure -0.78368 0.19193 -4.083 0.006478 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1119 on 6 degrees of freedom
## Multiple R-squared: 0.9977, Adjusted R-squared: 0.9953
## F-statistic: 425.8 on 6 and 6 DF, p-value: 1.281e-07
# Calcular MAPE (Mean Absolute Proportional Error)
library(Metrics)
actual <- test_dt$Life.expectancy
predicted_forward <- predict(modelo_forward, newdata = test_dt)
predicted_backward <- predict(modelo_backward, newdata = test_dt)
predicted_complete <- predict(scope, newdata = test_dt)
mape(actual, predicted_forward)
## [1] 0.003265612
mape(actual, predicted_backward)
## [1] 0.008092704
mape(actual, predicted_complete)
## [1] 0.007709235
Los resultados de MAPE en la regresión múltiple muestran que el modelo predicted_forward, con un error de 0.33%, es el más preciso, indicando que sus predicciones se acercan más a los valores reales en comparación con los otros modelos. El modelo predicted_backward presenta un error mayor, de 0.81%, mientras que el modelo predicted_complete tiene un error de 0.77%. En general, el modelo forward ofrece el mejor desempeño en términos de precisión en este conjunto de datos.