Objetivos

Análisis exploratorio

# cargando las librerias
library(tidyverse)
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr     1.0.8     v readr     2.1.4
## v forcats   1.0.0     v stringr   1.4.0
## v ggplot2   3.4.2     v tibble    3.1.6
## v lubridate 1.9.2     v tidyr     1.2.0
## v purrr     0.3.4     
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## i Use the [conflicted package](http://conflicted.r-lib.org/) to force all conflicts to become errors
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.0.5
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.0.5
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(ggfortify)
head(df)
## # A tibble: 6 x 9
##   country     region               happiness_score gdp_per_capita social_support
##   <chr>       <chr>                          <dbl>          <dbl>          <dbl>
## 1 Finland     Western Europe                  7.80           1.89           1.58
## 2 Denmark     Western Europe                  7.59           1.95           1.55
## 3 Iceland     Western Europe                  7.53           1.93           1.62
## 4 Israel      Middle East and Nor~            7.47           1.83           1.52
## 5 Netherlands Western Europe                  7.40           1.94           1.49
## 6 Sweden      Western Europe                  7.40           1.92           1.51
## # i 4 more variables: healthy_life_expectancy <dbl>,
## #   freedom_to_make_life_choices <dbl>, generosity <dbl>,
## #   perceptions_of_corruption <dbl>
cor(df[,3:9])
##                              happiness_score gdp_per_capita social_support
## happiness_score                   1.00000000      0.7843422     0.83460448
## gdp_per_capita                    0.78434218      1.0000000     0.73799278
## social_support                    0.83460448      0.7379928     1.00000000
## healthy_life_expectancy                   NA             NA             NA
## freedom_to_make_life_choices      0.66290852      0.4514050     0.54183170
## generosity                        0.04367966     -0.1567804     0.03573646
## perceptions_of_corruption         0.47191347      0.4369795     0.27253810
##                              healthy_life_expectancy
## happiness_score                                   NA
## gdp_per_capita                                    NA
## social_support                                    NA
## healthy_life_expectancy                            1
## freedom_to_make_life_choices                      NA
## generosity                                        NA
## perceptions_of_corruption                         NA
##                              freedom_to_make_life_choices  generosity
## happiness_score                                 0.6629085  0.04367966
## gdp_per_capita                                  0.4514050 -0.15678038
## social_support                                  0.5418317  0.03573646
## healthy_life_expectancy                                NA          NA
## freedom_to_make_life_choices                    1.0000000  0.16968528
## generosity                                      0.1696853  1.00000000
## perceptions_of_corruption                       0.3836725  0.12171201
##                              perceptions_of_corruption
## happiness_score                              0.4719135
## gdp_per_capita                               0.4369795
## social_support                               0.2725381
## healthy_life_expectancy                             NA
## freedom_to_make_life_choices                 0.3836725
## generosity                                   0.1217120
## perceptions_of_corruption                    1.0000000

Correlación entre la puntuación de felicidad y PIB per capita

cor(df$happiness_score, df$gdp_per_capita)
## [1] 0.7843422

Correlación entre la puntuación de felicidad y el apoyo social

cor(df$happiness_score, df$social_support)
## [1] 0.8346045

Correlación entre la puntuación de felicidad y esperanza de vida

# Suponiendo que df es tu dataframe
subset_df <- df[complete.cases(df$happiness_score, df$healthy_life_expectancy),]

# Calcular la correlación en el nuevo subset
cor(subset_df$happiness_score, subset_df$healthy_life_expectancy)
## [1] 0.7466993

Correlación entre la puntuación de felicidad y libertad para tomar elecciones en la vida

cor(df$happiness_score, df$freedom_to_make_life_choices)
## [1] 0.6629085

Correlación entre la puntuación de felicidad y la generosidad

cor(df$happiness_score, df$generosity)
## [1] 0.04367966

Correlación entre la puntuación de felicidad y la percepción de la corrupción

cor(df$happiness_score, df$perceptions_of_corruption)
## [1] 0.4719135
# Crea un boxplot en lugar de un gráfico de dispersión
ggplot(df, aes(x = happiness_score, y = region, fill = region)) +
  geom_boxplot() +
  labs(x = "Región", y = "Puntuación de Felicidad", title = "Boxplot de Puntuación de Felicidad por Región") +
  guides(fill = FALSE)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Modelamiento estadístico

Regresión entre felicidad y el apoyo social

modelo1=lm(happiness_score~ social_support, data=df)

summary(modelo1)
## 
## Call:
## lm(formula = happiness_score ~ social_support, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.76890 -0.37187  0.00861  0.46558  1.49958 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.1689     0.1989    10.9   <2e-16 ***
## social_support   2.9155     0.1656    17.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6303 on 135 degrees of freedom
## Multiple R-squared:  0.6966, Adjusted R-squared:  0.6943 
## F-statistic: 309.9 on 1 and 135 DF,  p-value: < 2.2e-16

visualización del modelo ajustado

ggplot(df, aes(x=social_support, y=happiness_score))+
    geom_point()+
    geom_smooth(method=lm, se=TRUE)
## 
## `geom_smooth()` using formula = 'y ~ x'

Diagnostico del modelo

autoplot(modelo1, which = c(1,2), ncol = 1, label.size = 3)

autoplot(modelo1, which = 4, ncol = 1, label.size = 3)

shapiro.test(modelo1$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  modelo1$residuals
## W = 0.99107, p-value = 0.5349
bptest(modelo1)
## 
##  studentized Breusch-Pagan test
## 
## data:  modelo1
## BP = 3.976, df = 1, p-value = 0.04615

Regresión entre felicidad y el PIB percapita

modelo2=lm(happiness_score~ gdp_per_capita, data=df)

summary(modelo2)
## 
## Call:
## lm(formula = happiness_score ~ gdp_per_capita, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.1685 -0.3177  0.0213  0.3951  2.5767 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      2.6343     0.2069   12.73   <2e-16 ***
## gdp_per_capita   2.0651     0.1406   14.69   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7097 on 135 degrees of freedom
## Multiple R-squared:  0.6152, Adjusted R-squared:  0.6123 
## F-statistic: 215.8 on 1 and 135 DF,  p-value: < 2.2e-16

visualización del modelo ajustado

ggplot(df, aes(x=gdp_per_capita, y=happiness_score))+
    geom_point()+
    geom_smooth(method=lm, se=TRUE)
## 
## `geom_smooth()` using formula = 'y ~ x'

### Diagnostico del modelo

autoplot(modelo2, which = c(1,2), ncol = 1, label.size = 3)

autoplot(modelo2, which = 4, ncol = 1, label.size = 3)

shapiro.test(modelo2$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  modelo2$residuals
## W = 0.9389, p-value = 1.066e-05
bptest(modelo2)
## 
##  studentized Breusch-Pagan test
## 
## data:  modelo2
## BP = 4.9622, df = 1, p-value = 0.02591

Regresión entre felicidad y la esperanza de vida

modelo3=lm(happiness_score~ healthy_life_expectancy, data= df)

summary(modelo3)
## 
## Call:
## lm(formula = happiness_score ~ healthy_life_expectancy, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.3258 -0.4571  0.1196  0.5088  1.4038 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               3.5502     0.1668   21.28   <2e-16 ***
## healthy_life_expectancy   5.4461     0.4191   12.99   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.763 on 134 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.5576, Adjusted R-squared:  0.5543 
## F-statistic: 168.9 on 1 and 134 DF,  p-value: < 2.2e-16

visualización del modelo ajustado

ggplot(df, aes(x=healthy_life_expectancy, y=happiness_score))+
    geom_point()+
    geom_smooth(method=lm, se=TRUE)
## 
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).

Diagnostico del modelo

autoplot(modelo3, which = c(1,2), ncol = 1, label.size = 3)

autoplot(modelo3, which = 4, ncol = 1, label.size = 3)

shapiro.test(modelo3$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  modelo3$residuals
## W = 0.94793, p-value = 5.346e-05
bptest(modelo3)
## 
##  studentized Breusch-Pagan test
## 
## data:  modelo3
## BP = 0.0040266, df = 1, p-value = 0.9494

Modelo seleccionado

modelo1
## 
## Call:
## lm(formula = happiness_score ~ social_support, data = df)
## 
## Coefficients:
##    (Intercept)  social_support  
##          2.169           2.915
intervalos_prediccion = predict(modelo1, interval = "prediction")
## Warning in predict.lm(modelo1, interval = "prediction"): predictions on current data refer to _future_ responses
df <- cbind(df,intervalos_prediccion)

# Supongamos que deseas agregar títulos a tu gráfico
titulo_principal <- "Relación entre Social Support y Happiness Score"
titulo_x <- "Social Support"
titulo_y <- "Happiness Score"
titulo_lineas <- "Intervalo de Predicción"

# Tu código existente
ggplot(data = df, aes(x = social_support, y = happiness_score)) + 
  geom_point() + 
  geom_smooth(method = 'lm', color = 'blue', se = TRUE) +
  geom_line(aes(y = lwr), linetype = "dashed") +
  geom_line(aes(y = upr), linetype = "dashed") +

  # Agregando títulos
  labs(
    title = titulo_principal,
    x = titulo_x,
    y = titulo_y,
    subtitle = titulo_lineas
  )
## 
## `geom_smooth()` using formula = 'y ~ x'

# seleccionado paises para obtener intervalos de predicción

paises= c("El Salvador", "Uganda", "Costa Rica", "Nigeria", "Argentina", "Mexico","Spain")
datos_social_support=df %>% filter(country %in% paises) %>% select(country, happiness_score, social_support) 
datos_social_support
##         country happiness_score social_support
## 23   Costa Rica           6.609          1.340
## 32        Spain           6.436          1.491
## 36       Mexico           6.330          1.169
## 50  El Salvador           6.122          1.044
## 52    Argentina           6.024          1.388
## 95      Nigeria           4.981          1.007
## 113      Uganda           4.432          1.144
new_data= data.frame(social_support =datos_social_support$social_support)

prediccion_media=predict(modelo1, newdata = new_data, interval = "confidence")

data.frame(cbind("country"=datos_social_support$country, round(prediccion_media,3)), "happiness_score"=datos_social_support$happiness_score)
##       country   fit   lwr   upr happiness_score
## 1  Costa Rica 6.076 5.953 6.198           6.609
## 2       Spain 6.516 6.363 6.669           6.436
## 3      Mexico 5.577 5.471 5.684           6.330
## 4 El Salvador 5.213   5.1 5.325           6.122
## 5   Argentina 6.216 6.085 6.346           6.024
## 6     Nigeria 5.105 4.988 5.222           4.981
## 7      Uganda 5.504 5.398 5.611           4.432
prediccion=predict(modelo1, newdata = new_data, interval = "prediction")

data.frame(cbind("country"=datos_social_support$country, round(prediccion,3)), "happiness_score"=datos_social_support$happiness_score)
##       country   fit   lwr   upr happiness_score
## 1  Costa Rica 6.076 4.823 7.328           6.609
## 2       Spain 6.516  5.26 7.772           6.436
## 3      Mexico 5.577 4.326 6.828           6.330
## 4 El Salvador 5.213 3.961 6.464           6.122
## 5   Argentina 6.216 4.962 7.469           6.024
## 6     Nigeria 5.105 3.853 6.357           4.981
## 7      Uganda 5.504 4.253 6.755           4.432
# Calcula las distancias Cook
distancias_cook <- cooks.distance(modelo1)

df[which(distancias_cook > 4/nrow(df)),] %>% select(country, happiness_score,social_support)  
##                 country happiness_score social_support
## 86  Congo (Brazzaville)           5.267          0.665
## 93          Ivory Coast           5.053          0.584
## 100             Morocco           4.903          0.535
## 116               Benin           4.374          0.242
## 132            Botswana           3.435          1.041
## 133    Congo (Kinshasa)           3.207          0.784
## 134            Zimbabwe           3.204          0.881
## 136             Lebanon           2.392          0.476