# cargando las librerias
library(tidyverse)
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'purrr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'stringr' was built under R version 4.0.5
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.0.8 v readr 2.1.4
## v forcats 1.0.0 v stringr 1.4.0
## v ggplot2 3.4.2 v tibble 3.1.6
## v lubridate 1.9.2 v tidyr 1.2.0
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the [conflicted package](http://conflicted.r-lib.org/) to force all conflicts to become errors
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.0.5
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.0.5
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(ggfortify)
head(df)
## # A tibble: 6 x 9
## country region happiness_score gdp_per_capita social_support
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Finland Western Europe 7.80 1.89 1.58
## 2 Denmark Western Europe 7.59 1.95 1.55
## 3 Iceland Western Europe 7.53 1.93 1.62
## 4 Israel Middle East and Nor~ 7.47 1.83 1.52
## 5 Netherlands Western Europe 7.40 1.94 1.49
## 6 Sweden Western Europe 7.40 1.92 1.51
## # i 4 more variables: healthy_life_expectancy <dbl>,
## # freedom_to_make_life_choices <dbl>, generosity <dbl>,
## # perceptions_of_corruption <dbl>
cor(df[,3:9])
## happiness_score gdp_per_capita social_support
## happiness_score 1.00000000 0.7843422 0.83460448
## gdp_per_capita 0.78434218 1.0000000 0.73799278
## social_support 0.83460448 0.7379928 1.00000000
## healthy_life_expectancy NA NA NA
## freedom_to_make_life_choices 0.66290852 0.4514050 0.54183170
## generosity 0.04367966 -0.1567804 0.03573646
## perceptions_of_corruption 0.47191347 0.4369795 0.27253810
## healthy_life_expectancy
## happiness_score NA
## gdp_per_capita NA
## social_support NA
## healthy_life_expectancy 1
## freedom_to_make_life_choices NA
## generosity NA
## perceptions_of_corruption NA
## freedom_to_make_life_choices generosity
## happiness_score 0.6629085 0.04367966
## gdp_per_capita 0.4514050 -0.15678038
## social_support 0.5418317 0.03573646
## healthy_life_expectancy NA NA
## freedom_to_make_life_choices 1.0000000 0.16968528
## generosity 0.1696853 1.00000000
## perceptions_of_corruption 0.3836725 0.12171201
## perceptions_of_corruption
## happiness_score 0.4719135
## gdp_per_capita 0.4369795
## social_support 0.2725381
## healthy_life_expectancy NA
## freedom_to_make_life_choices 0.3836725
## generosity 0.1217120
## perceptions_of_corruption 1.0000000
cor(df$happiness_score, df$gdp_per_capita)
## [1] 0.7843422
# Suponiendo que df es tu dataframe
subset_df <- df[complete.cases(df$happiness_score, df$healthy_life_expectancy),]
# Calcular la correlación en el nuevo subset
cor(subset_df$happiness_score, subset_df$healthy_life_expectancy)
## [1] 0.7466993
cor(df$happiness_score, df$freedom_to_make_life_choices)
## [1] 0.6629085
cor(df$happiness_score, df$generosity)
## [1] 0.04367966
cor(df$happiness_score, df$perceptions_of_corruption)
## [1] 0.4719135
# Crea un boxplot en lugar de un gráfico de dispersión
ggplot(df, aes(x = happiness_score, y = region, fill = region)) +
geom_boxplot() +
labs(x = "Región", y = "Puntuación de Felicidad", title = "Boxplot de Puntuación de Felicidad por Región") +
guides(fill = FALSE)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
modelo2=lm(happiness_score~ gdp_per_capita, data=df)
summary(modelo2)
##
## Call:
## lm(formula = happiness_score ~ gdp_per_capita, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.1685 -0.3177 0.0213 0.3951 2.5767
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.6343 0.2069 12.73 <2e-16 ***
## gdp_per_capita 2.0651 0.1406 14.69 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7097 on 135 degrees of freedom
## Multiple R-squared: 0.6152, Adjusted R-squared: 0.6123
## F-statistic: 215.8 on 1 and 135 DF, p-value: < 2.2e-16
ggplot(df, aes(x=gdp_per_capita, y=happiness_score))+
geom_point()+
geom_smooth(method=lm, se=TRUE)
##
## `geom_smooth()` using formula = 'y ~ x'
### Diagnostico del modelo
autoplot(modelo2, which = c(1,2), ncol = 1, label.size = 3)
autoplot(modelo2, which = 4, ncol = 1, label.size = 3)
shapiro.test(modelo2$residuals)
##
## Shapiro-Wilk normality test
##
## data: modelo2$residuals
## W = 0.9389, p-value = 1.066e-05
bptest(modelo2)
##
## studentized Breusch-Pagan test
##
## data: modelo2
## BP = 4.9622, df = 1, p-value = 0.02591
modelo3=lm(happiness_score~ healthy_life_expectancy, data= df)
summary(modelo3)
##
## Call:
## lm(formula = happiness_score ~ healthy_life_expectancy, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3258 -0.4571 0.1196 0.5088 1.4038
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.5502 0.1668 21.28 <2e-16 ***
## healthy_life_expectancy 5.4461 0.4191 12.99 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.763 on 134 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.5576, Adjusted R-squared: 0.5543
## F-statistic: 168.9 on 1 and 134 DF, p-value: < 2.2e-16
ggplot(df, aes(x=healthy_life_expectancy, y=happiness_score))+
geom_point()+
geom_smooth(method=lm, se=TRUE)
##
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 1 rows containing missing values (`geom_point()`).
autoplot(modelo3, which = c(1,2), ncol = 1, label.size = 3)
autoplot(modelo3, which = 4, ncol = 1, label.size = 3)
shapiro.test(modelo3$residuals)
##
## Shapiro-Wilk normality test
##
## data: modelo3$residuals
## W = 0.94793, p-value = 5.346e-05
bptest(modelo3)
##
## studentized Breusch-Pagan test
##
## data: modelo3
## BP = 0.0040266, df = 1, p-value = 0.9494
modelo1
##
## Call:
## lm(formula = happiness_score ~ social_support, data = df)
##
## Coefficients:
## (Intercept) social_support
## 2.169 2.915
intervalos_prediccion = predict(modelo1, interval = "prediction")
## Warning in predict.lm(modelo1, interval = "prediction"): predictions on current data refer to _future_ responses
df <- cbind(df,intervalos_prediccion)
# Supongamos que deseas agregar títulos a tu gráfico
titulo_principal <- "Relación entre Social Support y Happiness Score"
titulo_x <- "Social Support"
titulo_y <- "Happiness Score"
titulo_lineas <- "Intervalo de Predicción"
# Tu código existente
ggplot(data = df, aes(x = social_support, y = happiness_score)) +
geom_point() +
geom_smooth(method = 'lm', color = 'blue', se = TRUE) +
geom_line(aes(y = lwr), linetype = "dashed") +
geom_line(aes(y = upr), linetype = "dashed") +
# Agregando títulos
labs(
title = titulo_principal,
x = titulo_x,
y = titulo_y,
subtitle = titulo_lineas
)
##
## `geom_smooth()` using formula = 'y ~ x'
# seleccionado paises para obtener intervalos de predicción
paises= c("El Salvador", "Uganda", "Costa Rica", "Nigeria", "Argentina", "Mexico","Spain")
datos_social_support=df %>% filter(country %in% paises) %>% select(country, happiness_score, social_support)
datos_social_support
## country happiness_score social_support
## 23 Costa Rica 6.609 1.340
## 32 Spain 6.436 1.491
## 36 Mexico 6.330 1.169
## 50 El Salvador 6.122 1.044
## 52 Argentina 6.024 1.388
## 95 Nigeria 4.981 1.007
## 113 Uganda 4.432 1.144
new_data= data.frame(social_support =datos_social_support$social_support)
prediccion_media=predict(modelo1, newdata = new_data, interval = "confidence")
data.frame(cbind("country"=datos_social_support$country, round(prediccion_media,3)), "happiness_score"=datos_social_support$happiness_score)
## country fit lwr upr happiness_score
## 1 Costa Rica 6.076 5.953 6.198 6.609
## 2 Spain 6.516 6.363 6.669 6.436
## 3 Mexico 5.577 5.471 5.684 6.330
## 4 El Salvador 5.213 5.1 5.325 6.122
## 5 Argentina 6.216 6.085 6.346 6.024
## 6 Nigeria 5.105 4.988 5.222 4.981
## 7 Uganda 5.504 5.398 5.611 4.432
prediccion=predict(modelo1, newdata = new_data, interval = "prediction")
data.frame(cbind("country"=datos_social_support$country, round(prediccion,3)), "happiness_score"=datos_social_support$happiness_score)
## country fit lwr upr happiness_score
## 1 Costa Rica 6.076 4.823 7.328 6.609
## 2 Spain 6.516 5.26 7.772 6.436
## 3 Mexico 5.577 4.326 6.828 6.330
## 4 El Salvador 5.213 3.961 6.464 6.122
## 5 Argentina 6.216 4.962 7.469 6.024
## 6 Nigeria 5.105 3.853 6.357 4.981
## 7 Uganda 5.504 4.253 6.755 4.432
# Calcula las distancias Cook
distancias_cook <- cooks.distance(modelo1)
df[which(distancias_cook > 4/nrow(df)),] %>% select(country, happiness_score,social_support)
## country happiness_score social_support
## 86 Congo (Brazzaville) 5.267 0.665
## 93 Ivory Coast 5.053 0.584
## 100 Morocco 4.903 0.535
## 116 Benin 4.374 0.242
## 132 Botswana 3.435 1.041
## 133 Congo (Kinshasa) 3.207 0.784
## 134 Zimbabwe 3.204 0.881
## 136 Lebanon 2.392 0.476