1ª-Práctica-Aanálisis-Estadístico-Avanzado.R

## REGRESIÓN MÚLTIPLE, 1ª PRÁCTICA ANÁLISIS ESSTADÍSTICO AVANZADO ## 

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(boot)
library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:boot':
## 
##     logit
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

library(QuantPsyc)

## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## 
## Attaching package: 'QuantPsyc'
## 
## The following object is masked from 'package:base':
## 
##     norm

library(ggplot2)
library(lmtest)

## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(nortest)
library(corrr)
library(corrplot)

## corrplot 0.92 loaded

library(PerformanceAnalytics)

## Loading required package: xts
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend

library(psych)

## 
## Attaching package: 'psych'
## 
## The following object is masked from 'package:car':
## 
##     logit
## 
## The following object is masked from 'package:boot':
## 
##     logit
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(dplyr)
library(Hmisc)

## 
## Attaching package: 'Hmisc'
## 
## The following object is masked from 'package:psych':
## 
##     describe
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

# Cargar libreria. 

library(haven)

BBDD_Mundo_95 <- read_sav("C:/Users/javie/OneDrive - COMPARTE MARKETING SOCIAL, S.L/Escritorio/máster MIAS/ANÁLISIS ESTADÍSTICO AVANZADO/BBDD_Mundo 95.sav")

View(BBDD_Mundo_95)
names(BBDD_Mundo_95)

##  [1] "país"        "poblac"      "densidad"    "urbana"      "relig"      
##  [6] "espvidaf"    "espvidam"    "alfabet"     "inc_pob"     "mortinf"    
## [11] "pib_cap"     "región"      "calorías"    "sida"        "tasa_nat"   
## [16] "tasa_mor"    "tasasida"    "log_pib"     "logtsida"    "nac_def"    
## [21] "fertilid"    "log_pob"     "cregrano"    "alfabmas"    "alfabfem"   
## [26] "clima"       "espvidafrec"

## Matriz de correlaciones con nuestras variables seleccionadas. 

cor_matrix <- cor(BBDD_Mundo_95[c("fertilid", "pib_cap", "alfabfem", "mortinf")], use = "complete.obs")
print("Matriz de Correlación:")

## [1] "Matriz de Correlación:"

matriz_correlacion <- print(cor_matrix)

##            fertilid    pib_cap   alfabfem    mortinf
## fertilid  1.0000000 -0.5041522 -0.8389623  0.8088380
## pib_cap  -0.5041522  1.0000000  0.4287245 -0.5849956
## alfabfem -0.8389623  0.4287245  1.0000000 -0.8434112
## mortinf   0.8088380 -0.5849956 -0.8434112  1.0000000

## Gráfico que acompaña a la correlación. 

cor_matrix <- cor(BBDD_Mundo_95[c("fertilid", "pib_cap", "alfabfem", "mortinf")], use = "complete.obs")
datos <- BBDD_Mundo_95[, c(21, 11, 25, 10)] 
pairs(x = datos, lower.panel = NULL)

 ggpairs(datos, 
        lower = list(continuous = "smooth"), 
        diag = list(continuous = "bar"), 
        axisLabels = "none")

## Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
## "densityDiag", : Changing diag$continuous from 'bar' to 'barDiag'

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 2 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 24 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 2 rows containing missing values

## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 24 rows containing missing values

## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 24 rows containing missing values

## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 ## A continuación, presentamos un resumen del modelo. 
 
 rg_multiple_1 <- lm (fertilid ~ pib_cap + alfabfem + mortinf, BBDD_Mundo_95)
summary(rg_multiple_1)

## 
## Call:
## lm(formula = fertilid ~ pib_cap + alfabfem + mortinf, data = BBDD_Mundo_95)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.21830 -0.55838 -0.09492  0.47578  2.16284 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.916e+00  7.708e-01   7.674 3.28e-11 ***
## pib_cap     -4.127e-05  2.803e-05  -1.472   0.1448    
## alfabfem    -3.729e-02  6.940e-03  -5.373 7.25e-07 ***
## mortinf      1.335e-02  5.775e-03   2.311   0.0234 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9668 on 81 degrees of freedom
##   (24 observations deleted due to missingness)
## Multiple R-squared:  0.7462, Adjusted R-squared:  0.7368 
## F-statistic: 79.37 on 3 and 81 DF,  p-value: < 2.2e-16

## Correlación de Pearson: 

sqrt(0.7462)

## [1] 0.8638287

## La pendiente va a ser elevada y positiva, va a haber una relación fuerte entre las variables propuestas para el modelo. 

## valor de p: muy por debajo de 0,01 por lo que rechazamos la hipótesis nula, por lo que va a existir una relación lineal. 
## R cuadrado: 0,7462 
## R cuadrado ajustado: 0,7368 (lo que nos viene a decir que cada una de las variables indepndientes suman información útil al modelo, es prácticamnte lo mismo que el R cuadradro)
## Las estimadas que vemos, nos dicen que al aumntar un punto pib_cap, baja la fertilidad
## al aumentar un punto la alfabetización de las mujeres, baja la fertilidad. 
## al aumentar un punto la mortalidad infantil, aumenta la fertilidad. 


## SUPUESTOS: 

# Linealidad: 

crPlots(rg_multiple_1)

## Normalidad e independencia de los residuos: 

hist(residuals(rg_multiple_1))

lillie.test(residuals(rg_multiple_1))

## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  residuals(rg_multiple_1)
## D = 0.10961, p-value = 0.01336

dwtest(rg_multiple_1)

## 
##  Durbin-Watson test
## 
## data:  rg_multiple_1
## DW = 1.8085, p-value = 0.1816
## alternative hypothesis: true autocorrelation is greater than 0

## A continuación nos apoyamos en más gráficas para seguir anaalizando nuestro modelo multivariante. 

plot(rg_multiple_1)

## Evaluación Multicolinealidad del Modelo: 

vif(rg_multiple_1)

##  pib_cap alfabfem  mortinf 
## 1.554502 3.542331 4.395441

tolerancia <- 1/vif(rg_multiple_1)
print(tolerancia)

##   pib_cap  alfabfem   mortinf 
## 0.6432929 0.2823000 0.2275085

## Detección de casos atípicos y observaciones influyentes 

summary(influence.measures(rg_multiple_1))

## Potentially influential observations of
##   lm(formula = fertilid ~ pib_cap + alfabfem + mortinf, data = BBDD_Mundo_95) :
## 
##    dfb.1_ dfb.pb_c dfb.alfb dfb.mrtn dffit cov.r   cook.d hat    
## 2   0.22  -0.15    -0.15    -0.31    -0.37  1.22_*  0.03   0.17_*
## 4   0.25   0.15    -0.29    -0.16     0.43  0.84_*  0.04   0.03  
## 7  -0.04   0.11     0.03     0.04     0.12  1.18_*  0.00   0.12  
## 16 -0.36   0.10     0.35     0.31    -0.37  1.28_*  0.03   0.20_*
## 25 -0.11   0.23     0.01     0.09    -0.35  0.82_*  0.03   0.02  
## 38 -0.15   0.42     0.08     0.16     0.45  1.39_*  0.05   0.26_*
## 61  0.05  -0.15    -0.03    -0.04    -0.17  1.19_*  0.01   0.12  
## 94  0.44  -0.19    -0.39    -0.39     0.49  0.84_*  0.06   0.04

influencePlot(rg_multiple_1)

##       StudRes        Hat      CookD
## 16 -0.7304948 0.20117565 0.03379140
## 25 -2.3849620 0.02081163 0.02856982
## 38  0.7473453 0.26260476 0.04999866
## 52 -1.7853231 0.11504297 0.10086469
## 86 -1.9120941 0.07878879 0.07569220
## 94  2.3480413 0.04161345 0.05668861

## Regresiones simples: 

## Variable predictora: PIB per cápita. 

rg_pib_per_capita <-lm (fertilid ~ pib_cap, BBDD_Mundo_95)
summary(rg_pib_per_capita)

## 
## Call:
## lm(formula = fertilid ~ pib_cap, data = BBDD_Mundo_95)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6565 -1.3002 -0.0888  1.1350  3.6791 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.561e+00  2.023e-01  22.543  < 2e-16 ***
## pib_cap     -1.698e-04  2.308e-05  -7.355 4.38e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.553 on 105 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:   0.34,  Adjusted R-squared:  0.3338 
## F-statistic:  54.1 on 1 and 105 DF,  p-value: 4.379e-11

## Normalidad residuos 

hist(residuals(rg_pib_per_capita))

lillie.test(residuals(rg_pib_per_capita))

## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  residuals(rg_pib_per_capita)
## D = 0.086191, p-value = 0.04864

plot(rg_pib_per_capita)

## vARIABLE PREDICTORA: ALFABETIZACIÓN MUJERES 

rg_afabetfem <- lm (fertilid ~ alfabfem, BBDD_Mundo_95)
summary(rg_afabetfem)

## 
## Call:
## lm(formula = fertilid ~ alfabfem, data = BBDD_Mundo_95)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7175 -0.5207 -0.1281  0.5326  2.6178 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.654157   0.287308   26.64   <2e-16 ***
## alfabfem    -0.055260   0.003934  -14.04   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.032 on 83 degrees of freedom
##   (24 observations deleted due to missingness)
## Multiple R-squared:  0.7039, Adjusted R-squared:  0.7003 
## F-statistic: 197.3 on 1 and 83 DF,  p-value: < 2.2e-16

#Normalidad RESIDUOS. 

hist(residuals(rg_afabetfem))

lillie.test(residuals(rg_afabetfem))

## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  residuals(rg_afabetfem)
## D = 0.093114, p-value = 0.06587

plot(rg_afabetfem)

## Variable predictora: mortalidad infantil 

rg_mortinf <- lm (fertilid ~ mortinf, BBDD_Mundo_95)
summary(rg_mortinf)

## 
## Call:
## lm(formula = fertilid ~ mortinf, data = BBDD_Mundo_95)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0996 -0.5931 -0.2511  0.4450  3.2260 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.779399   0.154290   11.53   <2e-16 ***
## mortinf     0.041541   0.002692   15.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.057 on 105 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.694,  Adjusted R-squared:  0.6911 
## F-statistic: 238.2 on 1 and 105 DF,  p-value: < 2.2e-16

#Normalidad residuos: 

hist(residuals(rg_mortinf))

lillie.test(residuals(rg_mortinf))

## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  residuals(rg_mortinf)
## D = 0.14555, p-value = 7.861e-06

1ª-Práctica-Aanálisis-Estadístico-Avanzado.R

Jorge

2024-05-14