Tarea #5

Parte 1

Datos

Tasas <- read.csv("C:/Users/Nieves M/Documents/ESTA55503/Datos-Tareas/Tasas.csv")

La base de datos consta con 5 variables: TN- Tasa de natalidad TM- Tasa de mortalidad EV- Esperanza de vida EVM- Esperanza de vida en mujeres EVH- Esperanza de vida en hombres medidas en 194 paises alrededor del mundo.

Estandarizamos las variables

tasas_std <- scale(Tasas[, c("TN", "TM", "EV", "EVM", "EVH")])

Visualizaciones para analizar la normalidad:

Boxplots para identificar la dispersión y la simetría de los datos:

boxplot(tasas_std)

En la variable de Tasas de Mortalidad consta de bastantes outliers, auque en se puede observar simetria. En las demas variables tambien se puede ver simetria indicando normaliddad aunque en la de natalidad varia un poco.

Histogramas para comparar la distribución de las variables con la normal:

par(mfrow = c(2, 3))
hist(tasas_std[,1], prob = TRUE, xlab="TN", main="Histograma TN")
lines(density(tasas_std[,1]), col="red", lwd=2)
hist(tasas_std[,2], prob = TRUE, xlab="TM", main="Histograma TM")
lines(density(tasas_std[,2]), col="red", lwd=2)
hist(tasas_std[,3], prob = TRUE, xlab="EV", main="Histograma EV")
lines(density(tasas_std[,3]), col="red", lwd=2)
hist(tasas_std[,4], prob = TRUE, xlab="EVM", main="Histograma EVM")
lines(density(tasas_std[,4]), col="red", lwd=2)
hist(tasas_std[,5], prob = TRUE, xlab="EVH", main="Histograma EVH")
lines(density(tasas_std[,5]), col="red", lwd=2)

La variable de TN muestra poca normalidad con un sesgo a la derecha al igual que la variable TM. Las variables EV y EVH son las mas que reflejan normalidad, la variable TM muestra un leve sesgo hacia la izquierda

Q-Q plots para observar la normalidad de las variables:

par(mfrow = c(2, 3))
qqnorm(tasas_std[,1], main="Q-Q plot TN")
qqline(tasas_std[,1], col = "red")
qqnorm(tasas_std[,2], main="Q-Q plot TM")
qqline(tasas_std[,2], col = "red")
qqnorm(tasas_std[,3], main="Q-Q plot EV")
qqline(tasas_std[,3], col = "red")
qqnorm(tasas_std[,4], main="Q-Q plot EVM")
qqline(tasas_std[,4], col = "red")
qqnorm(tasas_std[,5], main="Q-Q plot EVH")
qqline(tasas_std[,5], col = "red")

Diagrama de correlación usando el paquete GGally para observar las relaciones entre las variables:

library(GGally)
## Warning: package 'GGally' was built under R version 4.3.3
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(tasas_std)

Pruebas analíticas de normalidad: Usaremos la función mvn del paquete MVN para aplicar las pruebas de normalidad univariada y multivariada.

library(MVN)
## Warning: package 'MVN' was built under R version 4.3.3
# Pruebas de normalidad univariada con Shapiro-Wilk
SW <- mvn(data = as.data.frame(tasas_std), univariateTest = "SW")
SW$univariateNormality
##           Test  Variable Statistic   p value Normality
## 1 Shapiro-Wilk    TN        0.9233  <0.001      NO    
## 2 Shapiro-Wilk    TM        0.9589  <0.001      NO    
## 3 Shapiro-Wilk    EV        0.9776  0.0034      NO    
## 4 Shapiro-Wilk    EVM       0.9707   4e-04      NO    
## 5 Shapiro-Wilk    EVH       0.9804   0.008      NO

Ninguna variable pasa la prueba de normalidad, al igual que el p-value es uno menor de 0.05

# Prueba de normalidad multivariada de Mardia
mardia <- mvn(data = as.data.frame(tasas_std), mvnTest = "mardia")
mardia$multivariateNormality
##              Test        Statistic              p value Result
## 1 Mardia Skewness 201.323801536187 2.98182323461392e-25     NO
## 2 Mardia Kurtosis 4.20204709407531 2.64512086500801e-05     NO
## 3             MVN             <NA>                 <NA>     NO

Parte 2

library(gapminder)
## Warning: package 'gapminder' was built under R version 4.3.3
library(devtools)
## Warning: package 'devtools' was built under R version 4.3.3
## Loading required package: usethis
## Warning: package 'usethis' was built under R version 4.3.3
#Instalar el paquete de gapminder desde Github
devtools::install_github("jennybc/gapminder")
## Downloading GitHub repo jennybc/gapminder@HEAD
## rlang (1.1.3 -> 1.1.4) [CRAN]
## glue  (1.7.0 -> 1.8.0) [CRAN]
## cli   (3.6.2 -> 3.6.3) [CRAN]
## Installing 3 packages: rlang, glue, cli
## Installing packages into 'C:/Users/Nieves M/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'rlang' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'rlang'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Nieves
## M\AppData\Local\R\win-library\4.3\00LOCK\rlang\libs\x64\rlang.dll to
## C:\Users\Nieves M\AppData\Local\R\win-library\4.3\rlang\libs\x64\rlang.dll:
## Permission denied
## Warning: restored 'rlang'
## package 'glue' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'glue'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Nieves M\AppData\Local\R\win-library\4.3\00LOCK\glue\libs\x64\glue.dll
## to C:\Users\Nieves M\AppData\Local\R\win-library\4.3\glue\libs\x64\glue.dll:
## Permission denied
## Warning: restored 'glue'
## package 'cli' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'cli'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Nieves M\AppData\Local\R\win-library\4.3\00LOCK\cli\libs\x64\cli.dll
## to C:\Users\Nieves M\AppData\Local\R\win-library\4.3\cli\libs\x64\cli.dll:
## Permission denied
## Warning: restored 'cli'
## 
## The downloaded binary packages are in
##  C:\Users\Nieves M\AppData\Local\Temp\RtmpacSuLd\downloaded_packages
## ── R CMD build ─────────────────────────────────────────────────────────────────
##          checking for file 'C:\Users\Nieves M\AppData\Local\Temp\RtmpacSuLd\remotes3e7c2d6c7bf9\jennybc-gapminder-5325048/DESCRIPTION' ...     checking for file 'C:\Users\Nieves M\AppData\Local\Temp\RtmpacSuLd\remotes3e7c2d6c7bf9\jennybc-gapminder-5325048/DESCRIPTION' ...   ✔  checking for file 'C:\Users\Nieves M\AppData\Local\Temp\RtmpacSuLd\remotes3e7c2d6c7bf9\jennybc-gapminder-5325048/DESCRIPTION'
##       ─  preparing 'gapminder': (1.1s)
##    checking DESCRIPTION meta-information ...     checking DESCRIPTION meta-information ...   ✔  checking DESCRIPTION meta-information
##       ─  checking for LF line-endings in source and make files and shell scripts
##   ─  checking for empty or unneeded directories
##       ─  building 'gapminder_1.0.0.9000.tar.gz'
##      
## 
## Warning: package 'gapminder' is in use and will not be installed
#Cargar los datos
data("gapminder")
#Datos
gapminder_2007 <- subset(gapminder, year == 2007,)

data_2007 <- data.frame(
  pop = gapminder_2007$pop,
  lifeExp = gapminder_2007$lifeExp,
  gdpPercap = gapminder_2007$gdpPercap)

Visualizar normalidad y test de normalidad

# Verificar normalidad de las variables
shapiro.test(gapminder_2007$pop) 
## 
##  Shapiro-Wilk normality test
## 
## data:  gapminder_2007$pop
## W = 0.25267, p-value < 2.2e-16
shapiro.test(gapminder_2007$lifeExp) 
## 
##  Shapiro-Wilk normality test
## 
## data:  gapminder_2007$lifeExp
## W = 0.89467, p-value = 1.357e-08
shapiro.test(gapminder_2007$gdpPercap) 
## 
##  Shapiro-Wilk normality test
## 
## data:  gapminder_2007$gdpPercap
## W = 0.80644, p-value = 2.039e-12
# Histogramas de las variables originales
par(mfrow = c(1, 3))
hist(gapminder_2007$pop, main = "Poblacion", xlab = "Poblacion")
hist(gapminder_2007$lifeExp, main = "Expectativa de vida", xlab = "lifeExp")
hist(gapminder_2007$gdpPercap, main = "Ingreso per capita", xlab = "gdpPercap")

#Histogramas
par(mfrow = c(1, 3))
hist(data_2007$pop, main="Poblacion", xlab="Poblacion")
hist(data_2007$lifeExp, main="Expectativa de vida", xlab="Expectativa de vida")
hist(data_2007$gdpPercap, main="Ingreso per capita", xlab="Ingreso per capita")

#Pruebas de normalidad de Shapiro-Wilk
shapiro.test(data_2007$pop)
## 
##  Shapiro-Wilk normality test
## 
## data:  data_2007$pop
## W = 0.25267, p-value < 2.2e-16
shapiro.test(data_2007$lifeExp)
## 
##  Shapiro-Wilk normality test
## 
## data:  data_2007$lifeExp
## W = 0.89467, p-value = 1.357e-08
shapiro.test(data_2007$gdpPercap)
## 
##  Shapiro-Wilk normality test
## 
## data:  data_2007$gdpPercap
## W = 0.80644, p-value = 2.039e-12
#Transformaciones logaritmicas
gapminder_2007$log_pop <- log(gapminder_2007$pop)
gapminder_2007$log_gdpPercap <- log(gapminder_2007$gdpPercap)

#Test de normalidad despues de la transformacion
shapiro.test(gapminder_2007$log_pop)  
## 
##  Shapiro-Wilk normality test
## 
## data:  gapminder_2007$log_pop
## W = 0.99249, p-value = 0.6593
shapiro.test(gapminder_2007$lifeExp) 
## 
##  Shapiro-Wilk normality test
## 
## data:  gapminder_2007$lifeExp
## W = 0.89467, p-value = 1.357e-08
shapiro.test(gapminder_2007$log_gdpPercap)
## 
##  Shapiro-Wilk normality test
## 
## data:  gapminder_2007$log_gdpPercap
## W = 0.95559, p-value = 0.000155
#Histogramas
par(mfrow = c(1, 3))
hist(gapminder_2007$log_pop, main = "Log(Poblacion)", xlab = "Log(Pop)")
hist(gapminder_2007$lifeExp, main = "Expectativa de vida", xlab = "lifeExp")
hist(gapminder_2007$log_gdpPercap, main = "Log(Ingreso per capita)", xlab = "Log(gdpPercap)")

library(bestNormalize)
## Warning: package 'bestNormalize' was built under R version 4.3.3
data("autotrader")
set.seed(123) 
sampled_data <- autotrader[sample(nrow(autotrader), 5000), ]

sampled_data$mileage <- as.numeric(as.character(sampled_data$mileage))
sampled_data$price <- as.numeric(as.character(sampled_data$price))

#Normalidad de las variables
shapiro_test_results <- data.frame(
  Variable = c("mileage", "price"),
  Shapiro_Wilk = c(
    shapiro.test(sampled_data$mileage)$p.value,
    shapiro.test(sampled_data$price)$p.value))
shapiro_test_results
##   Variable Shapiro_Wilk
## 1  mileage 5.063667e-54
## 2    price 1.222240e-36
#Transformaciones
sampled_data$mileage_log <- log(sampled_data$mileage + 1)
sampled_data$price_log <- log(sampled_data$price + 1)

#Prueba de Shapiro-Wilk en las variables transformadas
shapiro_trans <- data.frame(
  Variable = c("mileage_log", "price_log"),
  Shapiro_Wilk = c(
    shapiro.test(sampled_data$mileage_log)$p.value,
    shapiro.test(sampled_data$price_log)$p.value))
shapiro_trans
##      Variable Shapiro_Wilk
## 1 mileage_log 1.441335e-45
## 2   price_log 2.335904e-36