Tarea #5
Parte 1
Datos
Tasas <- read.csv("C:/Users/Nieves M/Documents/ESTA55503/Datos-Tareas/Tasas.csv")
La base de datos consta con 5 variables: TN- Tasa de natalidad TM- Tasa de mortalidad EV- Esperanza de vida EVM- Esperanza de vida en mujeres EVH- Esperanza de vida en hombres medidas en 194 paises alrededor del mundo.
Estandarizamos las variables
tasas_std <- scale(Tasas[, c("TN", "TM", "EV", "EVM", "EVH")])
Visualizaciones para analizar la normalidad:
Boxplots para identificar la dispersión y la simetrÃa de los datos:
boxplot(tasas_std)
En la variable de Tasas de Mortalidad consta de bastantes outliers,
auque en se puede observar simetria. En las demas variables tambien se
puede ver simetria indicando normaliddad aunque en la de natalidad varia
un poco.
Histogramas para comparar la distribución de las variables con la normal:
par(mfrow = c(2, 3))
hist(tasas_std[,1], prob = TRUE, xlab="TN", main="Histograma TN")
lines(density(tasas_std[,1]), col="red", lwd=2)
hist(tasas_std[,2], prob = TRUE, xlab="TM", main="Histograma TM")
lines(density(tasas_std[,2]), col="red", lwd=2)
hist(tasas_std[,3], prob = TRUE, xlab="EV", main="Histograma EV")
lines(density(tasas_std[,3]), col="red", lwd=2)
hist(tasas_std[,4], prob = TRUE, xlab="EVM", main="Histograma EVM")
lines(density(tasas_std[,4]), col="red", lwd=2)
hist(tasas_std[,5], prob = TRUE, xlab="EVH", main="Histograma EVH")
lines(density(tasas_std[,5]), col="red", lwd=2)
La variable de TN muestra poca normalidad con un sesgo a la derecha al
igual que la variable TM. Las variables EV y EVH son las mas que
reflejan normalidad, la variable TM muestra un leve sesgo hacia la
izquierda
Q-Q plots para observar la normalidad de las variables:
par(mfrow = c(2, 3))
qqnorm(tasas_std[,1], main="Q-Q plot TN")
qqline(tasas_std[,1], col = "red")
qqnorm(tasas_std[,2], main="Q-Q plot TM")
qqline(tasas_std[,2], col = "red")
qqnorm(tasas_std[,3], main="Q-Q plot EV")
qqline(tasas_std[,3], col = "red")
qqnorm(tasas_std[,4], main="Q-Q plot EVM")
qqline(tasas_std[,4], col = "red")
qqnorm(tasas_std[,5], main="Q-Q plot EVH")
qqline(tasas_std[,5], col = "red")
Diagrama de correlación usando el paquete GGally para observar las relaciones entre las variables:
library(GGally)
## Warning: package 'GGally' was built under R version 4.3.3
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(tasas_std)
Pruebas analÃticas de normalidad: Usaremos la función mvn del paquete MVN para aplicar las pruebas de normalidad univariada y multivariada.
library(MVN)
## Warning: package 'MVN' was built under R version 4.3.3
# Pruebas de normalidad univariada con Shapiro-Wilk
SW <- mvn(data = as.data.frame(tasas_std), univariateTest = "SW")
SW$univariateNormality
## Test Variable Statistic p value Normality
## 1 Shapiro-Wilk TN 0.9233 <0.001 NO
## 2 Shapiro-Wilk TM 0.9589 <0.001 NO
## 3 Shapiro-Wilk EV 0.9776 0.0034 NO
## 4 Shapiro-Wilk EVM 0.9707 4e-04 NO
## 5 Shapiro-Wilk EVH 0.9804 0.008 NO
Ninguna variable pasa la prueba de normalidad, al igual que el p-value es uno menor de 0.05
# Prueba de normalidad multivariada de Mardia
mardia <- mvn(data = as.data.frame(tasas_std), mvnTest = "mardia")
mardia$multivariateNormality
## Test Statistic p value Result
## 1 Mardia Skewness 201.323801536187 2.98182323461392e-25 NO
## 2 Mardia Kurtosis 4.20204709407531 2.64512086500801e-05 NO
## 3 MVN <NA> <NA> NO
Parte 2
library(gapminder)
## Warning: package 'gapminder' was built under R version 4.3.3
library(devtools)
## Warning: package 'devtools' was built under R version 4.3.3
## Loading required package: usethis
## Warning: package 'usethis' was built under R version 4.3.3
#Instalar el paquete de gapminder desde Github
devtools::install_github("jennybc/gapminder")
## Downloading GitHub repo jennybc/gapminder@HEAD
## rlang (1.1.3 -> 1.1.4) [CRAN]
## glue (1.7.0 -> 1.8.0) [CRAN]
## cli (3.6.2 -> 3.6.3) [CRAN]
## Installing 3 packages: rlang, glue, cli
## Installing packages into 'C:/Users/Nieves M/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'rlang' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'rlang'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Nieves
## M\AppData\Local\R\win-library\4.3\00LOCK\rlang\libs\x64\rlang.dll to
## C:\Users\Nieves M\AppData\Local\R\win-library\4.3\rlang\libs\x64\rlang.dll:
## Permission denied
## Warning: restored 'rlang'
## package 'glue' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'glue'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Nieves M\AppData\Local\R\win-library\4.3\00LOCK\glue\libs\x64\glue.dll
## to C:\Users\Nieves M\AppData\Local\R\win-library\4.3\glue\libs\x64\glue.dll:
## Permission denied
## Warning: restored 'glue'
## package 'cli' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'cli'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Nieves M\AppData\Local\R\win-library\4.3\00LOCK\cli\libs\x64\cli.dll
## to C:\Users\Nieves M\AppData\Local\R\win-library\4.3\cli\libs\x64\cli.dll:
## Permission denied
## Warning: restored 'cli'
##
## The downloaded binary packages are in
## C:\Users\Nieves M\AppData\Local\Temp\RtmpacSuLd\downloaded_packages
## ── R CMD build ─────────────────────────────────────────────────────────────────
## checking for file 'C:\Users\Nieves M\AppData\Local\Temp\RtmpacSuLd\remotes3e7c2d6c7bf9\jennybc-gapminder-5325048/DESCRIPTION' ... checking for file 'C:\Users\Nieves M\AppData\Local\Temp\RtmpacSuLd\remotes3e7c2d6c7bf9\jennybc-gapminder-5325048/DESCRIPTION' ... ✔ checking for file 'C:\Users\Nieves M\AppData\Local\Temp\RtmpacSuLd\remotes3e7c2d6c7bf9\jennybc-gapminder-5325048/DESCRIPTION'
## ─ preparing 'gapminder': (1.1s)
## checking DESCRIPTION meta-information ... checking DESCRIPTION meta-information ... ✔ checking DESCRIPTION meta-information
## ─ checking for LF line-endings in source and make files and shell scripts
## ─ checking for empty or unneeded directories
## ─ building 'gapminder_1.0.0.9000.tar.gz'
##
##
## Warning: package 'gapminder' is in use and will not be installed
#Cargar los datos
data("gapminder")
#Datos
gapminder_2007 <- subset(gapminder, year == 2007,)
data_2007 <- data.frame(
pop = gapminder_2007$pop,
lifeExp = gapminder_2007$lifeExp,
gdpPercap = gapminder_2007$gdpPercap)
Visualizar normalidad y test de normalidad
# Verificar normalidad de las variables
shapiro.test(gapminder_2007$pop)
##
## Shapiro-Wilk normality test
##
## data: gapminder_2007$pop
## W = 0.25267, p-value < 2.2e-16
shapiro.test(gapminder_2007$lifeExp)
##
## Shapiro-Wilk normality test
##
## data: gapminder_2007$lifeExp
## W = 0.89467, p-value = 1.357e-08
shapiro.test(gapminder_2007$gdpPercap)
##
## Shapiro-Wilk normality test
##
## data: gapminder_2007$gdpPercap
## W = 0.80644, p-value = 2.039e-12
# Histogramas de las variables originales
par(mfrow = c(1, 3))
hist(gapminder_2007$pop, main = "Poblacion", xlab = "Poblacion")
hist(gapminder_2007$lifeExp, main = "Expectativa de vida", xlab = "lifeExp")
hist(gapminder_2007$gdpPercap, main = "Ingreso per capita", xlab = "gdpPercap")
#Histogramas
par(mfrow = c(1, 3))
hist(data_2007$pop, main="Poblacion", xlab="Poblacion")
hist(data_2007$lifeExp, main="Expectativa de vida", xlab="Expectativa de vida")
hist(data_2007$gdpPercap, main="Ingreso per capita", xlab="Ingreso per capita")
#Pruebas de normalidad de Shapiro-Wilk
shapiro.test(data_2007$pop)
##
## Shapiro-Wilk normality test
##
## data: data_2007$pop
## W = 0.25267, p-value < 2.2e-16
shapiro.test(data_2007$lifeExp)
##
## Shapiro-Wilk normality test
##
## data: data_2007$lifeExp
## W = 0.89467, p-value = 1.357e-08
shapiro.test(data_2007$gdpPercap)
##
## Shapiro-Wilk normality test
##
## data: data_2007$gdpPercap
## W = 0.80644, p-value = 2.039e-12
#Transformaciones logaritmicas
gapminder_2007$log_pop <- log(gapminder_2007$pop)
gapminder_2007$log_gdpPercap <- log(gapminder_2007$gdpPercap)
#Test de normalidad despues de la transformacion
shapiro.test(gapminder_2007$log_pop)
##
## Shapiro-Wilk normality test
##
## data: gapminder_2007$log_pop
## W = 0.99249, p-value = 0.6593
shapiro.test(gapminder_2007$lifeExp)
##
## Shapiro-Wilk normality test
##
## data: gapminder_2007$lifeExp
## W = 0.89467, p-value = 1.357e-08
shapiro.test(gapminder_2007$log_gdpPercap)
##
## Shapiro-Wilk normality test
##
## data: gapminder_2007$log_gdpPercap
## W = 0.95559, p-value = 0.000155
#Histogramas
par(mfrow = c(1, 3))
hist(gapminder_2007$log_pop, main = "Log(Poblacion)", xlab = "Log(Pop)")
hist(gapminder_2007$lifeExp, main = "Expectativa de vida", xlab = "lifeExp")
hist(gapminder_2007$log_gdpPercap, main = "Log(Ingreso per capita)", xlab = "Log(gdpPercap)")
library(bestNormalize)
## Warning: package 'bestNormalize' was built under R version 4.3.3
data("autotrader")
set.seed(123)
sampled_data <- autotrader[sample(nrow(autotrader), 5000), ]
sampled_data$mileage <- as.numeric(as.character(sampled_data$mileage))
sampled_data$price <- as.numeric(as.character(sampled_data$price))
#Normalidad de las variables
shapiro_test_results <- data.frame(
Variable = c("mileage", "price"),
Shapiro_Wilk = c(
shapiro.test(sampled_data$mileage)$p.value,
shapiro.test(sampled_data$price)$p.value))
shapiro_test_results
## Variable Shapiro_Wilk
## 1 mileage 5.063667e-54
## 2 price 1.222240e-36
#Transformaciones
sampled_data$mileage_log <- log(sampled_data$mileage + 1)
sampled_data$price_log <- log(sampled_data$price + 1)
#Prueba de Shapiro-Wilk en las variables transformadas
shapiro_trans <- data.frame(
Variable = c("mileage_log", "price_log"),
Shapiro_Wilk = c(
shapiro.test(sampled_data$mileage_log)$p.value,
shapiro.test(sampled_data$price_log)$p.value))
shapiro_trans
## Variable Shapiro_Wilk
## 1 mileage_log 1.441335e-45
## 2 price_log 2.335904e-36