Analice la normalidad de la base de datos Tasas, la cual cuenta con
4 variables (TN: tasa de natalidad, TM: tasa de mortalidad, EV:
esperanza de vida, EVM: esperanza de vida en mujeres y EVH: esperanza de
vida en hombres), medidas en 194 países alrededor del mundo.
library(MVN)
data <- read.csv("Tasas.csv")
# 1. Mardia
Mardia <- mvn(data[,-1], mvnTest = "mardia")
Mardia$multivariateNormality
## Test Statistic p value Result
## 1 Mardia Skewness 201.323801536089 2.98182323473711e-25 NO
## 2 Mardia Kurtosis 4.20204709406389 2.64512086514124e-05 NO
## 3 MVN <NA> <NA> NO
# 2. Henze-Zirkler
HZ <- mvn(data[,-1], mvnTest = "hz")
HZ$multivariateNormality
## Test HZ p value MVN
## 1 Henze-Zirkler 2.294973 0 NO
# 3. Royston
Royston <- mvn(data[,-1], mvnTest = "royston")
Royston$multivariateNormality
## Test H p value MVN
## 1 Royston 25.55369 1.527186e-06 NO
# 4 Doornik-Hansen
DH <- mvn(data[,-1], mvnTest = "dh")
DH$multivariateNormality
## Test E df p value MVN
## 1 Doornik-Hansen 239.7374 10 7.77862e-46 NO
# 5 Energy
Energy<- mvn(data[,-1], mvnTest = "energy")
Energy$multivariateNormality
## Test Statistic p value MVN
## 1 E-statistic 3.151856 0 NO
## Despues de comprobar con diferentes prueba, concluyo que no es normal
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(data[,-1])

#pruebas de normalidad univariada
# 1. Shapiro-Wilks
SW <- mvn(data[,-1], univariateTest = "SW",desc=T)
SW
## $multivariateNormality
## Test HZ p value MVN
## 1 Henze-Zirkler 2.294973 0 NO
##
## $univariateNormality
## Test Variable Statistic p value Normality
## 1 Shapiro-Wilk TN 0.9233 <0.001 NO
## 2 Shapiro-Wilk TM 0.9589 <0.001 NO
## 3 Shapiro-Wilk EV 0.9776 0.0034 NO
## 4 Shapiro-Wilk EVM 0.9707 4e-04 NO
## 5 Shapiro-Wilk EVH 0.9804 0.008 NO
##
## $Descriptives
## n Mean Std.Dev Median Min Max 25th 75th Skew
## TN 194 19.06469 9.872522 16.885 5.00 45.29 10.4250 27.2400 0.65598906
## TM 194 8.57701 3.074472 8.090 1.31 18.40 6.6150 9.9975 0.72290852
## EV 194 71.31052 7.764204 71.860 52.53 85.60 65.6700 76.8050 -0.23215863
## EVM 194 74.01974 7.876189 75.165 53.07 87.90 68.4375 79.5975 -0.39876259
## EVH 194 68.73345 7.767232 68.700 50.37 84.10 63.0850 73.6750 -0.07285581
## Kurtosis
## TN -0.6175703
## TM 0.6721286
## EV -0.6532162
## EVM -0.5952345
## EVH -0.6863484
# 3. Lilliefors (correccion de Kolmogorov)
L <- mvn(data[,-1], univariateTest = "Lillie",desc=T)
L$univariateNormality
## Test Variable Statistic p value Normality
## 1 Lilliefors (Kolmogorov-Smirnov) TN 0.1290 <0.001 NO
## 2 Lilliefors (Kolmogorov-Smirnov) TM 0.0859 0.0014 NO
## 3 Lilliefors (Kolmogorov-Smirnov) EV 0.0576 0.1199 YES
## 4 Lilliefors (Kolmogorov-Smirnov) EVM 0.0745 0.0107 NO
## 5 Lilliefors (Kolmogorov-Smirnov) EVH 0.0613 0.0728 YES
# 4. Shapiro Francia
SF <- mvn(data[,-1], univariateTest = "SF",desc=T)
SF$univariateNormality
## Test Variable Statistic p value Normality
## 1 Shapiro-Francia TN 0.9267 <0.001 NO
## 2 Shapiro-Francia TM 0.9588 1e-04 NO
## 3 Shapiro-Francia EV 0.9809 0.0112 NO
## 4 Shapiro-Francia EVM 0.9736 0.0015 NO
## 5 Shapiro-Francia EVH 0.9835 0.0237 NO
# 5. Anderson Darling
AD <- mvn(data[,-1], univariateTest = "AD",desc=T)
AD$univariateNormality
## Test Variable Statistic p value Normality
## 1 Anderson-Darling TN 4.9804 <0.001 NO
## 2 Anderson-Darling TM 2.6962 <0.001 NO
## 3 Anderson-Darling EV 0.9374 0.0172 NO
## 4 Anderson-Darling EVM 1.5290 6e-04 NO
## 5 Anderson-Darling EVH 0.8141 0.0347 NO
## Despues de varias pruebas, concluyo que no es normal
library(gapminder)
library(devtools)
## Loading required package: usethis
## Warning: package 'usethis' was built under R version 4.3.3
devtools::install_github("jennybc/gapminder")
## Skipping install of 'gapminder' from a github remote, the SHA1 (b8958723) has not changed since last install.
## Use `force = TRUE` to force installation
# Cargamos los datos
gapminder <- gapminder::gapminder
# Datos de población
Data <- data.frame(
pob = gapminder$pop[gapminder$year == 1982],
lf = gapminder$lifeExp[gapminder$year == 1982],
gdp = gapminder$gdpPercap[gapminder$year == 1982])
# 1. Mardia
Mardia <- mvn(Data, mvnTest = "mardia")
Mardia$multivariateNormality
## Test Statistic p value Result
## 1 Mardia Skewness 1433.19089827176 6.7587006978414e-302 NO
## 2 Mardia Kurtosis 65.7769715987007 0 NO
## 3 MVN <NA> <NA> NO
library(bestNormalize)
SW1 <- shapiro.test(Data$pob)
SW1
##
## Shapiro-Wilk normality test
##
## data: Data$pob
## W = 0.24637, p-value < 2.2e-16
SW2 <- shapiro.test(Data$lf)
SW2
##
## Shapiro-Wilk normality test
##
## data: Data$lf
## W = 0.94091, p-value = 1.059e-05
SW3 <- shapiro.test(Data$gdp)
SW3
##
## Shapiro-Wilk normality test
##
## data: Data$gdp
## W = 0.83081, p-value = 1.676e-11
library(bestNormalize)
best_trans1 <- bestNormalize(Data$pob)
best_trans1
## Best Normalizing transformation with 142 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - arcsinh(x): 0.8979
## - Box-Cox: 0.875
## - Center+scale: 7.6328
## - Double Reversed Log_b(x+a): 6.6672
## - Log_b(x+a): 0.8979
## - orderNorm (ORQ): 0.987
## - sqrt(x + a): 2.8541
## - Yeo-Johnson: 0.875
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## Standardized Box Cox Transformation with 142 nonmissing obs.:
## Estimated statistics:
## - lambda = -0.02564246
## - mean (before standardization) = 12.99403
## - sd (before standardization) = 1.038111
best_trans2 <- bestNormalize(Data$lf)
best_trans2
## Best Normalizing transformation with 142 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - arcsinh(x): 1.7448
## - Box-Cox: 1.5314
## - Center+scale: 1.4762
## - Double Reversed Log_b(x+a): 1.3813
## - Exp(x): 14.3223
## - Log_b(x+a): 1.7448
## - orderNorm (ORQ): 1.1364
## - sqrt(x + a): 1.6267
## - Yeo-Johnson: 1.5314
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## orderNorm Transformation with 142 nonmissing obs and no ties
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 38.445 52.940 62.442 70.921 77.110
best_trans3 <- bestNormalize(Data$gdp)
best_trans3
## Best Normalizing transformation with 142 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - arcsinh(x): 1.2632
## - Box-Cox: 1.2343
## - Center+scale: 3.7349
## - Double Reversed Log_b(x+a): 4.6484
## - Log_b(x+a): 1.2632
## - orderNorm (ORQ): 1.2091
## - sqrt(x + a): 1.557
## - Yeo-Johnson: 1.2343
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## orderNorm Transformation with 142 nonmissing obs and no ties
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 424.000 1363.339 4216.228 12347.954 33693.175
library(MVN)
SW1_trans <- shapiro.test(best_trans1$x.t)
SW1_trans
##
## Shapiro-Wilk normality test
##
## data: best_trans1$x.t
## W = 0.99362, p-value = 0.7824
SW2_trans <- shapiro.test(best_trans2$x.t)
SW2_trans
##
## Shapiro-Wilk normality test
##
## data: best_trans2$x.t
## W = 0.99968, p-value = 1
SW3_trans <- shapiro.test(best_trans3$x.t)
SW3_trans
##
## Shapiro-Wilk normality test
##
## data: best_trans3$x.t
## W = 0.99968, p-value = 1
## Con lo que aprendi en clase, normalice las variables. BestNormalize escogio orderNorm Transformation
library(bestNormalize)
data("autotrader")
names(autotrader)
## [1] "Car_Info" "Link" "Make" "Year" "Location" "Radius"
## [7] "price" "mileage" "status" "model"
Data3 <- data.frame(
kilom= autotrader$mileage,
ant = autotrader$Year,
precio = autotrader$price)
library(nortest)
AD_kilom <- ad.test(Data3$kilom)
AD_ant <- ad.test(Data3$ant)
AD_precio <- ad.test(Data3$precio)
library(bestNormalize)
best_transk <- bestNormalize(Data3$kilom)
## Warning: `progress_estimated()` was deprecated in dplyr 1.0.0.
## ℹ The deprecated feature was likely used in the bestNormalize package.
## Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
best_transk
## Best Normalizing transformation with 6283 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - arcsinh(x): 3.3649
## - Box-Cox: 3.0301
## - Center+scale: 14.7426
## - Double Reversed Log_b(x+a): 23.2558
## - Log_b(x+a): 3.3699
## - orderNorm (ORQ): 1.1326
## - sqrt(x + a): 5.0516
## - Yeo-Johnson: 3.0237
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## orderNorm Transformation with 6283 nonmissing obs and ties
## - 6077 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 2 29099 44800 88950 325556
best_transa <- bestNormalize(Data3$ant)
best_transa
## Best Normalizing transformation with 6283 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - arcsinh(x): 83.562
## - Box-Cox: 83.562
## - Center+scale: 83.562
## - Double Reversed Log_b(x+a): 83.3492
## - Log_b(x+a): 83.562
## - orderNorm (ORQ): 81.3902
## - sqrt(x + a): 83.562
## - Yeo-Johnson: 83.5797
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## orderNorm Transformation with 6283 nonmissing obs and ties
## - 17 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 2000 2010 2013 2014 2016
best_transp <- bestNormalize(Data3$precio)
best_transp
## Best Normalizing transformation with 6283 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - arcsinh(x): 4.0973
## - Box-Cox: 2.2405
## - Center+scale: 3.5981
## - Double Reversed Log_b(x+a): 6.6707
## - Log_b(x+a): 4.0973
## - orderNorm (ORQ): 1.2773
## - sqrt(x + a): 2.2668
## - Yeo-Johnson: 2.2405
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## orderNorm Transformation with 6283 nonmissing obs and ties
## - 2465 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 722 11499 15998 21497 64998
trans_kilom <- ad.test(best_transk$x.t)
trans_kilom
##
## Anderson-Darling normality test
##
## data: best_transk$x.t
## A = 0.00032915, p-value = 1
trans_ant <- ad.test(best_transa$x.t)
trans_ant
##
## Anderson-Darling normality test
##
## data: best_transa$x.t
## A = 98.365, p-value < 2.2e-16
trans_precio <- ad.test(best_transp$x.t)
trans_precio
##
## Anderson-Darling normality test
##
## data: best_transp$x.t
## A = 0.01806, p-value = 1
## Los datos ya son normales. BestNormalize escogio orderNorm Transformation