Tasas <- read_csv("Tasas.csv")
numeric_data <- Tasas[, 2:6]
#1.Shapiro-Wilks esta metrica es para variables menores de 30
SW <- mvn(numeric_data, univariateTest="SW", desc=T)
SW$univariateNormality
## Test Variable Statistic p value Normality
## 1 Shapiro-Wilk TN 0.9233 <0.001 NO
## 2 Shapiro-Wilk TM 0.9589 <0.001 NO
## 3 Shapiro-Wilk EV 0.9776 0.0034 NO
## 4 Shapiro-Wilk EVM 0.9707 4e-04 NO
## 5 Shapiro-Wilk EVH 0.9804 0.008 NO
#2.Cramer-VonMises
CVM<-mvn(numeric_data, univariateTest= "CVM",desc=T)
CVM$univariateNormality
## Test Variable Statistic p value Normality
## 1 Cramer-von Mises TN 0.8156 <0.001 NO
## 2 Cramer-von Mises TM 0.4518 <0.001 NO
## 3 Cramer-von Mises EV 0.1192 0.061 YES
## 4 Cramer-von Mises EVM 0.2454 0.0014 NO
## 5 Cramer-von Mises EVH 0.0812 0.1989 YES
#3.Lilliefors(correccion de Kolmogorov)
L <- mvn(numeric_data, univariateTest="Lillie",desc=T)
L$univariateNormality
## Test Variable Statistic p value Normality
## 1 Lilliefors (Kolmogorov-Smirnov) TN 0.1290 <0.001 NO
## 2 Lilliefors (Kolmogorov-Smirnov) TM 0.0859 0.0014 NO
## 3 Lilliefors (Kolmogorov-Smirnov) EV 0.0576 0.1199 YES
## 4 Lilliefors (Kolmogorov-Smirnov) EVM 0.0745 0.0107 NO
## 5 Lilliefors (Kolmogorov-Smirnov) EVH 0.0613 0.0728 YES
# 4. Shapiro Francia
SF <- mvn(numeric_data, univariateTest = "SF",desc=T)
SF$univariateNormality
## Test Variable Statistic p value Normality
## 1 Shapiro-Francia TN 0.9267 <0.001 NO
## 2 Shapiro-Francia TM 0.9588 1e-04 NO
## 3 Shapiro-Francia EV 0.9809 0.0112 NO
## 4 Shapiro-Francia EVM 0.9736 0.0015 NO
## 5 Shapiro-Francia EVH 0.9835 0.0237 NO
# 5. Anderson Darling
AD <- mvn(numeric_data, univariateTest = "AD",desc=T)
AD$univariateNormality
## Test Variable Statistic p value Normality
## 1 Anderson-Darling TN 4.9804 <0.001 NO
## 2 Anderson-Darling TM 2.6962 <0.001 NO
## 3 Anderson-Darling EV 0.9374 0.0172 NO
## 4 Anderson-Darling EVM 1.5290 6e-04 NO
## 5 Anderson-Darling EVH 0.8141 0.0347 NO
# 1. Mardia
Mardia <- mvn(numeric_data, mvnTest = "mardia")
Mardia$multivariateNormality
## Test Statistic p value Result
## 1 Mardia Skewness 201.323801536089 2.98182323473629e-25 NO
## 2 Mardia Kurtosis 4.20204709407368 2.64512086503021e-05 NO
## 3 MVN <NA> <NA> NO
# 2. Henze-Zirkler
HZ <- mvn(numeric_data, mvnTest = "hz")
HZ$multivariateNormality
## Test HZ p value MVN
## 1 Henze-Zirkler 2.294973 0 NO
# 3. Royston
Royston <- mvn(numeric_data, mvnTest = "royston")
Royston$multivariateNormality
## Test H p value MVN
## 1 Royston 25.55369 1.527186e-06 NO
# 4 Doornik-Hansen
DH <- mvn(numeric_data, mvnTest = "royston")
DH$multivariateNormality
## Test H p value MVN
## 1 Royston 25.55369 1.527186e-06 NO
# 5 Energy
Energy<- mvn(numeric_data, mvnTest = "energy")
Energy$multivariateNormality
## Test Statistic p value MVN
## 1 E-statistic 3.151856 0 NO
library(GGally)
ggpairs(numeric_data)
numeric_data_sig <- numeric_data[,c(3,5)]
Podemos notar que bajo las pruebas normales univariadas,
Cramer-VonMises y Lilliefors las variables EV: esperanza de vida y EVH:
esperanza de vida en hombres son normales. Mientras que en las pruebas
de la normal mutivariada no lo son. El que estas pruebas sean normales
en las pruebas normales multivariadas no nos debe sorprender dado que si
revisamos gráficos se puede apreciar que estas varibles tienen una
distribución similar a la normal.
library(devtools)
library(gapminder)
devtools::install_github("jennybc/gapminder")
##
## ── R CMD build ─────────────────────────────────────────────────────────────────
## checking for file 'C:\Users\yelia\AppData\Local\Temp\RtmpqUPmGM\remotes29f87491278f\jennybc-gapminder-5325048/DESCRIPTION' ... ✔ checking for file 'C:\Users\yelia\AppData\Local\Temp\RtmpqUPmGM\remotes29f87491278f\jennybc-gapminder-5325048/DESCRIPTION' (359ms)
## ─ preparing 'gapminder': (1.6s)
## checking DESCRIPTION meta-information ... ✔ checking DESCRIPTION meta-information
## ─ checking for LF line-endings in source and make files and shell scripts (471ms)
## ─ checking for empty or unneeded directories
## ─ building 'gapminder_1.0.0.9000.tar.gz'
##
##
gapminder <- gapminder::gapminder
Trabaja en la transformación de las variables población (pop), expectativa de vida (lifeExp) y el ingreso percápita (gdpPercap) para un año cualquier en específico.
# Datos
data1 <- data.frame(
pob = gapminder$pop[gapminder$year == 1997],
lf = gapminder$lifeExp[gapminder$year == 1997],
gdp = gapminder$gdpPercap[gapminder$year == 1997]
)
# Pruebas analíticas
SW_pob <- shapiro.test(data1$pob)
SW_pob
##
## Shapiro-Wilk normality test
##
## data: data1$pob
## W = 0.24725, p-value < 2.2e-16
SW_lf <- shapiro.test(data1$lf)
SW_lf
##
## Shapiro-Wilk normality test
##
## data: data1$lf
## W = 0.91387, p-value = 1.662e-07
SW_gdp <- shapiro.test(data1$gdp)
SW_gdp
##
## Shapiro-Wilk normality test
##
## data: data1$gdp
## W = 0.79607, p-value = 8.794e-13
# Pruebas gráficas
par(mfrow=c(1,3))
qqnorm(data1$pob,main=" ",xlab="Poblacion",ylab=" ")
qqline(data1$pob,col="blue",lwd=2)
qqnorm(data1$lf,main=" ",xlab="Expectativa de Vida",ylab="")
qqline(data1$lf,col="blue",lwd=2)
qqnorm(data1$gdp,main=" ",xlab=" Ingreso Per Cápita",ylab=" ")
qqline(data1$gdp,col="blue",lwd=2)
library(bestNormalize)
# Mejor transformacion para cada variable
best_trans_pob <- bestNormalize(data1$pob)
best_trans_pob
## Best Normalizing transformation with 142 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - arcsinh(x): 1.181
## - Box-Cox: 1.1928
## - Center+scale: 8.0004
## - Double Reversed Log_b(x+a): 6.7919
## - Log_b(x+a): 1.181
## - orderNorm (ORQ): 1.3124
## - sqrt(x + a): 3.1078
## - Yeo-Johnson: 1.1928
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## Standardized asinh(x) Transformation with 142 nonmissing obs.:
## Relevant statistics:
## - mean (before standardization) = 16.8141
## - sd (before standardization) = 1.537222
best_trans_lf <- bestNormalize(data1$lf)
best_trans_lf
## Best Normalizing transformation with 142 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - arcsinh(x): 2.6971
## - Box-Cox: 1.8792
## - Center+scale: 2.1154
## - Double Reversed Log_b(x+a): 1.467
## - Exp(x): 14.5798
## - Log_b(x+a): 2.6971
## - orderNorm (ORQ): 1.3829
## - sqrt(x + a): 2.395
## - Yeo-Johnson: 1.8621
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## orderNorm Transformation with 142 nonmissing obs and no ties
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 36.087 55.634 69.394 74.170 80.690
best_trans_gdp <- bestNormalize(data1$gdp)
best_trans_gdp
## Best Normalizing transformation with 142 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - arcsinh(x): 1.451
## - Box-Cox: 1.4072
## - Center+scale: 4.3177
## - Double Reversed Log_b(x+a): 5.128
## - Log_b(x+a): 1.451
## - orderNorm (ORQ): 1.3451
## - sqrt(x + a): 2.0385
## - Yeo-Johnson: 1.4015
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## orderNorm Transformation with 142 nonmissing obs and no ties
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 312.188 1366.838 4781.825 12022.867 41283.164
#Transformaciones
x1_trans <-best_trans_pob$x.t
SW_pobt <- shapiro.test(x1_trans)
SW_pobt
##
## Shapiro-Wilk normality test
##
## data: x1_trans
## W = 0.99251, p-value = 0.6621
x2_trans <- best_trans_lf$x.t
SW_lft <- shapiro.test(x2_trans)
SW_lft
##
## Shapiro-Wilk normality test
##
## data: x2_trans
## W = 0.99968, p-value = 1
x3_trans <- best_trans_gdp$x.t
SW_gpdt <- shapiro.test(x3_trans)
SW_gpdt
##
## Shapiro-Wilk normality test
##
## data: x3_trans
## W = 0.99968, p-value = 1
Luego de haber elegido la mejor opcion para normalizar los
datos en cada variable, al realizar la prueba de Shapiro podemos notar
que las variables pob: población y lf: expectativa de vida 1997 se
llegaron a normalizar. Sin embargo, la variable gdp: ingreso per cápita
para este año no se pudo normalizar, pero al visualizar las pruebas
gráficas vemos que esta variable tiene una distribución muy diferente
por lo cual no es extraño.
library(bestNormalize)
data("autotrader")
data2 <- data.frame(
mil = autotrader$mileage,
year = autotrader$Year,
price = autotrader$price
)
# Pruebas analíticas
library(nortest)
AD_mil <- ad.test(data2$mil)
AD_mil
##
## Anderson-Darling normality test
##
## data: data2$mil
## A = 271.27, p-value < 2.2e-16
AD_year <- ad.test(data2$year)
AD_year
##
## Anderson-Darling normality test
##
## data: data2$year
## A = 399.77, p-value < 2.2e-16
AD_price <- ad.test(data2$price)
AD_price
##
## Anderson-Darling normality test
##
## data: data2$price
## A = 49.948, p-value < 2.2e-16
# Pruebas gráficas
par(mfrow=c(1,3))
qqnorm(data2$mil,main=" ",xlab="Milleage",ylab=" ")
qqline(data2$mil,col="blue",lwd=2)
qqnorm(data2$year,main=" ",xlab="Year",ylab="")
qqline(data2$year,col="blue",lwd=2)
qqnorm(data2$price,main=" ",xlab="Price",ylab=" ")
qqline(data2$price,col="blue",lwd=2)
# Encontrar la mejor transformación
best_trans_mil <- bestNormalize(data2$mil)
best_trans_mil
## Best Normalizing transformation with 6283 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - arcsinh(x): 3.3971
## - Box-Cox: 2.9883
## - Center+scale: 14.8266
## - Double Reversed Log_b(x+a): 23.1766
## - Log_b(x+a): 3.4049
## - orderNorm (ORQ): 1.1013
## - sqrt(x + a): 5.052
## - Yeo-Johnson: 2.9926
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## orderNorm Transformation with 6283 nonmissing obs and ties
## - 6077 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 2 29099 44800 88950 325556
best_trans_year <- bestNormalize(data2$year)
best_trans_year
## Best Normalizing transformation with 6283 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - arcsinh(x): 83.567
## - Box-Cox: 83.567
## - Center+scale: 83.567
## - Double Reversed Log_b(x+a): 83.3381
## - Log_b(x+a): 83.567
## - orderNorm (ORQ): 81.312
## - sqrt(x + a): 83.567
## - Yeo-Johnson: 83.5876
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## orderNorm Transformation with 6283 nonmissing obs and ties
## - 17 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 2000 2010 2013 2014 2016
best_trans_price <- bestNormalize(data2$price)
best_trans_price
## Best Normalizing transformation with 6283 Observations
## Estimated Normality Statistics (Pearson P / df, lower => more normal):
## - arcsinh(x): 3.9536
## - Box-Cox: 2.2104
## - Center+scale: 3.5508
## - Double Reversed Log_b(x+a): 6.6272
## - Log_b(x+a): 3.9536
## - orderNorm (ORQ): 1.1401
## - sqrt(x + a): 2.2815
## - Yeo-Johnson: 2.2109
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##
## Based off these, bestNormalize chose:
## orderNorm Transformation with 6283 nonmissing obs and ties
## - 2465 unique values
## - Original quantiles:
## 0% 25% 50% 75% 100%
## 722 11499 15998 21497 64998
# Transformaciones
x1_trans <-best_trans_mil$x.t
AD_Tmil <- ad.test(x1_trans)
AD_Tmil
##
## Anderson-Darling normality test
##
## data: x1_trans
## A = 0.00032915, p-value = 1
x2_trans <-best_trans_year$x.t
AD_Tyear <- ad.test(x2_trans)
AD_Tyear
##
## Anderson-Darling normality test
##
## data: x2_trans
## A = 98.365, p-value < 2.2e-16
x3_trans <-best_trans_price$x.t
AD_Tprice <- ad.test(x3_trans)
AD_Tprice
##
## Anderson-Darling normality test
##
## data: x3_trans
## A = 0.01806, p-value = 1
Luego de haber elegido la mejor opción para normalizar los
datos en cada variable, al realizar la prueba de Anderson-Darling
podemos notar que las variables mil: mileage y price: precio se llegaron
a normalizar. Sin embargo la varible year: antigüedad no se pudo
normalizar. Esto no nos debe sorprender ya que como podemos notar en el
gráfico, esta variable tiene una disperción muy extraña.