EJERICIO 1

pruebas multivariadas

library(MVN)
## Warning: package 'MVN' was built under R version 4.3.3
data<- read.csv("Tasas.csv")
head(data)
##               Paises    TN    TM    EV   EVM   EVH
## 1        Afganistan  35.84  7.34 61.98 65.28 58.92
## 2           Albania   8.90  8.60 75.50 77.70 73.60
## 3          Alemania   8.80 12.70 80.70 83.30 78.40
## 4           Andorra   6.20  4.60 83.70 86.00 81.30
## 5            Angola  38.81  8.01 61.64 64.31 59.03
## 6 Antigua y Barbuda  12.12  6.37 78.50 80.94 75.78
#  Mardia 

Mardia <- mvn(data[,-1], mvnTest = "mardia")
Mardia$multivariateNormality
##              Test        Statistic              p value Result
## 1 Mardia Skewness 201.323801536089 2.98182323473629e-25     NO
## 2 Mardia Kurtosis 4.20204709407368 2.64512086503021e-05     NO
## 3             MVN             <NA>                 <NA>     NO
# Henze-Zirkler
HZ <- mvn(data[,-1], mvnTest = "hz")
HZ$multivariateNormality
##            Test       HZ p value MVN
## 1 Henze-Zirkler 2.294973       0  NO
# Royston

Royston <- mvn(data[,-1], mvnTest = "royston")
Royston$multivariateNormality
##      Test        H      p value MVN
## 1 Royston 25.55369 1.527186e-06  NO
# Doornik Hansen
DH <- mvn(data[,-1], mvnTest = "dh")
DH$multivariateNormality
##             Test        E df     p value MVN
## 1 Doornik-Hansen 239.7374 10 7.77862e-46  NO

# Despues de hacer varias pruebas multivariadas (mardia,royston, henze-zirkler y doornik- hansen) podemos observar y por ende concluir que los datos NO son normales

pruebas de normalidad univariada

# Shapiro-Wilks
SW <- mvn(data[,-1], univariateTest = "SW",desc=T)
SW
## $multivariateNormality
##            Test       HZ p value MVN
## 1 Henze-Zirkler 2.294973       0  NO
## 
## $univariateNormality
##           Test  Variable Statistic   p value Normality
## 1 Shapiro-Wilk    TN        0.9233  <0.001      NO    
## 2 Shapiro-Wilk    TM        0.9589  <0.001      NO    
## 3 Shapiro-Wilk    EV        0.9776  0.0034      NO    
## 4 Shapiro-Wilk    EVM       0.9707   4e-04      NO    
## 5 Shapiro-Wilk    EVH       0.9804   0.008      NO    
## 
## $Descriptives
##       n     Mean  Std.Dev Median   Min   Max    25th    75th        Skew
## TN  194 19.06469 9.872522 16.885  5.00 45.29 10.4250 27.2400  0.65598906
## TM  194  8.57701 3.074472  8.090  1.31 18.40  6.6150  9.9975  0.72290852
## EV  194 71.31052 7.764204 71.860 52.53 85.60 65.6700 76.8050 -0.23215863
## EVM 194 74.01974 7.876189 75.165 53.07 87.90 68.4375 79.5975 -0.39876259
## EVH 194 68.73345 7.767232 68.700 50.37 84.10 63.0850 73.6750 -0.07285581
##       Kurtosis
## TN  -0.6175703
## TM   0.6721286
## EV  -0.6532162
## EVM -0.5952345
## EVH -0.6863484
# Shapiro-Francia 

SF <- mvn(data[,-1], univariateTest = "SF",desc=T)
SF$univariateNormality
##              Test  Variable Statistic   p value Normality
## 1 Shapiro-Francia    TN        0.9267  <0.001      NO    
## 2 Shapiro-Francia    TM        0.9588   1e-04      NO    
## 3 Shapiro-Francia    EV        0.9809  0.0112      NO    
## 4 Shapiro-Francia    EVM       0.9736  0.0015      NO    
## 5 Shapiro-Francia    EVH       0.9835  0.0237      NO
# Anderson Darling
AD <- mvn(data[,-1], univariateTest = "AD",desc=T)
AD$univariateNormality
##               Test  Variable Statistic   p value Normality
## 1 Anderson-Darling    TN        4.9804  <0.001      NO    
## 2 Anderson-Darling    TM        2.6962  <0.001      NO    
## 3 Anderson-Darling    EV        0.9374  0.0172      NO    
## 4 Anderson-Darling    EVM       1.5290   6e-04      NO    
## 5 Anderson-Darling    EVH       0.8141  0.0347      NO

Igual que en las pruebas multivariadas, las univariadas demostraron que los datos no son normales. Es asi como podemos concluir que la TN (tasa de natalidad) TM (tasa de mortalidad) EV (esperanza de vida) EVH (esperanza de vida de hombres) y EVM (esperanza de vida de mujeres) de los 194 paises no son normales.

#EJERCICIO 2 #para este ejericio utilizare el año 2007

library(gapminder)
## Warning: package 'gapminder' was built under R version 4.3.3
library(devtools)
## Warning: package 'devtools' was built under R version 4.3.3
## Loading required package: usethis
## Warning: package 'usethis' was built under R version 4.3.3
devtools::install_github("jennybc/gapminder")
## Downloading GitHub repo jennybc/gapminder@HEAD
## 
## ── R CMD build ─────────────────────────────────────────────────────────────────
##          checking for file 'C:\Users\polic\AppData\Local\Temp\RtmpURBNzt\remotes6ffc53333894\jennybc-gapminder-b895872/DESCRIPTION' ...  ✔  checking for file 'C:\Users\polic\AppData\Local\Temp\RtmpURBNzt\remotes6ffc53333894\jennybc-gapminder-b895872/DESCRIPTION' (519ms)
##       ─  preparing 'gapminder': (1.9s)
##    checking DESCRIPTION meta-information ...     checking DESCRIPTION meta-information ...   ✔  checking DESCRIPTION meta-information
##       ─  checking for LF line-endings in source and make files and shell scripts (357ms)
##   ─  checking for empty or unneeded directories
##       ─  building 'gapminder_1.0.0.9000.tar.gz'
##      
## 
## Warning: package 'gapminder' is in use and will not be installed
gapminder <- gapminder::gapminder

Data <- data.frame(
  pob = gapminder$pop[gapminder$year == 2007],
  lf = gapminder$lifeExp[gapminder$year == 2007],
  gdp = gapminder$gdpPercap[gapminder$year == 2007])
library(bestNormalize)
## Warning: package 'bestNormalize' was built under R version 4.3.3
SW1 <- shapiro.test(Data$pob)
SW1
## 
##  Shapiro-Wilk normality test
## 
## data:  Data$pob
## W = 0.25267, p-value < 2.2e-16
SW2 <- shapiro.test(Data$lf)
SW2
## 
##  Shapiro-Wilk normality test
## 
## data:  Data$lf
## W = 0.89467, p-value = 1.357e-08
SW3 <- shapiro.test(Data$gdp)
SW3
## 
##  Shapiro-Wilk normality test
## 
## data:  Data$gdp
## W = 0.80644, p-value = 2.039e-12

podemos observar que los datos no osn normales por sus pvalue

transformaciones

library(bestNormalize)
best_trans1 <- bestNormalize(Data$pob)
best_trans1
## Best Normalizing transformation with 142 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - arcsinh(x): 0.9109
##  - Box-Cox: 0.9493
##  - Center+scale: 7.2895
##  - Double Reversed Log_b(x+a): 6.6665
##  - Log_b(x+a): 0.9109
##  - orderNorm (ORQ): 1.1204
##  - sqrt(x + a): 2.8248
##  - Yeo-Johnson: 0.9493
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## Standardized asinh(x) Transformation with 142 nonmissing obs.:
##  Relevant statistics:
##  - mean (before standardization) = 16.96914 
##  - sd (before standardization) = 1.525595
best_trans2 <- bestNormalize(Data$lf)
best_trans2
## Best Normalizing transformation with 142 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - arcsinh(x): 3.4491
##  - Box-Cox: 1.9288
##  - Center+scale: 2.6217
##  - Double Reversed Log_b(x+a): 1.5337
##  - Exp(x): 14.2968
##  - Log_b(x+a): 3.4491
##  - orderNorm (ORQ): 1.3996
##  - sqrt(x + a): 3.0476
##  - Yeo-Johnson: 1.717
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## orderNorm Transformation with 142 nonmissing obs and no ties 
##  - Original quantiles:
##     0%    25%    50%    75%   100% 
## 39.613 57.160 71.935 76.413 82.603
best_trans3 <- bestNormalize(Data$gdp)
best_trans3
## Best Normalizing transformation with 142 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - arcsinh(x): 1.3642
##  - Box-Cox: 1.3611
##  - Center+scale: 4.1695
##  - Double Reversed Log_b(x+a): 5.1676
##  - Log_b(x+a): 1.3642
##  - orderNorm (ORQ): 1.2587
##  - sqrt(x + a): 1.971
##  - Yeo-Johnson: 1.3611
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## orderNorm Transformation with 142 nonmissing obs and no ties 
##  - Original quantiles:
##        0%       25%       50%       75%      100% 
##   277.552  1624.842  6124.371 18008.836 49357.190

Best normalize nos indica que la mejor prueba para trnsformarlos es standarized box cox

library(MVN)

SW1_trans <- shapiro.test(best_trans1$x.t)
SW1_trans
## 
##  Shapiro-Wilk normality test
## 
## data:  best_trans1$x.t
## W = 0.99249, p-value = 0.6593
SW2_trans <- shapiro.test(best_trans2$x.t)
SW2_trans
## 
##  Shapiro-Wilk normality test
## 
## data:  best_trans2$x.t
## W = 0.99968, p-value = 1
SW3_trans <- shapiro.test(best_trans3$x.t)
SW3_trans
## 
##  Shapiro-Wilk normality test
## 
## data:  best_trans3$x.t
## W = 0.99968, p-value = 1

Ahora podemos observar como los datos quedan transformados y ya son normales.

EJERCICIO 3

library(bestNormalize)
data("autotrader")
names(autotrader)
##  [1] "Car_Info" "Link"     "Make"     "Year"     "Location" "Radius"  
##  [7] "price"    "mileage"  "status"   "model"
Data3 <- data.frame(
  kilom=  autotrader$mileage,
  ant = autotrader$Year,
  precio = autotrader$price)
library(nortest)
AD_kilom <- ad.test(Data3$kilom)
AD_ant <- ad.test(Data3$ant)
AD_precio <- ad.test(Data3$precio)

los datos no son normales.

transformaciones

library(bestNormalize)

best_transk <- bestNormalize(Data3$kilom)
## Warning: `progress_estimated()` was deprecated in dplyr 1.0.0.
## ℹ The deprecated feature was likely used in the bestNormalize package.
##   Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
best_transk
## Best Normalizing transformation with 6283 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - arcsinh(x): 3.388
##  - Box-Cox: 3.0709
##  - Center+scale: 14.8621
##  - Double Reversed Log_b(x+a): 23.1172
##  - Log_b(x+a): 3.3852
##  - orderNorm (ORQ): 1.1028
##  - sqrt(x + a): 5.0991
##  - Yeo-Johnson: 3.0674
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## orderNorm Transformation with 6283 nonmissing obs and ties
##  - 6077 unique values 
##  - Original quantiles:
##     0%    25%    50%    75%   100% 
##      2  29099  44800  88950 325556
best_transa <- bestNormalize(Data3$ant)
best_transa
## Best Normalizing transformation with 6283 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - arcsinh(x): 83.5506
##  - Box-Cox: 83.5506
##  - Center+scale: 83.5506
##  - Double Reversed Log_b(x+a): 83.3981
##  - Log_b(x+a): 83.5506
##  - orderNorm (ORQ): 81.6037
##  - sqrt(x + a): 83.5506
##  - Yeo-Johnson: 83.5506
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## orderNorm Transformation with 6283 nonmissing obs and ties
##  - 17 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
## 2000 2010 2013 2014 2016
best_transp <- bestNormalize(Data3$precio)
best_transp 
## Best Normalizing transformation with 6283 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - arcsinh(x): 4.1032
##  - Box-Cox: 2.2162
##  - Center+scale: 3.4596
##  - Double Reversed Log_b(x+a): 6.3819
##  - Log_b(x+a): 4.1032
##  - orderNorm (ORQ): 1.0972
##  - sqrt(x + a): 2.2029
##  - Yeo-Johnson: 2.2165
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## orderNorm Transformation with 6283 nonmissing obs and ties
##  - 2465 unique values 
##  - Original quantiles:
##    0%   25%   50%   75%  100% 
##   722 11499 15998 21497 64998
trans_kilom <- ad.test(best_transk$x.t)
trans_kilom
## 
##  Anderson-Darling normality test
## 
## data:  best_transk$x.t
## A = 0.00032915, p-value = 1
trans_ant <- ad.test(best_transa$x.t)
trans_ant
## 
##  Anderson-Darling normality test
## 
## data:  best_transa$x.t
## A = 98.365, p-value < 2.2e-16
trans_precio <- ad.test(best_transp$x.t)
trans_precio
## 
##  Anderson-Darling normality test
## 
## data:  best_transp$x.t
## A = 0.01806, p-value = 1

Concluimos que despues de la transformacion, 2 de los datos son normales ya que una de ellas no se pudo transformar.