tarea esta

Analice la normalidad de la base de datos Tasas, la cual cuenta con 4 variables (TN: tasa de natalidad, TM: tasa de mortalidad, EV: esperanza de vida, EVM: esperanza de vida en mujeres y EVH: esperanza de vida en hombres), medidas en 194 países alrededor del mundo.

library(MVN)
data <- read.csv("Tasas.csv")

# 1. Mardia 
Mardia <- mvn(data[,-1], mvnTest = "mardia")
Mardia$multivariateNormality

##              Test        Statistic              p value Result
## 1 Mardia Skewness 201.323801536089 2.98182323473711e-25     NO
## 2 Mardia Kurtosis 4.20204709406389 2.64512086514124e-05     NO
## 3             MVN             <NA>                 <NA>     NO

# 2. Henze-Zirkler
HZ <- mvn(data[,-1], mvnTest = "hz")
HZ$multivariateNormality

##            Test       HZ p value MVN
## 1 Henze-Zirkler 2.294973       0  NO

# 3. Royston
Royston <- mvn(data[,-1], mvnTest = "royston")
Royston$multivariateNormality

##      Test        H      p value MVN
## 1 Royston 25.55369 1.527186e-06  NO

# 4 Doornik-Hansen
DH <- mvn(data[,-1], mvnTest = "dh")
DH$multivariateNormality

##             Test        E df     p value MVN
## 1 Doornik-Hansen 239.7374 10 7.77862e-46  NO

# 5 Energy
Energy<- mvn(data[,-1], mvnTest = "energy")
Energy$multivariateNormality

##          Test Statistic p value MVN
## 1 E-statistic  3.151856       0  NO

## Despues de comprobar con diferentes prueba, concluyo que no es normal 

library(GGally)

## Loading required package: ggplot2

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

ggpairs(data[,-1])

#pruebas de normalidad univariada

# 1. Shapiro-Wilks
SW <- mvn(data[,-1], univariateTest = "SW",desc=T)
SW

## $multivariateNormality
##            Test       HZ p value MVN
## 1 Henze-Zirkler 2.294973       0  NO
## 
## $univariateNormality
##           Test  Variable Statistic   p value Normality
## 1 Shapiro-Wilk    TN        0.9233  <0.001      NO    
## 2 Shapiro-Wilk    TM        0.9589  <0.001      NO    
## 3 Shapiro-Wilk    EV        0.9776  0.0034      NO    
## 4 Shapiro-Wilk    EVM       0.9707   4e-04      NO    
## 5 Shapiro-Wilk    EVH       0.9804   0.008      NO    
## 
## $Descriptives
##       n     Mean  Std.Dev Median   Min   Max    25th    75th        Skew
## TN  194 19.06469 9.872522 16.885  5.00 45.29 10.4250 27.2400  0.65598906
## TM  194  8.57701 3.074472  8.090  1.31 18.40  6.6150  9.9975  0.72290852
## EV  194 71.31052 7.764204 71.860 52.53 85.60 65.6700 76.8050 -0.23215863
## EVM 194 74.01974 7.876189 75.165 53.07 87.90 68.4375 79.5975 -0.39876259
## EVH 194 68.73345 7.767232 68.700 50.37 84.10 63.0850 73.6750 -0.07285581
##       Kurtosis
## TN  -0.6175703
## TM   0.6721286
## EV  -0.6532162
## EVM -0.5952345
## EVH -0.6863484

# 3. Lilliefors (correccion de Kolmogorov)
L <- mvn(data[,-1], univariateTest = "Lillie",desc=T)
L$univariateNormality

##                              Test  Variable Statistic   p value Normality
## 1 Lilliefors (Kolmogorov-Smirnov)    TN        0.1290  <0.001      NO    
## 2 Lilliefors (Kolmogorov-Smirnov)    TM        0.0859  0.0014      NO    
## 3 Lilliefors (Kolmogorov-Smirnov)    EV        0.0576  0.1199      YES   
## 4 Lilliefors (Kolmogorov-Smirnov)    EVM       0.0745  0.0107      NO    
## 5 Lilliefors (Kolmogorov-Smirnov)    EVH       0.0613  0.0728      YES

# 4. Shapiro Francia
SF <- mvn(data[,-1], univariateTest = "SF",desc=T)
SF$univariateNormality

##              Test  Variable Statistic   p value Normality
## 1 Shapiro-Francia    TN        0.9267  <0.001      NO    
## 2 Shapiro-Francia    TM        0.9588   1e-04      NO    
## 3 Shapiro-Francia    EV        0.9809  0.0112      NO    
## 4 Shapiro-Francia    EVM       0.9736  0.0015      NO    
## 5 Shapiro-Francia    EVH       0.9835  0.0237      NO

# 5. Anderson Darling
AD <- mvn(data[,-1], univariateTest = "AD",desc=T)
AD$univariateNormality

##               Test  Variable Statistic   p value Normality
## 1 Anderson-Darling    TN        4.9804  <0.001      NO    
## 2 Anderson-Darling    TM        2.6962  <0.001      NO    
## 3 Anderson-Darling    EV        0.9374  0.0172      NO    
## 4 Anderson-Darling    EVM       1.5290   6e-04      NO    
## 5 Anderson-Darling    EVH       0.8141  0.0347      NO

## Despues de varias pruebas, concluyo que no es normal

library(gapminder)
library(devtools)

## Loading required package: usethis

## Warning: package 'usethis' was built under R version 4.3.3

devtools::install_github("jennybc/gapminder")

## Skipping install of 'gapminder' from a github remote, the SHA1 (b8958723) has not changed since last install.
##   Use `force = TRUE` to force installation

# Cargamos los datos
gapminder <- gapminder::gapminder

# Datos de población 
Data <- data.frame(
  pob = gapminder$pop[gapminder$year == 1982],
  lf = gapminder$lifeExp[gapminder$year == 1982],
  gdp = gapminder$gdpPercap[gapminder$year == 1982])

# 1. Mardia 
Mardia <- mvn(Data, mvnTest = "mardia")
Mardia$multivariateNormality

##              Test        Statistic              p value Result
## 1 Mardia Skewness 1433.19089827176 6.7587006978414e-302     NO
## 2 Mardia Kurtosis 65.7769715987007                    0     NO
## 3             MVN             <NA>                 <NA>     NO

library(bestNormalize)
SW1 <- shapiro.test(Data$pob)
SW1

## 
##  Shapiro-Wilk normality test
## 
## data:  Data$pob
## W = 0.24637, p-value < 2.2e-16

SW2 <- shapiro.test(Data$lf)
SW2

## 
##  Shapiro-Wilk normality test
## 
## data:  Data$lf
## W = 0.94091, p-value = 1.059e-05

SW3 <- shapiro.test(Data$gdp)
SW3

## 
##  Shapiro-Wilk normality test
## 
## data:  Data$gdp
## W = 0.83081, p-value = 1.676e-11

library(bestNormalize)
best_trans1 <- bestNormalize(Data$pob)
best_trans1

## Best Normalizing transformation with 142 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - arcsinh(x): 0.8979
##  - Box-Cox: 0.875
##  - Center+scale: 7.6328
##  - Double Reversed Log_b(x+a): 6.6672
##  - Log_b(x+a): 0.8979
##  - orderNorm (ORQ): 0.987
##  - sqrt(x + a): 2.8541
##  - Yeo-Johnson: 0.875
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## Standardized Box Cox Transformation with 142 nonmissing obs.:
##  Estimated statistics:
##  - lambda = -0.02564246 
##  - mean (before standardization) = 12.99403 
##  - sd (before standardization) = 1.038111

best_trans2 <- bestNormalize(Data$lf)
best_trans2

## Best Normalizing transformation with 142 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - arcsinh(x): 1.7448
##  - Box-Cox: 1.5314
##  - Center+scale: 1.4762
##  - Double Reversed Log_b(x+a): 1.3813
##  - Exp(x): 14.3223
##  - Log_b(x+a): 1.7448
##  - orderNorm (ORQ): 1.1364
##  - sqrt(x + a): 1.6267
##  - Yeo-Johnson: 1.5314
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## orderNorm Transformation with 142 nonmissing obs and no ties 
##  - Original quantiles:
##     0%    25%    50%    75%   100% 
## 38.445 52.940 62.442 70.921 77.110

best_trans3 <- bestNormalize(Data$gdp)
best_trans3

## Best Normalizing transformation with 142 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - arcsinh(x): 1.2632
##  - Box-Cox: 1.2343
##  - Center+scale: 3.7349
##  - Double Reversed Log_b(x+a): 4.6484
##  - Log_b(x+a): 1.2632
##  - orderNorm (ORQ): 1.2091
##  - sqrt(x + a): 1.557
##  - Yeo-Johnson: 1.2343
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## orderNorm Transformation with 142 nonmissing obs and no ties 
##  - Original quantiles:
##        0%       25%       50%       75%      100% 
##   424.000  1363.339  4216.228 12347.954 33693.175

library(MVN)
SW1_trans <- shapiro.test(best_trans1$x.t)
SW1_trans

## 
##  Shapiro-Wilk normality test
## 
## data:  best_trans1$x.t
## W = 0.99362, p-value = 0.7824

SW2_trans <- shapiro.test(best_trans2$x.t)
SW2_trans

## 
##  Shapiro-Wilk normality test
## 
## data:  best_trans2$x.t
## W = 0.99968, p-value = 1

SW3_trans <- shapiro.test(best_trans3$x.t)
SW3_trans

## 
##  Shapiro-Wilk normality test
## 
## data:  best_trans3$x.t
## W = 0.99968, p-value = 1

## Con lo que aprendi en clase, normalice las variables. BestNormalize escogio orderNorm Transformation

library(bestNormalize)
data("autotrader")
names(autotrader)

##  [1] "Car_Info" "Link"     "Make"     "Year"     "Location" "Radius"  
##  [7] "price"    "mileage"  "status"   "model"

Data3 <- data.frame(
  kilom=  autotrader$mileage,
  ant = autotrader$Year,
  precio = autotrader$price)

library(nortest)
AD_kilom <- ad.test(Data3$kilom)
AD_ant <- ad.test(Data3$ant)
AD_precio <- ad.test(Data3$precio)

library(bestNormalize)
best_transk <- bestNormalize(Data3$kilom)

## Warning: `progress_estimated()` was deprecated in dplyr 1.0.0.
## ℹ The deprecated feature was likely used in the bestNormalize package.
##   Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

best_transk

## Best Normalizing transformation with 6283 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - arcsinh(x): 3.3649
##  - Box-Cox: 3.0301
##  - Center+scale: 14.7426
##  - Double Reversed Log_b(x+a): 23.2558
##  - Log_b(x+a): 3.3699
##  - orderNorm (ORQ): 1.1326
##  - sqrt(x + a): 5.0516
##  - Yeo-Johnson: 3.0237
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## orderNorm Transformation with 6283 nonmissing obs and ties
##  - 6077 unique values 
##  - Original quantiles:
##     0%    25%    50%    75%   100% 
##      2  29099  44800  88950 325556

best_transa <- bestNormalize(Data3$ant)

best_transa

## Best Normalizing transformation with 6283 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - arcsinh(x): 83.562
##  - Box-Cox: 83.562
##  - Center+scale: 83.562
##  - Double Reversed Log_b(x+a): 83.3492
##  - Log_b(x+a): 83.562
##  - orderNorm (ORQ): 81.3902
##  - sqrt(x + a): 83.562
##  - Yeo-Johnson: 83.5797
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## orderNorm Transformation with 6283 nonmissing obs and ties
##  - 17 unique values 
##  - Original quantiles:
##   0%  25%  50%  75% 100% 
## 2000 2010 2013 2014 2016

best_transp <- bestNormalize(Data3$precio)

best_transp

## Best Normalizing transformation with 6283 Observations
##  Estimated Normality Statistics (Pearson P / df, lower => more normal):
##  - arcsinh(x): 4.0973
##  - Box-Cox: 2.2405
##  - Center+scale: 3.5981
##  - Double Reversed Log_b(x+a): 6.6707
##  - Log_b(x+a): 4.0973
##  - orderNorm (ORQ): 1.2773
##  - sqrt(x + a): 2.2668
##  - Yeo-Johnson: 2.2405
## Estimation method: Out-of-sample via CV with 10 folds and 5 repeats
##  
## Based off these, bestNormalize chose:
## orderNorm Transformation with 6283 nonmissing obs and ties
##  - 2465 unique values 
##  - Original quantiles:
##    0%   25%   50%   75%  100% 
##   722 11499 15998 21497 64998

trans_kilom <- ad.test(best_transk$x.t)
trans_kilom

## 
##  Anderson-Darling normality test
## 
## data:  best_transk$x.t
## A = 0.00032915, p-value = 1

trans_ant <- ad.test(best_transa$x.t)
trans_ant

## 
##  Anderson-Darling normality test
## 
## data:  best_transa$x.t
## A = 98.365, p-value < 2.2e-16

trans_precio <- ad.test(best_transp$x.t)
trans_precio

## 
##  Anderson-Darling normality test
## 
## data:  best_transp$x.t
## A = 0.01806, p-value = 1

## Los datos ya son normales. BestNormalize escogio orderNorm Transformation

tarea esta

2024-10-10

Analice la normalidad de la base de datos Tasas, la cual cuenta con 4 variables (TN: tasa de natalidad, TM: tasa de mortalidad, EV: esperanza de vida, EVM: esperanza de vida en mujeres y EVH: esperanza de vida en hombres), medidas en 194 países alrededor del mundo.