# Cargar el conjunto de datos
telco <- read.csv("https://raw.githubusercontent.com/VictorGuevaraP/Estadistica-R/master/Caso_telefon%C3%ADa.csv",
encoding = "latin1", sep = ";", stringsAsFactors = T)
head(telco)
\[ z=\frac{x_i-\mu}{\sigma} \]
Donde: \(x_i\) representa cada dato dentro del caso \(\mu\) Es la media poblacion (Promedio) \(\sigma\) es la desviacion estandar poblacion
head(telco)
Vamos a aplicar estandarizacion Z a la variable Monto de manera manual
Metodo 1: Por partes
media_monto <- mean(telco$Monto)
media_monto
## [1] 92.305
desv_est <- sd(telco$Monto)
monto_estandar <- (telco$Monto-media_monto)/desv_est
monto_estandar
## [1] -0.15863801 0.33556141 2.19375124 1.35361222 0.66173303 -0.20805796
## [7] 1.00767262 -0.01037819 1.75885575 0.41463332 -0.39585374 -0.66272143
## [13] -0.25747790 -0.34643380 -0.82086524 -0.80109726 -0.05979813 -1.79938010
## [19] -0.34643380 -0.06968212 -1.21622478 -0.38596975 -1.36448461 -0.39585374
## [25] -0.66272143 -0.36620177 -1.33483264 -1.05808096 -0.59353351 -0.84063322
## [31] -0.15863801 0.33556141 0.69138499 -0.89005316 0.49370522 -0.93947310
## [37] 0.19718557 0.41463332 -1.33483264 0.78034089 -0.96912507 -0.95924108
## [43] 0.21695355 -0.59353351 -0.39585374 -0.85051721 -0.84063322 -0.81098125
## [49] -0.89993715 -0.74179333 0.65184904 2.64841471 1.09662852 1.70943580
## [55] 1.35361222 0.81987684 1.05709257 1.79839170 1.98618748 -0.15863801
## [61] 1.99607147 1.58094395 1.35361222 0.66173303 -0.20805796 1.00767262
## [67] -0.01037819 1.75885575 0.41463332 -0.03014616 -0.80109726 -0.38596975
## [73] -1.36448461 -0.39585374 -0.66272143 -0.36620177 -1.33483264 -1.05808096
## [79] -0.59353351 -0.84063322
Metodo 2: Directo
monto_estandar2 <- (telco$Monto-mean(telco$Monto))/sd(telco$Monto)
monto_estandar2
## [1] -0.15863801 0.33556141 2.19375124 1.35361222 0.66173303 -0.20805796
## [7] 1.00767262 -0.01037819 1.75885575 0.41463332 -0.39585374 -0.66272143
## [13] -0.25747790 -0.34643380 -0.82086524 -0.80109726 -0.05979813 -1.79938010
## [19] -0.34643380 -0.06968212 -1.21622478 -0.38596975 -1.36448461 -0.39585374
## [25] -0.66272143 -0.36620177 -1.33483264 -1.05808096 -0.59353351 -0.84063322
## [31] -0.15863801 0.33556141 0.69138499 -0.89005316 0.49370522 -0.93947310
## [37] 0.19718557 0.41463332 -1.33483264 0.78034089 -0.96912507 -0.95924108
## [43] 0.21695355 -0.59353351 -0.39585374 -0.85051721 -0.84063322 -0.81098125
## [49] -0.89993715 -0.74179333 0.65184904 2.64841471 1.09662852 1.70943580
## [55] 1.35361222 0.81987684 1.05709257 1.79839170 1.98618748 -0.15863801
## [61] 1.99607147 1.58094395 1.35361222 0.66173303 -0.20805796 1.00767262
## [67] -0.01037819 1.75885575 0.41463332 -0.03014616 -0.80109726 -0.38596975
## [73] -1.36448461 -0.39585374 -0.66272143 -0.36620177 -1.33483264 -1.05808096
## [79] -0.59353351 -0.84063322
Metodo 3: Apoyarse en las funcionesnden R R tiene multiples funciones para estandarizar, la clasica es la funcion scale
# Funcion scale
monto_estandar3 <- scale(telco$Monto)
monto_estandar3
## [,1]
## [1,] -0.15863801
## [2,] 0.33556141
## [3,] 2.19375124
## [4,] 1.35361222
## [5,] 0.66173303
## [6,] -0.20805796
## [7,] 1.00767262
## [8,] -0.01037819
## [9,] 1.75885575
## [10,] 0.41463332
## [11,] -0.39585374
## [12,] -0.66272143
## [13,] -0.25747790
## [14,] -0.34643380
## [15,] -0.82086524
## [16,] -0.80109726
## [17,] -0.05979813
## [18,] -1.79938010
## [19,] -0.34643380
## [20,] -0.06968212
## [21,] -1.21622478
## [22,] -0.38596975
## [23,] -1.36448461
## [24,] -0.39585374
## [25,] -0.66272143
## [26,] -0.36620177
## [27,] -1.33483264
## [28,] -1.05808096
## [29,] -0.59353351
## [30,] -0.84063322
## [31,] -0.15863801
## [32,] 0.33556141
## [33,] 0.69138499
## [34,] -0.89005316
## [35,] 0.49370522
## [36,] -0.93947310
## [37,] 0.19718557
## [38,] 0.41463332
## [39,] -1.33483264
## [40,] 0.78034089
## [41,] -0.96912507
## [42,] -0.95924108
## [43,] 0.21695355
## [44,] -0.59353351
## [45,] -0.39585374
## [46,] -0.85051721
## [47,] -0.84063322
## [48,] -0.81098125
## [49,] -0.89993715
## [50,] -0.74179333
## [51,] 0.65184904
## [52,] 2.64841471
## [53,] 1.09662852
## [54,] 1.70943580
## [55,] 1.35361222
## [56,] 0.81987684
## [57,] 1.05709257
## [58,] 1.79839170
## [59,] 1.98618748
## [60,] -0.15863801
## [61,] 1.99607147
## [62,] 1.58094395
## [63,] 1.35361222
## [64,] 0.66173303
## [65,] -0.20805796
## [66,] 1.00767262
## [67,] -0.01037819
## [68,] 1.75885575
## [69,] 0.41463332
## [70,] -0.03014616
## [71,] -0.80109726
## [72,] -0.38596975
## [73,] -1.36448461
## [74,] -0.39585374
## [75,] -0.66272143
## [76,] -0.36620177
## [77,] -1.33483264
## [78,] -1.05808096
## [79,] -0.59353351
## [80,] -0.84063322
## attr(,"scaled:center")
## [1] 92.305
## attr(,"scaled:scale")
## [1] 10.11737
La ventaja de la funcion de R, es que se puede enviar todo el caso
telco_cuanti_scale <- scale(telco[ ,4:9])
head(telco_cuanti_scale)
## Reclamos Llamadas Edad Minutos Monto Tiempo
## [1,] 0.6462614 0.86099329 -0.5465360 -0.52591040 -0.1586380 1.5218742
## [2,] -0.7263292 -0.37636234 -0.3802893 -0.67800907 0.3355614 -0.6349750
## [3,] 0.6462614 0.86099329 -0.3802893 0.28138254 2.1937512 -0.2848372
## [4,] -0.2687990 0.03608954 -1.5440161 0.21118315 1.3536122 1.1857419
## [5,] -0.7263292 -0.78881422 -0.2140426 0.03568469 0.6617330 -0.4669088
## [6,] -0.2687990 -0.37636234 -0.7127827 0.72597865 -0.2080580 0.7655764
Recordar En una sesion anterior se realizo graficos de cajas para verificar la presencia de outliers No se recomienda trabajar con valores originales para la siguiente grafica
boxplot(telco[,4:9])
Lo mas recomendable es realizar el grafico con los valores de la variables transformadas
boxplot(telco_cuanti_scale)
Estandorizacion min-max: consiste en restar cada dato con el valor minimo y a esta operacion se le divide con la resta del valor maximo
\[ X_{norm}=\frac{\left(x_i-x_{\min}\right)}{x_{\max}-x_{\min}} \]
Metodo 1:
monto_normal <- (telco$Monto-min(telco$Monto))/(max(telco$Monto)-min(telco$Monto))
monto_normal
## [1] 0.36888889 0.48000000 0.89777778 0.70888889 0.55333333 0.35777778
## [7] 0.63111111 0.40222222 0.80000000 0.49777778 0.31555556 0.25555556
## [13] 0.34666667 0.32666667 0.22000000 0.22444444 0.39111111 0.00000000
## [19] 0.32666667 0.38888889 0.13111111 0.31777778 0.09777778 0.31555556
## [25] 0.25555556 0.32222222 0.10444444 0.16666667 0.27111111 0.21555556
## [31] 0.36888889 0.48000000 0.56000000 0.20444444 0.51555556 0.19333333
## [37] 0.44888889 0.49777778 0.10444444 0.58000000 0.18666667 0.18888889
## [43] 0.45333333 0.27111111 0.31555556 0.21333333 0.21555556 0.22222222
## [49] 0.20222222 0.23777778 0.55111111 1.00000000 0.65111111 0.78888889
## [55] 0.70888889 0.58888889 0.64222222 0.80888889 0.85111111 0.36888889
## [61] 0.85333333 0.76000000 0.70888889 0.55333333 0.35777778 0.63111111
## [67] 0.40222222 0.80000000 0.49777778 0.39777778 0.22444444 0.31777778
## [73] 0.09777778 0.31555556 0.25555556 0.32222222 0.10444444 0.16666667
## [79] 0.27111111 0.21555556
Metodo 2: Funcion
library(scales)
rescale(telco$Monto)
## [1] 0.36888889 0.48000000 0.89777778 0.70888889 0.55333333 0.35777778
## [7] 0.63111111 0.40222222 0.80000000 0.49777778 0.31555556 0.25555556
## [13] 0.34666667 0.32666667 0.22000000 0.22444444 0.39111111 0.00000000
## [19] 0.32666667 0.38888889 0.13111111 0.31777778 0.09777778 0.31555556
## [25] 0.25555556 0.32222222 0.10444444 0.16666667 0.27111111 0.21555556
## [31] 0.36888889 0.48000000 0.56000000 0.20444444 0.51555556 0.19333333
## [37] 0.44888889 0.49777778 0.10444444 0.58000000 0.18666667 0.18888889
## [43] 0.45333333 0.27111111 0.31555556 0.21333333 0.21555556 0.22222222
## [49] 0.20222222 0.23777778 0.55111111 1.00000000 0.65111111 0.78888889
## [55] 0.70888889 0.58888889 0.64222222 0.80888889 0.85111111 0.36888889
## [61] 0.85333333 0.76000000 0.70888889 0.55333333 0.35777778 0.63111111
## [67] 0.40222222 0.80000000 0.49777778 0.39777778 0.22444444 0.31777778
## [73] 0.09777778 0.31555556 0.25555556 0.32222222 0.10444444 0.16666667
## [79] 0.27111111 0.21555556
Aplicando a todo el caso (var cuantitativas) la funcion rescale solo permite aplicarse a vectores, no es posible directamente aplicar al data frame.
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
pre_procesamiento <- preProcess(telco[,4:9]) # Asi por defecto muestra la est. Z
predict(pre_procesamiento, telco[,4:9])
library(caret)
pre_procesamiento <- preProcess(telco[,4:9], method = "range")
predict(pre_procesamiento, telco[,4:9])