# Cargar el conjunto de datos

telco <- read.csv("https://raw.githubusercontent.com/VictorGuevaraP/Estadistica-R/master/Caso_telefon%C3%ADa.csv",
                  encoding = "latin1", sep = ";", stringsAsFactors = T)

head(telco)

\[ z=\frac{x_i-\mu}{\sigma} \]

Donde: \(x_i\) representa cada dato dentro del caso \(\mu\) Es la media poblacion (Promedio) \(\sigma\) es la desviacion estandar poblacion

head(telco)

Vamos a aplicar estandarizacion Z a la variable Monto de manera manual

Metodo 1: Por partes

media_monto <- mean(telco$Monto)
media_monto
## [1] 92.305
desv_est <- sd(telco$Monto)
monto_estandar <- (telco$Monto-media_monto)/desv_est
monto_estandar
##  [1] -0.15863801  0.33556141  2.19375124  1.35361222  0.66173303 -0.20805796
##  [7]  1.00767262 -0.01037819  1.75885575  0.41463332 -0.39585374 -0.66272143
## [13] -0.25747790 -0.34643380 -0.82086524 -0.80109726 -0.05979813 -1.79938010
## [19] -0.34643380 -0.06968212 -1.21622478 -0.38596975 -1.36448461 -0.39585374
## [25] -0.66272143 -0.36620177 -1.33483264 -1.05808096 -0.59353351 -0.84063322
## [31] -0.15863801  0.33556141  0.69138499 -0.89005316  0.49370522 -0.93947310
## [37]  0.19718557  0.41463332 -1.33483264  0.78034089 -0.96912507 -0.95924108
## [43]  0.21695355 -0.59353351 -0.39585374 -0.85051721 -0.84063322 -0.81098125
## [49] -0.89993715 -0.74179333  0.65184904  2.64841471  1.09662852  1.70943580
## [55]  1.35361222  0.81987684  1.05709257  1.79839170  1.98618748 -0.15863801
## [61]  1.99607147  1.58094395  1.35361222  0.66173303 -0.20805796  1.00767262
## [67] -0.01037819  1.75885575  0.41463332 -0.03014616 -0.80109726 -0.38596975
## [73] -1.36448461 -0.39585374 -0.66272143 -0.36620177 -1.33483264 -1.05808096
## [79] -0.59353351 -0.84063322

Metodo 2: Directo

monto_estandar2 <- (telco$Monto-mean(telco$Monto))/sd(telco$Monto)
monto_estandar2
##  [1] -0.15863801  0.33556141  2.19375124  1.35361222  0.66173303 -0.20805796
##  [7]  1.00767262 -0.01037819  1.75885575  0.41463332 -0.39585374 -0.66272143
## [13] -0.25747790 -0.34643380 -0.82086524 -0.80109726 -0.05979813 -1.79938010
## [19] -0.34643380 -0.06968212 -1.21622478 -0.38596975 -1.36448461 -0.39585374
## [25] -0.66272143 -0.36620177 -1.33483264 -1.05808096 -0.59353351 -0.84063322
## [31] -0.15863801  0.33556141  0.69138499 -0.89005316  0.49370522 -0.93947310
## [37]  0.19718557  0.41463332 -1.33483264  0.78034089 -0.96912507 -0.95924108
## [43]  0.21695355 -0.59353351 -0.39585374 -0.85051721 -0.84063322 -0.81098125
## [49] -0.89993715 -0.74179333  0.65184904  2.64841471  1.09662852  1.70943580
## [55]  1.35361222  0.81987684  1.05709257  1.79839170  1.98618748 -0.15863801
## [61]  1.99607147  1.58094395  1.35361222  0.66173303 -0.20805796  1.00767262
## [67] -0.01037819  1.75885575  0.41463332 -0.03014616 -0.80109726 -0.38596975
## [73] -1.36448461 -0.39585374 -0.66272143 -0.36620177 -1.33483264 -1.05808096
## [79] -0.59353351 -0.84063322

Metodo 3: Apoyarse en las funcionesnden R R tiene multiples funciones para estandarizar, la clasica es la funcion scale

# Funcion scale
monto_estandar3 <- scale(telco$Monto)
monto_estandar3
##              [,1]
##  [1,] -0.15863801
##  [2,]  0.33556141
##  [3,]  2.19375124
##  [4,]  1.35361222
##  [5,]  0.66173303
##  [6,] -0.20805796
##  [7,]  1.00767262
##  [8,] -0.01037819
##  [9,]  1.75885575
## [10,]  0.41463332
## [11,] -0.39585374
## [12,] -0.66272143
## [13,] -0.25747790
## [14,] -0.34643380
## [15,] -0.82086524
## [16,] -0.80109726
## [17,] -0.05979813
## [18,] -1.79938010
## [19,] -0.34643380
## [20,] -0.06968212
## [21,] -1.21622478
## [22,] -0.38596975
## [23,] -1.36448461
## [24,] -0.39585374
## [25,] -0.66272143
## [26,] -0.36620177
## [27,] -1.33483264
## [28,] -1.05808096
## [29,] -0.59353351
## [30,] -0.84063322
## [31,] -0.15863801
## [32,]  0.33556141
## [33,]  0.69138499
## [34,] -0.89005316
## [35,]  0.49370522
## [36,] -0.93947310
## [37,]  0.19718557
## [38,]  0.41463332
## [39,] -1.33483264
## [40,]  0.78034089
## [41,] -0.96912507
## [42,] -0.95924108
## [43,]  0.21695355
## [44,] -0.59353351
## [45,] -0.39585374
## [46,] -0.85051721
## [47,] -0.84063322
## [48,] -0.81098125
## [49,] -0.89993715
## [50,] -0.74179333
## [51,]  0.65184904
## [52,]  2.64841471
## [53,]  1.09662852
## [54,]  1.70943580
## [55,]  1.35361222
## [56,]  0.81987684
## [57,]  1.05709257
## [58,]  1.79839170
## [59,]  1.98618748
## [60,] -0.15863801
## [61,]  1.99607147
## [62,]  1.58094395
## [63,]  1.35361222
## [64,]  0.66173303
## [65,] -0.20805796
## [66,]  1.00767262
## [67,] -0.01037819
## [68,]  1.75885575
## [69,]  0.41463332
## [70,] -0.03014616
## [71,] -0.80109726
## [72,] -0.38596975
## [73,] -1.36448461
## [74,] -0.39585374
## [75,] -0.66272143
## [76,] -0.36620177
## [77,] -1.33483264
## [78,] -1.05808096
## [79,] -0.59353351
## [80,] -0.84063322
## attr(,"scaled:center")
## [1] 92.305
## attr(,"scaled:scale")
## [1] 10.11737

La ventaja de la funcion de R, es que se puede enviar todo el caso

telco_cuanti_scale <- scale(telco[ ,4:9])
head(telco_cuanti_scale)
##        Reclamos    Llamadas       Edad     Minutos      Monto     Tiempo
## [1,]  0.6462614  0.86099329 -0.5465360 -0.52591040 -0.1586380  1.5218742
## [2,] -0.7263292 -0.37636234 -0.3802893 -0.67800907  0.3355614 -0.6349750
## [3,]  0.6462614  0.86099329 -0.3802893  0.28138254  2.1937512 -0.2848372
## [4,] -0.2687990  0.03608954 -1.5440161  0.21118315  1.3536122  1.1857419
## [5,] -0.7263292 -0.78881422 -0.2140426  0.03568469  0.6617330 -0.4669088
## [6,] -0.2687990 -0.37636234 -0.7127827  0.72597865 -0.2080580  0.7655764

Recordar En una sesion anterior se realizo graficos de cajas para verificar la presencia de outliers No se recomienda trabajar con valores originales para la siguiente grafica

boxplot(telco[,4:9])

Lo mas recomendable es realizar el grafico con los valores de la variables transformadas

boxplot(telco_cuanti_scale)

Normalizacion:

Estandorizacion min-max: consiste en restar cada dato con el valor minimo y a esta operacion se le divide con la resta del valor maximo

\[ X_{norm}=\frac{\left(x_i-x_{\min}\right)}{x_{\max}-x_{\min}} \]

Metodo 1:

monto_normal <- (telco$Monto-min(telco$Monto))/(max(telco$Monto)-min(telco$Monto))
monto_normal
##  [1] 0.36888889 0.48000000 0.89777778 0.70888889 0.55333333 0.35777778
##  [7] 0.63111111 0.40222222 0.80000000 0.49777778 0.31555556 0.25555556
## [13] 0.34666667 0.32666667 0.22000000 0.22444444 0.39111111 0.00000000
## [19] 0.32666667 0.38888889 0.13111111 0.31777778 0.09777778 0.31555556
## [25] 0.25555556 0.32222222 0.10444444 0.16666667 0.27111111 0.21555556
## [31] 0.36888889 0.48000000 0.56000000 0.20444444 0.51555556 0.19333333
## [37] 0.44888889 0.49777778 0.10444444 0.58000000 0.18666667 0.18888889
## [43] 0.45333333 0.27111111 0.31555556 0.21333333 0.21555556 0.22222222
## [49] 0.20222222 0.23777778 0.55111111 1.00000000 0.65111111 0.78888889
## [55] 0.70888889 0.58888889 0.64222222 0.80888889 0.85111111 0.36888889
## [61] 0.85333333 0.76000000 0.70888889 0.55333333 0.35777778 0.63111111
## [67] 0.40222222 0.80000000 0.49777778 0.39777778 0.22444444 0.31777778
## [73] 0.09777778 0.31555556 0.25555556 0.32222222 0.10444444 0.16666667
## [79] 0.27111111 0.21555556

Metodo 2: Funcion

library(scales)
rescale(telco$Monto)
##  [1] 0.36888889 0.48000000 0.89777778 0.70888889 0.55333333 0.35777778
##  [7] 0.63111111 0.40222222 0.80000000 0.49777778 0.31555556 0.25555556
## [13] 0.34666667 0.32666667 0.22000000 0.22444444 0.39111111 0.00000000
## [19] 0.32666667 0.38888889 0.13111111 0.31777778 0.09777778 0.31555556
## [25] 0.25555556 0.32222222 0.10444444 0.16666667 0.27111111 0.21555556
## [31] 0.36888889 0.48000000 0.56000000 0.20444444 0.51555556 0.19333333
## [37] 0.44888889 0.49777778 0.10444444 0.58000000 0.18666667 0.18888889
## [43] 0.45333333 0.27111111 0.31555556 0.21333333 0.21555556 0.22222222
## [49] 0.20222222 0.23777778 0.55111111 1.00000000 0.65111111 0.78888889
## [55] 0.70888889 0.58888889 0.64222222 0.80888889 0.85111111 0.36888889
## [61] 0.85333333 0.76000000 0.70888889 0.55333333 0.35777778 0.63111111
## [67] 0.40222222 0.80000000 0.49777778 0.39777778 0.22444444 0.31777778
## [73] 0.09777778 0.31555556 0.25555556 0.32222222 0.10444444 0.16666667
## [79] 0.27111111 0.21555556

Aplicando a todo el caso (var cuantitativas) la funcion rescale solo permite aplicarse a vectores, no es posible directamente aplicar al data frame.

library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
pre_procesamiento <- preProcess(telco[,4:9]) # Asi por defecto muestra la est. Z
predict(pre_procesamiento, telco[,4:9])
library(caret)
pre_procesamiento <- preProcess(telco[,4:9], method = "range")
predict(pre_procesamiento, telco[,4:9])