Hecho por: Emmanuel Goncalves, Javier Lyon y Samuel Vetrini
# Descargar Librerias
library(VIM)
library(readxl)
library(dplyr)
library(tidyverse)
library(visdat)
library(mice)
library(naniar)
library(missForest)
library(cowplot)
library(ggstatsplot)
data <- read_excel("C:/Users/samuv/Downloads/Online shop.xlsx")
##1. Elabore un análisis exploratorio de los datos para dar una panorámica general. Incluya estadísticas descriptivas y gráficos como elementos necesarios.
dim(data)
[1] 12330 18
#Se puede observar que hay 12330 observaciones y 18 variables en el dataset.
str(data)
tibble [12,330 × 18] (S3: tbl_df/tbl/data.frame)
$ Administrative : num [1:12330] 0 0 0 0 0 0 0 1 0 0 ...
$ Administrative_Duration: num [1:12330] 0 0 0 0 0 0 0 0 0 0 ...
$ Informational : num [1:12330] 0 0 0 0 0 0 0 0 0 0 ...
$ Informational_Duration : num [1:12330] 0 0 0 0 0 0 0 0 0 0 ...
$ ProductRelated : num [1:12330] 1 2 1 2 10 19 1 0 2 3 ...
$ ProductRelated_Duration: num [1:12330] 0 64 0 2.67 627.5 ...
$ BounceRates : num [1:12330] 0.2 0 0.2 0.05 0.02 ...
$ ExitRates : num [1:12330] 0.2 0.1 0.2 0.14 0.05 ...
$ PageValues : num [1:12330] 0 0 0 0 0 0 0 0 0 0 ...
$ SpecialDay : num [1:12330] 0 0 0 0 0 0 0.4 0 0.8 0.4 ...
$ Month : chr [1:12330] "Feb" "Feb" "Feb" "Feb" ...
$ OperatingSystems : num [1:12330] 1 2 4 3 3 2 2 1 2 2 ...
$ Browser : num [1:12330] 1 2 1 2 3 2 4 2 2 4 ...
$ Region : num [1:12330] 1 1 9 2 1 1 3 1 2 1 ...
$ TrafficType : num [1:12330] 1 2 3 4 4 3 3 5 3 2 ...
$ VisitorType : chr [1:12330] "Returning_Visitor" "Returning_Visitor" "Returning_Visitor" "Returning_Visitor" ...
$ Weekend : chr [1:12330] "FALSE" "FALSE" "FALSE" "FALSE" ...
$ Revenue : chr [1:12330] "FALSE" "FALSE" "FALSE" "FALSE" ...
#Podemos ver el detalle de cada variable, hay 4 variables de caracter y el resto son numéricas.
summary(data)
Administrative Administrative_Duration Informational Informational_Duration
Min. : 0.000 Min. : 0.00 Min. : 0.0000 Min. : 0.00
1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.0000 1st Qu.: 0.00
Median : 1.000 Median : 7.50 Median : 0.0000 Median : 0.00
Mean : 2.315 Mean : 80.82 Mean : 0.5036 Mean : 34.47
3rd Qu.: 4.000 3rd Qu.: 93.26 3rd Qu.: 0.0000 3rd Qu.: 0.00
Max. :27.000 Max. :3398.75 Max. :24.0000 Max. :2549.38
ProductRelated ProductRelated_Duration BounceRates ExitRates
Min. : 0.00 Min. : 0.0 Min. :0.000000 Min. :0.00000
1st Qu.: 7.00 1st Qu.: 184.1 1st Qu.:0.000000 1st Qu.:0.01429
Median : 18.00 Median : 598.9 Median :0.003112 Median :0.02516
Mean : 31.73 Mean : 1194.8 Mean :0.022191 Mean :0.04307
3rd Qu.: 38.00 3rd Qu.: 1464.2 3rd Qu.:0.016813 3rd Qu.:0.05000
Max. :705.00 Max. :63973.5 Max. :0.200000 Max. :0.20000
PageValues SpecialDay Month OperatingSystems
Min. : 0.000 Min. :0.00000 Length:12330 Min. :1.000
1st Qu.: 0.000 1st Qu.:0.00000 Class :character 1st Qu.:2.000
Median : 0.000 Median :0.00000 Mode :character Median :2.000
Mean : 5.889 Mean :0.06143 Mean :2.124
3rd Qu.: 0.000 3rd Qu.:0.00000 3rd Qu.:3.000
Max. :361.764 Max. :1.00000 Max. :8.000
Browser Region TrafficType VisitorType
Min. : 1.000 Min. :1.000 Min. : 1.00 Length:12330
1st Qu.: 2.000 1st Qu.:1.000 1st Qu.: 2.00 Class :character
Median : 2.000 Median :3.000 Median : 2.00 Mode :character
Mean : 2.357 Mean :3.147 Mean : 4.07
3rd Qu.: 2.000 3rd Qu.:4.000 3rd Qu.: 4.00
Max. :13.000 Max. :9.000 Max. :20.00
Weekend Revenue
Length:12330 Length:12330
Class :character Class :character
Mode :character Mode :character
#Con esto vemos datos estadísticos sobre cada variable, como el promedio, la mediana,
#el valor mínimo y máximo y el 1er y 3er cuartil.
#Histograma
ggplot(data, aes(x = BounceRates)) +
geom_histogram(bins = 30, fill = "skyblue", color = "black") +
labs(title = "Distribucion de las Tasas de Rebote",
x = "Tasas",
y = "Frecuencia")
#Este histograma muestra que la mayoría de las muestras tienen tasas de rebote muy bajas,
#lo cual es una excelente señal. Una tasa de rebote baja indica que los visitantes encuentran
#el contenido del sitio web relevante e interesante, lo que los anima a explorar más páginas y
#a interactuar con el sitio.
#Gráfico de barras
ggplot(data, aes(x = Month, fill = Month)) +
geom_bar() +
labs(title = "Meses con mayor cantidad de informacion",
x = "Mes",
y = "Cantidad de observaciones")
#Podemos ver que Mayo es el mes con mayor información con más de 3000 observaciones,
#le sigue Noviembre y Marzo.
#Proporción de visitantes que generaron ingresos
data %>% count(Revenue) %>%
mutate(porc=100*n/sum(n))
#Se observa que el 84,5% de los visitantes no generaron ingresos para el sitio web,
#el 15,5% si generaron ingreso.
#Análisis de la tasa de salida en base al tipo de visitante
ggplot(data, aes(x = VisitorType, y = ProductRelated_Duration, fill = VisitorType)) +
geom_boxplot() +
labs(title = "Duracion de la Pagina de Productos por Tipo de Visitante",
x = "Tipo de Visitante",
y = "Duracion (segundos)")
#Este gráfico muestra que los visitantes recurrentes tienden a pasar más tiempo en la página
#de productos en comparación con los otros tipos de visitantes. Los nuevos visitantes tienden
#a pasar menos tiempo en la página de productos. Es importante investigar las posibles causas
#de estas diferencias y tomar medidas para mejorar la experiencia de usuario y aumentar el
#tiempo de permanencia en la página de productos para todos los tipos de visitantes.
#Histogramas y gráficos de densidad para examinar la forma de la distribución de estas variables
ggplot(data, aes(x=Administrative)) +
geom_histogram(bins=30, fill="skyblue", color="black") +
labs(title="Distribucion de la variable Administrative")
ggplot(data, aes(x=Informational)) +
geom_histogram(bins=30, fill="skyblue", color="black") +
labs(title="Distribucion de la variable Informational")
#2. Seleccione al menos dos casos diferentes para elegir un porcentaje de datos faltantes
#para eliminar registros en la base de datos. Elabore un protocolo para tratar datos faltantes,
#donde describa la disposición de los datos, establezca algunas variables para discriminar el
#análisis y observar si existe algún patrón en las mismas. Pruebe las estrategias de imputación
#de datos faltantes. Analice si se observan diferencias en los casos tratados en comparación
#con los datos originales.
#Primer caso
data_na1<-prodNA(data,noNA=0.075)
#Ver el % de data faltante del dataset, así como la disposición de los datos e identificar patrones.
vis_miss(data_na1)
vis_dat(data_na1)
#Se observa que los datos faltantes están dispersos de manera uniforme por todo
#el conjunto de datos, no se detecta que estén concentrados en ciertas variables u
#obedezcan a algún patrón.
#Además, si se observa en particular la variable ProductRelated_Duration, que es
#la variable de interés para este primer caso, tampoco se encuentran patrones en sus datos faltantes.
#Imputación
mice_imput <- data.frame(
original = data_na1$ProductRelated_Duration,
imput_pmm = complete(mice(data_na1, method = "pmm"))$ProductRelated_Duration,
imput_media = complete(mice(data_na1, method = "mean"))$ProductRelated_Duration,
imput_lasso = complete(mice(data_na1, method = "lasso.norm"))$ProductRelated_Duration)
iter imp variable
1 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
iter imp variable
1 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
iter imp variable
1 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
#Analizar los resultados para ver cuál imputación fue mejor
h1 <- ggplot(mice_imput, aes(x = original)) +
geom_histogram(fill = "#ad1538", color = "#000000", position = "identity") +
ggtitle("Distribución Original ") +
theme_classic()
h2 <- ggplot(mice_imput, aes(x = imput_pmm)) +
geom_histogram(fill = "#15ad4f", color = "#000000", position = "identity") +
ggtitle("Pmm-distribución") +
theme_classic()
h3 <- ggplot(mice_imput, aes(x = imput_media)) +
geom_histogram(fill = "#1543ad", color = "#000000", position = "identity") + # + added here
ggtitle("Media-distribución") +
theme_classic()
h4 <- ggplot(mice_imput, aes(x = imput_lasso)) +
geom_histogram(fill = "#ad8415", color = "#000000", position = "identity") +
ggtitle("Lasso-distribución") +
theme_classic()
plot_grid(h1, h2, h3, h4, nrow = 2, ncol = 2)
#El método de imputación que mejor preserva la forma de la distribución original es PMM. Tanto la imputación por la media como Lasso generan distribuciones que no se asemejan a la original, lo cual podría distorsionar el análisis de los datos
#Segundo caso
data_na2<-prodNA(data,noNA=0.25)
#Ver el % de data faltante del dataset, así como la disposición de los datos e identificar patrones.
vis_miss(data_na2)
vis_dat(data_na2)
#Se observa que los datos faltantes están dispersos de manera uniforme por todo
#el conjunto de datos, no se detecta que estén concentrados en ciertas variables u
#obedezcan a algún patrón.
#Además, si se observa en particular la variable ProductRelated_Duration, que es
#la variable de interés para este segundo caso, tampoco se encuentran patrones en sus datos faltantes.
#Imputación
mice_imput2 <- data.frame(
original = data_na2$ProductRelated_Duration,
imput_pmm = complete(mice(data_na2, method = "pmm"))$ProductRelated_Duration,
imput_media = complete(mice(data_na2, method = "mean"))$ProductRelated_Duration,
imput_lasso = complete(mice(data_na2, method = "lasso.norm"))$ProductRelated_Duration)
iter imp variable
1 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
Warning: Number of logged events: 4
iter imp variable
1 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
Warning: Number of logged events: 4
iter imp variable
1 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
1 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
2 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
3 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
4 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 1 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 2 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 3 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 4 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
5 5 Administrative Administrative_Duration Informational Informational_Duration ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues SpecialDay OperatingSystems Browser Region TrafficType
Warning: Number of logged events: 4
#Analizar los resultados para ver cuál imputación fue mejor
g1 <- ggplot(mice_imput2, aes(x = original)) +
geom_histogram(fill = "#ad1538", color = "#000000", position = "identity") +
ggtitle("Distribución Original ") +
theme_classic()
g2 <- ggplot(mice_imput2, aes(x = imput_pmm)) +
geom_histogram(fill = "#15ad4f", color = "#000000", position = "identity") +
ggtitle("Pmm-distribución") +
theme_classic()
g3 <- ggplot(mice_imput2, aes(x = imput_media)) +
geom_histogram(fill = "#1543ad", color = "#000000", position = "identity") +
ggtitle("Media-distribución") +
theme_classic()
g4 <- ggplot(mice_imput2, aes(x = imput_lasso)) +
geom_histogram(fill = "#ad8415", color = "#000000", position = "identity") +
ggtitle("Lasso-distribución") +
theme_classic()
plot_grid(g1, g2, g3, g4, nrow = 2, ncol = 2)
#Se puede observar que al igual que el caso anterior, la mejor imputación es la PMM
#ya que es la que más se asimila a la distribución original de los datos.
#Podemos ver que al aumentar la cantidad de datos faltantes del 7.5% al 25%, se
#incrementa el impacto de la imputación en la forma final de la distribución,
#igualmente PMM sigue siendo el método de imputación que mejor preserva la forma de
#la distribución original en comparación con la media y Lasso. Esta comparación revela
#las limitaciones de la imputación, especialmente cuando la cantidad de datos faltantes es alta.
#Ninguno de los métodos de imputación puede recrear perfectamente la distribución original,
#y la precisión de la imputación disminuye a medida que aumenta la cantidad de datos faltantes.
#De esta manera, la imputación es una herramienta útil para manejar datos faltantes,
#pero no puede reemplazar la necesidad de datos precisos y completos.
# 3. Elabore un protocolo para el tratamiento de valores atípicos.
# Al respecto, estudiaremos las variables "BounceRates", "ExitRates" y "PageValues".
# Se empleará un método de detección de valores atípicos donde todas las observaciones
# que se encuentren fuera del intervalo formado por los percentiles 2,5 y 97,5 se
# considerarán como posibles valores atípicos.
# Calcular límites inferior y superior basados en percentiles para PageValues
lim_inferior <- quantile(data$PageValues, 0.025)
lim_superior <- quantile(data$PageValues, 0.975)
# Crear un índice para extraer los posibles candidatos a valores atípicos
ind_atipicos <- which(data$PageValues < lim_inferior | data$PageValues > lim_superior)
ind_atipicos
[1] 199 200 201 296 401 458 638 696 864 904 956 1043 1051
[14] 1068 1220 1334 1363 1389 1414 1424 1433 1492 1572 1638 1673 1697
[27] 1790 1942 1998 2025 2071 2189 2260 2393 2413 2467 2575 2590 2605
[40] 2660 2756 2795 2874 2943 2957 3015 3111 3131 3165 3185 3191 3201
[53] 3272 3308 3313 3318 3339 3341 3356 3382 3436 3530 3629 3742 3779
[66] 3798 3816 3818 3826 3900 3948 4003 4013 4070 4113 4148 4184 4186
[79] 4195 4294 4340 4432 4437 4494 4530 4574 4642 4643 4794 4866 4923
[92] 4967 4999 5083 5131 5245 5288 5290 5465 5480 5495 5508 5525 5543
[105] 5636 5745 5762 5774 5822 5878 5967 5979 6028 6055 6078 6111 6164
[118] 6258 6277 6308 6313 6361 6399 6407 6419 6430 6432 6442 6506 6548
[131] 6584 6614 6618 6646 6680 6708 6729 6752 6791 6800 6813 6817 6841
[144] 6900 6956 6969 6975 7008 7027 7037 7058 7081 7083 7132 7159 7167
[157] 7180 7251 7284 7318 7367 7368 7412 7565 7620 7714 7728 7853 7939
[170] 7953 8027 8039 8085 8097 8123 8165 8252 8290 8300 8341 8346 8380
[183] 8417 8437 8497 8561 8574 8582 8617 8628 8636 8646 8649 8671 8706
[196] 8763 8795 8861 8880 8885 8895 8918 9012 9055 9101 9102 9115 9116
[209] 9182 9240 9250 9259 9279 9287 9375 9388 9443 9467 9519 9533 9537
[222] 9541 9542 9549 9573 9652 9686 9715 9728 9793 9794 9817 9884 9903
[235] 9953 9960 9989 10025 10026 10039 10054 10064 10068 10098 10106 10131 10178
[248] 10192 10248 10286 10299 10386 10388 10400 10421 10436 10440 10501 10515 10520
[261] 10534 10560 10641 10655 10813 10828 10835 10854 10878 10883 10884 11038 11039
[274] 11123 11157 11184 11196 11323 11386 11400 11433 11444 11475 11512 11520 11539
[287] 11599 11630 11636 11654 11680 11683 11709 11725 11742 11817 11859 11871 11980
[300] 12089 12109 12116 12118 12183 12224 12235 12248 12273 12314
# Extraer los valores atípicos
atip_PageValues <- data[ind_atipicos, "PageValues"]
atip_PageValues
# Se identifican 309 valores atipicos, lo que supone el 2,5% de las observaciones.
# Calcular límites inferior y superior basados en percentiles para BounceRates
lim_inferior <- quantile(data$BounceRates, 0.025)
lim_superior <- quantile(data$BounceRates, 0.975)
# Crear un índice para extraer los posibles candidatos a valores atípicos
ind_atipicos <- which(data$BounceRates < lim_inferior | data$BounceRates > lim_superior)
ind_atipicos
integer(0)
# Extraer los valores atípicos
atip_BounceRate <- data[ind_atipicos, "BounceRates"]
atip_BounceRate
# Se concluye que no hay valores atípicos en la variable BounceRates según el
#método utilizado.
# Calcular límites inferior y superior basados en percentiles para ExitRates
lim_inferior <- quantile(data$ExitRates, 0.025)
lim_superior <- quantile(data$ExitRates, 0.975)
# Crear un índice para extraer los posibles candidatos a valores atípicos
ind_atipicos <- which(data$ExitRates < lim_inferior | data$ExitRates > lim_superior)
ind_atipicos
[1] 41 199 200 301 325 332 413 444 536 546 585 607 621
[14] 673 686 696 739 745 850 867 920 1050 1051 1069 1116 1141
[27] 1222 1253 1303 1465 1528 1553 1578 1580 1621 1646 1648 1678 1685
[40] 1706 1725 1782 1893 1905 1922 1958 2046 2068 2072 2087 2093 2106
[53] 2112 2189 2213 2248 2449 2576 2599 2751 2830 2892 2943 3022 3037
[66] 3131 3163 3187 3198 3203 3394 3469 3493 3603 3627 3641 3666 3668
[79] 3812 3823 3835 3970 4063 4066 4070 4082 4155 4250 4385 4413 4594
[92] 4604 4646 4658 4748 4899 4929 4966 5035 5041 5140 5250 5263 5290
[105] 5323 5363 5371 5447 5458 5478 5495 5506 5528 5534 5553 5556 5557
[118] 5637 5649 5653 5669 5674 5729 5731 5745 5774 5776 5805 5815 5822
[131] 5931 5943 5971 6018 6022 6029 6059 6066 6074 6078 6103 6108 6118
[144] 6158 6164 6175 6182 6201 6217 6232 6244 6258 6265 6266 6268 6278
[157] 6287 6315 6334 6345 6359 6371 6382 6386 6387 6392 6395 6399 6410
[170] 6417 6438 6462 6468 6469 6480 6484 6502 6503 6548 6573 6643 6645
[183] 6669 6681 6695 6724 6729 6730 6776 6797 6800 6817 6818 6827 6829
[196] 6865 6876 6896 6908 6941 6954 6956 6969 7023 7050 7063 7088 7143
[209] 7144 7159 7195 7226 7231 7244 7248 7284 7300 7315 7318 7321 7355
[222] 7358 7365 7368 7412 7422 7443 7529 7536 7550 7552 7565 7579 7584
[235] 7639 7641 7653 7657 7675 7676 7704 7716 7736 7747 7760 7770 7774
[248] 7784 7818 7836 7849 7854 7871 7895 7924 7953 8007 8025 8038 8207
[261] 8340 8355 8357 8396 8400 8602 8653 8795 8880 8943 9022 9209 9344
[274] 9371 9422 9588 9653 9687 9772 9834 9844 9880 10109 10161 10189 10501
[287] 10543 10768 10774 10889 10901 11046 11084 11162 11184 11274 11368 11400 11449
[300] 11697 11860 11876 12019 12049 12089 12110 12129 12202 12282
# Extraer los valores atípicos
atip_ExitRates <- data[ind_atipicos, "ExitRates"]
atip_ExitRates
#Se identifican 309 valores atipicos, lo que supone el 2,5% de las observaciones.
# 4. Describa los datos seleccionados como atípicos y determine si se consideran
#para el análisis.
# Para determinar si los valores atípicos se deben considerar en el análisis o no,
# compararemos cómo son los estadísticas descriptivas si se considera estos datos o no.
# Estadísticas descriptivas con valores atípicos
estadisticas_con_atipicos <- data %>%
summarise(
Media_PageValues = mean(PageValues, na.rm = TRUE),
Mediana_PageValues = median(PageValues, na.rm = TRUE),
DesvEst_PageValues = sd(PageValues, na.rm = TRUE),
Min_PageValues = min(PageValues, na.rm = TRUE),
Max_PageValues = max(PageValues, na.rm = TRUE),
Media_ExitRates = mean(ExitRates, na.rm = TRUE),
Mediana_ExitRates = median(ExitRates, na.rm = TRUE),
DesvEst_ExitRates = sd(ExitRates, na.rm = TRUE),
Min_ExitRates = min(ExitRates, na.rm = TRUE),
Max_ExitRates = max(ExitRates, na.rm = TRUE)
) %>%
mutate(Tipo = "Con valores atípicos")
print(estadisticas_con_atipicos)
NA
# Filtrar los valores atípicos de PageValues y ExitRates
data_sin_atipicos <- data %>%
filter(
PageValues >= quantile(PageValues, 0.025) & PageValues <= quantile(PageValues, 0.975),
ExitRates >= quantile(ExitRates, 0.025) & ExitRates <= quantile(ExitRates, 0.975)
)
print(data_sin_atipicos)
# Estadísticas descriptivas sin valores atípicos
estadisticas_sin_atipicos <- data_sin_atipicos %>%
summarise(
Media_PageValues = mean(PageValues, na.rm = TRUE),
Mediana_PageValues = median(PageValues, na.rm = TRUE),
DesvEst_PageValues = sd(PageValues, na.rm = TRUE),
Min_PageValues = min(PageValues, na.rm = TRUE),
Max_PageValues = max(PageValues, na.rm = TRUE),
Media_ExitRates = mean(ExitRates, na.rm = TRUE),
Mediana_ExitRates = median(ExitRates, na.rm = TRUE),
DesvEst_ExitRates = sd(ExitRates, na.rm = TRUE),
Min_ExitRates = min(ExitRates, na.rm = TRUE),
Max_ExitRates = max(ExitRates, na.rm = TRUE)
) %>%
mutate(Tipo = "Sin valores atípicos")
print(estadisticas_sin_atipicos)
# Combinar las estadísticas en un solo dataframe
comparacion_estadisticas <- bind_rows(estadisticas_con_atipicos, estadisticas_sin_atipicos)
# Reordenar las columnas para que "Tipo" aparezca de primero
comparacion_estadisticas <- comparacion_estadisticas %>%
select(Tipo, everything())
# Mostrar el dataframe con la comparación
print(comparacion_estadisticas)
#Como se puede observar, para la variable PageValues, los valores atípicos incrementan
#de manera significativa su media, desviación estándar y máximo. Al respecto, incrementan
#estos valores en un 66,57%, 94,95% y 531,18% respectivamente.
#Los valores atípicos claramente influyen en la media y la desviación estándar,
#sugiriendo que podrían estar capturando transacciones de muy alto valor que son
#raras pero importantes para entender el comportamiento extremo del usuario.
#Teniendo en cuenta las estadísticas descriptivas y la naturaleza de los datos,
#elegimos tratar los valores atípicos como un grupo aparte.
#Esta decisión permite analizar en profundidad los casos extremos, lo que nos dará una
#mejor comprensión del comportamiento de los usuarios que generan estos valores atípicos.
#Respecto a la variable ExitRates, las estadísticas descriptivas son bastante similares
#tanto con valores atípicos como sin ellos. Esto sugiere que los valores atípicos no
#tienen un impacto significativo en la distribución general de esta variable.
#Al respecto, los valores atípicos disminuyen la media un 9,09%, la mediana un 3,85%
#y la desviación un 2,04%.
#Dado que las estadísticas descriptivas son bastante similares, aceptar los valores atípicos
# en ExitRates es una opción razonable. Esto evitaría la pérdida de datos y mantendría la integridad
#del análisis.
#5. Establezca las conclusiones y recomendaciones en base al análisis realizado al conjunto de datos.
#Se concluye que no hay patrones significativos en la distribución de datos faltantes
#por variable. Además, a la luz de nuestros resultados, el algoritmo pmm es una buena herramienta
#de imputación de datos dado que para las dos variables seleccionadas proporcionó
#el mejor ajuste. Se descarta la imputación con la media como un buen método de imputación.
#Respecto a los valores atípicos, se recomienda que los procesos de análisis de datos que
#trabajen con la variable PageValues traten los valores atipicos por separado
#dado que distorsionan significativamente a la alza la distribución de los datos.
#Además, este tratamiento particular ofrecerá una
#mejor comprensión del comportamiento de los usuarios que generan estos valores atípicos.
#Por último, respecto a la variable ExitRates, dado que las estadísticas descriptivas son bastante similares,
#se recomienda aceptar los valores atípicos. Adicionalmente, se concluye que la variable
#BounceRates no presenta valores atípicos, de modo que el porcentaje de visitantes
#que ingresan al sitio desde esa página y luego lo abandonan sin activar ninguna otra solicitud
#al servidor es homogéneo, proviene de una población consistente y ofrece un marco
#favorable para hacer inferencia estadística.