# Cargar el archivo utilizando read.csv()
datos_ecommerce <- read.csv("C:/Users/joseb/OneDrive/Escritorio/A/R/Ecommerce.txt", header = TRUE)
# Ver las primeras líneas de los datos
head(datos_ecommerce)
## Email
## 1 mstephenson@fernandez.com
## 2 hduke@hotmail.com
## 3 pallen@yahoo.com
## 4 riverarebecca@gmail.com
## 5 mstephens@davidson-herman.com
## 6 alvareznancy@lucas.biz
## Address Avatar
## 1 835 Frank Tunnel\nWrightmouth, MI 82180-9605 Violet
## 2 4547 Archer Common\nDiazchester, CA 06566-8576 DarkGreen
## 3 24645 Valerie Unions Suite 582\nCobbborough, DC 99414-7564 Bisque
## 4 1414 David Throughway\nPort Jason, OH 22070-1220 SaddleBrown
## 5 14023 Rodriguez Passage\nPort Jacobville, PR 37242-1057 MediumAquaMarine
## 6 645 Martha Park Apt. 611\nJeffreychester, MN 67218-7250 FloralWhite
## Avg..Session.Length Time.on.App Time.on.Website Length.of.Membership
## 1 34.49727 12.65565 39.57767 4.082621
## 2 31.92627 11.10946 37.26896 2.664034
## 3 33.00091 11.33028 37.11060 4.104543
## 4 34.30556 13.71751 36.72128 3.120179
## 5 33.33067 12.79519 37.53665 4.446308
## 6 33.87104 12.02693 34.47688 5.493507
## Yearly.Amount.Spent
## 1 587.9511
## 2 392.2049
## 3 487.5475
## 4 581.8523
## 5 599.4061
## 6 637.1024
# De esta forma busco y borro filas que tengan información vacia
datos_ecommerce[!complete.cases(datos_ecommerce),]
## [1] Email Address Avatar
## [4] Avg..Session.Length Time.on.App Time.on.Website
## [7] Length.of.Membership Yearly.Amount.Spent
## <0 rows> (or 0-length row.names)
#no se observan filas vacias o con falta de informacion
# Veré qué tipo de variables tiene la data
str(datos_ecommerce)
## 'data.frame': 500 obs. of 8 variables:
## $ Email : chr "mstephenson@fernandez.com" "hduke@hotmail.com" "pallen@yahoo.com" "riverarebecca@gmail.com" ...
## $ Address : chr "835 Frank Tunnel\nWrightmouth, MI 82180-9605" "4547 Archer Common\nDiazchester, CA 06566-8576" "24645 Valerie Unions Suite 582\nCobbborough, DC 99414-7564" "1414 David Throughway\nPort Jason, OH 22070-1220" ...
## $ Avatar : chr "Violet" "DarkGreen" "Bisque" "SaddleBrown" ...
## $ Avg..Session.Length : num 34.5 31.9 33 34.3 33.3 ...
## $ Time.on.App : num 12.7 11.1 11.3 13.7 12.8 ...
## $ Time.on.Website : num 39.6 37.3 37.1 36.7 37.5 ...
## $ Length.of.Membership: num 4.08 2.66 4.1 3.12 4.45 ...
## $ Yearly.Amount.Spent : num 588 392 488 582 599 ...
# 3.- Realiza una estadística descriptiva de las variables en estudio (excluyendo email, Address y Avatar). Para las variables cuantitativas utiliza medidas de localización, dispersión y forma. Interpreta los resultados.
# Cargo las bibliotecas necesarias
library(moments) # para calcular skewness y kurtosis
# Seleccionamos las columnas de interés
vars_of_interest <- datos_ecommerce[, c("Avg..Session.Length", "Time.on.App", "Time.on.Website", "Length.of.Membership", "Yearly.Amount.Spent")]
# CalculO las medidas
measures <- data.frame(
Variable = colnames(vars_of_interest),
Mean = sapply(vars_of_interest, mean),
Median = sapply(vars_of_interest, median),
Std.Dev = sapply(vars_of_interest, sd),
Variance = sapply(vars_of_interest, var),
Range = sapply(vars_of_interest, function(x) max(x) - min(x)),
Skewness = sapply(vars_of_interest, skewness),
Kurtosis = sapply(vars_of_interest, kurtosis)
)
measures
## Variable Mean Median Std.Dev
## Avg..Session.Length Avg..Session.Length 33.053194 33.082008 0.9925631
## Time.on.App Time.on.App 12.052488 11.983231 0.9942156
## Time.on.Website Time.on.Website 37.060445 37.069367 1.0104889
## Length.of.Membership Length.of.Membership 3.533462 3.533975 0.9992775
## Yearly.Amount.Spent Yearly.Amount.Spent 499.314038 498.887875 79.3147815
## Variance Range Skewness Kurtosis
## Avg..Session.Length 0.9851815 6.607234 -0.03207820 2.999767
## Time.on.App 0.9884647 6.618842 -0.08885351 3.111858
## Time.on.Website 1.0210878 6.091334 0.01210542 2.890717
## Length.of.Membership 0.9985555 6.652788 -0.10628796 3.333553
## Yearly.Amount.Spent 6290.8345723 508.847880 0.03468573 3.447373
# 4.- Construye gráficos univariados para todas las variables cuantitativas. Luego, construye gráficos bivariados (dispersión) entre la variable “Yearly Amount Spentr” y cada una de las variables restantes. Interpreta los resultados.
# Graficos Univariados
par(mfrow=c(2,3)) # Configura el área de trazado para mostrar varios gráficos en la misma ventana.
hist(datos_ecommerce$Avg..Session.Length, main="Avg. Session Length", xlab="Length", col="lightblue", border="black")
hist(datos_ecommerce$Time.on.App, main="Time on App", xlab="Time", col="lightgreen", border="black")
hist(datos_ecommerce$Time.on.Website, main="Time on Website", xlab="Time", col="lightcoral", border="black")
hist(datos_ecommerce$Length.of.Membership, main="Length of Membership", xlab="Length", col="lightyellow", border="black")
hist(datos_ecommerce$Yearly.Amount.Spent, main="Yearly Amount Spent", xlab="Amount", col="lightpink", border="black")

# Gráficos Bivariados
par(mfrow=c(2,2))
plot(datos_ecommerce$Avg..Session.Length, datos_ecommerce$Yearly.Amount.Spent, xlab="Avg. Session Length", ylab="Yearly Amount Spent", main="Scatterplot of Avg. Session Length vs. Yearly Amount Spent")
plot(datos_ecommerce$Time.on.App, datos_ecommerce$Yearly.Amount.Spent, xlab="Time on App", ylab="Yearly Amount Spent", main="Scatterplot of Time on App vs. Yearly Amount Spent")
plot(datos_ecommerce$Time.on.Website, datos_ecommerce$Yearly.Amount.Spent, xlab="Time on Website", ylab="Yearly Amount Spent", main="Scatterplot of Time on Website vs. Yearly Amount Spent")
plot(datos_ecommerce$Length.of.Membership, datos_ecommerce$Yearly.Amount.Spent, xlab="Length of Membership", ylab="Yearly Amount Spent", main="Scatterplot of Length of Membership vs. Yearly Amount Spent")

# 5.- Construye la variable “AvgSessionGroup” a partir de la variable “Avg. Session Length:”, con rangos [29, 32), [32, 33), [33, 34), y [34, 36.2). Descríbela desde la perspectiva de una variable de agrupación, realiza un gráfico adecuado, y para cada una de las nuevas categorías determina la media, la mediana y la desviación estándar de la variable “Yearly Amount Spentr”. Comenta tus resultados.
datos_ecommerce$AvgSessionGroup <- cut(datos_ecommerce$Avg..Session.Length,
breaks = c(29, 32, 33, 34, 36.2),
labels = c("[29, 32)", "[32, 33)", "[33, 34)", "[34, 36.2)"),
right = FALSE)
table(datos_ecommerce$AvgSessionGroup)
##
## [29, 32) [32, 33) [33, 34) [34, 36.2)
## 69 167 179 85
barplot(table(datos_ecommerce$AvgSessionGroup), main = "Distribución de Avg. Session Length por Grupo", xlab = "Grupos", ylab = "Frecuencia")

library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
datos_ecommerce %>%
group_by(AvgSessionGroup) %>%
summarise(
Mean_Yearly_Amount_Spent = mean(Yearly.Amount.Spent),
Median_Yearly_Amount_Spent = median(Yearly.Amount.Spent),
StdDev_Yearly_Amount_Spent = sd(Yearly.Amount.Spent)
)
## # A tibble: 4 × 4
## AvgSessionGroup Mean_Yearly_Amount_Spent Median_Yearly_Amount_Spent
## <fct> <dbl> <dbl>
## 1 [29, 32) 453. 446.
## 2 [32, 33) 484. 487.
## 3 [33, 34) 509. 506.
## 4 [34, 36.2) 545. 549.
## # ℹ 1 more variable: StdDev_Yearly_Amount_Spent <dbl>
#Estadísticas descriptivas para cada categoría:
# 6.- Para cada una de las categorías de la variable “AvgSessionGroup”, genera un boxplot de la variable “Yearly Amount Spentr” e interpreta cada uno de ellos.
library(ggplot2)
#
ggplot(datos_ecommerce, aes(x = AvgSessionGroup, y = Yearly.Amount.Spent)) +
geom_boxplot() +
xlab("AvgSessionGroup") +
ylab("Yearly Amount Spent") +
ggtitle("Boxplot de Yearly Amount Spent por AvgSessionGroup")

categories <- levels(datos_ecommerce$AvgSessionGroup)
list_stats <- lapply(categories, function(cat) {
data_subset <- datos_ecommerce$Yearly.Amount.Spent[datos_ecommerce$AvgSessionGroup == cat]
boxplot.stats(data_subset)$stats
})
names(list_stats) <- categories
list_stats
## $`[29, 32)`
## [1] 319.9289 416.3584 446.4187 494.6386 591.7811
##
## $`[32, 33)`
## [1] 314.4385 440.1572 486.8389 532.7211 662.9611
##
## $`[33, 34)`
## [1] 357.5914 460.5868 505.7711 558.2911 689.7876
##
## $`[34, 36.2)`
## [1] 402.1671 506.1323 548.5185 584.1059 700.9171
# 7.-Para todas las variables cuantitativas, calcula la matriz de las correlaciones. Interpreta la magnitud de las correlaciones de las variables cuantitativas con la variable “Yearly Amount Spentr”.
cor_matrix <- cor(datos_ecommerce[, c('Avg..Session.Length', 'Time.on.App', 'Time.on.Website', 'Length.of.Membership', 'Yearly.Amount.Spent')])
print(cor_matrix)
## Avg..Session.Length Time.on.App Time.on.Website
## Avg..Session.Length 1.00000000 -0.02782598 -0.034986900
## Time.on.App -0.02782598 1.00000000 0.082388273
## Time.on.Website -0.03498690 0.08238827 1.000000000
## Length.of.Membership 0.06024739 0.02914256 -0.047581819
## Yearly.Amount.Spent 0.35508829 0.49932777 -0.002640845
## Length.of.Membership Yearly.Amount.Spent
## Avg..Session.Length 0.06024739 0.355088295
## Time.on.App 0.02914256 0.499327770
## Time.on.Website -0.04758182 -0.002640845
## Length.of.Membership 1.00000000 0.809083568
## Yearly.Amount.Spent 0.80908357 1.000000000
# 8.-Supón que se desea realizar un modelo de regresión lineal múltiple considerando como variable respuesta “Yearly Amount Spentr” y como variables independientes:
if(!is.data.frame(datos_ecommerce)) {
datos_ecommerce <- as.data.frame(datos_ecommerce)
}
# Ahora, ejecuto el modelo de regresión lineal
lm_result <- lm(Yearly.Amount.Spent ~ Avg..Session.Length + Time.on.App + Time.on.Website + Length.of.Membership, data=datos_ecommerce)
# Mostramos el resumen del modelo
summary(lm_result)
##
## Call:
## lm(formula = Yearly.Amount.Spent ~ Avg..Session.Length + Time.on.App +
## Time.on.Website + Length.of.Membership, data = datos_ecommerce)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.4059 -6.2191 -0.1364 6.6048 30.3085
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1051.5943 22.9925 -45.736 <2e-16 ***
## Avg..Session.Length 25.7343 0.4510 57.057 <2e-16 ***
## Time.on.App 38.7092 0.4510 85.828 <2e-16 ***
## Time.on.Website 0.4367 0.4441 0.983 0.326
## Length.of.Membership 61.5773 0.4483 137.346 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.973 on 495 degrees of freedom
## Multiple R-squared: 0.9843, Adjusted R-squared: 0.9842
## F-statistic: 7766 on 4 and 495 DF, p-value: < 2.2e-16
#9- Finalmente, para comunicar tus resultados elabora un reporte con la librería flexdashboard, en donde la respuesta completa de cada pregunta/inciso debe estar en una hoja aparte: por ejemplo, una hoja con la tabla de estadísticas y el texto de interpretación del inciso 3, otra con lo correspondiente del inciso 4, etc. La orientación y orden dentro de cada hoja queda a disposición tuya. Las restricciones son que los gráficos deben ser interactivos, las tablas deben poder verse completas (si son muy largas, debe ser posible arrastrar) y no debe tener problemas de exportación por errores en el código al crear el reporte.
#install.packages(c("flexdashboard", "plotly", "dplyr", "broom"))
library(flexdashboard)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(dplyr)
library(broom)
# Cargando los datos
# Asegúrate de tenerlos cargados o ajusta la ruta según corresponda
datos_ecommerce <- read.csv("C:/Users/joseb/OneDrive/Escritorio/A/R/Ecommerce.txt")
# Ajustando el modelo
model <- lm(Yearly.Amount.Spent ~ Avg..Session.Length + Time.on.App + Time.on.Website + Length.of.Membership, data = datos_ecommerce)
# Convertir el resumen del modelo a un dataframe para una mejor visualización
tidy_model <- broom::tidy(model)
tidy_model %>%
plot_ly(type = 'table',
header = list(values = names(tidy_model)),
cells = list(values = as.list(tidy_model)))
# Gráfica de residuos
residual_plot <- plot_ly(datos_ecommerce, x = ~fitted(model), y = ~resid(model), type = "scatter", mode = "markers") %>%
layout(title = "Residuos vs Valores Ajustados",
xaxis = list(title = "Valores Ajustados"),
yaxis = list(title = "Residuos"))
residual_plot