PONTIFICIA UNIVERSIDAD CATOLICA DE CHILE

# Cargar el archivo utilizando read.csv()
datos_ecommerce <- read.csv("C:/Users/joseb/OneDrive/Escritorio/A/R/Ecommerce.txt", header = TRUE)

# Ver las primeras líneas de los datos
head(datos_ecommerce)

##                           Email
## 1     mstephenson@fernandez.com
## 2             hduke@hotmail.com
## 3              pallen@yahoo.com
## 4       riverarebecca@gmail.com
## 5 mstephens@davidson-herman.com
## 6        alvareznancy@lucas.biz
##                                                      Address           Avatar
## 1               835 Frank Tunnel\nWrightmouth, MI 82180-9605           Violet
## 2             4547 Archer Common\nDiazchester, CA 06566-8576        DarkGreen
## 3 24645 Valerie Unions Suite 582\nCobbborough, DC 99414-7564           Bisque
## 4           1414 David Throughway\nPort Jason, OH 22070-1220      SaddleBrown
## 5    14023 Rodriguez Passage\nPort Jacobville, PR 37242-1057 MediumAquaMarine
## 6    645 Martha Park Apt. 611\nJeffreychester, MN 67218-7250      FloralWhite
##   Avg..Session.Length Time.on.App Time.on.Website Length.of.Membership
## 1            34.49727    12.65565        39.57767             4.082621
## 2            31.92627    11.10946        37.26896             2.664034
## 3            33.00091    11.33028        37.11060             4.104543
## 4            34.30556    13.71751        36.72128             3.120179
## 5            33.33067    12.79519        37.53665             4.446308
## 6            33.87104    12.02693        34.47688             5.493507
##   Yearly.Amount.Spent
## 1            587.9511
## 2            392.2049
## 3            487.5475
## 4            581.8523
## 5            599.4061
## 6            637.1024

# De esta forma busco y borro filas que tengan información vacia

datos_ecommerce[!complete.cases(datos_ecommerce),]

## [1] Email                Address              Avatar              
## [4] Avg..Session.Length  Time.on.App          Time.on.Website     
## [7] Length.of.Membership Yearly.Amount.Spent 
## <0 rows> (or 0-length row.names)

#no se observan filas vacias o con falta de informacion

# Veré qué tipo de variables tiene la data

str(datos_ecommerce)

## 'data.frame':    500 obs. of  8 variables:
##  $ Email               : chr  "mstephenson@fernandez.com" "hduke@hotmail.com" "pallen@yahoo.com" "riverarebecca@gmail.com" ...
##  $ Address             : chr  "835 Frank Tunnel\nWrightmouth, MI 82180-9605" "4547 Archer Common\nDiazchester, CA 06566-8576" "24645 Valerie Unions Suite 582\nCobbborough, DC 99414-7564" "1414 David Throughway\nPort Jason, OH 22070-1220" ...
##  $ Avatar              : chr  "Violet" "DarkGreen" "Bisque" "SaddleBrown" ...
##  $ Avg..Session.Length : num  34.5 31.9 33 34.3 33.3 ...
##  $ Time.on.App         : num  12.7 11.1 11.3 13.7 12.8 ...
##  $ Time.on.Website     : num  39.6 37.3 37.1 36.7 37.5 ...
##  $ Length.of.Membership: num  4.08 2.66 4.1 3.12 4.45 ...
##  $ Yearly.Amount.Spent : num  588 392 488 582 599 ...

# 3.- Realiza una estadística descriptiva de las variables en estudio (excluyendo email, Address y Avatar). Para las variables cuantitativas utiliza medidas de localización, dispersión y forma. Interpreta los resultados.

# Cargo las bibliotecas necesarias
library(moments)  # para calcular skewness y kurtosis

# Seleccionamos las columnas de interés
vars_of_interest <- datos_ecommerce[, c("Avg..Session.Length", "Time.on.App", "Time.on.Website", "Length.of.Membership", "Yearly.Amount.Spent")]

# CalculO las medidas
measures <- data.frame(
  Variable = colnames(vars_of_interest),
  Mean = sapply(vars_of_interest, mean),
  Median = sapply(vars_of_interest, median),
  Std.Dev = sapply(vars_of_interest, sd),
  Variance = sapply(vars_of_interest, var),
  Range = sapply(vars_of_interest, function(x) max(x) - min(x)),
  Skewness = sapply(vars_of_interest, skewness),
  Kurtosis = sapply(vars_of_interest, kurtosis)
)

measures

##                                  Variable       Mean     Median    Std.Dev
## Avg..Session.Length   Avg..Session.Length  33.053194  33.082008  0.9925631
## Time.on.App                   Time.on.App  12.052488  11.983231  0.9942156
## Time.on.Website           Time.on.Website  37.060445  37.069367  1.0104889
## Length.of.Membership Length.of.Membership   3.533462   3.533975  0.9992775
## Yearly.Amount.Spent   Yearly.Amount.Spent 499.314038 498.887875 79.3147815
##                          Variance      Range    Skewness Kurtosis
## Avg..Session.Length     0.9851815   6.607234 -0.03207820 2.999767
## Time.on.App             0.9884647   6.618842 -0.08885351 3.111858
## Time.on.Website         1.0210878   6.091334  0.01210542 2.890717
## Length.of.Membership    0.9985555   6.652788 -0.10628796 3.333553
## Yearly.Amount.Spent  6290.8345723 508.847880  0.03468573 3.447373

# 4.- Construye gráficos univariados para todas las variables cuantitativas. Luego, construye gráficos bivariados (dispersión) entre la variable “Yearly Amount Spentr” y cada una de las variables restantes. Interpreta los resultados.

# Graficos Univariados

par(mfrow=c(2,3)) # Configura el área de trazado para mostrar varios gráficos en la misma ventana.

hist(datos_ecommerce$Avg..Session.Length, main="Avg. Session Length", xlab="Length", col="lightblue", border="black")
hist(datos_ecommerce$Time.on.App, main="Time on App", xlab="Time", col="lightgreen", border="black")
hist(datos_ecommerce$Time.on.Website, main="Time on Website", xlab="Time", col="lightcoral", border="black")
hist(datos_ecommerce$Length.of.Membership, main="Length of Membership", xlab="Length", col="lightyellow", border="black")
hist(datos_ecommerce$Yearly.Amount.Spent, main="Yearly Amount Spent", xlab="Amount", col="lightpink", border="black")

# Gráficos Bivariados

par(mfrow=c(2,2))

plot(datos_ecommerce$Avg..Session.Length, datos_ecommerce$Yearly.Amount.Spent, xlab="Avg. Session Length", ylab="Yearly Amount Spent", main="Scatterplot of Avg. Session Length vs. Yearly Amount Spent")
plot(datos_ecommerce$Time.on.App, datos_ecommerce$Yearly.Amount.Spent, xlab="Time on App", ylab="Yearly Amount Spent", main="Scatterplot of Time on App vs. Yearly Amount Spent")
plot(datos_ecommerce$Time.on.Website, datos_ecommerce$Yearly.Amount.Spent, xlab="Time on Website", ylab="Yearly Amount Spent", main="Scatterplot of Time on Website vs. Yearly Amount Spent")
plot(datos_ecommerce$Length.of.Membership, datos_ecommerce$Yearly.Amount.Spent, xlab="Length of Membership", ylab="Yearly Amount Spent", main="Scatterplot of Length of Membership vs. Yearly Amount Spent")

# 5.- Construye la variable “AvgSessionGroup” a partir de la variable “Avg. Session Length:”, con rangos [29, 32), [32, 33), [33, 34), y [34, 36.2). Descríbela desde la perspectiva de una variable de agrupación, realiza un gráfico adecuado, y para cada una de las nuevas categorías determina la media, la mediana y la desviación estándar de la variable “Yearly Amount Spentr”. Comenta tus resultados.

datos_ecommerce$AvgSessionGroup <- cut(datos_ecommerce$Avg..Session.Length, 
                                       breaks = c(29, 32, 33, 34, 36.2),
                                       labels = c("[29, 32)", "[32, 33)", "[33, 34)", "[34, 36.2)"),
                                       right = FALSE)

table(datos_ecommerce$AvgSessionGroup)

## 
##   [29, 32)   [32, 33)   [33, 34) [34, 36.2) 
##         69        167        179         85

barplot(table(datos_ecommerce$AvgSessionGroup), main = "Distribución de Avg. Session Length por Grupo", xlab = "Grupos", ylab = "Frecuencia")

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

datos_ecommerce %>%
  group_by(AvgSessionGroup) %>%
  summarise(
    Mean_Yearly_Amount_Spent = mean(Yearly.Amount.Spent),
    Median_Yearly_Amount_Spent = median(Yearly.Amount.Spent),
    StdDev_Yearly_Amount_Spent = sd(Yearly.Amount.Spent)
  )

## # A tibble: 4 × 4
##   AvgSessionGroup Mean_Yearly_Amount_Spent Median_Yearly_Amount_Spent
##   <fct>                              <dbl>                      <dbl>
## 1 [29, 32)                            453.                       446.
## 2 [32, 33)                            484.                       487.
## 3 [33, 34)                            509.                       506.
## 4 [34, 36.2)                          545.                       549.
## # ℹ 1 more variable: StdDev_Yearly_Amount_Spent <dbl>

#Estadísticas descriptivas para cada categoría:

# 6.- Para cada una de las categorías de la variable “AvgSessionGroup”, genera un boxplot de la variable “Yearly Amount Spentr” e interpreta cada uno de ellos.

library(ggplot2)

#  
ggplot(datos_ecommerce, aes(x = AvgSessionGroup, y = Yearly.Amount.Spent)) +
  geom_boxplot() +
  xlab("AvgSessionGroup") +
  ylab("Yearly Amount Spent") +
  ggtitle("Boxplot de Yearly Amount Spent por AvgSessionGroup")

categories <- levels(datos_ecommerce$AvgSessionGroup)

list_stats <- lapply(categories, function(cat) {
  data_subset <- datos_ecommerce$Yearly.Amount.Spent[datos_ecommerce$AvgSessionGroup == cat]
  boxplot.stats(data_subset)$stats
})

names(list_stats) <- categories
list_stats

## $`[29, 32)`
## [1] 319.9289 416.3584 446.4187 494.6386 591.7811
## 
## $`[32, 33)`
## [1] 314.4385 440.1572 486.8389 532.7211 662.9611
## 
## $`[33, 34)`
## [1] 357.5914 460.5868 505.7711 558.2911 689.7876
## 
## $`[34, 36.2)`
## [1] 402.1671 506.1323 548.5185 584.1059 700.9171

# 7.-Para todas las variables cuantitativas, calcula la matriz de las correlaciones. Interpreta la magnitud de las correlaciones de las variables cuantitativas con la variable “Yearly Amount Spentr”.

cor_matrix <- cor(datos_ecommerce[, c('Avg..Session.Length', 'Time.on.App', 'Time.on.Website', 'Length.of.Membership', 'Yearly.Amount.Spent')])
print(cor_matrix)

##                      Avg..Session.Length Time.on.App Time.on.Website
## Avg..Session.Length           1.00000000 -0.02782598    -0.034986900
## Time.on.App                  -0.02782598  1.00000000     0.082388273
## Time.on.Website              -0.03498690  0.08238827     1.000000000
## Length.of.Membership          0.06024739  0.02914256    -0.047581819
## Yearly.Amount.Spent           0.35508829  0.49932777    -0.002640845
##                      Length.of.Membership Yearly.Amount.Spent
## Avg..Session.Length            0.06024739         0.355088295
## Time.on.App                    0.02914256         0.499327770
## Time.on.Website               -0.04758182        -0.002640845
## Length.of.Membership           1.00000000         0.809083568
## Yearly.Amount.Spent            0.80908357         1.000000000

# 8.-Supón que se desea realizar un modelo de regresión lineal múltiple considerando como variable respuesta “Yearly Amount Spentr” y como variables independientes:


if(!is.data.frame(datos_ecommerce)) {
  datos_ecommerce <- as.data.frame(datos_ecommerce)
}

# Ahora, ejecuto el modelo de regresión lineal
lm_result <- lm(Yearly.Amount.Spent ~ Avg..Session.Length + Time.on.App + Time.on.Website + Length.of.Membership, data=datos_ecommerce)

# Mostramos el resumen del modelo
summary(lm_result)

## 
## Call:
## lm(formula = Yearly.Amount.Spent ~ Avg..Session.Length + Time.on.App + 
##     Time.on.Website + Length.of.Membership, data = datos_ecommerce)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -30.4059  -6.2191  -0.1364   6.6048  30.3085 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1051.5943    22.9925 -45.736   <2e-16 ***
## Avg..Session.Length     25.7343     0.4510  57.057   <2e-16 ***
## Time.on.App             38.7092     0.4510  85.828   <2e-16 ***
## Time.on.Website          0.4367     0.4441   0.983    0.326    
## Length.of.Membership    61.5773     0.4483 137.346   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.973 on 495 degrees of freedom
## Multiple R-squared:  0.9843, Adjusted R-squared:  0.9842 
## F-statistic:  7766 on 4 and 495 DF,  p-value: < 2.2e-16

#9- Finalmente, para comunicar tus resultados elabora un reporte con la librería flexdashboard, en donde la respuesta completa de cada pregunta/inciso debe estar en una hoja aparte: por ejemplo, una hoja con la tabla de estadísticas y el texto de interpretación del inciso 3, otra con lo correspondiente del inciso 4, etc. La orientación y orden dentro de cada hoja queda a disposición tuya. Las restricciones son que los gráficos deben ser interactivos, las tablas deben poder verse completas (si son muy largas, debe ser posible arrastrar) y no debe tener problemas de exportación por errores en el código al crear el reporte.

#install.packages(c("flexdashboard", "plotly", "dplyr", "broom"))
library(flexdashboard)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(dplyr)
library(broom)

# Cargando los datos
# Asegúrate de tenerlos cargados o ajusta la ruta según corresponda
datos_ecommerce <- read.csv("C:/Users/joseb/OneDrive/Escritorio/A/R/Ecommerce.txt")

# Ajustando el modelo
model <- lm(Yearly.Amount.Spent ~ Avg..Session.Length + Time.on.App + Time.on.Website + Length.of.Membership, data = datos_ecommerce)

# Convertir el resumen del modelo a un dataframe para una mejor visualización
tidy_model <- broom::tidy(model)
tidy_model %>% 
  plot_ly(type = 'table', 
          header = list(values = names(tidy_model)), 
          cells = list(values = as.list(tidy_model)))

# Gráfica de residuos
residual_plot <- plot_ly(datos_ecommerce, x = ~fitted(model), y = ~resid(model), type = "scatter", mode = "markers") %>%
  layout(title = "Residuos vs Valores Ajustados",
         xaxis = list(title = "Valores Ajustados"),
         yaxis = list(title = "Residuos"))
residual_plot

PONTIFICIA UNIVERSIDAD CATOLICA DE CHILE

JOSE BAEZA

2023-10-02