##Exploracion inicial de datos

#Cargue Librerias
library(readxl)
library(ggplot2)
library(reshape2)
df = read_excel("G:/TRABAJO/DOCENCIA/UNIANDES/Esp. Inteligencia de Mercados/001 DataViz I/archive/ifood_df.xlsx")
head(df)
## # A tibble: 6 × 39
##   Income Kidhome Teenhome Recency MntWines MntFruits MntMeatProducts
##    <dbl>   <dbl>    <dbl>   <dbl>    <dbl>     <dbl>           <dbl>
## 1  58138       0        0      58      635        88             546
## 2  46344       1        1      38       11         1               6
## 3  71613       0        0      26      426        49             127
## 4  26646       1        0      26       11         4              20
## 5  58293       1        0      94      173        43             118
## 6  62513       0        1      16      520        42              98
## # ℹ 32 more variables: MntFishProducts <dbl>, MntSweetProducts <dbl>,
## #   MntGoldProds <dbl>, NumDealsPurchases <dbl>, NumWebPurchases <dbl>,
## #   NumCatalogPurchases <dbl>, NumStorePurchases <dbl>,
## #   NumWebVisitsMonth <dbl>, AcceptedCmp3 <dbl>, AcceptedCmp4 <dbl>,
## #   AcceptedCmp5 <dbl>, AcceptedCmp1 <dbl>, AcceptedCmp2 <dbl>, Complain <dbl>,
## #   Z_CostContact <dbl>, Z_Revenue <dbl>, Response <dbl>, Age <dbl>,
## #   Customer_Days <dbl>, marital_Divorced <dbl>, marital_Married <dbl>, …
#Dimension de los datos
dim(df)
## [1] 2205   39
#Tipo de datos
sapply(df, class)
##               Income              Kidhome             Teenhome 
##            "numeric"            "numeric"            "numeric" 
##              Recency             MntWines            MntFruits 
##            "numeric"            "numeric"            "numeric" 
##      MntMeatProducts      MntFishProducts     MntSweetProducts 
##            "numeric"            "numeric"            "numeric" 
##         MntGoldProds    NumDealsPurchases      NumWebPurchases 
##            "numeric"            "numeric"            "numeric" 
##  NumCatalogPurchases    NumStorePurchases    NumWebVisitsMonth 
##            "numeric"            "numeric"            "numeric" 
##         AcceptedCmp3         AcceptedCmp4         AcceptedCmp5 
##            "numeric"            "numeric"            "numeric" 
##         AcceptedCmp1         AcceptedCmp2             Complain 
##            "numeric"            "numeric"            "numeric" 
##        Z_CostContact            Z_Revenue             Response 
##            "numeric"            "numeric"            "numeric" 
##                  Age        Customer_Days     marital_Divorced 
##            "numeric"            "numeric"            "numeric" 
##      marital_Married       marital_Single     marital_Together 
##            "numeric"            "numeric"            "numeric" 
##        marital_Widow   education_2n Cycle      education_Basic 
##            "numeric"            "numeric"            "numeric" 
## education_Graduation     education_Master        education_PhD 
##            "numeric"            "numeric"            "numeric" 
##             MntTotal      MntRegularProds   AcceptedCmpOverall 
##            "numeric"            "numeric"            "numeric"
summary(df)
##      Income          Kidhome          Teenhome         Recency     
##  Min.   :  1730   Min.   :0.0000   Min.   :0.0000   Min.   : 0.00  
##  1st Qu.: 35196   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:24.00  
##  Median : 51287   Median :0.0000   Median :0.0000   Median :49.00  
##  Mean   : 51622   Mean   :0.4422   Mean   :0.5066   Mean   :49.01  
##  3rd Qu.: 68281   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:74.00  
##  Max.   :113734   Max.   :2.0000   Max.   :2.0000   Max.   :99.00  
##     MntWines        MntFruits     MntMeatProducts  MntFishProducts 
##  Min.   :   0.0   Min.   :  0.0   Min.   :   0.0   Min.   :  0.00  
##  1st Qu.:  24.0   1st Qu.:  2.0   1st Qu.:  16.0   1st Qu.:  3.00  
##  Median : 178.0   Median :  8.0   Median :  68.0   Median : 12.00  
##  Mean   : 306.2   Mean   : 26.4   Mean   : 165.3   Mean   : 37.76  
##  3rd Qu.: 507.0   3rd Qu.: 33.0   3rd Qu.: 232.0   3rd Qu.: 50.00  
##  Max.   :1493.0   Max.   :199.0   Max.   :1725.0   Max.   :259.00  
##  MntSweetProducts  MntGoldProds    NumDealsPurchases NumWebPurchases 
##  Min.   :  0.00   Min.   :  0.00   Min.   : 0.000    Min.   : 0.000  
##  1st Qu.:  1.00   1st Qu.:  9.00   1st Qu.: 1.000    1st Qu.: 2.000  
##  Median :  8.00   Median : 25.00   Median : 2.000    Median : 4.000  
##  Mean   : 27.13   Mean   : 44.06   Mean   : 2.318    Mean   : 4.101  
##  3rd Qu.: 34.00   3rd Qu.: 56.00   3rd Qu.: 3.000    3rd Qu.: 6.000  
##  Max.   :262.00   Max.   :321.00   Max.   :15.000    Max.   :27.000  
##  NumCatalogPurchases NumStorePurchases NumWebVisitsMonth  AcceptedCmp3    
##  Min.   : 0.000      Min.   : 0.000    Min.   : 0.000    Min.   :0.00000  
##  1st Qu.: 0.000      1st Qu.: 3.000    1st Qu.: 3.000    1st Qu.:0.00000  
##  Median : 2.000      Median : 5.000    Median : 6.000    Median :0.00000  
##  Mean   : 2.645      Mean   : 5.824    Mean   : 5.337    Mean   :0.07392  
##  3rd Qu.: 4.000      3rd Qu.: 8.000    3rd Qu.: 7.000    3rd Qu.:0.00000  
##  Max.   :28.000      Max.   :13.000    Max.   :20.000    Max.   :1.00000  
##   AcceptedCmp4      AcceptedCmp5      AcceptedCmp1     AcceptedCmp2    
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.00000   Median :0.0000   Median :0.00000  
##  Mean   :0.07438   Mean   :0.07302   Mean   :0.0644   Mean   :0.01361  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.0000   Max.   :1.00000  
##     Complain       Z_CostContact   Z_Revenue     Response          Age      
##  Min.   :0.00000   Min.   :3     Min.   :11   Min.   :0.000   Min.   :24.0  
##  1st Qu.:0.00000   1st Qu.:3     1st Qu.:11   1st Qu.:0.000   1st Qu.:43.0  
##  Median :0.00000   Median :3     Median :11   Median :0.000   Median :50.0  
##  Mean   :0.00907   Mean   :3     Mean   :11   Mean   :0.151   Mean   :51.1  
##  3rd Qu.:0.00000   3rd Qu.:3     3rd Qu.:11   3rd Qu.:0.000   3rd Qu.:61.0  
##  Max.   :1.00000   Max.   :3     Max.   :11   Max.   :1.000   Max.   :80.0  
##  Customer_Days  marital_Divorced marital_Married  marital_Single  
##  Min.   :2159   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:2339   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :2515   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :2513   Mean   :0.1043   Mean   :0.3873   Mean   :0.2163  
##  3rd Qu.:2688   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :2858   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  marital_Together marital_Widow     education_2n Cycle education_Basic  
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.0000     Min.   :0.00000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000     1st Qu.:0.00000  
##  Median :0.0000   Median :0.00000   Median :0.0000     Median :0.00000  
##  Mean   :0.2576   Mean   :0.03447   Mean   :0.0898     Mean   :0.02449  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.0000     3rd Qu.:0.00000  
##  Max.   :1.0000   Max.   :1.00000   Max.   :1.0000     Max.   :1.00000  
##  education_Graduation education_Master education_PhD       MntTotal     
##  Min.   :0.0000       Min.   :0.0000   Min.   :0.0000   Min.   :   4.0  
##  1st Qu.:0.0000       1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:  56.0  
##  Median :1.0000       Median :0.0000   Median :0.0000   Median : 343.0  
##  Mean   :0.5048       Mean   :0.1651   Mean   :0.2159   Mean   : 562.8  
##  3rd Qu.:1.0000       3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.: 964.0  
##  Max.   :1.0000       Max.   :1.0000   Max.   :1.0000   Max.   :2491.0  
##  MntRegularProds  AcceptedCmpOverall
##  Min.   :-283.0   Min.   :0.0000    
##  1st Qu.:  42.0   1st Qu.:0.0000    
##  Median : 288.0   Median :0.0000    
##  Mean   : 518.7   Mean   :0.2993    
##  3rd Qu.: 884.0   3rd Qu.:0.0000    
##  Max.   :2458.0   Max.   :4.0000

##Analisis Grafico

# Crear el boxplot
ggplot(df, aes(x = "", y = Income)) + 
  geom_boxplot() +
  labs(title = "Boxplot de Income", y = "Income") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))  # Centra el título

# Crear el gráfico de densidad y el histograma
ggplot(df, aes(x = Income)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", color = "black", alpha = 0.4) +  # Histograma con densidad
  geom_density(fill = "blue", alpha = 0.4) +  # Diagrama de densidad
  labs(title = "Diagrama de Densidad de Income", x = "Income", y = "Densidad") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))  # Centra el título
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Crear el gráfico de barras
ggplot(df, aes(x = as.factor(Kidhome))) + 
  geom_bar(fill = "blue") +
  labs(title = "Distribución de Kidhome", 
       x = "Número de niños en el hogar", 
       y = "Frecuencia") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))  # Centra el título

# Crear el diagrama de dispersión con línea de tendencia
ggplot(df, aes(x = MntTotal, y = Income)) +
  geom_point(color = "blue", alpha = 0.5) +  # Diagrama de dispersión
  geom_smooth(method = "lm", color = "red", se = FALSE) +  # Línea de tendencia (regresión lineal)
  labs(title = "Diagrama de Dispersión entre MntTotal e Income con Línea de Tendencia", 
       x = "MntTotal", 
       y = "Income") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))  # Centra el título
## `geom_smooth()` using formula = 'y ~ x'

# Cargar las librerías necesarias
library(ggplot2)


# Lista con los nombres de las columnas numéricas que te interesan
columnas_a_incluir <- c('Income', 'MntMeatProducts', 'MntWines', 'MntFruits', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds')

# Filtrar el DataFrame utilizando la lista
df_numeric <- df[, columnas_a_incluir]

# Calcular la matriz de correlación
corr_matrix <- cor(df_numeric, use = "complete.obs")

# Convertir la matriz en formato largo para usar ggplot2
melted_corr_matrix <- melt(corr_matrix)

# Crear el heatmap
ggplot(melted_corr_matrix, aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1, 1), space = "Lab", 
                       name = "Correlación") +
  geom_text(aes(label = round(value, 2)), color = "black", size = 4) +
  labs(title = "Matriz de Correlación", x = "", y = "") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, 
                                   size = 10, hjust = 1),
        plot.title = element_text(hjust = 0.5))

# Crear una lista con los nombres de las columnas a eliminar
columnas_a_eliminar <- c('Z_CostContact', 'Z_Revenue')

# Eliminar las columnas del DataFrame
df <- df[, !(names(df) %in% columnas_a_eliminar)]