##Exploracion inicial de datos
#Cargue Librerias
library(readxl)
library(ggplot2)
library(reshape2)
df = read_excel("G:/TRABAJO/DOCENCIA/UNIANDES/Esp. Inteligencia de Mercados/001 DataViz I/archive/ifood_df.xlsx")
head(df)
## # A tibble: 6 × 39
## Income Kidhome Teenhome Recency MntWines MntFruits MntMeatProducts
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 58138 0 0 58 635 88 546
## 2 46344 1 1 38 11 1 6
## 3 71613 0 0 26 426 49 127
## 4 26646 1 0 26 11 4 20
## 5 58293 1 0 94 173 43 118
## 6 62513 0 1 16 520 42 98
## # ℹ 32 more variables: MntFishProducts <dbl>, MntSweetProducts <dbl>,
## # MntGoldProds <dbl>, NumDealsPurchases <dbl>, NumWebPurchases <dbl>,
## # NumCatalogPurchases <dbl>, NumStorePurchases <dbl>,
## # NumWebVisitsMonth <dbl>, AcceptedCmp3 <dbl>, AcceptedCmp4 <dbl>,
## # AcceptedCmp5 <dbl>, AcceptedCmp1 <dbl>, AcceptedCmp2 <dbl>, Complain <dbl>,
## # Z_CostContact <dbl>, Z_Revenue <dbl>, Response <dbl>, Age <dbl>,
## # Customer_Days <dbl>, marital_Divorced <dbl>, marital_Married <dbl>, …
#Dimension de los datos
dim(df)
## [1] 2205 39
#Tipo de datos
sapply(df, class)
## Income Kidhome Teenhome
## "numeric" "numeric" "numeric"
## Recency MntWines MntFruits
## "numeric" "numeric" "numeric"
## MntMeatProducts MntFishProducts MntSweetProducts
## "numeric" "numeric" "numeric"
## MntGoldProds NumDealsPurchases NumWebPurchases
## "numeric" "numeric" "numeric"
## NumCatalogPurchases NumStorePurchases NumWebVisitsMonth
## "numeric" "numeric" "numeric"
## AcceptedCmp3 AcceptedCmp4 AcceptedCmp5
## "numeric" "numeric" "numeric"
## AcceptedCmp1 AcceptedCmp2 Complain
## "numeric" "numeric" "numeric"
## Z_CostContact Z_Revenue Response
## "numeric" "numeric" "numeric"
## Age Customer_Days marital_Divorced
## "numeric" "numeric" "numeric"
## marital_Married marital_Single marital_Together
## "numeric" "numeric" "numeric"
## marital_Widow education_2n Cycle education_Basic
## "numeric" "numeric" "numeric"
## education_Graduation education_Master education_PhD
## "numeric" "numeric" "numeric"
## MntTotal MntRegularProds AcceptedCmpOverall
## "numeric" "numeric" "numeric"
summary(df)
## Income Kidhome Teenhome Recency
## Min. : 1730 Min. :0.0000 Min. :0.0000 Min. : 0.00
## 1st Qu.: 35196 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:24.00
## Median : 51287 Median :0.0000 Median :0.0000 Median :49.00
## Mean : 51622 Mean :0.4422 Mean :0.5066 Mean :49.01
## 3rd Qu.: 68281 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:74.00
## Max. :113734 Max. :2.0000 Max. :2.0000 Max. :99.00
## MntWines MntFruits MntMeatProducts MntFishProducts
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.00
## 1st Qu.: 24.0 1st Qu.: 2.0 1st Qu.: 16.0 1st Qu.: 3.00
## Median : 178.0 Median : 8.0 Median : 68.0 Median : 12.00
## Mean : 306.2 Mean : 26.4 Mean : 165.3 Mean : 37.76
## 3rd Qu.: 507.0 3rd Qu.: 33.0 3rd Qu.: 232.0 3rd Qu.: 50.00
## Max. :1493.0 Max. :199.0 Max. :1725.0 Max. :259.00
## MntSweetProducts MntGoldProds NumDealsPurchases NumWebPurchases
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 1.00 1st Qu.: 9.00 1st Qu.: 1.000 1st Qu.: 2.000
## Median : 8.00 Median : 25.00 Median : 2.000 Median : 4.000
## Mean : 27.13 Mean : 44.06 Mean : 2.318 Mean : 4.101
## 3rd Qu.: 34.00 3rd Qu.: 56.00 3rd Qu.: 3.000 3rd Qu.: 6.000
## Max. :262.00 Max. :321.00 Max. :15.000 Max. :27.000
## NumCatalogPurchases NumStorePurchases NumWebVisitsMonth AcceptedCmp3
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. :0.00000
## 1st Qu.: 0.000 1st Qu.: 3.000 1st Qu.: 3.000 1st Qu.:0.00000
## Median : 2.000 Median : 5.000 Median : 6.000 Median :0.00000
## Mean : 2.645 Mean : 5.824 Mean : 5.337 Mean :0.07392
## 3rd Qu.: 4.000 3rd Qu.: 8.000 3rd Qu.: 7.000 3rd Qu.:0.00000
## Max. :28.000 Max. :13.000 Max. :20.000 Max. :1.00000
## AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.00000 Median :0.00000 Median :0.0000 Median :0.00000
## Mean :0.07438 Mean :0.07302 Mean :0.0644 Mean :0.01361
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.00000
## Complain Z_CostContact Z_Revenue Response Age
## Min. :0.00000 Min. :3 Min. :11 Min. :0.000 Min. :24.0
## 1st Qu.:0.00000 1st Qu.:3 1st Qu.:11 1st Qu.:0.000 1st Qu.:43.0
## Median :0.00000 Median :3 Median :11 Median :0.000 Median :50.0
## Mean :0.00907 Mean :3 Mean :11 Mean :0.151 Mean :51.1
## 3rd Qu.:0.00000 3rd Qu.:3 3rd Qu.:11 3rd Qu.:0.000 3rd Qu.:61.0
## Max. :1.00000 Max. :3 Max. :11 Max. :1.000 Max. :80.0
## Customer_Days marital_Divorced marital_Married marital_Single
## Min. :2159 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:2339 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :2515 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :2513 Mean :0.1043 Mean :0.3873 Mean :0.2163
## 3rd Qu.:2688 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :2858 Max. :1.0000 Max. :1.0000 Max. :1.0000
## marital_Together marital_Widow education_2n Cycle education_Basic
## Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000 Median :0.0000 Median :0.00000
## Mean :0.2576 Mean :0.03447 Mean :0.0898 Mean :0.02449
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.00000
## education_Graduation education_Master education_PhD MntTotal
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. : 4.0
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 56.0
## Median :1.0000 Median :0.0000 Median :0.0000 Median : 343.0
## Mean :0.5048 Mean :0.1651 Mean :0.2159 Mean : 562.8
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.: 964.0
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :2491.0
## MntRegularProds AcceptedCmpOverall
## Min. :-283.0 Min. :0.0000
## 1st Qu.: 42.0 1st Qu.:0.0000
## Median : 288.0 Median :0.0000
## Mean : 518.7 Mean :0.2993
## 3rd Qu.: 884.0 3rd Qu.:0.0000
## Max. :2458.0 Max. :4.0000
##Analisis Grafico
# Crear el boxplot
ggplot(df, aes(x = "", y = Income)) +
geom_boxplot() +
labs(title = "Boxplot de Income", y = "Income") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) # Centra el tÃtulo
# Crear el gráfico de densidad y el histograma
ggplot(df, aes(x = Income)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", color = "black", alpha = 0.4) + # Histograma con densidad
geom_density(fill = "blue", alpha = 0.4) + # Diagrama de densidad
labs(title = "Diagrama de Densidad de Income", x = "Income", y = "Densidad") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) # Centra el tÃtulo
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Crear el gráfico de barras
ggplot(df, aes(x = as.factor(Kidhome))) +
geom_bar(fill = "blue") +
labs(title = "Distribución de Kidhome",
x = "Número de niños en el hogar",
y = "Frecuencia") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) # Centra el tÃtulo
# Crear el diagrama de dispersión con lÃnea de tendencia
ggplot(df, aes(x = MntTotal, y = Income)) +
geom_point(color = "blue", alpha = 0.5) + # Diagrama de dispersión
geom_smooth(method = "lm", color = "red", se = FALSE) + # LÃnea de tendencia (regresión lineal)
labs(title = "Diagrama de Dispersión entre MntTotal e Income con LÃnea de Tendencia",
x = "MntTotal",
y = "Income") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) # Centra el tÃtulo
## `geom_smooth()` using formula = 'y ~ x'
# Cargar las librerÃas necesarias
library(ggplot2)
# Lista con los nombres de las columnas numéricas que te interesan
columnas_a_incluir <- c('Income', 'MntMeatProducts', 'MntWines', 'MntFruits', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds')
# Filtrar el DataFrame utilizando la lista
df_numeric <- df[, columnas_a_incluir]
# Calcular la matriz de correlación
corr_matrix <- cor(df_numeric, use = "complete.obs")
# Convertir la matriz en formato largo para usar ggplot2
melted_corr_matrix <- melt(corr_matrix)
# Crear el heatmap
ggplot(melted_corr_matrix, aes(Var1, Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1, 1), space = "Lab",
name = "Correlación") +
geom_text(aes(label = round(value, 2)), color = "black", size = 4) +
labs(title = "Matriz de Correlación", x = "", y = "") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 10, hjust = 1),
plot.title = element_text(hjust = 0.5))
# Crear una lista con los nombres de las columnas a eliminar
columnas_a_eliminar <- c('Z_CostContact', 'Z_Revenue')
# Eliminar las columnas del DataFrame
df <- df[, !(names(df) %in% columnas_a_eliminar)]