install.packages(“dplyr”) library(“dplyr”)
df <-read.csv(“Wholesale_customers_data.csv”, header=TRUE)
str(df) head(df) df\(Channel<-as.factor(df\)Channel) df\(Region<-as.factor(df\)Region)
library(“plyr”) df\(Channel <-revalue(df\)Channel, c(“1”=“Horeca”, “2”=“Retail”)) df\(Region <-revalue(df\)Region, c(“1”=“Lisbon”, “2”=“Oporto”,“3”=“Other”))
library(ggplot2) library(gridExtra)
grocery <- ggplot(df, aes(x=Channel, y=Grocery, fill=Channel)) + geom_boxplot() + theme(legend.position=“none”) + xlab(“”) delicassen <- ggplot(df, aes(x=Channel, y=Delicassen, fill=Channel)) + geom_boxplot() + theme(legend.position=“none”) + xlab(“”) detergents <- ggplot(df, aes(x=Channel, y=Detergents_Paper, fill=Channel)) + geom_boxplot() + theme(legend.position=“none”)+ xlab(“”) milk <- ggplot(df, aes(x=Channel, y=Milk, fill=Channel)) + geom_boxplot() + theme(legend.position=“none”)+ xlab(“”) fresh <- ggplot(df, aes(x=Channel, y=Fresh, fill=Channel)) + geom_boxplot() + theme(legend.position=“none”)+ xlab(“”) frozen <- ggplot(df, aes(x=Channel, y=Frozen, fill=Channel)) + geom_boxplot() + theme(legend.position=“none”)+ xlab(“”)
grid.arrange(fresh, milk, grocery, frozen,detergents,delicassen, ncol = 3)
ggplot(filter(df, Channel == “Horeca”), aes(x=Channel, y=Fresh, fill=Channel)) + geom_boxplot() + theme(legend.position=“none”)+ xlab(“”) + geom_jitter(alpha = 0.25)
########################## # ALGORITMOS JERARQUICOS # ##########################
df_horeca <- subset(df, Channel == “Horeca”) df_horeca <- df_horeca[,-1]
library(tidyverse) library(cluster) library(factoextra) library(dendextend)
df_horeca_op<-subset(df_horeca, Region==“Oporto”) df_horeca_li<-subset(df_horeca, Region==“Lisbon”) df_horeca_oth<-subset(df_horeca, Region==“Other”)
df_horeca_op<-df_horeca_op[-1] df_horeca_li<-df_horeca_li[-1] df_horeca_oth<-df_horeca_oth[-1]
df_horeca_op<-scale(df_horeca_op) df_horeca_li<-scale(df_horeca_li) df_horeca_oth<-scale(df_horeca_oth)
dist_matrix_op <- dist(df_horeca_op, method = “euclidean”) dist_matrix_li <- dist(df_horeca_li, method = “euclidean”) dist_matrix_oth <- dist(df_horeca_oth, method = “euclidean”)
hc_op <- hclust(dist_matrix_op, method = “ward.D2” ) hc_li <- hclust(dist_matrix_li, method = “ward.D2” ) hc_oth <- hclust(dist_matrix_oth, method = “ward.D2” )
plot(hc_op, hang = -1, cex = 0.5) ;rect.hclust(hc_op, k = 4, border = 2:6) plot(hc_li, hang = -1, cex = 0.6) ;rect.hclust(hc_li, k = 4, border = 2:6) plot(hc_oth, hang = -1, cex = 0.6) ;rect.hclust(hc_oth, k = 4, border = 2:6)
groups <- cutree(hc_oth, k=4) #Definimos una funcion que nos muestre a que cluster pertenece cada observacion print_clusters <- function(labels, k) { for (i in 1:k){ print(paste(“cluster”,i)) print(df_horeca_oth[labels==i, c(“Fresh”, “Milk”,“Grocery”,“Frozen”,“Detergents_Paper”,“Delicassen”)]) } } #Evaluamos la funcion print_clusters(groups, 4)
install.packages(“devtools”) install_github(‘sinhrks/ggfortify’) library(devtools) library(ggfortify) library(ggplot2)
remove_outliers <- function(x, na.rm = TRUE) { qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm) H <- 1.5 * IQR(x, na.rm = na.rm) y <- x y[x < (qnt[1] - H)] <- NA y[x > (qnt[2] + H)] <- NA y }
df2<-df df2\(Delicassen<-remove_outliers(df2\)Delicassen) df2\(Detergents_Paper<-remove_outliers(df2\)Detergents_Paper) df2\(Frozen<-remove_outliers(df2\)Frozen) df2\(Grocery<-remove_outliers(df2\)Grocery) df2\(Milk<-remove_outliers(df2\)Milk) df2\(Fresh<-remove_outliers(df2\)Fresh)
df2<-na.omit(df2)
pca <- prcomp(df2[-(1:2)], scale. = TRUE)
fviz_pca_biplot(pca, geom.ind = “point”, fill.ind = df2$Channel, col.ind = “black”,pointshape = 21, pointsize = 2,palette = “jco”,addEllipses = TRUE,alpha.var =“contrib”, col.var = “contrib”,gradient.cols = “RdYlBu”,legend.title = list(fill = “Channel”, color = “Contrib”,alpha = “Contrib”))
ffviz_nbclust(scale(df2[-(1:2)]), kmeans, method = “silhouette”)
k2 <- kmeans((scale(df2[-(1:2)])), centers = 2, nstart = 25)
fviz_cluster(k2, geom = “point”, data = scale(df2[-(1:2)]),main = “Cluster plot”, xlab = ‘Primera Componente Principal’, ylab = ‘Segunda Componente Principal’, ellipse.type = “norm”,ggtheme = theme_classic(), palette = “jco”)
df3 <- cbind(k2\(cluster,df2) df3\)k2$cluster <-as.factor(df3\(`k2\)cluster) df3$k2\(cluster` <-revalue(df3\)k2$cluster, c(“1”=“Horeca”, “2”=“Retail”))
plot(df3[-(1:3)], col = df3\(Channel, main = "Representación de los Datos",pch=20,cex=0.5) plot(df3[-(1:3)], col = k2\)cluster, main = “Representación del Cluster”,pch=20,cex=0.5)
install.packages(“caret”) library(‘caret’) matriz_de_confusion<-confusionMatrix(df3\(`k2\)cluster`, df3$Channel)
install.packages(“GGally”) library(GGally) #Correlacion (FIGURA 10) ggcorr(df[-(1:2)], label = TRUE, legend.position = “left”,hjust = 0.8) ggcorr(df2[-(1:2)], label = TRUE, legend.position = “left”,hjust = 0.8) #Otra opcion #ggcorr(df[-(1:2)], label = TRUE,hjust = 0.75, geom = ‘blank’) + #geom_point(size = 15, aes(color = coefficient< 0, alpha = abs(coefficient) > 0.4)) + #scale_alpha_manual(values = c(“TRUE” = 0.25, “FALSE” = 0)) + #guides(color = FALSE, alpha = FALSE)
install.packages(“GGally”) library(GGally)
ggcorr(df[-(1:2)], label = TRUE, legend.position = “left”,hjust = 0.8) ggcorr(df2[-(1:2)], label = TRUE, legend.position = “left”,hjust = 0.8)
df_li_ho<-filter(df, Channel == “Horeca”, Region == “Lisbon”) df_op_ho<-filter(df, Channel == “Horeca”, Region == “Oporto”) df_ot_ho<-filter(df, Channel == “Horeca”, Region == “Other”) df_ot_re<-filter(df, Channel == “Retail”, Region == “Other”) df_op_re<-filter(df, Channel == “Retail”, Region == “Oporto”) df_li_re<-filter(df, Channel == “Retail”, Region == “Lisbon”)
p1 <- ggcorr(df_li_ho[-(1:2)], label = TRUE,hjust = 0.8, geom = ‘blank’)+ geom_point(size = 15, aes(color = coefficient< 0, alpha = abs(coefficient) > 0.7))+ scale_alpha_manual(values = c(“TRUE” = 0.25, “FALSE” = 0))+ guides(color = FALSE, alpha = FALSE) p2 <- ggcorr(df_op_ho[-(1:2)], label = TRUE,hjust = 0.8, geom = ‘blank’)+ geom_point(size = 15, aes(color = coefficient< 0, alpha = abs(coefficient) > 0.7))+ scale_alpha_manual(values = c(“TRUE” = 0.25, “FALSE” = 0))+ guides(color = FALSE, alpha = FALSE) p3 <- ggcorr(df_ot_ho[-(1:2)], label = TRUE,hjust = 0.8, geom = ‘blank’)+ geom_point(size = 15, aes(color = coefficient< 0, alpha = abs(coefficient) > 0.7))+ scale_alpha_manual(values = c(“TRUE” = 0.25, “FALSE” = 0))+ guides(color = FALSE, alpha = FALSE) p4 <- ggcorr(df_li_re[-(1:2)], label = TRUE,hjust = 0.8, geom = ‘blank’)+ geom_point(size = 15, aes(color = coefficient< 0, alpha = abs(coefficient) > 0.7))+ scale_alpha_manual(values = c(“TRUE” = 0.25, “FALSE” = 0))+ guides(color = FALSE, alpha = FALSE) p5 <- ggcorr(df_op_re[-(1:2)], label = TRUE,hjust = 0.8, geom = ‘blank’)+ geom_point(size = 15, aes(color = coefficient< 0, alpha = abs(coefficient) > 0.7))+ scale_alpha_manual(values = c(“TRUE” = 0.25, “FALSE” = 0))+ guides(color = FALSE, alpha = FALSE) p6 <- ggcorr(df_ot_re[-(1:2)], label = TRUE,hjust = 0.8, geom = ‘blank’)+ geom_point(size = 15, aes(color = coefficient< 0, alpha = abs(coefficient) > 0.7))+ scale_alpha_manual(values = c(“TRUE” = 0.25, “FALSE” = 0))+ guides(color = FALSE, alpha = FALSE)
install.packages(“devtools”) library(devtools) install_github(“easyGgplot2”, “kassambara”) library(easyGgplot2) ggplot2.multiplot(p1,p2,p3,p4,p5,p6, cols=3)
install.packages(“rgl”) library(“rgl”) require(plotly) plot_ly(df2, x = ~Detergents_Paper, y = ~Milk, z = ~Grocery, color = ~Channel) %>% add_markers()
df_est<-scale(df[-c(1,2)]) df_est<-cbind(df[1:2],df_est) ggpairs(df_est[-2], aes(colour = Channel, alpha = 0.4), title = ‘Resumen estadistico por canales’) ggpairs(df_est[-1], aes(colour = Region, alpha = 0.4), title = ‘Resumen estadistico por regiones’)