library(dplyr)
library(ggplot2)
library(lattice)
#library(caret)
Df <- read.csv("C:/Users/HP/Documents/EDA/R/ulabox_orders_with_categories_partials_2017/ulabox_orders_with_categories_partials_2017.csv")
# Feature Re-Engineering
Df$customer <- as.factor(Df$customer)
Df$order <- as.factor(Df$order)
Df$weekday <- as.factor(Df$weekday)
Df$hour <- as.factor(Df$hour)
str(Df)
## 'data.frame': 30000 obs. of 14 variables:
## $ customer : Factor w/ 10239 levels "0","1","2","3",..: 1 1 1 2 2 2 2 2 2 2 ...
## $ order : Factor w/ 30000 levels "0","1","2","3",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ total_items: int 45 38 51 57 53 8 35 12 35 44 ...
## $ discount. : num 23.03 1.22 18.08 16.51 18.31 ...
## $ weekday : Factor w/ 7 levels "1","2","3","4",..: 4 5 4 1 2 4 1 4 1 2 ...
## $ hour : Factor w/ 24 levels "0","1","2","3",..: 14 14 14 13 12 14 11 9 13 13 ...
## $ Food. : num 9.46 15.87 16.88 28.81 24.13 ...
## $ Fresh. : num 87.1 75.8 56.8 36 60.4 ...
## $ Drinks. : num 3.48 6.22 3.37 11.78 7.78 ...
## $ Home. : num 0 2.12 16.48 4.62 7.72 ...
## $ Beauty. : num 0 0 6.53 2.87 0 ...
## $ Health. : num 0 0 0 15.9 0 ...
## $ Baby. : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Pets. : num 0 0 0 0 0 0 0 0 0 0 ...
ggplot(Df, aes(total_items)) +
geom_histogram(binwidth = 3, alpha = 0.63,fill = "grey75") +
theme_bw() +
labs(title = "Total Items' on histogram", x = "Total items per order", y ="Frequency")
Plot is displaying a Log-normal distribution.
Df %>% count(customer, sort =T) %>% head(20)
## # A tibble: 20 x 2
## customer n
## <fct> <int>
## 1 6560 52
## 2 7688 40
## 3 7776 37
## 4 657 36
## 5 4713 31
## 6 4 29
## 7 9220 29
## 8 2245 28
## 9 2616 28
## 10 2931 28
## 11 4707 28
## 12 1276 26
## 13 2632 26
## 14 4114 26
## 15 7032 26
## 16 7973 26
## 17 3195 25
## 18 4940 25
## 19 1656 24
## 20 2428 24
CFreq <- Df %>% count(customer, sort =T) #Descending order of customer orders
colnames(CFreq)[2] <- "cfreq" ## Renaming the second column
Df <- full_join(Df, CFreq, by = "customer") #Attaching new feature to the dataset
WFreq <- Df %>% count(weekday, sort = F) #Desc order of weekday orders
colnames(WFreq)[2] <- "dfreq"
Df <- full_join(Df, WFreq, by = "weekday")
HFreq <- Df %>% count(hour, sort = F)
colnames(HFreq)[2] <- 'hreq'
Df <- full_join(Df, HFreq, by = "hour")
Df <- Df %>% mutate(Paid.Pcnt = (1-(discount./100))*100)
Price.item <- round(12000000/sum(Df$total_items), 2) #12,000,000 here represents the est. annual revenue {12 * 1,000,000+
Df <- Df %>% mutate(Paid.Price = total_items*Price.item)
Df$Sell.Price <- if_else(Df$Paid.Pcnt == 0, 0, Df$Paid.Pcnt*100/Df$Paid.Pcnt) # 100%
Df$Sell.Price <- round(Df$Sell.Price)
-5 Geography served due to Fresh Food category’s contraint
Note F Food was only served to Madrid & Barcelona
Df$geo <- if_else(Df$Fresh. >0, "Madrid.Barcelona", "Rest.of.Spain")
# The code creates a dichotomony, if Df$Fresh. is 0, then create "M.B"; else (Df$Fresh is not 0) create "RoS"
Df$geo <- as.factor(Df$geo) # Makes our new column a Factor
Df %>%
filter(discount. > 0) %>%
ggplot(aes(x = total_items, y = (discount.))) +
geom_density_2d(aes(), col = "royalblue", alpha = 0.9) +
facet_grid(geo~weekday, scales = "free") +
theme_bw() +
coord_cartesian(ylim = c(0, 80))
## Warning: Computation failed in `stat_density2d()`:
## Number of x coordinates must match number of columns in density matrix.
## Warning: Computation failed in `stat_density2d()`:
## Number of x coordinates must match number of columns in density matrix.
## Why is day 3 not returning any value.?
MAdrid and Barcelena despite their excusive offer get less discounts. The first conclusion will be left as reference for bad data habits, and a reminder to standardize the graph’s scale which I did with coord_cartesian.Df %>% count(weekday, sort = TRUE)
## # A tibble: 7 x 2
## weekday n
## <fct> <int>
## 1 1 6237
## 2 7 5160
## 3 2 4970
## 4 3 4708
## 5 4 4081
## 6 5 2538
## 7 6 2306
Df %>%
count(weekday, sort = T) %>%
ggplot(aes(x = reorder(weekday, -n), y=n)) +
geom_histogram(stat = "identity", fill = "gray75") +
labs(x = "WeekDays", y="Purchases")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Attach Labels to Weekday.
Df %>% count(hour, sort = T)
## # A tibble: 24 x 2
## hour n
## <fct> <int>
## 1 22 2491
## 2 21 2306
## 3 12 2211
## 4 11 2184
## 5 20 2040
## 6 10 2024
## 7 13 1983
## 8 23 1875
## 9 19 1775
## 10 18 1539
## # ... with 14 more rows
Df %>%
count(hour, sort = T) %>%
ggplot(aes(x = reorder(hour, -n), y = n)) +
geom_histogram(stat = "identity") +
labs(x = "Hours", y = "Purchases")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
Most orders past 10am, the highest been at 10pm at night; creating a nice dicothomy of peak periods in the day and at night.
Df %>%
group_by(total_items) %>%
summarize(m = mean(discount.), n=n()) %>%
ggplot(aes(x=n, y=m))+
geom_point() +
geom_smooth(color="red")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
From the above graph, the relationship is quite responsive and discount. does seem to matter up to a limit, we see a minor increase in purchases when discounts are highest, however discounts hit a ceiling - given the spread of purchases of general discounts.
Df_LR <- lm(total_items ~ Food. + Fresh. + Drinks. + Home. + Beauty. + Health. + Baby. + Pets.,data=Df)
summary(Df_LR)
##
## Call:
## lm(formula = total_items ~ Food. + Fresh. + Drinks. + Home. +
## Beauty. + Health. + Baby. + Pets., data = Df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.53 -11.08 -3.53 7.44 262.53
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.36188 5.80077 -0.235 0.81438
## Food. 0.45892 0.05818 7.887 3.19e-15 ***
## Fresh. 0.43846 0.05815 7.540 4.82e-14 ***
## Drinks. 0.26872 0.05820 4.617 3.91e-06 ***
## Home. 0.33079 0.05820 5.683 1.33e-08 ***
## Beauty. 0.16262 0.05928 2.743 0.00609 **
## Health. 0.17691 0.06146 2.878 0.00400 **
## Baby. 0.11773 0.05822 2.022 0.04319 *
## Pets. 0.19406 0.06055 3.205 0.00135 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.89 on 29991 degrees of freedom
## Multiple R-squared: 0.1576, Adjusted R-squared: 0.1574
## F-statistic: 701.4 on 8 and 29991 DF, p-value: < 2.2e-16
Food. and ’Fresh.` contribute the most to our basket.