Reading Parquet files
train_data<-arrow::read_parquet("datasets/train_data.parquet")
train_data %>% group_by(sku) %>% summarise(n=n())
train_data %>% filter(sku==12101)
NA
skimr::skim(
train_data %>% select_if(is.numeric)) %>%
yank("numeric")
── Variable type: numeric ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
1 sku 0 1 330374. 190792. 0 165125 330380 495596 660915 ▇▇▇▇▇
2 sold_quantity 0 1 0.990 9.99 0 0 0 1 6951 ▇▁▁▁▁
3 current_price 0 1 2358. 1263914. 0.87 48.7 119 299. 999999999 ▇▁▁▁▁
4 minutes_active 0 1 1043. 630. 0 0 1440 1440 1440 ▃▁▁▁▇
Read JSON lines files
library(tidyjson)
library(tidyr)
metadata<-jsonlite::stream_in(file("datasets/items_static_metadata_full.jl"),verbose = 0)
metadata %>% filter(item_domain_id=="MLB-SNEAKERS")
Countries and item_id are separated into two different columns
metadata<-metadata %>% separate(item_domain_id,sep = "-", into=c("country","item_domain_id"))
metadata %>% group_by(country,item_domain_id) %>% summarise(n=n())
`summarise()` has grouped output by 'country'. You can override using the `.groups` argument.
JOIN both data.frames
train_data_meta<-train_data %>% inner_join(metadata %>% select(country,sku,item_domain_id), by="sku")
train_data_meta
Now we can explore…
library(ggplot2)
library(lubridate)
library(ggdark)
serie_ex<-train_data_meta %>% filter(country=="MLM" & item_domain_id=="ADHESIVE_TAPES") %>% group_by(date) %>% summarise(sold=sum(sold_quantity))
serie_ex %>% ggplot()+
geom_point(aes(x=ymd(date),y=sold),color='orange')+
geom_line(aes(x=ymd(date),y=sold),color='red')+
ggdark::dark_theme_bw()+
xlab("Date")+
ylab("Sold Quantity")+
labs(title="SNEAKERS Sold in Mexico for 30 days")+
theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))

#train_data_meta %>% filter(country=="MLM" & item_domain_id=="SNEAKERS") %>% filter(date=="2021-02-01") %>% filter(sold_quantity!=0)
Boxplots
train_data_meta_sum<-train_data_meta %>% group_by(sku,dow) %>% summarise(total_sold_qty=sum(sold_quantity))
`summarise()` has grouped output by 'sku'. You can override using the `.groups` argument.
Warning message:
package ‘arrow’ was built under R version 4.0.5
train_data_meta_sum<-train_data_meta %>% group_by(sku,dow) %>% summarise(total_sold_qty=sum(sold_quantity))
`summarise()` has grouped output by 'sku'. You can override using the `.groups` argument.
train_data_meta_sum %>%
ggplot(aes(x=dow,y=total_sold_qty))+
# geom_point()+
#geom_violin()+
geom_boxplot(outlier.shape=NA)+
ylim(0,50)+
xlab("day_of_the_week")+
ggdark::dark_theme_bw()
Inverted geom defaults of fill and color/colour.
To change them back, use invert_geom_defaults().

Multitple time series.
train_data_meta<-train_data_meta %>% mutate(dow=ymd(date) %>% wday(label=TRUE), week=week(date))
train_data_meta_sum_m <- train_data_meta %>% group_by(item_domain_id,week,dow) %>% summarise(total_sold_qty=sum(sold_quantity))
`summarise()` has grouped output by 'item_domain_id', 'week'. You can override using the `.groups` argument.
sku_ex<-train_data_meta_sum_m %>% filter(week!=5) %>% filter(week!=13) %>% filter(item_domain_id=="ADHESIVE_TAPES")
sku_ex %>%
ggplot(aes(x=dow,y=total_sold_qty))+
# facet_wrap(~week)+
geom_point(color='orange')+
geom_smooth(aes(x=as.numeric(dow)),method = "loess",se = TRUE)+
#geom_area(aes(x=dow,y=total_sold_qty,group=week,fill = week), position = "stack") +
#geom_line(aes(group=week))+
#geom_smooth(aes(x=dow,y=total_sold_qty, gr))+
xlab("DOW")+
ylab("Sold Quantity")+
ggdark::dark_theme_bw()+
labs(title="ADHESIVE_TAPES: Total Qty sold per day of the week")+
theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))

## read https://stackoverflow.com/questions/10357768/plotting-lines-and-the-group-aesthetic-in-ggplot2
sku_ex %>%
ggplot(aes(x=dow,y=total_sold_qty))+
geom_point(color='orange')+
geom_line(aes(group=week,color=as.factor(week)))+
xlab("DOW")+
ylab("Sold Quantity")+
ggdark::dark_theme_bw()+
labs(title="ADHESIVE_TAPES: Total Qty sold per day of the week")+
theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))

using FACET
sku_ex %>%
ggplot(aes(x=dow,y=total_sold_qty))+
facet_wrap(~week)+
geom_point(color='orange')+
geom_line(aes(group=1))+
xlab("DOW")+
ylab("Sold Quantity")+
ggdark::dark_theme_bw()+
labs(title="ADHESIVE_TAPES: Total Qty sold per day of the week")+
theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))

LS0tCnRpdGxlOiAiTWVsaSAyMDIxIERhdGFzZXQiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9CmxpYnJhcnkoYXJyb3cpCmxpYnJhcnkoc2tpbXIpCmxpYnJhcnkoanNvbmxpdGUpCmxpYnJhcnkoZHBseXIpCmxpYnJhcnkoc2tpbXIpCmBgYAoKCiMgUmVhZGluZyBQYXJxdWV0IGZpbGVzCmBgYHtyIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9CnRyYWluX2RhdGE8LWFycm93OjpyZWFkX3BhcnF1ZXQoImRhdGFzZXRzL3RyYWluX2RhdGEucGFycXVldCIpCnRyYWluX2RhdGEgJT4lIGdyb3VwX2J5KHNrdSkgJT4lIHN1bW1hcmlzZShuPW4oKSkKCnRyYWluX2RhdGEgJT4lIGZpbHRlcihza3U9PTEyMTAxKSAgCgpgYGAKCmBgYHtyfQoKc2tpbXI6OnNraW0oCnRyYWluX2RhdGEgJT4lIHNlbGVjdF9pZihpcy5udW1lcmljKSkgJT4lCiAgeWFuaygibnVtZXJpYyIpCgoKYGBgCiMjIFJlYWQgSlNPTiBsaW5lcyBmaWxlcwoKYGBge3IgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0KbGlicmFyeSh0aWR5anNvbikKbGlicmFyeSh0aWR5cikKbWV0YWRhdGE8LWpzb25saXRlOjpzdHJlYW1faW4oZmlsZSgiZGF0YXNldHMvaXRlbXNfc3RhdGljX21ldGFkYXRhX2Z1bGwuamwiKSx2ZXJib3NlID0gMCkKbWV0YWRhdGEgJT4lIGZpbHRlcihpdGVtX2RvbWFpbl9pZD09Ik1MQi1TTkVBS0VSUyIpIApgYGAKIyMjIENvdW50cmllcyBhbmQgaXRlbV9pZCBhcmUgc2VwYXJhdGVkIGludG8gdHdvIGRpZmZlcmVudCBjb2x1bW5zCmBgYHtyfQptZXRhZGF0YTwtbWV0YWRhdGEgJT4lIHNlcGFyYXRlKGl0ZW1fZG9tYWluX2lkLHNlcCA9ICItIiwgaW50bz1jKCJjb3VudHJ5IiwiaXRlbV9kb21haW5faWQiKSkKbWV0YWRhdGEgICU+JSBncm91cF9ieShjb3VudHJ5LGl0ZW1fZG9tYWluX2lkKSAlPiUgc3VtbWFyaXNlKG49bigpKQpgYGAKCiMjIEpPSU4gYm90aCBkYXRhLmZyYW1lcwoKYGBge3J9CnRyYWluX2RhdGFfbWV0YTwtdHJhaW5fZGF0YSAlPiUgaW5uZXJfam9pbihtZXRhZGF0YSAlPiUgc2VsZWN0KGNvdW50cnksc2t1LGl0ZW1fZG9tYWluX2lkKSwgYnk9InNrdSIpCnRyYWluX2RhdGFfbWV0YQpgYGAKCiMjIE5vdyB3ZSBjYW4gZXhwbG9yZS4uLgoKCmBgYHtyfQoKbGlicmFyeShnZ3Bsb3QyKQpsaWJyYXJ5KGx1YnJpZGF0ZSkKbGlicmFyeShnZ2RhcmspCnNlcmllX2V4PC10cmFpbl9kYXRhX21ldGEgJT4lIGZpbHRlcihjb3VudHJ5PT0iTUxNIiAmIGl0ZW1fZG9tYWluX2lkPT0iQURIRVNJVkVfVEFQRVMiKSAlPiUgZ3JvdXBfYnkoZGF0ZSkgJT4lIHN1bW1hcmlzZShzb2xkPXN1bShzb2xkX3F1YW50aXR5KSkgCgoKc2VyaWVfZXggJT4lIGdncGxvdCgpKwogIGdlb21fcG9pbnQoYWVzKHg9eW1kKGRhdGUpLHk9c29sZCksY29sb3I9J29yYW5nZScpKwogIGdlb21fbGluZShhZXMoeD15bWQoZGF0ZSkseT1zb2xkKSxjb2xvcj0ncmVkJykrCiAgZ2dkYXJrOjpkYXJrX3RoZW1lX2J3KCkrCiAgeGxhYigiRGF0ZSIpKwogIHlsYWIoIlNvbGQgUXVhbnRpdHkiKSsKICBsYWJzKHRpdGxlPSJTTkVBS0VSUyBTb2xkIGluIE1leGljbyBmb3IgMzAgZGF5cyIpKwogIHRoZW1lKGF4aXMudGV4dC54ID0gZWxlbWVudF90ZXh0KGFuZ2xlID0gNDUsIHZqdXN0ID0gMC41LCBoanVzdD0xKSkKCiN0cmFpbl9kYXRhX21ldGEgJT4lIGZpbHRlcihjb3VudHJ5PT0iTUxNIiAmIGl0ZW1fZG9tYWluX2lkPT0iU05FQUtFUlMiKSAlPiUgZmlsdGVyKGRhdGU9PSIyMDIxLTAyLTAxIikgJT4lIGZpbHRlcihzb2xkX3F1YW50aXR5IT0wKQpgYGAKCiMjIEJveHBsb3RzCmBgYHtyfQpsaWJyYXJ5KGx1YnJpZGF0ZSkKdHJhaW5fZGF0YV9tZXRhPC10cmFpbl9kYXRhX21ldGEgJT4lIG11dGF0ZShkb3c9eW1kKGRhdGUpICU+JSB3ZGF5KGxhYmVsPVRSVUUpLCB3ZWVrPXdlZWsoZGF0ZSkpCnRyYWluX2RhdGFfbWV0YV9zdW08LXRyYWluX2RhdGFfbWV0YSAlPiUgZ3JvdXBfYnkoc2t1LGRvdykgJT4lIHN1bW1hcmlzZSh0b3RhbF9zb2xkX3F0eT1zdW0oc29sZF9xdWFudGl0eSkpIAp0cmFpbl9kYXRhX21ldGFfc3VtICU+JQogIGdncGxvdChhZXMoeD1kb3cseT10b3RhbF9zb2xkX3F0eSkpKwogIyBnZW9tX3BvaW50KCkrCiAgI2dlb21fdmlvbGluKCkrCiAgZ2VvbV9ib3hwbG90KG91dGxpZXIuc2hhcGU9TkEpKwogIHlsaW0oMCw1MCkrCiAgeGxhYigiZGF5X29mX3RoZV93ZWVrIikrCiAgZ2dkYXJrOjpkYXJrX3RoZW1lX2J3KCkKYGBgCgojIyBNdWx0aXRwbGUgdGltZSBzZXJpZXMuCmBgYHtyIGVjaG89VFJVRX0KdHJhaW5fZGF0YV9tZXRhPC10cmFpbl9kYXRhX21ldGEgJT4lIG11dGF0ZShkb3c9eW1kKGRhdGUpICU+JSB3ZGF5KGxhYmVsPVRSVUUpLCB3ZWVrPXdlZWsoZGF0ZSkpCnRyYWluX2RhdGFfbWV0YV9zdW1fbSA8LSB0cmFpbl9kYXRhX21ldGEgJT4lIGdyb3VwX2J5KGl0ZW1fZG9tYWluX2lkLHdlZWssZG93KSAlPiUgc3VtbWFyaXNlKHRvdGFsX3NvbGRfcXR5PXN1bShzb2xkX3F1YW50aXR5KSkgCgpza3VfZXg8LXRyYWluX2RhdGFfbWV0YV9zdW1fbSAgJT4lIGZpbHRlcih3ZWVrIT01KSAlPiUgZmlsdGVyKHdlZWshPTEzKSAlPiUgZmlsdGVyKGl0ZW1fZG9tYWluX2lkPT0iQURIRVNJVkVfVEFQRVMiKQpza3VfZXggJT4lCiAgZ2dwbG90KGFlcyh4PWRvdyx5PXRvdGFsX3NvbGRfcXR5KSkrCiAjIGZhY2V0X3dyYXAofndlZWspKwogIGdlb21fcG9pbnQoY29sb3I9J29yYW5nZScpKwogIGdlb21fc21vb3RoKGFlcyh4PWFzLm51bWVyaWMoZG93KSksbWV0aG9kID0gImxvZXNzIixzZSA9IFRSVUUpKwogICNnZW9tX2FyZWEoYWVzKHg9ZG93LHk9dG90YWxfc29sZF9xdHksZ3JvdXA9d2VlayxmaWxsID0gd2VlayksIHBvc2l0aW9uID0gInN0YWNrIikgKwogICNnZW9tX2xpbmUoYWVzKGdyb3VwPXdlZWspKSsKICAjZ2VvbV9zbW9vdGgoYWVzKHg9ZG93LHk9dG90YWxfc29sZF9xdHksIGdyKSkrCiAgeGxhYigiRE9XIikrCiAgCiAgeWxhYigiU29sZCBRdWFudGl0eSIpKwogIGdnZGFyazo6ZGFya190aGVtZV9idygpKwogIGxhYnModGl0bGU9IkFESEVTSVZFX1RBUEVTOiBUb3RhbCBRdHkgc29sZCBwZXIgZGF5IG9mIHRoZSB3ZWVrIikrCiAgdGhlbWUoYXhpcy50ZXh0LnggPSBlbGVtZW50X3RleHQoYW5nbGUgPSA0NSwgdmp1c3QgPSAwLjUsIGhqdXN0PTEpKQoKIyMgcmVhZCBodHRwczovL3N0YWNrb3ZlcmZsb3cuY29tL3F1ZXN0aW9ucy8xMDM1Nzc2OC9wbG90dGluZy1saW5lcy1hbmQtdGhlLWdyb3VwLWFlc3RoZXRpYy1pbi1nZ3Bsb3QyCiAgCmBgYApgYGB7cn0KCnNrdV9leCAlPiUKICBnZ3Bsb3QoYWVzKHg9ZG93LHk9dG90YWxfc29sZF9xdHkpKSsKICBnZW9tX3BvaW50KGNvbG9yPSdvcmFuZ2UnKSsKICBnZW9tX2xpbmUoYWVzKGdyb3VwPXdlZWssY29sb3I9YXMuZmFjdG9yKHdlZWspKSkrCiAgeGxhYigiRE9XIikrCiAgeWxhYigiU29sZCBRdWFudGl0eSIpKwogIGdnZGFyazo6ZGFya190aGVtZV9idygpKwogIGxhYnModGl0bGU9IkFESEVTSVZFX1RBUEVTOiBUb3RhbCBRdHkgc29sZCBwZXIgZGF5IG9mIHRoZSB3ZWVrIikrCiAgdGhlbWUoYXhpcy50ZXh0LnggPSBlbGVtZW50X3RleHQoYW5nbGUgPSA0NSwgdmp1c3QgPSAwLjUsIGhqdXN0PTEpKQoKYGBgCgojIyB1c2luZyBGQUNFVApgYGB7cn0Kc2t1X2V4ICU+JQogIGdncGxvdChhZXMoeD1kb3cseT10b3RhbF9zb2xkX3F0eSkpKwogIGZhY2V0X3dyYXAofndlZWspKwogIGdlb21fcG9pbnQoY29sb3I9J29yYW5nZScpKwogIGdlb21fbGluZShhZXMoZ3JvdXA9MSkpKwogIHhsYWIoIkRPVyIpKwogIHlsYWIoIlNvbGQgUXVhbnRpdHkiKSsKICBnZ2Rhcms6OmRhcmtfdGhlbWVfYncoKSsKICBsYWJzKHRpdGxlPSJBREhFU0lWRV9UQVBFUzogVG90YWwgUXR5IHNvbGQgcGVyIGRheSBvZiB0aGUgd2VlayIpKwogIHRoZW1lKGF4aXMudGV4dC54ID0gZWxlbWVudF90ZXh0KGFuZ2xlID0gNDUsIHZqdXN0ID0gMC41LCBoanVzdD0xKSkKCmBgYAo=