library(arrow)
library(skimr)
library(jsonlite)
library(dplyr)
library(skimr)

Reading Parquet files

train_data<-arrow::read_parquet("datasets/train_data.parquet")
train_data %>% group_by(sku) %>% summarise(n=n())

train_data %>% filter(sku==12101)  
NA

skimr::skim(
train_data %>% select_if(is.numeric)) %>%
  yank("numeric")

── Variable type: numeric ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────
  skim_variable  n_missing complete_rate       mean         sd    p0      p25    p50     p75      p100 hist 
1 sku                    0             1 330374.     190792.    0    165125   330380 495596     660915 ▇▇▇▇▇
2 sold_quantity          0             1      0.990       9.99  0         0        0      1       6951 ▇▁▁▁▁
3 current_price          0             1   2358.    1263914.    0.87     48.7    119    299. 999999999 ▇▁▁▁▁
4 minutes_active         0             1   1043.        630.    0         0     1440   1440       1440 ▃▁▁▁▇

Read JSON lines files

library(tidyjson)
library(tidyr)
metadata<-jsonlite::stream_in(file("datasets/items_static_metadata_full.jl"),verbose = 0)
metadata %>% filter(item_domain_id=="MLB-SNEAKERS") 

Countries and item_id are separated into two different columns

metadata<-metadata %>% separate(item_domain_id,sep = "-", into=c("country","item_domain_id"))
metadata  %>% group_by(country,item_domain_id) %>% summarise(n=n())
`summarise()` has grouped output by 'country'. You can override using the `.groups` argument.

JOIN both data.frames

train_data_meta<-train_data %>% inner_join(metadata %>% select(country,sku,item_domain_id), by="sku")
train_data_meta

Now we can explore…


library(ggplot2)
library(lubridate)
library(ggdark)
serie_ex<-train_data_meta %>% filter(country=="MLM" & item_domain_id=="ADHESIVE_TAPES") %>% group_by(date) %>% summarise(sold=sum(sold_quantity)) 


serie_ex %>% ggplot()+
  geom_point(aes(x=ymd(date),y=sold),color='orange')+
  geom_line(aes(x=ymd(date),y=sold),color='red')+
  ggdark::dark_theme_bw()+
  xlab("Date")+
  ylab("Sold Quantity")+
  labs(title="SNEAKERS Sold in Mexico for 30 days")+
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))


#train_data_meta %>% filter(country=="MLM" & item_domain_id=="SNEAKERS") %>% filter(date=="2021-02-01") %>% filter(sold_quantity!=0)

Boxplots

train_data_meta_sum<-train_data_meta %>% group_by(sku,dow) %>% summarise(total_sold_qty=sum(sold_quantity)) 
`summarise()` has grouped output by 'sku'. You can override using the `.groups` argument.
Warning message:
package ‘arrow’ was built under R version 4.0.5 
train_data_meta_sum<-train_data_meta %>% group_by(sku,dow) %>% summarise(total_sold_qty=sum(sold_quantity)) 
`summarise()` has grouped output by 'sku'. You can override using the `.groups` argument.
train_data_meta_sum %>%
  ggplot(aes(x=dow,y=total_sold_qty))+
 # geom_point()+
  #geom_violin()+
  geom_boxplot(outlier.shape=NA)+
  ylim(0,50)+
  xlab("day_of_the_week")+
  ggdark::dark_theme_bw()
Inverted geom defaults of fill and color/colour.
To change them back, use invert_geom_defaults().

Multitple time series.

train_data_meta<-train_data_meta %>% mutate(dow=ymd(date) %>% wday(label=TRUE), week=week(date))
train_data_meta_sum_m <- train_data_meta %>% group_by(item_domain_id,week,dow) %>% summarise(total_sold_qty=sum(sold_quantity)) 
`summarise()` has grouped output by 'item_domain_id', 'week'. You can override using the `.groups` argument.
sku_ex<-train_data_meta_sum_m  %>% filter(week!=5) %>% filter(week!=13) %>% filter(item_domain_id=="ADHESIVE_TAPES")
sku_ex %>%
  ggplot(aes(x=dow,y=total_sold_qty))+
 # facet_wrap(~week)+
  geom_point(color='orange')+
  geom_smooth(aes(x=as.numeric(dow)),method = "loess",se = TRUE)+
  #geom_area(aes(x=dow,y=total_sold_qty,group=week,fill = week), position = "stack") +
  #geom_line(aes(group=week))+
  #geom_smooth(aes(x=dow,y=total_sold_qty, gr))+
  xlab("DOW")+
  
  ylab("Sold Quantity")+
  ggdark::dark_theme_bw()+
  labs(title="ADHESIVE_TAPES: Total Qty sold per day of the week")+
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))


## read https://stackoverflow.com/questions/10357768/plotting-lines-and-the-group-aesthetic-in-ggplot2
  

sku_ex %>%
  ggplot(aes(x=dow,y=total_sold_qty))+
  geom_point(color='orange')+
  geom_line(aes(group=week,color=as.factor(week)))+
  xlab("DOW")+
  ylab("Sold Quantity")+
  ggdark::dark_theme_bw()+
  labs(title="ADHESIVE_TAPES: Total Qty sold per day of the week")+
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))

using FACET

sku_ex %>%
  ggplot(aes(x=dow,y=total_sold_qty))+
  facet_wrap(~week)+
  geom_point(color='orange')+
  geom_line(aes(group=1))+
  xlab("DOW")+
  ylab("Sold Quantity")+
  ggdark::dark_theme_bw()+
  labs(title="ADHESIVE_TAPES: Total Qty sold per day of the week")+
  theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1))

LS0tCnRpdGxlOiAiTWVsaSAyMDIxIERhdGFzZXQiCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9CmxpYnJhcnkoYXJyb3cpCmxpYnJhcnkoc2tpbXIpCmxpYnJhcnkoanNvbmxpdGUpCmxpYnJhcnkoZHBseXIpCmxpYnJhcnkoc2tpbXIpCmBgYAoKCiMgUmVhZGluZyBQYXJxdWV0IGZpbGVzCmBgYHtyIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0V9CnRyYWluX2RhdGE8LWFycm93OjpyZWFkX3BhcnF1ZXQoImRhdGFzZXRzL3RyYWluX2RhdGEucGFycXVldCIpCnRyYWluX2RhdGEgJT4lIGdyb3VwX2J5KHNrdSkgJT4lIHN1bW1hcmlzZShuPW4oKSkKCnRyYWluX2RhdGEgJT4lIGZpbHRlcihza3U9PTEyMTAxKSAgCgpgYGAKCmBgYHtyfQoKc2tpbXI6OnNraW0oCnRyYWluX2RhdGEgJT4lIHNlbGVjdF9pZihpcy5udW1lcmljKSkgJT4lCiAgeWFuaygibnVtZXJpYyIpCgoKYGBgCiMjIFJlYWQgSlNPTiBsaW5lcyBmaWxlcwoKYGBge3IgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0KbGlicmFyeSh0aWR5anNvbikKbGlicmFyeSh0aWR5cikKbWV0YWRhdGE8LWpzb25saXRlOjpzdHJlYW1faW4oZmlsZSgiZGF0YXNldHMvaXRlbXNfc3RhdGljX21ldGFkYXRhX2Z1bGwuamwiKSx2ZXJib3NlID0gMCkKbWV0YWRhdGEgJT4lIGZpbHRlcihpdGVtX2RvbWFpbl9pZD09Ik1MQi1TTkVBS0VSUyIpIApgYGAKIyMjIENvdW50cmllcyBhbmQgaXRlbV9pZCBhcmUgc2VwYXJhdGVkIGludG8gdHdvIGRpZmZlcmVudCBjb2x1bW5zCmBgYHtyfQptZXRhZGF0YTwtbWV0YWRhdGEgJT4lIHNlcGFyYXRlKGl0ZW1fZG9tYWluX2lkLHNlcCA9ICItIiwgaW50bz1jKCJjb3VudHJ5IiwiaXRlbV9kb21haW5faWQiKSkKbWV0YWRhdGEgICU+JSBncm91cF9ieShjb3VudHJ5LGl0ZW1fZG9tYWluX2lkKSAlPiUgc3VtbWFyaXNlKG49bigpKQpgYGAKCiMjIEpPSU4gYm90aCBkYXRhLmZyYW1lcwoKYGBge3J9CnRyYWluX2RhdGFfbWV0YTwtdHJhaW5fZGF0YSAlPiUgaW5uZXJfam9pbihtZXRhZGF0YSAlPiUgc2VsZWN0KGNvdW50cnksc2t1LGl0ZW1fZG9tYWluX2lkKSwgYnk9InNrdSIpCnRyYWluX2RhdGFfbWV0YQpgYGAKCiMjIE5vdyB3ZSBjYW4gZXhwbG9yZS4uLgoKCmBgYHtyfQoKbGlicmFyeShnZ3Bsb3QyKQpsaWJyYXJ5KGx1YnJpZGF0ZSkKbGlicmFyeShnZ2RhcmspCnNlcmllX2V4PC10cmFpbl9kYXRhX21ldGEgJT4lIGZpbHRlcihjb3VudHJ5PT0iTUxNIiAmIGl0ZW1fZG9tYWluX2lkPT0iQURIRVNJVkVfVEFQRVMiKSAlPiUgZ3JvdXBfYnkoZGF0ZSkgJT4lIHN1bW1hcmlzZShzb2xkPXN1bShzb2xkX3F1YW50aXR5KSkgCgoKc2VyaWVfZXggJT4lIGdncGxvdCgpKwogIGdlb21fcG9pbnQoYWVzKHg9eW1kKGRhdGUpLHk9c29sZCksY29sb3I9J29yYW5nZScpKwogIGdlb21fbGluZShhZXMoeD15bWQoZGF0ZSkseT1zb2xkKSxjb2xvcj0ncmVkJykrCiAgZ2dkYXJrOjpkYXJrX3RoZW1lX2J3KCkrCiAgeGxhYigiRGF0ZSIpKwogIHlsYWIoIlNvbGQgUXVhbnRpdHkiKSsKICBsYWJzKHRpdGxlPSJTTkVBS0VSUyBTb2xkIGluIE1leGljbyBmb3IgMzAgZGF5cyIpKwogIHRoZW1lKGF4aXMudGV4dC54ID0gZWxlbWVudF90ZXh0KGFuZ2xlID0gNDUsIHZqdXN0ID0gMC41LCBoanVzdD0xKSkKCiN0cmFpbl9kYXRhX21ldGEgJT4lIGZpbHRlcihjb3VudHJ5PT0iTUxNIiAmIGl0ZW1fZG9tYWluX2lkPT0iU05FQUtFUlMiKSAlPiUgZmlsdGVyKGRhdGU9PSIyMDIxLTAyLTAxIikgJT4lIGZpbHRlcihzb2xkX3F1YW50aXR5IT0wKQpgYGAKCiMjIEJveHBsb3RzCmBgYHtyfQpsaWJyYXJ5KGx1YnJpZGF0ZSkKdHJhaW5fZGF0YV9tZXRhPC10cmFpbl9kYXRhX21ldGEgJT4lIG11dGF0ZShkb3c9eW1kKGRhdGUpICU+JSB3ZGF5KGxhYmVsPVRSVUUpLCB3ZWVrPXdlZWsoZGF0ZSkpCnRyYWluX2RhdGFfbWV0YV9zdW08LXRyYWluX2RhdGFfbWV0YSAlPiUgZ3JvdXBfYnkoc2t1LGRvdykgJT4lIHN1bW1hcmlzZSh0b3RhbF9zb2xkX3F0eT1zdW0oc29sZF9xdWFudGl0eSkpIAp0cmFpbl9kYXRhX21ldGFfc3VtICU+JQogIGdncGxvdChhZXMoeD1kb3cseT10b3RhbF9zb2xkX3F0eSkpKwogIyBnZW9tX3BvaW50KCkrCiAgI2dlb21fdmlvbGluKCkrCiAgZ2VvbV9ib3hwbG90KG91dGxpZXIuc2hhcGU9TkEpKwogIHlsaW0oMCw1MCkrCiAgeGxhYigiZGF5X29mX3RoZV93ZWVrIikrCiAgZ2dkYXJrOjpkYXJrX3RoZW1lX2J3KCkKYGBgCgojIyBNdWx0aXRwbGUgdGltZSBzZXJpZXMuCmBgYHtyIGVjaG89VFJVRX0KdHJhaW5fZGF0YV9tZXRhPC10cmFpbl9kYXRhX21ldGEgJT4lIG11dGF0ZShkb3c9eW1kKGRhdGUpICU+JSB3ZGF5KGxhYmVsPVRSVUUpLCB3ZWVrPXdlZWsoZGF0ZSkpCnRyYWluX2RhdGFfbWV0YV9zdW1fbSA8LSB0cmFpbl9kYXRhX21ldGEgJT4lIGdyb3VwX2J5KGl0ZW1fZG9tYWluX2lkLHdlZWssZG93KSAlPiUgc3VtbWFyaXNlKHRvdGFsX3NvbGRfcXR5PXN1bShzb2xkX3F1YW50aXR5KSkgCgpza3VfZXg8LXRyYWluX2RhdGFfbWV0YV9zdW1fbSAgJT4lIGZpbHRlcih3ZWVrIT01KSAlPiUgZmlsdGVyKHdlZWshPTEzKSAlPiUgZmlsdGVyKGl0ZW1fZG9tYWluX2lkPT0iQURIRVNJVkVfVEFQRVMiKQpza3VfZXggJT4lCiAgZ2dwbG90KGFlcyh4PWRvdyx5PXRvdGFsX3NvbGRfcXR5KSkrCiAjIGZhY2V0X3dyYXAofndlZWspKwogIGdlb21fcG9pbnQoY29sb3I9J29yYW5nZScpKwogIGdlb21fc21vb3RoKGFlcyh4PWFzLm51bWVyaWMoZG93KSksbWV0aG9kID0gImxvZXNzIixzZSA9IFRSVUUpKwogICNnZW9tX2FyZWEoYWVzKHg9ZG93LHk9dG90YWxfc29sZF9xdHksZ3JvdXA9d2VlayxmaWxsID0gd2VlayksIHBvc2l0aW9uID0gInN0YWNrIikgKwogICNnZW9tX2xpbmUoYWVzKGdyb3VwPXdlZWspKSsKICAjZ2VvbV9zbW9vdGgoYWVzKHg9ZG93LHk9dG90YWxfc29sZF9xdHksIGdyKSkrCiAgeGxhYigiRE9XIikrCiAgCiAgeWxhYigiU29sZCBRdWFudGl0eSIpKwogIGdnZGFyazo6ZGFya190aGVtZV9idygpKwogIGxhYnModGl0bGU9IkFESEVTSVZFX1RBUEVTOiBUb3RhbCBRdHkgc29sZCBwZXIgZGF5IG9mIHRoZSB3ZWVrIikrCiAgdGhlbWUoYXhpcy50ZXh0LnggPSBlbGVtZW50X3RleHQoYW5nbGUgPSA0NSwgdmp1c3QgPSAwLjUsIGhqdXN0PTEpKQoKIyMgcmVhZCBodHRwczovL3N0YWNrb3ZlcmZsb3cuY29tL3F1ZXN0aW9ucy8xMDM1Nzc2OC9wbG90dGluZy1saW5lcy1hbmQtdGhlLWdyb3VwLWFlc3RoZXRpYy1pbi1nZ3Bsb3QyCiAgCmBgYApgYGB7cn0KCnNrdV9leCAlPiUKICBnZ3Bsb3QoYWVzKHg9ZG93LHk9dG90YWxfc29sZF9xdHkpKSsKICBnZW9tX3BvaW50KGNvbG9yPSdvcmFuZ2UnKSsKICBnZW9tX2xpbmUoYWVzKGdyb3VwPXdlZWssY29sb3I9YXMuZmFjdG9yKHdlZWspKSkrCiAgeGxhYigiRE9XIikrCiAgeWxhYigiU29sZCBRdWFudGl0eSIpKwogIGdnZGFyazo6ZGFya190aGVtZV9idygpKwogIGxhYnModGl0bGU9IkFESEVTSVZFX1RBUEVTOiBUb3RhbCBRdHkgc29sZCBwZXIgZGF5IG9mIHRoZSB3ZWVrIikrCiAgdGhlbWUoYXhpcy50ZXh0LnggPSBlbGVtZW50X3RleHQoYW5nbGUgPSA0NSwgdmp1c3QgPSAwLjUsIGhqdXN0PTEpKQoKYGBgCgojIyB1c2luZyBGQUNFVApgYGB7cn0Kc2t1X2V4ICU+JQogIGdncGxvdChhZXMoeD1kb3cseT10b3RhbF9zb2xkX3F0eSkpKwogIGZhY2V0X3dyYXAofndlZWspKwogIGdlb21fcG9pbnQoY29sb3I9J29yYW5nZScpKwogIGdlb21fbGluZShhZXMoZ3JvdXA9MSkpKwogIHhsYWIoIkRPVyIpKwogIHlsYWIoIlNvbGQgUXVhbnRpdHkiKSsKICBnZ2Rhcms6OmRhcmtfdGhlbWVfYncoKSsKICBsYWJzKHRpdGxlPSJBREhFU0lWRV9UQVBFUzogVG90YWwgUXR5IHNvbGQgcGVyIGRheSBvZiB0aGUgd2VlayIpKwogIHRoZW1lKGF4aXMudGV4dC54ID0gZWxlbWVudF90ZXh0KGFuZ2xlID0gNDUsIHZqdXN0ID0gMC41LCBoanVzdD0xKSkKCmBgYAo=