library(dplyr)
library(splitstackshape)
library(ggplot2)
library(knitr)
library(data.table)
library(zoo)
library(tidyr)
library(kSamples)
library(cluster)
library(NbClust)
library(mclust)

Description of the Analysis

Since the files are sent in buckets of 1000 there exists 45 unique End Dates in 11-month period. Thus the first part of the analysis is to model the Time between two End Dates. The next step is to model model the Average Number of Files per bulk across time and the final step is to model the Distribution of the Duration per bulk across time.

Summary Table of Days Difference and Number of Files by End Date

df<-read.csv("fn_data.csv", sep=';')
df<-df[,2:8]
df$EndDate<-as.Date(df$end_date, format="%Y-%m-%d")
df_group<-df%>%select(EndDate)%>%group_by(EndDate)%>%summarise(Files=n())%>%arrange(EndDate)%>%
  mutate(Lag_date=lag(EndDate), Diff=EndDate-lag(EndDate))%>%select(EndDate, Lag_date, Diff, Files)

kable(df_group, format="markdown")
EndDate Lag_date Diff Files
2016-01-18 NA NA 1835
2016-01-25 2016-01-18 7 days 921
2016-01-28 2016-01-25 3 days 997
2016-02-08 2016-01-28 11 days 1137
2016-02-15 2016-02-08 7 days 893
2016-02-22 2016-02-15 7 days 1023
2016-02-25 2016-02-22 3 days 899
2016-03-04 2016-02-25 8 days 667
2016-03-10 2016-03-04 6 days 862
2016-03-16 2016-03-10 6 days 835
2016-03-29 2016-03-16 13 days 694
2016-04-01 2016-03-29 3 days 846
2016-04-06 2016-04-01 5 days 676
2016-04-15 2016-04-06 9 days 585
2016-04-21 2016-04-15 6 days 857
2016-05-03 2016-04-21 12 days 750
2016-05-10 2016-05-03 7 days 812
2016-05-12 2016-05-10 2 days 578
2016-05-20 2016-05-12 8 days 842
2016-05-27 2016-05-20 7 days 1051
2016-06-06 2016-05-27 10 days 993
2016-06-08 2016-06-06 2 days 1041
2016-06-20 2016-06-08 12 days 955
2016-06-27 2016-06-20 7 days 1111
2016-06-29 2016-06-27 2 days 1171
2016-07-07 2016-06-29 8 days 950
2016-07-12 2016-07-07 5 days 1044
2016-07-20 2016-07-12 8 days 848
2016-08-01 2016-07-20 12 days 900
2016-08-03 2016-08-01 2 days 937
2016-08-11 2016-08-03 8 days 805
2016-08-23 2016-08-11 12 days 725
2016-08-25 2016-08-23 2 days 674
2016-09-02 2016-08-25 8 days 737
2016-09-08 2016-09-02 6 days 861
2016-09-15 2016-09-08 7 days 743
2016-09-21 2016-09-15 6 days 887
2016-09-28 2016-09-21 7 days 821
2016-10-06 2016-09-28 8 days 932
2016-10-12 2016-10-06 6 days 1009
2016-10-20 2016-10-12 8 days 1104
2016-10-26 2016-10-20 6 days 902
2016-11-03 2016-10-26 8 days 1715
2016-11-10 2016-11-03 7 days 994
2016-11-14 2016-11-10 4 days 955

Plots of Days Difference and Number of Files by End Date

df_group$MA_5_Files<-stats::filter(df_group$Files, rep(1/5, 5))
df_group$MA_5_Diff<-stats::filter(df_group$Diff, rep(1/5, 5))

ggplot(df_group, aes(x=EndDate,y=Diff))+geom_point(aes(size=Files))+stat_smooth()+ggtitle("Difference Between Two Consecutive Days")+xlab("End Date")+ylab("Days Difference")

ggplot(df_group, aes(x=EndDate,y=MA_5_Diff))+geom_point()+stat_smooth()+ggtitle("Moving Average (k=5) of Days Difference by End Date")+xlab("End Date")+ylab("Days Difference ")

ggplot(df_group, aes(x=EndDate,y=Files))+geom_point()+stat_smooth()+ggtitle("Files by End Date")+xlab("End Date")+ylab("Files")

ggplot(df_group, aes(x=EndDate,y=MA_5_Files))+geom_point()+stat_smooth()+ggtitle("Moving Average (k=5) of Files by End Date")+xlab("End Date")+ylab("Files")

cat("Correlation Between Days Difference and Files is ",round(cor(as.numeric(df_group$Diff), df_group$Files, use="complete.obs"),2))
## Correlation Between Days Difference and Files is  -0.04

Confidence Interval of Days Difference and Number of Files

cat("The Average of the Days Difference between two buckets is")
## The Average of the Days Difference between two buckets is
round(mean(df_group$Diff, na.rm=TRUE),2)
## Time difference of 6.84 days
cat("The 95% Confidence Interval of the Days Difference between two buckets is")
## The 95% Confidence Interval of the Days Difference between two buckets is
t.test(df_group$Diff)$conf.int
## Time differences in days
## [1] 5.949220 7.732598
## attr(,"conf.level")
## [1] 0.95
cat("The Average of the Files per bucket is")
## The Average of the Files per bucket is
round(mean(df_group$Files),2)
## [1] 923.87
cat("The 95% Confidence Interval of the Files per bucket is")
## The 95% Confidence Interval of the Files per bucket is
t.test(df_group$Files)$conf.int
## [1] 853.6457 994.0876
## attr(,"conf.level")
## [1] 0.95

Summary Table of Duration by End Date

df_summary<-df%>%select(EndDate, duration)%>%group_by(EndDate)%>%summarise(Average=round(mean(duration),0), StDev=round(sd(duration),0), LowerBound=quantile(duration, 0.025), Q1=quantile(duration, 0.25), 
                                                                       ThirtyFive=quantile(duration, 0.35), Median_Duration=median(duration),  Sixty=quantile(duration, 0.60), Q3_Duration=quantile(duration, 0.75), EigthyFive=quantile(duration, 0.85), UpperBound=quantile(duration, 0.975))

kable(df_summary, format="markdown")
EndDate Average StDev LowerBound Q1 ThirtyFive Median_Duration Sixty Q3_Duration EigthyFive UpperBound
2016-01-18 243 135 84.0 210 217 224.0 231 245 286.30 589.050
2016-01-25 237 127 84.0 210 217 217.0 224 245 273.00 560.000
2016-01-28 226 147 80.0 206 213 213.0 220 234 262.00 374.000
2016-02-08 233 124 84.0 210 217 217.0 224 245 273.00 501.200
2016-02-15 247 158 84.0 217 217 224.0 224 245 280.00 623.000
2016-02-22 257 169 91.0 217 217 224.0 231 252 333.90 651.000
2016-02-25 241 144 52.0 213 213 220.0 220 234 276.00 535.000
2016-03-04 271 184 102.0 214 221 221.0 228 263 354.00 687.900
2016-03-10 250 130 66.0 220 220 227.0 234 248 290.00 548.650
2016-03-16 252 162 72.0 219 219 226.0 226 240 275.00 513.000
2016-03-29 252 181 71.0 225 225 232.0 239 253 295.00 596.000
2016-04-01 245 118 74.0 221 228 228.0 235 249 270.00 445.000
2016-04-06 280 221 79.0 226 226 233.0 233 254 303.00 842.875
2016-04-15 260 115 88.0 228 235 235.0 242 256 312.00 565.400
2016-04-21 259 149 94.0 234 234 241.0 241 255 276.00 458.000
2016-05-03 272 150 169.0 239 239 246.0 246 253 274.00 473.850
2016-05-10 278 119 127.0 239 246 246.0 253 267 302.00 491.000
2016-05-12 250 102 80.0 234 234 241.0 241 255 269.00 469.025
2016-05-20 248 96 88.0 235 235 242.0 242 256 277.00 458.825
2016-05-27 249 127 95.0 235 235 242.0 242 256 277.00 445.000
2016-06-06 255 142 105.0 238 238 245.0 245 259 280.00 455.000
2016-06-08 266 139 100.0 233 233 240.0 247 268 303.00 506.000
2016-06-20 265 119 98.0 238 238 245.0 252 266 294.00 491.050
2016-06-27 265 126 105.0 238 245 245.0 252 266 294.00 500.500
2016-06-29 260 112 100.0 233 233 240.0 247 254 289.00 485.000
2016-07-07 256 105 115.0 234 234 241.0 241 255 276.00 500.000
2016-07-12 252 102 120.0 225 232 232.0 239 253 288.00 498.000
2016-07-20 249 125 114.0 198 198 236.5 240 254 275.00 504.775
2016-08-01 258 134 126.0 210 210 210.0 252 266 294.00 518.000
2016-08-03 243 100 142.0 205 212 212.0 212 254 282.00 485.000
2016-08-11 254 166 129.0 213 213 213.0 220 255 283.00 542.000
2016-08-23 266 126 127.0 218 218 225.0 232 274 323.00 553.300
2016-08-25 241 94 115.0 213 213 220.0 220 234 290.00 500.000
2016-09-02 238 120 123.0 214 214 221.0 221 235 284.00 536.000
2016-09-08 243 115 129.0 213 220 220.0 227 234 248.00 542.000
2016-09-15 255 101 136.0 220 220 227.0 227 241 301.90 531.150
2016-09-21 233 87 128.0 212 212 219.0 226 233 254.00 476.950
2016-09-28 264 169 135.0 219 226 226.0 233 254 310.00 544.500
2016-10-06 247 96 143.0 220 220 227.0 227 248 264.45 547.075
2016-10-12 257 132 142.0 219 219 226.0 226 247 268.00 569.000
2016-10-20 256 108 157.0 220 220 227.0 227 248 279.85 556.000
2016-10-26 258 93 142.0 219 219 226.0 240 261 296.00 562.000
2016-11-03 242 86 171.0 213 220 220.0 227 241 262.00 437.000
2016-11-10 245 95 171.0 213 213 220.0 227 248 276.00 482.675
2016-11-14 244 92 165.9 210 217 217.0 224 238 273.00 574.000

Plots of Duration Distribution by End Date

ggplot(df, aes(x=duration))+geom_histogram(bins=200)+coord_cartesian(xlim=c(0,500))+ggtitle("Histogram of the Duration")+xlab("Duration")+ylab("Count")

ggplot(df, aes(x=as.factor(EndDate),y=duration))+geom_boxplot()+coord_cartesian(ylim=c(0,500))+ggtitle("Box of the Duration")+xlab("End Date")+theme(axis.text.x = element_text(angle = 90, hjust = 1))

Summary Table of Duration Quantiles by End Date

probs = seq(0, 1, 0.1)
df_q<-df %>%
  group_by(EndDate) %>%
  do(data.frame(prob = probs, stat = quantile(.$duration, probs = probs))) %>%
  spread(prob, stat)
kable(df_q, format="markdown")
EndDate 0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1
2016-01-18 56 119.0 210 217 217.0 224.0 231 238 259.0 329.0 2163
2016-01-25 21 105.0 210 210 217.0 217.0 224 238 259.0 315.0 1666
2016-01-28 24 94.0 206 206 213.0 213.0 220 227 248.0 276.0 2082
2016-02-08 35 98.0 210 210 217.0 217.0 224 238 252.0 322.0 1743
2016-02-15 35 105.0 217 217 217.0 224.0 224 231 252.0 322.0 2107
2016-02-22 28 112.0 210 217 217.0 224.0 231 238 273.0 371.0 2086
2016-02-25 38 143.0 213 213 220.0 220.0 220 234 248.0 332.0 1844
2016-03-04 25 207.0 214 221 221.0 221.0 228 242 284.0 382.0 2503
2016-03-10 31 185.0 220 220 220.0 227.0 234 241 262.0 346.0 1480
2016-03-16 30 184.0 219 219 219.0 226.0 226 233 254.0 342.2 2144
2016-03-29 29 85.0 190 225 232.0 232.0 239 246 267.0 372.0 2276
2016-04-01 32 137.0 221 228 228.0 228.0 235 242 256.0 326.0 1362
2016-04-06 37 205.0 226 226 226.0 233.0 233 247 268.0 387.0 1878
2016-04-15 32 179.0 228 228 235.0 235.0 242 249 277.0 382.0 1138
2016-04-21 24 199.0 227 234 234.0 241.0 241 248 262.0 334.8 2677
2016-05-03 43 232.0 239 239 239.0 246.0 246 253 260.0 310.4 2276
2016-05-10 43 232.0 239 239 246.0 246.0 253 260 279.6 393.0 1849
2016-05-12 31 175.9 234 234 234.0 241.0 241 248 255.0 290.0 1130
2016-05-20 39 130.0 235 235 237.8 242.0 242 249 263.0 305.0 1558
2016-05-27 39 109.0 228 235 242.0 242.0 242 249 263.0 298.0 1810
2016-06-06 49 147.0 238 238 238.0 245.0 245 252 266.0 308.0 2821
2016-06-08 44 156.0 233 233 240.0 240.0 247 261 282.0 352.0 1969
2016-06-20 77 161.0 238 238 245.0 245.0 252 259 280.0 326.2 1491
2016-06-27 56 175.0 238 238 245.0 245.0 252 259 273.0 322.0 1820
2016-06-29 79 163.0 233 233 233.0 240.0 247 254 268.0 345.0 1633
2016-07-07 45 171.0 227 234 234.0 241.0 241 248 262.0 325.0 1578
2016-07-12 36 183.0 225 232 232.0 232.0 239 246 260.0 327.9 1891
2016-07-20 51 191.0 198 198 233.0 236.5 240 247 261.0 317.0 1913
2016-08-01 49 210.0 210 210 210.0 210.0 252 259 273.0 336.0 1792
2016-08-03 93 198.0 205 205 212.0 212.0 212 247 268.0 310.0 1612
2016-08-11 59 199.0 206 213 213.0 213.0 220 220 262.0 332.0 2250
2016-08-23 57 211.0 211 218 225.0 225.0 232 232 288.0 418.2 1240
2016-08-25 66 199.0 213 213 213.0 220.0 220 227 269.0 311.0 1193
2016-09-02 60 137.0 193 214 214.0 221.0 221 228 242.0 328.8 1642
2016-09-08 66 150.0 213 220 220.0 220.0 227 234 241.0 318.0 1984
2016-09-15 66 199.0 220 220 220.0 227.0 227 241 255.0 372.6 948
2016-09-21 65 149.0 205 212 219.0 219.0 226 226 240.0 303.0 1255
2016-09-28 93 149.0 219 219 226.0 226.0 233 247 268.0 373.0 2347
2016-10-06 66 192.0 213 220 220.0 227.0 227 241 255.0 318.0 1109
2016-10-12 79 212.0 219 219 219.0 226.0 226 240 254.0 332.4 1948
2016-10-20 73 213.0 220 220 220.0 227.0 227 241 262.0 343.9 1557
2016-10-26 72 212.0 219 219 219.0 226.0 240 254 273.6 359.0 779
2016-11-03 52 213.0 213 220 220.0 220.0 227 234 248.0 276.0 1893
2016-11-10 80 206.0 213 213 220.0 220.0 227 234 255.0 297.0 1396
2016-11-14 70 203.0 210 210 217.0 217.0 224 231 252.0 294.0 1351

Anderson-Darling k-Sample Test

At this point we will perform the Anderson Darlin k-Sample Test in order to test the hypothesis that k independent samples with sample sizes n_1,., n_k arose from a common unspecified distribution function F(x) and testing is done conditionally given the observed tie pattern. Thus this is a permutation test. Both versions of the AD statistic are computed.

df_n<-as.matrix(sapply(df, as.numeric)) 
u_dates<-unique(df_n[,8])



s1  <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 1   ])
s2  <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 2   ])
s3  <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 3   ])
s4  <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 4   ])
s5  <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 5   ])
s6  <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 6   ])
s7  <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 7   ])
s8  <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 8   ])
s9  <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 9   ])
s10 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 10  ])
s11 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 11  ])
s12 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 12  ])
s13 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 13  ])
s14 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 14  ])
s15 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 15  ])
s16 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 16  ])
s17 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 17  ])
s18 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 18  ])
s19 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 19  ])
s20 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 20  ])
s21 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 21  ])
s22 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 22  ])
s23 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 23  ])
s24 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 24  ])
s25 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 25  ])
s26 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 26  ])
s27 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 27  ])
s28 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 28  ])
s29 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 29  ])
s30 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 30  ])
s31 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 31  ])
s32 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 32  ])
s33 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 33  ])
s34 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 34  ])
s35 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 35  ])
s36 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 36  ])
s37 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 37  ])
s38 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 38  ])
s39 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 39  ])
s40 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 40  ])
s41 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 41  ])
s42 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 42  ])
s43 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 43  ])
s44 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 44  ])
s45 <-subset(df_n,  select=duration, df_n[,8]==u_dates[ 45  ])


ad.test(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21,s22,s23,s24,s25,s26,s7,s28,s29,s30,s31,s32,s33,s34,s35,s36,s37,s38,s39,s40,s41,s42,s43,s44,s45)
## 
## 
##  Anderson-Darling k-sample test.
## 
## Number of samples:  45
## Sample sizes:  993, 857, 667, 694, 750, 997, 1835, 821, 835, 893, 1023, 805, 676, 812, 899, 1044, 1111, 1041, 1137, 861, 848, 1009, 1051, 921, 1715, 900, 1835, 737, 1171, 846, 1104, 862, 937, 950, 955, 994, 585, 955, 932, 725, 578, 887, 674, 743, 902
## Number of ties: 41812
## 
## Mean of  Anderson-Darling  Criterion: 44
## Standard deviation of  Anderson-Darling  Criterion: 5.04784
## 
## T.AD = ( Anderson-Darling  Criterion - mean)/sigma
## 
## Null Hypothesis: All samples come from a common population.
## 
##              AD  T.AD  asympt. P-value
## version 1: 3302 645.4                0
## version 2: 3220 628.3                0

Cluster Analysis

From the k-sample Anderson-Darling test we found that there are different distributions in Duration across End Dates. However we need to mention that the Distribution Test like “Anderson-Darling”, “Cramer”, “Kolmogorov-Smirnov” are relative strict.
Our scope is to create clusters of the Distribution of the Duration by End Date. For this Analysis we will use the 20%-80% quantiles

cat("Using the Elbow-Rule we define the number of clusters to be equal to 3")
## Using the Elbow-Rule we define the number of clusters to be equal to 3
k.max <- 15 # Maximal number of clusters
data=df_q[,4:10]
wss <- sapply(1:k.max, 
        function(k){kmeans(data, k, nstart=10 )$tot.withinss})
plot(1:k.max, wss,
       type="b", pch = 19, frame = FALSE, 
       xlab="Number of clusters K",
       ylab="Total within-clusters sum of squares")
abline(v = 3, lty =2)

fit <- kmeans(data, 3)
df_q2<-data.frame(df_q, cluster=fit$cluster)
df_q2<-df_q2[, c("EndDate", "cluster")]
df2<-merge(df, df_q2)
df2$cluster<-as.factor(df2$cluster)

clusplot(data, fit$cluster, color=TRUE, shade=TRUE,     labels=1, lines=0)

ggplot(df2, aes(x=as.factor(EndDate),y=duration))+geom_boxplot(aes(fill=cluster))+coord_cartesian(ylim=c(0,500))+ggtitle("Box Plor of the Duration")+xlab("End Date")+theme(axis.text.x = element_text(angle = 90, hjust = 1))

Discussion

Below we represent some findings regarding the available Data

  • The Days Difference between two consecutive bulking dates has been stabilized during the last months and can be considered equal to 7 Days. Also the 95% Condifence Interval is (5.94 days , 7.73 days). Thus in order to sum up, we should expect a bulk once a week!
  • Every bulk contains on average 920 files with a confidence interval of (853, 994). However from the data we can observe a seasonality, but since we have only one year of data we cannot make any statistical inference yet.
  • We examine if there was a correlation between the Number of Files per Bulk vs the Days Between two Bulks and we found that there is no correlation (-0.04)
  • The distribution of the duration can be considered stable across End Dates where during last months the Median is around 220 Days and during the middle of the observation period was around 240. The Summary Table of Duration Quantiles by End Date shows the Quartiles of the Duration by End Date
  • We can argue that the distribution across End Dates are not the same (i.e. are not equivalent) since the p-value of the Anderson-Darling of k-Sample test is equal to 0
  • We defined three clusters of the distribution of the Duration