library(dplyr)
library(splitstackshape)
library(ggplot2)
library(knitr)
library(data.table)
library(zoo)
library(tidyr)
library(kSamples)
library(cluster)
library(NbClust)
library(mclust)
Since the files are sent in buckets of 1000 there exists 45 unique End Dates in 11-month period. Thus the first part of the analysis is to model the Time between two End Dates. The next step is to model model the Average Number of Files per bulk across time and the final step is to model the Distribution of the Duration per bulk across time.
df<-read.csv("fn_data.csv", sep=';')
df<-df[,2:8]
df$EndDate<-as.Date(df$end_date, format="%Y-%m-%d")
df_group<-df%>%select(EndDate)%>%group_by(EndDate)%>%summarise(Files=n())%>%arrange(EndDate)%>%
mutate(Lag_date=lag(EndDate), Diff=EndDate-lag(EndDate))%>%select(EndDate, Lag_date, Diff, Files)
kable(df_group, format="markdown")
| EndDate | Lag_date | Diff | Files |
|---|---|---|---|
| 2016-01-18 | NA | NA | 1835 |
| 2016-01-25 | 2016-01-18 | 7 days | 921 |
| 2016-01-28 | 2016-01-25 | 3 days | 997 |
| 2016-02-08 | 2016-01-28 | 11 days | 1137 |
| 2016-02-15 | 2016-02-08 | 7 days | 893 |
| 2016-02-22 | 2016-02-15 | 7 days | 1023 |
| 2016-02-25 | 2016-02-22 | 3 days | 899 |
| 2016-03-04 | 2016-02-25 | 8 days | 667 |
| 2016-03-10 | 2016-03-04 | 6 days | 862 |
| 2016-03-16 | 2016-03-10 | 6 days | 835 |
| 2016-03-29 | 2016-03-16 | 13 days | 694 |
| 2016-04-01 | 2016-03-29 | 3 days | 846 |
| 2016-04-06 | 2016-04-01 | 5 days | 676 |
| 2016-04-15 | 2016-04-06 | 9 days | 585 |
| 2016-04-21 | 2016-04-15 | 6 days | 857 |
| 2016-05-03 | 2016-04-21 | 12 days | 750 |
| 2016-05-10 | 2016-05-03 | 7 days | 812 |
| 2016-05-12 | 2016-05-10 | 2 days | 578 |
| 2016-05-20 | 2016-05-12 | 8 days | 842 |
| 2016-05-27 | 2016-05-20 | 7 days | 1051 |
| 2016-06-06 | 2016-05-27 | 10 days | 993 |
| 2016-06-08 | 2016-06-06 | 2 days | 1041 |
| 2016-06-20 | 2016-06-08 | 12 days | 955 |
| 2016-06-27 | 2016-06-20 | 7 days | 1111 |
| 2016-06-29 | 2016-06-27 | 2 days | 1171 |
| 2016-07-07 | 2016-06-29 | 8 days | 950 |
| 2016-07-12 | 2016-07-07 | 5 days | 1044 |
| 2016-07-20 | 2016-07-12 | 8 days | 848 |
| 2016-08-01 | 2016-07-20 | 12 days | 900 |
| 2016-08-03 | 2016-08-01 | 2 days | 937 |
| 2016-08-11 | 2016-08-03 | 8 days | 805 |
| 2016-08-23 | 2016-08-11 | 12 days | 725 |
| 2016-08-25 | 2016-08-23 | 2 days | 674 |
| 2016-09-02 | 2016-08-25 | 8 days | 737 |
| 2016-09-08 | 2016-09-02 | 6 days | 861 |
| 2016-09-15 | 2016-09-08 | 7 days | 743 |
| 2016-09-21 | 2016-09-15 | 6 days | 887 |
| 2016-09-28 | 2016-09-21 | 7 days | 821 |
| 2016-10-06 | 2016-09-28 | 8 days | 932 |
| 2016-10-12 | 2016-10-06 | 6 days | 1009 |
| 2016-10-20 | 2016-10-12 | 8 days | 1104 |
| 2016-10-26 | 2016-10-20 | 6 days | 902 |
| 2016-11-03 | 2016-10-26 | 8 days | 1715 |
| 2016-11-10 | 2016-11-03 | 7 days | 994 |
| 2016-11-14 | 2016-11-10 | 4 days | 955 |
df_group$MA_5_Files<-stats::filter(df_group$Files, rep(1/5, 5))
df_group$MA_5_Diff<-stats::filter(df_group$Diff, rep(1/5, 5))
ggplot(df_group, aes(x=EndDate,y=Diff))+geom_point(aes(size=Files))+stat_smooth()+ggtitle("Difference Between Two Consecutive Days")+xlab("End Date")+ylab("Days Difference")
ggplot(df_group, aes(x=EndDate,y=MA_5_Diff))+geom_point()+stat_smooth()+ggtitle("Moving Average (k=5) of Days Difference by End Date")+xlab("End Date")+ylab("Days Difference ")
ggplot(df_group, aes(x=EndDate,y=Files))+geom_point()+stat_smooth()+ggtitle("Files by End Date")+xlab("End Date")+ylab("Files")
ggplot(df_group, aes(x=EndDate,y=MA_5_Files))+geom_point()+stat_smooth()+ggtitle("Moving Average (k=5) of Files by End Date")+xlab("End Date")+ylab("Files")
cat("Correlation Between Days Difference and Files is ",round(cor(as.numeric(df_group$Diff), df_group$Files, use="complete.obs"),2))
## Correlation Between Days Difference and Files is -0.04
cat("The Average of the Days Difference between two buckets is")
## The Average of the Days Difference between two buckets is
round(mean(df_group$Diff, na.rm=TRUE),2)
## Time difference of 6.84 days
cat("The 95% Confidence Interval of the Days Difference between two buckets is")
## The 95% Confidence Interval of the Days Difference between two buckets is
t.test(df_group$Diff)$conf.int
## Time differences in days
## [1] 5.949220 7.732598
## attr(,"conf.level")
## [1] 0.95
cat("The Average of the Files per bucket is")
## The Average of the Files per bucket is
round(mean(df_group$Files),2)
## [1] 923.87
cat("The 95% Confidence Interval of the Files per bucket is")
## The 95% Confidence Interval of the Files per bucket is
t.test(df_group$Files)$conf.int
## [1] 853.6457 994.0876
## attr(,"conf.level")
## [1] 0.95
df_summary<-df%>%select(EndDate, duration)%>%group_by(EndDate)%>%summarise(Average=round(mean(duration),0), StDev=round(sd(duration),0), LowerBound=quantile(duration, 0.025), Q1=quantile(duration, 0.25),
ThirtyFive=quantile(duration, 0.35), Median_Duration=median(duration), Sixty=quantile(duration, 0.60), Q3_Duration=quantile(duration, 0.75), EigthyFive=quantile(duration, 0.85), UpperBound=quantile(duration, 0.975))
kable(df_summary, format="markdown")
| EndDate | Average | StDev | LowerBound | Q1 | ThirtyFive | Median_Duration | Sixty | Q3_Duration | EigthyFive | UpperBound |
|---|---|---|---|---|---|---|---|---|---|---|
| 2016-01-18 | 243 | 135 | 84.0 | 210 | 217 | 224.0 | 231 | 245 | 286.30 | 589.050 |
| 2016-01-25 | 237 | 127 | 84.0 | 210 | 217 | 217.0 | 224 | 245 | 273.00 | 560.000 |
| 2016-01-28 | 226 | 147 | 80.0 | 206 | 213 | 213.0 | 220 | 234 | 262.00 | 374.000 |
| 2016-02-08 | 233 | 124 | 84.0 | 210 | 217 | 217.0 | 224 | 245 | 273.00 | 501.200 |
| 2016-02-15 | 247 | 158 | 84.0 | 217 | 217 | 224.0 | 224 | 245 | 280.00 | 623.000 |
| 2016-02-22 | 257 | 169 | 91.0 | 217 | 217 | 224.0 | 231 | 252 | 333.90 | 651.000 |
| 2016-02-25 | 241 | 144 | 52.0 | 213 | 213 | 220.0 | 220 | 234 | 276.00 | 535.000 |
| 2016-03-04 | 271 | 184 | 102.0 | 214 | 221 | 221.0 | 228 | 263 | 354.00 | 687.900 |
| 2016-03-10 | 250 | 130 | 66.0 | 220 | 220 | 227.0 | 234 | 248 | 290.00 | 548.650 |
| 2016-03-16 | 252 | 162 | 72.0 | 219 | 219 | 226.0 | 226 | 240 | 275.00 | 513.000 |
| 2016-03-29 | 252 | 181 | 71.0 | 225 | 225 | 232.0 | 239 | 253 | 295.00 | 596.000 |
| 2016-04-01 | 245 | 118 | 74.0 | 221 | 228 | 228.0 | 235 | 249 | 270.00 | 445.000 |
| 2016-04-06 | 280 | 221 | 79.0 | 226 | 226 | 233.0 | 233 | 254 | 303.00 | 842.875 |
| 2016-04-15 | 260 | 115 | 88.0 | 228 | 235 | 235.0 | 242 | 256 | 312.00 | 565.400 |
| 2016-04-21 | 259 | 149 | 94.0 | 234 | 234 | 241.0 | 241 | 255 | 276.00 | 458.000 |
| 2016-05-03 | 272 | 150 | 169.0 | 239 | 239 | 246.0 | 246 | 253 | 274.00 | 473.850 |
| 2016-05-10 | 278 | 119 | 127.0 | 239 | 246 | 246.0 | 253 | 267 | 302.00 | 491.000 |
| 2016-05-12 | 250 | 102 | 80.0 | 234 | 234 | 241.0 | 241 | 255 | 269.00 | 469.025 |
| 2016-05-20 | 248 | 96 | 88.0 | 235 | 235 | 242.0 | 242 | 256 | 277.00 | 458.825 |
| 2016-05-27 | 249 | 127 | 95.0 | 235 | 235 | 242.0 | 242 | 256 | 277.00 | 445.000 |
| 2016-06-06 | 255 | 142 | 105.0 | 238 | 238 | 245.0 | 245 | 259 | 280.00 | 455.000 |
| 2016-06-08 | 266 | 139 | 100.0 | 233 | 233 | 240.0 | 247 | 268 | 303.00 | 506.000 |
| 2016-06-20 | 265 | 119 | 98.0 | 238 | 238 | 245.0 | 252 | 266 | 294.00 | 491.050 |
| 2016-06-27 | 265 | 126 | 105.0 | 238 | 245 | 245.0 | 252 | 266 | 294.00 | 500.500 |
| 2016-06-29 | 260 | 112 | 100.0 | 233 | 233 | 240.0 | 247 | 254 | 289.00 | 485.000 |
| 2016-07-07 | 256 | 105 | 115.0 | 234 | 234 | 241.0 | 241 | 255 | 276.00 | 500.000 |
| 2016-07-12 | 252 | 102 | 120.0 | 225 | 232 | 232.0 | 239 | 253 | 288.00 | 498.000 |
| 2016-07-20 | 249 | 125 | 114.0 | 198 | 198 | 236.5 | 240 | 254 | 275.00 | 504.775 |
| 2016-08-01 | 258 | 134 | 126.0 | 210 | 210 | 210.0 | 252 | 266 | 294.00 | 518.000 |
| 2016-08-03 | 243 | 100 | 142.0 | 205 | 212 | 212.0 | 212 | 254 | 282.00 | 485.000 |
| 2016-08-11 | 254 | 166 | 129.0 | 213 | 213 | 213.0 | 220 | 255 | 283.00 | 542.000 |
| 2016-08-23 | 266 | 126 | 127.0 | 218 | 218 | 225.0 | 232 | 274 | 323.00 | 553.300 |
| 2016-08-25 | 241 | 94 | 115.0 | 213 | 213 | 220.0 | 220 | 234 | 290.00 | 500.000 |
| 2016-09-02 | 238 | 120 | 123.0 | 214 | 214 | 221.0 | 221 | 235 | 284.00 | 536.000 |
| 2016-09-08 | 243 | 115 | 129.0 | 213 | 220 | 220.0 | 227 | 234 | 248.00 | 542.000 |
| 2016-09-15 | 255 | 101 | 136.0 | 220 | 220 | 227.0 | 227 | 241 | 301.90 | 531.150 |
| 2016-09-21 | 233 | 87 | 128.0 | 212 | 212 | 219.0 | 226 | 233 | 254.00 | 476.950 |
| 2016-09-28 | 264 | 169 | 135.0 | 219 | 226 | 226.0 | 233 | 254 | 310.00 | 544.500 |
| 2016-10-06 | 247 | 96 | 143.0 | 220 | 220 | 227.0 | 227 | 248 | 264.45 | 547.075 |
| 2016-10-12 | 257 | 132 | 142.0 | 219 | 219 | 226.0 | 226 | 247 | 268.00 | 569.000 |
| 2016-10-20 | 256 | 108 | 157.0 | 220 | 220 | 227.0 | 227 | 248 | 279.85 | 556.000 |
| 2016-10-26 | 258 | 93 | 142.0 | 219 | 219 | 226.0 | 240 | 261 | 296.00 | 562.000 |
| 2016-11-03 | 242 | 86 | 171.0 | 213 | 220 | 220.0 | 227 | 241 | 262.00 | 437.000 |
| 2016-11-10 | 245 | 95 | 171.0 | 213 | 213 | 220.0 | 227 | 248 | 276.00 | 482.675 |
| 2016-11-14 | 244 | 92 | 165.9 | 210 | 217 | 217.0 | 224 | 238 | 273.00 | 574.000 |
ggplot(df, aes(x=duration))+geom_histogram(bins=200)+coord_cartesian(xlim=c(0,500))+ggtitle("Histogram of the Duration")+xlab("Duration")+ylab("Count")
ggplot(df, aes(x=as.factor(EndDate),y=duration))+geom_boxplot()+coord_cartesian(ylim=c(0,500))+ggtitle("Box of the Duration")+xlab("End Date")+theme(axis.text.x = element_text(angle = 90, hjust = 1))
probs = seq(0, 1, 0.1)
df_q<-df %>%
group_by(EndDate) %>%
do(data.frame(prob = probs, stat = quantile(.$duration, probs = probs))) %>%
spread(prob, stat)
kable(df_q, format="markdown")
| EndDate | 0 | 0.1 | 0.2 | 0.3 | 0.4 | 0.5 | 0.6 | 0.7 | 0.8 | 0.9 | 1 |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 2016-01-18 | 56 | 119.0 | 210 | 217 | 217.0 | 224.0 | 231 | 238 | 259.0 | 329.0 | 2163 |
| 2016-01-25 | 21 | 105.0 | 210 | 210 | 217.0 | 217.0 | 224 | 238 | 259.0 | 315.0 | 1666 |
| 2016-01-28 | 24 | 94.0 | 206 | 206 | 213.0 | 213.0 | 220 | 227 | 248.0 | 276.0 | 2082 |
| 2016-02-08 | 35 | 98.0 | 210 | 210 | 217.0 | 217.0 | 224 | 238 | 252.0 | 322.0 | 1743 |
| 2016-02-15 | 35 | 105.0 | 217 | 217 | 217.0 | 224.0 | 224 | 231 | 252.0 | 322.0 | 2107 |
| 2016-02-22 | 28 | 112.0 | 210 | 217 | 217.0 | 224.0 | 231 | 238 | 273.0 | 371.0 | 2086 |
| 2016-02-25 | 38 | 143.0 | 213 | 213 | 220.0 | 220.0 | 220 | 234 | 248.0 | 332.0 | 1844 |
| 2016-03-04 | 25 | 207.0 | 214 | 221 | 221.0 | 221.0 | 228 | 242 | 284.0 | 382.0 | 2503 |
| 2016-03-10 | 31 | 185.0 | 220 | 220 | 220.0 | 227.0 | 234 | 241 | 262.0 | 346.0 | 1480 |
| 2016-03-16 | 30 | 184.0 | 219 | 219 | 219.0 | 226.0 | 226 | 233 | 254.0 | 342.2 | 2144 |
| 2016-03-29 | 29 | 85.0 | 190 | 225 | 232.0 | 232.0 | 239 | 246 | 267.0 | 372.0 | 2276 |
| 2016-04-01 | 32 | 137.0 | 221 | 228 | 228.0 | 228.0 | 235 | 242 | 256.0 | 326.0 | 1362 |
| 2016-04-06 | 37 | 205.0 | 226 | 226 | 226.0 | 233.0 | 233 | 247 | 268.0 | 387.0 | 1878 |
| 2016-04-15 | 32 | 179.0 | 228 | 228 | 235.0 | 235.0 | 242 | 249 | 277.0 | 382.0 | 1138 |
| 2016-04-21 | 24 | 199.0 | 227 | 234 | 234.0 | 241.0 | 241 | 248 | 262.0 | 334.8 | 2677 |
| 2016-05-03 | 43 | 232.0 | 239 | 239 | 239.0 | 246.0 | 246 | 253 | 260.0 | 310.4 | 2276 |
| 2016-05-10 | 43 | 232.0 | 239 | 239 | 246.0 | 246.0 | 253 | 260 | 279.6 | 393.0 | 1849 |
| 2016-05-12 | 31 | 175.9 | 234 | 234 | 234.0 | 241.0 | 241 | 248 | 255.0 | 290.0 | 1130 |
| 2016-05-20 | 39 | 130.0 | 235 | 235 | 237.8 | 242.0 | 242 | 249 | 263.0 | 305.0 | 1558 |
| 2016-05-27 | 39 | 109.0 | 228 | 235 | 242.0 | 242.0 | 242 | 249 | 263.0 | 298.0 | 1810 |
| 2016-06-06 | 49 | 147.0 | 238 | 238 | 238.0 | 245.0 | 245 | 252 | 266.0 | 308.0 | 2821 |
| 2016-06-08 | 44 | 156.0 | 233 | 233 | 240.0 | 240.0 | 247 | 261 | 282.0 | 352.0 | 1969 |
| 2016-06-20 | 77 | 161.0 | 238 | 238 | 245.0 | 245.0 | 252 | 259 | 280.0 | 326.2 | 1491 |
| 2016-06-27 | 56 | 175.0 | 238 | 238 | 245.0 | 245.0 | 252 | 259 | 273.0 | 322.0 | 1820 |
| 2016-06-29 | 79 | 163.0 | 233 | 233 | 233.0 | 240.0 | 247 | 254 | 268.0 | 345.0 | 1633 |
| 2016-07-07 | 45 | 171.0 | 227 | 234 | 234.0 | 241.0 | 241 | 248 | 262.0 | 325.0 | 1578 |
| 2016-07-12 | 36 | 183.0 | 225 | 232 | 232.0 | 232.0 | 239 | 246 | 260.0 | 327.9 | 1891 |
| 2016-07-20 | 51 | 191.0 | 198 | 198 | 233.0 | 236.5 | 240 | 247 | 261.0 | 317.0 | 1913 |
| 2016-08-01 | 49 | 210.0 | 210 | 210 | 210.0 | 210.0 | 252 | 259 | 273.0 | 336.0 | 1792 |
| 2016-08-03 | 93 | 198.0 | 205 | 205 | 212.0 | 212.0 | 212 | 247 | 268.0 | 310.0 | 1612 |
| 2016-08-11 | 59 | 199.0 | 206 | 213 | 213.0 | 213.0 | 220 | 220 | 262.0 | 332.0 | 2250 |
| 2016-08-23 | 57 | 211.0 | 211 | 218 | 225.0 | 225.0 | 232 | 232 | 288.0 | 418.2 | 1240 |
| 2016-08-25 | 66 | 199.0 | 213 | 213 | 213.0 | 220.0 | 220 | 227 | 269.0 | 311.0 | 1193 |
| 2016-09-02 | 60 | 137.0 | 193 | 214 | 214.0 | 221.0 | 221 | 228 | 242.0 | 328.8 | 1642 |
| 2016-09-08 | 66 | 150.0 | 213 | 220 | 220.0 | 220.0 | 227 | 234 | 241.0 | 318.0 | 1984 |
| 2016-09-15 | 66 | 199.0 | 220 | 220 | 220.0 | 227.0 | 227 | 241 | 255.0 | 372.6 | 948 |
| 2016-09-21 | 65 | 149.0 | 205 | 212 | 219.0 | 219.0 | 226 | 226 | 240.0 | 303.0 | 1255 |
| 2016-09-28 | 93 | 149.0 | 219 | 219 | 226.0 | 226.0 | 233 | 247 | 268.0 | 373.0 | 2347 |
| 2016-10-06 | 66 | 192.0 | 213 | 220 | 220.0 | 227.0 | 227 | 241 | 255.0 | 318.0 | 1109 |
| 2016-10-12 | 79 | 212.0 | 219 | 219 | 219.0 | 226.0 | 226 | 240 | 254.0 | 332.4 | 1948 |
| 2016-10-20 | 73 | 213.0 | 220 | 220 | 220.0 | 227.0 | 227 | 241 | 262.0 | 343.9 | 1557 |
| 2016-10-26 | 72 | 212.0 | 219 | 219 | 219.0 | 226.0 | 240 | 254 | 273.6 | 359.0 | 779 |
| 2016-11-03 | 52 | 213.0 | 213 | 220 | 220.0 | 220.0 | 227 | 234 | 248.0 | 276.0 | 1893 |
| 2016-11-10 | 80 | 206.0 | 213 | 213 | 220.0 | 220.0 | 227 | 234 | 255.0 | 297.0 | 1396 |
| 2016-11-14 | 70 | 203.0 | 210 | 210 | 217.0 | 217.0 | 224 | 231 | 252.0 | 294.0 | 1351 |
At this point we will perform the Anderson Darlin k-Sample Test in order to test the hypothesis that k independent samples with sample sizes n_1,., n_k arose from a common unspecified distribution function F(x) and testing is done conditionally given the observed tie pattern. Thus this is a permutation test. Both versions of the AD statistic are computed.
df_n<-as.matrix(sapply(df, as.numeric))
u_dates<-unique(df_n[,8])
s1 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 1 ])
s2 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 2 ])
s3 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 3 ])
s4 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 4 ])
s5 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 5 ])
s6 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 6 ])
s7 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 7 ])
s8 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 8 ])
s9 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 9 ])
s10 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 10 ])
s11 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 11 ])
s12 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 12 ])
s13 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 13 ])
s14 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 14 ])
s15 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 15 ])
s16 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 16 ])
s17 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 17 ])
s18 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 18 ])
s19 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 19 ])
s20 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 20 ])
s21 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 21 ])
s22 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 22 ])
s23 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 23 ])
s24 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 24 ])
s25 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 25 ])
s26 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 26 ])
s27 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 27 ])
s28 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 28 ])
s29 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 29 ])
s30 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 30 ])
s31 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 31 ])
s32 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 32 ])
s33 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 33 ])
s34 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 34 ])
s35 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 35 ])
s36 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 36 ])
s37 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 37 ])
s38 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 38 ])
s39 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 39 ])
s40 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 40 ])
s41 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 41 ])
s42 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 42 ])
s43 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 43 ])
s44 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 44 ])
s45 <-subset(df_n, select=duration, df_n[,8]==u_dates[ 45 ])
ad.test(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21,s22,s23,s24,s25,s26,s7,s28,s29,s30,s31,s32,s33,s34,s35,s36,s37,s38,s39,s40,s41,s42,s43,s44,s45)
##
##
## Anderson-Darling k-sample test.
##
## Number of samples: 45
## Sample sizes: 993, 857, 667, 694, 750, 997, 1835, 821, 835, 893, 1023, 805, 676, 812, 899, 1044, 1111, 1041, 1137, 861, 848, 1009, 1051, 921, 1715, 900, 1835, 737, 1171, 846, 1104, 862, 937, 950, 955, 994, 585, 955, 932, 725, 578, 887, 674, 743, 902
## Number of ties: 41812
##
## Mean of Anderson-Darling Criterion: 44
## Standard deviation of Anderson-Darling Criterion: 5.04784
##
## T.AD = ( Anderson-Darling Criterion - mean)/sigma
##
## Null Hypothesis: All samples come from a common population.
##
## AD T.AD asympt. P-value
## version 1: 3302 645.4 0
## version 2: 3220 628.3 0
From the k-sample Anderson-Darling test we found that there are different distributions in Duration across End Dates. However we need to mention that the Distribution Test like “Anderson-Darling”, “Cramer”, “Kolmogorov-Smirnov” are relative strict.
Our scope is to create clusters of the Distribution of the Duration by End Date. For this Analysis we will use the 20%-80% quantiles
cat("Using the Elbow-Rule we define the number of clusters to be equal to 3")
## Using the Elbow-Rule we define the number of clusters to be equal to 3
k.max <- 15 # Maximal number of clusters
data=df_q[,4:10]
wss <- sapply(1:k.max,
function(k){kmeans(data, k, nstart=10 )$tot.withinss})
plot(1:k.max, wss,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
abline(v = 3, lty =2)
fit <- kmeans(data, 3)
df_q2<-data.frame(df_q, cluster=fit$cluster)
df_q2<-df_q2[, c("EndDate", "cluster")]
df2<-merge(df, df_q2)
df2$cluster<-as.factor(df2$cluster)
clusplot(data, fit$cluster, color=TRUE, shade=TRUE, labels=1, lines=0)
ggplot(df2, aes(x=as.factor(EndDate),y=duration))+geom_boxplot(aes(fill=cluster))+coord_cartesian(ylim=c(0,500))+ggtitle("Box Plor of the Duration")+xlab("End Date")+theme(axis.text.x = element_text(angle = 90, hjust = 1))
Below we represent some findings regarding the available Data