Andrey Danilchenko <danilchenko@yandex-team.ru> — Feb 18, 2014, 2:18 AM
setwd('~/workspace/cs-matstat/')
#download.file(url='http://beta.compscicenter.ru/media/homework_supplement/82/13_6', destfile='data/week1-home.tsv')
read.csv('data/week1-home.tsv', sep='\t', header=F) -> bad.friday
names(bad.friday) <- c('shopping.type', 'date', 'customers.6', 'customers.13', 'shop')
# shopping.type is unused
# we can build additional column for difference and work with it
bad.friday$diff <- bad.friday$customers.6 - bad.friday$customers.13
# getting summary
summary(bad.friday)
shopping.type date customers.6 customers.13
shopping:45 1990, July :9 Min. :3558 Min. :3554
1991, December :9 1st Qu.:3954 1st Qu.:3926
1991, September:9 Median :4805 Median :4736
1992, March :9 Mean :4971 Mean :5017
1992, November :9 3rd Qu.:6026 3rd Qu.:6162
Max. :7138 Max. :7057
shop diff
Chichester : 5 Min. :-774.0
Crystal Palace: 5 1st Qu.:-136.0
Dorking : 5 Median : -11.0
East Grinstead: 5 Mean : -46.5
Epsom : 5 3rd Qu.: 47.0
Guildford : 5 Max. : 302.0
(Other) :15
# customers.6 and customers.13 are about the same range
max(bad.friday$customers.6) - min(bad.friday$customers.6)
[1] 3580
max(bad.friday$customers.13) - min(bad.friday$customers.13)
[1] 3503
# customers.13 mean is greater than mean of customers.6
# but median of customers.6 is greater than median of customers.13
# so we can suppose that customers.* distributions are not normal
# and so we see:
h.13 <- hist(bad.friday$customers.13, col=rgb(1,0,0,0.3), main='Histogram of customers count', xlab='Count')
h.6 <- hist(bad.friday$customers.6, col=rgb(0,1,1,0.3), add=T)
lines(h.6$counts~h.6$mids, col='blue')
lines(h.13$counts~h.13$mids, col='red')
legend('topright', c('6','13'), col=c('blue','red'), pch=15)
# but histogram of difference looks like good normal distribution with a little negative skew
h <- hist(bad.friday$diff, breaks=10, main='Histogram of diffs 6-13', xlab='Diff')
# Let's check is there any difference between years?
boxplot(bad.friday$diff~bad.friday$date)
abline(h=median(bad.friday$diff), col=2, lwd=2)
# now we see that 1990, July looks different
# this year can cause skew of histogram
filtered <- bad.friday[bad.friday$date != '1990, July',]
hist(filtered$diff, breaks=10, main='Histogram of diffs 6-13 (filtered by date)', xlab='Diff')
# and now histogram looks much like normal distribution except some outliers on the left
# Now let's check if there is any difference between shops
boxplot(bad.friday$diff~bad.friday$shop)
# Oh, Lewisham looks really bad!
filtered2 <- bad.friday[bad.friday$shop != 'Lewisham',]
hist(filtered2$diff, main='Histogram of diffs 6-13 (filtered)', xlab='Diff')
# and now it's pretty good
# let's check what is mean difference
mean(filtered2$diff)
[1] -37.17
# not big in compare with standard deviation
sd(filtered2$diff)
[1] 120.6
mean(filtered2$diff)/sd(filtered2$diff)
[1] -0.3083
# so I suppose that there's no difference in number of customers.
# So 13 is not such a bad day =)