bad-friday.R

Andrey Danilchenko <danilchenko@yandex-team.ru> — Feb 18, 2014, 2:18 AM

setwd('~/workspace/cs-matstat/')

#download.file(url='http://beta.compscicenter.ru/media/homework_supplement/82/13_6', destfile='data/week1-home.tsv')

read.csv('data/week1-home.tsv', sep='\t', header=F) -> bad.friday
names(bad.friday) <- c('shopping.type', 'date', 'customers.6', 'customers.13', 'shop')
# shopping.type is unused

# we can build additional column for difference and work with it
bad.friday$diff <- bad.friday$customers.6 - bad.friday$customers.13

# getting summary
summary(bad.friday)

  shopping.type              date    customers.6    customers.13 
 shopping:45    1990, July     :9   Min.   :3558   Min.   :3554  
                1991, December :9   1st Qu.:3954   1st Qu.:3926  
                1991, September:9   Median :4805   Median :4736  
                1992, March    :9   Mean   :4971   Mean   :5017  
                1992, November :9   3rd Qu.:6026   3rd Qu.:6162  
                                    Max.   :7138   Max.   :7057  

             shop         diff       
 Chichester    : 5   Min.   :-774.0  
 Crystal Palace: 5   1st Qu.:-136.0  
 Dorking       : 5   Median : -11.0  
 East Grinstead: 5   Mean   : -46.5  
 Epsom         : 5   3rd Qu.:  47.0  
 Guildford     : 5   Max.   : 302.0  
 (Other)       :15


# customers.6 and customers.13 are about the same range
max(bad.friday$customers.6) - min(bad.friday$customers.6)

[1] 3580

max(bad.friday$customers.13) - min(bad.friday$customers.13)

[1] 3503


# customers.13 mean is greater than mean of customers.6
# but median of customers.6 is greater than median of customers.13
# so we can suppose that customers.* distributions are not normal

# and so we see:
h.13 <- hist(bad.friday$customers.13, col=rgb(1,0,0,0.3), main='Histogram of customers count', xlab='Count')
h.6  <- hist(bad.friday$customers.6, col=rgb(0,1,1,0.3), add=T)
lines(h.6$counts~h.6$mids, col='blue')
lines(h.13$counts~h.13$mids, col='red')
legend('topright', c('6','13'), col=c('blue','red'), pch=15)

plot of chunk unnamed-chunk-1


# but histogram of difference looks like good normal distribution with a little negative skew
h <- hist(bad.friday$diff, breaks=10, main='Histogram of diffs 6-13', xlab='Diff')

plot of chunk unnamed-chunk-1


# Let's check is there any difference between years?
boxplot(bad.friday$diff~bad.friday$date)
abline(h=median(bad.friday$diff), col=2, lwd=2)

plot of chunk unnamed-chunk-1

# now we see that 1990, July looks different
# this year can cause skew of histogram

filtered <- bad.friday[bad.friday$date != '1990, July',]
hist(filtered$diff, breaks=10, main='Histogram of diffs 6-13 (filtered by date)', xlab='Diff')

plot of chunk unnamed-chunk-1

# and now histogram looks much like normal distribution except some outliers on the left

# Now let's check if there is any difference between shops
boxplot(bad.friday$diff~bad.friday$shop)

plot of chunk unnamed-chunk-1

# Oh, Lewisham looks really bad!

filtered2 <- bad.friday[bad.friday$shop != 'Lewisham',]
hist(filtered2$diff, main='Histogram of diffs 6-13 (filtered)', xlab='Diff')

plot of chunk unnamed-chunk-1

# and now it's pretty good

# let's check what is mean difference
mean(filtered2$diff)

[1] -37.17

# not big in compare with standard deviation
sd(filtered2$diff)

[1] 120.6

mean(filtered2$diff)/sd(filtered2$diff)

[1] -0.3083


# so I suppose that there's no difference in number of customers. 
# So 13 is not such a bad day =)