*A statistical exploration

#read.csv('yogurt.csv')
yo<-read.csv('yogurt.csv')
yo$id=as.factor(yo$id)
nci<-read.table(file="nci.tsv")
library(ggplot2)
qplot(x= price,data=yo,binwidth=1)

qplot(x= price,data=yo,binwidth=20)  #This has higher bios

summary(yo)
##       obs               id            time         strawberry     
##  Min.   :   1.0   2132290:  74   Min.   : 9662   Min.   : 0.0000  
##  1st Qu.: 696.5   2130583:  59   1st Qu.: 9843   1st Qu.: 0.0000  
##  Median :1369.5   2124073:  50   Median :10045   Median : 0.0000  
##  Mean   :1367.8   2149500:  50   Mean   :10050   Mean   : 0.6492  
##  3rd Qu.:2044.2   2101790:  47   3rd Qu.:10255   3rd Qu.: 1.0000  
##  Max.   :2743.0   2129528:  39   Max.   :10459   Max.   :11.0000  
##                   (Other):2061                                    
##    blueberry        pina.colada          plain         mixed.berry    
##  Min.   : 0.0000   Min.   : 0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 0.0000   1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median : 0.0000   Median : 0.0000   Median :0.0000   Median :0.0000  
##  Mean   : 0.3571   Mean   : 0.3584   Mean   :0.2176   Mean   :0.3887  
##  3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :12.0000   Max.   :10.0000   Max.   :6.0000   Max.   :8.0000  
##                                                                       
##      price      
##  Min.   :20.00  
##  1st Qu.:50.00  
##  Median :65.04  
##  Mean   :59.25  
##  3rd Qu.:68.96  
##  Max.   :68.96  
## 
length(unique(yo$price))  # How many distinct prices are there
## [1] 20
table(yo$price)  # Table the distinct variables 
## 
##    20 24.96 33.04  33.2 33.28 33.36 33.52 39.04    44 45.04 48.96 49.52 
##     2    11    54     1     1    22     1   234    21    11    81     1 
##  49.6    50 55.04 58.96    62 63.04 65.04 68.96 
##     1   205     6   303    15     2   799   609

*Transforming and plotting

# Variable aded to find all purchases for a single household.
yo<-transform(yo,all.purchases=strawberry+blueberry+pina.colada+plain+mixed.berry+price)  
qplot(x= all.purchases,data=yo,binwidth=1,fill=I('#099009'))

ggplot(aes(x=time,y=price),data=yo)+
  geom_jitter(alpha=1/4,shape=21,fill=I('#F79420'))

*When we have data like above graph, which is mostly constant, it is really intresting to explore by taking samples.

set.seed(4231)
sample.ids<-sample(levels(yo$id),16)
ggplot(aes(x=time,y=price),data=subset(yo, id %in% sample.ids))+
  facet_wrap(~id)+
  geom_line()+
  geom_point(aes(size=all.purchases),pch=1)

*scatterplot Matrices

# install.packages('GGally')
library(GGally)
theme_set(theme_minimal(20))
# set the seed for reproducible results
set.seed(123)
yo_subset<- yo[,3:9]
ggpairs(yo_subset[sample.int(nrow(yo_subset),1000),])

*Creat Heat Maps

library(reshape2)
nci<-read.table('nci.tsv')
nci.long.samp<-melt(as.matrix(nci[1:200,]))
names(nci.long.samp)<-c('gene','case','value')
head(nci.long.samp)
##   gene case  value
## 1    1   V1  0.300
## 2    2   V1  1.180
## 3    3   V1  0.550
## 4    4   V1  1.140
## 5    5   V1 -0.265
## 6    6   V1 -0.070
#Make the heat map
ggplot(aes(y=gene,x=case,fill=value),
       data=nci.long.samp)+
  geom_tile()+
  scale_fill_gradientn(colors=colorRampPalette(c('blue','red'))(100))