*A statistical exploration
#read.csv('yogurt.csv')
yo<-read.csv('yogurt.csv')
yo$id=as.factor(yo$id)
nci<-read.table(file="nci.tsv")
library(ggplot2)
qplot(x= price,data=yo,binwidth=1)
qplot(x= price,data=yo,binwidth=20) #This has higher bios
summary(yo)
## obs id time strawberry
## Min. : 1.0 2132290: 74 Min. : 9662 Min. : 0.0000
## 1st Qu.: 696.5 2130583: 59 1st Qu.: 9843 1st Qu.: 0.0000
## Median :1369.5 2124073: 50 Median :10045 Median : 0.0000
## Mean :1367.8 2149500: 50 Mean :10050 Mean : 0.6492
## 3rd Qu.:2044.2 2101790: 47 3rd Qu.:10255 3rd Qu.: 1.0000
## Max. :2743.0 2129528: 39 Max. :10459 Max. :11.0000
## (Other):2061
## blueberry pina.colada plain mixed.berry
## Min. : 0.0000 Min. : 0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0.0000 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median : 0.0000 Median : 0.0000 Median :0.0000 Median :0.0000
## Mean : 0.3571 Mean : 0.3584 Mean :0.2176 Mean :0.3887
## 3rd Qu.: 0.0000 3rd Qu.: 0.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :12.0000 Max. :10.0000 Max. :6.0000 Max. :8.0000
##
## price
## Min. :20.00
## 1st Qu.:50.00
## Median :65.04
## Mean :59.25
## 3rd Qu.:68.96
## Max. :68.96
##
length(unique(yo$price)) # How many distinct prices are there
## [1] 20
table(yo$price) # Table the distinct variables
##
## 20 24.96 33.04 33.2 33.28 33.36 33.52 39.04 44 45.04 48.96 49.52
## 2 11 54 1 1 22 1 234 21 11 81 1
## 49.6 50 55.04 58.96 62 63.04 65.04 68.96
## 1 205 6 303 15 2 799 609
*Transforming and plotting
# Variable aded to find all purchases for a single household.
yo<-transform(yo,all.purchases=strawberry+blueberry+pina.colada+plain+mixed.berry+price)
qplot(x= all.purchases,data=yo,binwidth=1,fill=I('#099009'))
ggplot(aes(x=time,y=price),data=yo)+
geom_jitter(alpha=1/4,shape=21,fill=I('#F79420'))
*When we have data like above graph, which is mostly constant, it is really intresting to explore by taking samples.
set.seed(4231)
sample.ids<-sample(levels(yo$id),16)
ggplot(aes(x=time,y=price),data=subset(yo, id %in% sample.ids))+
facet_wrap(~id)+
geom_line()+
geom_point(aes(size=all.purchases),pch=1)
*scatterplot Matrices
# install.packages('GGally')
library(GGally)
theme_set(theme_minimal(20))
# set the seed for reproducible results
set.seed(123)
yo_subset<- yo[,3:9]
ggpairs(yo_subset[sample.int(nrow(yo_subset),1000),])
*Creat Heat Maps
library(reshape2)
nci<-read.table('nci.tsv')
nci.long.samp<-melt(as.matrix(nci[1:200,]))
names(nci.long.samp)<-c('gene','case','value')
head(nci.long.samp)
## gene case value
## 1 1 V1 0.300
## 2 2 V1 1.180
## 3 3 V1 0.550
## 4 4 V1 1.140
## 5 5 V1 -0.265
## 6 6 V1 -0.070
#Make the heat map
ggplot(aes(y=gene,x=case,fill=value),
data=nci.long.samp)+
geom_tile()+
scale_fill_gradientn(colors=colorRampPalette(c('blue','red'))(100))