Reading the Data
df=read.csv('test_dad.csv')
df=df[,c(1,2,5,6)]
str(df)
## 'data.frame': 9346 obs. of 4 variables:
## $ buckt : Factor w/ 2 levels "control","test": 1 1 2 2 2 2 2 1 2 2 ...
## $ slr_id : Factor w/ 9346 levels "seller_1","seller_10",..: 1 1112 2223 3334 4445 5556 6667 7778 8889 2 ...
## $ pre_GMV: num 0 0 0 0 99.2 ...
## $ pos_GMV: num 0 0 0 0 2.12 4.32 6.31 6.95 8.19 8.42 ...
merging pre and post GMV
df$gmv=apply(df[c("pre_GMV","pos_GMV")],1, max)
head(df)
plot(df$slr_id,df$pre_GMV,type='l')
points(df$slr_id,df$pos_GMV,col=2,type='l')

Dividing the data in control and test
df_control=df[df$buckt=='control',]
df_test=df[df$buckt=='test',]
head(df_test)
Analyze pre GMV trends of control
This will give 95% confidence limits for the GMV
brks=quantile(df_control$pre_GMV,p=seq(0,1,0.05))
df_control$cats=cut(df_control$pre_GMV,breaks=brks)
tbl1=data.frame(tapply(df_control$pre_GMV,df_control$cats,mean))
names(tbl1)<-c('mean')
tbl1$sd=tapply(df_control$pre_GMV,df_control$cats,sd)
#tbl1$ci1=round(tbl1$mean-1.96*tbl1$sd,2)
#tbl1$ci2=round(tbl1$mean+1.96*tbl1$sd,2)
tbl1
Analyze post GMV trends of control
This will give 95% confidence limits for the GMV
brks=quantile(df_control$pos_GMV,p=seq(0,1,0.05))
df_control$cats=cut(df_control$pos_GMV,breaks=brks[7:21])
tbl2=tapply(df_control$pos_GMV,df_control$cats,mean)
tbl2=data.frame(tbl2)
names(tbl2)<-c('mean')
tbl2$sd=tapply(df_control$pos_GMV,df_control$cats,sd)
#tbl2$ci1=round(tbl2$mean-1.96*tbl2$sd,2)
#tbl2$ci2=round(tbl2$mean+1.96*tbl2$sd,2)
tbl2
barplot(height = rbind(tbl2$mean,tbl1$mean),horiz = TRUE,col = c(2,4))
## Warning in rbind(tbl2$mean, tbl1$mean): number of columns of result is not
## a multiple of vector length (arg 1)

plot(tbl2$mean)

tabl3=table(cut(df$pre_GMV, brks[7:21]))
plot(tabl3,ylim = c(0,64000))
points(tbl2$mean,type='b',col=4)

barplots for pre and post periods of average GMV
df=read.csv("test_dad.csv")
##----------------------------------------------
cbrks=quantile(c(df$pre_GMV,df$pos_GMV),p=seq(0,1,0.05))
cbrks=cbrks[-(1:2)]
##------------both and control and test combined
df$pos_cat=cut(df$pos_GMV,breaks = cbrks)
df$pre_cat=cut(df$pre_GMV,breaks = cbrks)
tbl=data.frame(cbrks)
tbl=data.frame(tapply(df$pre_GMV,df$pre_cat,mean))
colnames(tbl)<-c('pre_mean')
tbl$pos_mean=tapply(df$pos_GMV,df$pre_cat,mean)
tbl
barplot(height=t(tbl),col = 1:2,beside = TRUE,axis.lty = 2
,main='average GMV pre and post periods (both control and test groups)')
legend("topleft",
c('pre_GMV','post_GMV'),
col = c(1,2),
lty=c(1,1),lwd=c(2,2)
)

###---------------------------------------
Separate Barplots for Control Group
##------------only control group ----------
df=df[df$buckt=='control',]
df$pos_cat=cut(df$pos_GMV,breaks = cbrks)
df$pre_cat=cut(df$pre_GMV,breaks = cbrks)
tbl=data.frame(cbrks)
tbl=data.frame(tapply(df$pre_GMV,df$pre_cat,mean))
colnames(tbl)<-c('pre_mean')
tbl$pos_mean=tapply(df$pos_GMV,df$pre_cat,mean)
tbl
barplot(height=t(tbl),col = c(1,4),beside = TRUE,axis.lty = 2
,main='average GMV pre and post periods (control groups)')
legend("topleft",
c('pre_GMV','post_GMV'),
col = c(1,4),
lty=c(1,1),lwd=c(2,2)
)

###---------------------------------------
Barplot for Test Group
##------------only test group ----------
df=read.csv('test_dad.csv')
df=df[df$buckt=='test',]
df$pos_cat=cut(df$pos_GMV,breaks = cbrks)
df$pre_cat=cut(df$pre_GMV,breaks = cbrks)
tbl=data.frame(cbrks)
tbl=data.frame(tapply(df$pre_GMV,df$pre_cat,mean))
colnames(tbl)<-c('pre_mean')
tbl$pos_mean=tapply(df$pos_GMV,df$pre_cat,mean)
tbl
barplot(height=t(tbl),col = c(1,3),beside = TRUE,axis.lty = 2
,main='average GMV pre and post periods (test group)')
legend("topleft",
c('pre_GMV','post_GMV'),
col = c(1,3),
lty=c(1,1),lwd=c(2,2)
)

###---------------------------------------