tt <- function(x,y,mu0=0,ts=TRUE) { # two-sided t-test
nx <- length(x)
ny <- length(y)
sp <- sqrt(((nx-1)*var(x)+(ny-1)*var(y))/(nx+ny-2))
t <- (mean(x)-mean(y)-mu0) / (sp*sqrt(1/nx+1/ny))
df <- (var(x)/nx + var(y)/ny)^2 /
((var(x)/nx)^2/(nx-1) + (var(y)/ny)^2/(ny-1))
sample_sizes <- c(nx, ny)
names(sample_sizes) <- c("x","y")
sample_means <- c(mean(x), mean(y))
names(sample_means) <- c("x", "y")
pvalue <- ifelse(ts,2,1)*(1-pt(abs(t),df=df))
cohensd <- (mean(x) - mean(y))/sp
list(sample_sizes=sample_sizes, sample_means=sample_means,
df=df, pvalue=pvalue, cohensd=cohensd)
}
tt_significant <- function(tt.df, alpha=0.05) {
pvalues <- tt.df$pvalue
pvalues <- as.numeric(pvalues < alpha)
names(pvalues) <- tt.df$variable
pvalues
}
plot_alpha <- function(tt.df, from=0.01, to=1.0, by=0.01) {
alpha <- seq(from, to, by)
prop_significant <- sapply(alpha, function(x) mean(tt_significant(tt.df, x)))
plot(x=alpha, y=prop_significant, ylab="Proportion significant", xlab="Alpha",
type='l')
}
expr_data <- data.frame(matrix(
c("1007_s_at", "11.376519", "11.826743", "11.123022", "11.743439", "12.172961",
"11.522009", "11.288407", "11.364544", "11.783231", "12.102697", "12.141934",
"12.141672", "11.541161", "12.069206", "11.529456", "9.692066", "11.242988",
"DDR1", "U48705", "1053_at", "7.270398", "7.534450", "7.169297", "7.730833",
"6.728914", "7.033900", "8.152550", "7.357942", "7.811469", "6.704366",
"7.723678", "7.607720", "6.904579", "6.837490", "7.437899", "7.608960",
"6.704648", "RFC2", "M87338", "117_at", "8.172823", "8.350568", "8.216073",
"8.052177", "7.940714", "8.122496", "8.246269", "7.597745", "8.809971",
"7.299070", "7.808597", "8.390707", "7.653514", "8.680945", "8.050873",
"9.242006", "8.253535", "HSPA6", "X51757", "121_at", "10.064195",
"11.193688", "9.846189", "10.549992", "10.172722", "10.357284", "11.361081",
"10.446139", "11.165541", "10.285435", "10.123556", "10.532735",
"10.379335", "10.487541", "10.542419", "10.248043", "10.207259", "PAX8",
"X69699", "1255_g_at", "6.256425", "6.830607", "5.825010", "6.098157",
"6.104971", "5.818458", "6.355995", "6.311312", "7.366574", "5.577412",
"4.570794", "5.046956", "6.561945", "5.897955", "5.402725", "5.957542",
"6.201037", "GUCA1A", "L36861", "1294_at", "9.347887", "9.540260",
"9.229501", "9.464348", "9.764903", "9.962180", "9.300450", "9.230649",
"9.783263","8.749285", "9.466965", "9.653450", "9.076623", "9.827835",
"9.096732", "9.441370", "9.102000", "UBA7", "L13852"), nrow=6, byrow=TRUE))
row.names(expr_data) <- expr_data[,1]
expr_data <- expr_data[,-1]
names(expr_data) <- c("GSM119944", "GSM119945", "GSM119946", "GSM119947",
"GSM119948", "GSM119949", "GSM119950", "GSM119951",
"GSM119952", "GSM119953", "GSM119954", "GSM119955",
"GSM119956", "GSM119957", "GSM119958", "GSM119959",
"GSM119960", "GeneSymbol", "PublicID")
expr_data[,1:17] <- sapply(expr_data[,1:17], function(x)
as.numeric(as.character(x)))
groups_data <- data.frame(
PatientID=c('GSM119946','GSM119948','GSM119951','GSM119955','GSM119956',
'GSM119959','GSM119947','GSM119950','GSM119952','GSM119953',
'GSM119957','GSM119958','GSM119944','GSM119945','GSM119949',
'GSM119954','GSM119960'),
TreatmentGroup = c(rep('Control',6), rep('Treatment1',6),
rep('Treatment2',5))
)
control_cols <- groups_data$PatientID[which(groups_data$TreatmentGroup=="Control")]
treatment_cols <- groups_data$PatientID[which(groups_data$TreatmentGroup=="Treatment1")]
# nrows <- dim(expr_data)[1]
# for(i in 1:nrows) {
# control_group <- expr_data[i,control_cols]
# treatment_group <- expr_data[i,treatment_cols]
#
# cat("T-test for control vs treatment (", row.names(treatment_group), ")\n")
# result <- tt(as.numeric(control_group), as.numeric(treatment_group), 0, FALSE)
# cat(" sample sizes:", as.numeric(result$sample_sizes),"\n")
# cat(" sample means:", as.numeric(result$sample_means),"\n")
# cat(" degrees of freedom:", as.numeric(result$df),"\n")
# cat(" p-value:", as.numeric(result$pvalue),"\n\n")
# }
tt.df <- data.frame(Variable=character(0), SampleSize1=numeric(0), SampleSize2=numeric(0),
SampleMean1=numeric(0), SampleMean2=numeric(0), DegreesFreedom=numeric(0),
Pvalue=numeric(0), CohensD=numeric(0))
# store results in data.frame
nrows <- dim(expr_data)[1]
for(i in 1:nrows) {
control_group <- expr_data[i,control_cols]
treatment_group <- expr_data[i,treatment_cols]
result <- tt(as.numeric(control_group), as.numeric(treatment_group), 0, FALSE)
tt.df <- rbind(tt.df, data.frame(variable=row.names(treatment_group),
size1=result$sample_sizes[1],
size2=result$sample_sizes[2],
mean1=result$sample_means[1],
mean2=result$sample_means[2],
df=result$df,
pvalue=result$pvalue,
cohensd=result$cohensd))
}
row.names(tt.df) <- 1:nrow(tt.df)
print(tt.df)
## variable size1 size2 mean1 mean2 df pvalue
## 1 1007_s_at 6 6 11.339238 11.752739 6.165043 0.16600503
## 2 1053_at 6 6 7.229569 7.445768 8.484730 0.22795627
## 3 117_at 6 6 8.173460 8.189884 9.865556 0.48076769
## 4 121_at 6 6 10.270860 10.732001 7.980862 0.02555726
## 5 1255_g_at 6 6 5.967956 6.116470 9.223889 0.34351776
## 6 1294_at 6 6 9.399416 9.370319 8.588961 0.44419898
## cohensd
## 1 -0.60773394
## 2 -0.45104152
## 3 -0.02855896
## 4 -1.32376621
## 5 -0.24008516
## 6 0.08345823
cat("\nSignificance at alpha=0.05:\n-----------------------------\n")
##
## Significance at alpha=0.05:
## -----------------------------
print(tt_significant(tt.df))
## 1007_s_at 1053_at 117_at 121_at 1255_g_at 1294_at
## 0 0 0 1 0 0
plot_alpha(tt.df)
