This is example data from an experiment in which the CG8177 gene was knocked down with CG8177 RNAi using a VDRC line. The data this script refers to was collected by Bryne, Mari and Sumitra in 7/2016, and analyzed by Bryne in 10/12/2016 to 12/12/2016
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.3
library(plyr)
## Warning: package 'plyr' was built under R version 3.1.3
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.1.2
#imports useful packages
new = read.csv("160728_CG8177_VDRC.csv")
#assumes this file is located within your working directory, imports data into R
splith <- function (df1) {
h <- subset(df1, pH == "HCO3", drop = TRUE)
return(h)
}
#this is a function that separates HCO3 values from the rest of the dataset
splitp <- function (df1) {
ph <- subset( df1, pH != "HCO3", drop = TRUE )
nph <- ph$pH
nph <- as.numeric(as.character(nph))
ph$nph <- nph
return(ph)
}
#this is a function that separates Nigericin pH = 6.5 and pH = 7.5 values and makes sure they are defined as numeric
pnew <- splitp(new)
#stores calibration values in variable called pnew
hnew <- splith(new)
#stores HCO3 values in a variable callled hnew
Now we plot the regression lines to take a look at the data..
ggplot(pnew, aes(x=nph, y = Ratio, fill = genotype )) + geom_point() + stat_smooth(method = "lm") + facet_wrap(~region)
Further breaking down data by genotype.
pctrl <- pnew[pnew$genotype == "ctrl",]
poe <- pnew[pnew$genotype == "CG8177",]
Calculating linear regression models for each genotype and region
mylm <- function(df) {lm(Ratio ~ nph, data= df)}
#computes linear regression for any given data_frame with Ratio and pH columns
all_lm <- function(df, g ) {dlply(df, .(g), mylm)}
#applies lm to grouping specified by g, and values specified by yvar and x var
#this function will throw an error if you have rows with NA values
#g should be of equivalent length to the df (ie df$myvariable works a list
#of variable names doesnt)
ctrl.lm <- all_lm(pctrl, pctrl$region)
mut.lm <- all_lm(poe, poe$region)
#linear regression models are now stored in a variable as a list
Pulling out the values in the linear regression model and storing them in a dataframe instead of a list.
rsq <- function(x) summary(x)$r.squared
#pulls out the rsquared value from a linear mode
mycoefs <- function(x) { ldply(x, function(x) c(coef(x), rsquare = rsq(x)))}
#returns vector of intercept and slope
df.ctrl <- mycoefs(ctrl.lm)
df.ctrl$genotype <- "ctrl"
#adds column for genotype
df.oe <- mycoefs(mut.lm)
df.oe$genotype <- "CG8177"
#make sure genotype name matches original file
df.all <- rbind(df.ctrl, df.oe)
names(df.all) = c("region", "int", "slope", "rsquared", "genotype")
#changes names of columns
All the values of the linear regression model are now stored in a single dataframe. Now combine this dataframe with the data containing the ratio values of cells treated in bicarbonate buffer and calculate pH values
calclm = function(val, slope, int) {newval = (val - int)/slope
return(newval)
}
#this is a function that takes three arguments - a ratio value (val) and the slope and y-intercept from a linear regression model to calculate a pH value
merged = merge(hnew, df.all, by = c("region", "genotype"))
merged$calcpH = calclm(merged$Ratio, merged$slope, merged$int)
#this adds a new column for calculated pH values based on the above function
set2 = read.csv("160707_8177vdrc.csv")
set1 = read.csv("160705_8177vdrc.csv")
#importing two additional data sets that already have pH values calculated
set1$genotype <- gsub("CG8177_vdrc", "CG8177", set1$genotype)
#replacing genotype names so all sets match
Next bit of code takes the combined data set from three separate experiemnts, removes values outside the range of the sensor, calculates statistics via t-test, and generates summary statistics of the data
all = rbind(set1, set2, merged)
#combines three sets into one
all.clean = all[all$calcpH < 7.8 & all$calcpH > 6.5,]
#eliminate values outside range of sensor
all.clean$region = relevel(all.clean$region, ref = "stem")
all.clean$genotype = as.factor(all.clean$genotype)
all.clean$genotype = relevel(all.clean$genotype, ref = "ctrl")
#reorder factors to appear in correct order
p3 = ggplot(all.clean, aes(x=region, y = calcpH, fill = genotype) ) + geom_boxplot() + theme_classic() + theme(axis.text = element_text(size = 15), line = element_line(size =2 ) )
p3 + ylim(6.0, 8.0)
#plots data with boxplot using ggplot2 package
myt <- function(df) t.test(df[["calcpH"]] ~ df[["genotype"]])
#assumes that you have a list of dataframes split by a relevant grouping
#this works as long as you manually type in the name the subelement in the
#using either $element or [["elemen"]]
getp <-function(x) x[["p.value"]]
#pulls p-value from a list
getpvals <- function (df, test ) {
byregion<- dlply(df, .(region), myt)
#spits out a list of t.tests
pvals <- ldply(byregion, .fun = getp)
#puts into a df
return(pvals)
}
pvals.all = getpvals(all.clean, myt)
summary.all = ddply(all.clean, .(genotype, region), summarise, ph = mean(calcpH), sd = sd(calcpH), se = sd/sqrt(3))