Example script for calculating pH values from Ratio values

This is example data from an experiment in which the CG8177 gene was knocked down with CG8177 RNAi using a VDRC line. The data this script refers to was collected by Bryne, Mari and Sumitra in 7/2016, and analyzed by Bryne in 10/12/2016 to 12/12/2016

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.3
library(plyr)
## Warning: package 'plyr' was built under R version 3.1.3
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.1.2
#imports useful packages 

new = read.csv("160728_CG8177_VDRC.csv")
#assumes this file is located within your working directory, imports data into R

splith <- function (df1) {
  h <- subset(df1, pH == "HCO3", drop = TRUE)
  return(h) 
} 
#this is a function that separates HCO3 values from the rest of the dataset 
splitp <- function (df1) {
  ph <- subset( df1, pH != "HCO3", drop = TRUE )

  nph <- ph$pH
  nph <- as.numeric(as.character(nph))
  ph$nph <- nph
  return(ph) 
}
#this is a function that separates Nigericin pH = 6.5 and pH = 7.5 values and makes sure they are defined as numeric 
pnew <- splitp(new)
#stores calibration values in variable called pnew 
hnew <- splith(new)
#stores HCO3 values in a variable callled hnew 

Now we plot the regression lines to take a look at the data..

ggplot(pnew, aes(x=nph, y = Ratio, fill = genotype )) + geom_point() + stat_smooth(method = "lm") + facet_wrap(~region)

plot of chunk unnamed-chunk-2 Further breaking down data by genotype.

pctrl <- pnew[pnew$genotype == "ctrl",]
poe <- pnew[pnew$genotype == "CG8177",] 

Calculating linear regression models for each genotype and region

mylm <- function(df) {lm(Ratio ~ nph, data= df)}
#computes linear regression for any given data_frame with Ratio and pH columns

all_lm <- function(df, g ) {dlply(df, .(g), mylm)}
#applies lm to grouping specified by g, and values specified by yvar and x var 
#this function will throw an error if you have rows with NA values
#g should be of equivalent length to the df (ie df$myvariable works a list 
#of variable names doesnt)
ctrl.lm <- all_lm(pctrl, pctrl$region)
mut.lm <- all_lm(poe, poe$region)
#linear regression models are now stored in a variable as a list 

Pulling out the values in the linear regression model and storing them in a dataframe instead of a list.

rsq <- function(x) summary(x)$r.squared
#pulls out the rsquared value from a linear mode
mycoefs <- function(x) { ldply(x, function(x) c(coef(x), rsquare = rsq(x)))}
#returns vector of intercept and slope 
df.ctrl <- mycoefs(ctrl.lm)
df.ctrl$genotype <- "ctrl"
#adds column for genotype 
df.oe <- mycoefs(mut.lm)
df.oe$genotype <- "CG8177"
#make sure genotype name matches original file 
df.all <- rbind(df.ctrl, df.oe)
names(df.all) = c("region", "int", "slope", "rsquared", "genotype")
#changes names of columns

All the values of the linear regression model are now stored in a single dataframe. Now combine this dataframe with the data containing the ratio values of cells treated in bicarbonate buffer and calculate pH values

calclm = function(val, slope, int) {newval = (val - int)/slope  
return(newval)
}
#this is a function that takes three arguments - a ratio value (val) and the slope and y-intercept from a linear regression model to calculate a pH value 
merged = merge(hnew, df.all, by = c("region", "genotype"))
merged$calcpH = calclm(merged$Ratio, merged$slope, merged$int)
#this adds a new column for calculated pH values based on the above function 
set2 = read.csv("160707_8177vdrc.csv")
set1 = read.csv("160705_8177vdrc.csv")
#importing two additional data sets that already have pH values calculated 

set1$genotype <- gsub("CG8177_vdrc", "CG8177", set1$genotype)
#replacing genotype names so all sets match 

Next bit of code takes the combined data set from three separate experiemnts, removes values outside the range of the sensor, calculates statistics via t-test, and generates summary statistics of the data

all = rbind(set1, set2, merged) 
#combines three sets into one 
all.clean = all[all$calcpH < 7.8 & all$calcpH > 6.5,]
#eliminate values outside range of sensor 
all.clean$region = relevel(all.clean$region, ref = "stem")
all.clean$genotype = as.factor(all.clean$genotype)
all.clean$genotype = relevel(all.clean$genotype, ref = "ctrl")
#reorder factors to appear in correct order  

p3 = ggplot(all.clean, aes(x=region, y = calcpH, fill = genotype) ) + geom_boxplot() + theme_classic() + theme(axis.text = element_text(size = 15), line = element_line(size =2 ) ) 
p3 + ylim(6.0, 8.0)

plot of chunk unnamed-chunk-8

#plots data with boxplot using ggplot2 package 

myt <- function(df) t.test(df[["calcpH"]] ~ df[["genotype"]])
#assumes that you have a list of dataframes split by a relevant grouping
#this works as long as you manually type in the name the subelement in the 
#using either $element or [["elemen"]]

getp <-function(x) x[["p.value"]]
#pulls p-value from a list 
getpvals <- function (df, test ) {
  byregion<- dlply(df, .(region), myt)
  #spits out a list of t.tests
  pvals <- ldply(byregion, .fun = getp)
  #puts into a df 
  return(pvals)
}

pvals.all = getpvals(all.clean, myt)
summary.all = ddply(all.clean, .(genotype, region), summarise, ph = mean(calcpH), sd = sd(calcpH), se = sd/sqrt(3))