rm(list = ls())
library(readr)
library(ggplot2)
#weight and systolic blood pressure
setwd("c:/users/Paul/Documents/Rworksmsu")
ttcelldata<- read.csv(file="ttcelldata.csv")
#Let's check the structure of our data
str(ttcelldata)
## 'data.frame':    51 obs. of  4 variables:
##  $ Obs     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ glucose : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ conc    : int  631000 592000 563000 475000 461000 416000 385000 321000 302000 199000 ...
##  $ diameter: num  21.2 21.5 21.3 21 21.5 21.3 20.3 22.7 21.5 22.2 ...
head(ttcelldata)
##   Obs glucose   conc diameter
## 1   1       1 631000     21.2
## 2   2       1 592000     21.5
## 3   3       1 563000     21.3
## 4   4       1 475000     21.0
## 5   5       1 461000     21.5
## 6   6       1 416000     21.3
ttcelldata$glucoseF<-as.factor(ttcelldata$glucose)
head(ttcelldata)
##   Obs glucose   conc diameter glucoseF
## 1   1       1 631000     21.2        1
## 2   2       1 592000     21.5        1
## 3   3       1 563000     21.3        1
## 4   4       1 475000     21.0        1
## 5   5       1 461000     21.5        1
## 6   6       1 416000     21.3        1
#Let's use ggplot to visualiaze the scatter plot bewtween average cell diameter and cell concentration
ttcellplot1<-ggplot(ttcelldata,aes(x=conc,y=diameter))+geom_point()
ttcellplot1

#Let's try transforming the response variable (average diameter), but before then, let's look at the lambda value
library(MASS)           # need the MASS library for Box-Cox procedure
library(car)            #need the car package for boxTidwell procedure
## Loading required package: carData
transcheck1<-boxcox(diameter~conc, data=ttcelldata,lambda=seq(-5,3,length=10))   #Box Cox analysis

transcheck1
## $x
##   [1] -5.00000000 -4.91919192 -4.83838384 -4.75757576 -4.67676768 -4.59595960
##   [7] -4.51515152 -4.43434343 -4.35353535 -4.27272727 -4.19191919 -4.11111111
##  [13] -4.03030303 -3.94949495 -3.86868687 -3.78787879 -3.70707071 -3.62626263
##  [19] -3.54545455 -3.46464646 -3.38383838 -3.30303030 -3.22222222 -3.14141414
##  [25] -3.06060606 -2.97979798 -2.89898990 -2.81818182 -2.73737374 -2.65656566
##  [31] -2.57575758 -2.49494949 -2.41414141 -2.33333333 -2.25252525 -2.17171717
##  [37] -2.09090909 -2.01010101 -1.92929293 -1.84848485 -1.76767677 -1.68686869
##  [43] -1.60606061 -1.52525253 -1.44444444 -1.36363636 -1.28282828 -1.20202020
##  [49] -1.12121212 -1.04040404 -0.95959596 -0.87878788 -0.79797980 -0.71717172
##  [55] -0.63636364 -0.55555556 -0.47474747 -0.39393939 -0.31313131 -0.23232323
##  [61] -0.15151515 -0.07070707  0.01010101  0.09090909  0.17171717  0.25252525
##  [67]  0.33333333  0.41414141  0.49494949  0.57575758  0.65656566  0.73737374
##  [73]  0.81818182  0.89898990  0.97979798  1.06060606  1.14141414  1.22222222
##  [79]  1.30303030  1.38383838  1.46464646  1.54545455  1.62626263  1.70707071
##  [85]  1.78787879  1.86868687  1.94949495  2.03030303  2.11111111  2.19191919
##  [91]  2.27272727  2.35353535  2.43434343  2.51515152  2.59595960  2.67676768
##  [97]  2.75757576  2.83838384  2.91919192  3.00000000
## 
## $y
##   [1] 47.62206 47.82493 48.02438 48.22040 48.41299 48.60213 48.78783 48.97009
##   [9] 49.14889 49.32424 49.49612 49.66455 49.82950 49.99098 50.14898 50.30350
##  [17] 50.45453 50.60206 50.74608 50.88660 51.02361 51.15709 51.28705 51.41349
##  [25] 51.53639 51.65576 51.77162 51.88395 51.99275 52.09805 52.19982 52.29808
##  [33] 52.39283 52.48407 52.57180 52.65603 52.73677 52.81403 52.88780 52.95810
##  [41] 53.02494 53.08833 53.14826 53.20475 53.25781 53.30744 53.35366 53.39647
##  [49] 53.43589 53.47193 53.50461 53.53394 53.55993 53.58258 53.60192 53.61796
##  [57] 53.63071 53.64018 53.64640 53.64937 53.64912 53.64566 53.63901 53.62917
##  [65] 53.61618 53.60005 53.58079 53.55841 53.53295 53.50441 53.47281 53.43819
##  [73] 53.40054 53.35990 53.31628 53.26970 53.22018 53.16774 53.11239 53.05417
##  [81] 52.99308 52.92916 52.86241 52.79287 52.72055 52.64546 52.56765 52.48711
##  [89] 52.40389 52.31798 52.22942 52.13823 52.04442 51.94802 51.84905 51.74752
##  [97] 51.64346 51.53689 51.42783 51.31629
#For the response variable,"average diameter", let's try different transformation means using the lambda value
#lambda=3
ttcelldata$cbdiameter<-(ttcelldata$diameter)^3
ttcellplot2<-ggplot(ttcelldata,aes(x=conc,y=cbdiameter))+geom_point()+geom_smooth(method="lm",se=FALSE)
ttcellplot2
## `geom_smooth()` using formula 'y ~ x'

#lambda=2
ttcelldata$sqdiameter<-(ttcelldata$diameter)^2
ttcellplot3<-ggplot(ttcelldata,aes(x=conc,y=sqdiameter))+geom_point()+geom_smooth(method="lm",se=FALSE)
ttcellplot3
## `geom_smooth()` using formula 'y ~ x'

#lambda=1
ttcellplot4<-ggplot(ttcelldata,aes(x=conc,y=diameter))+geom_point()+geom_smooth(method="lm",se=FALSE)
ttcellplot4
## `geom_smooth()` using formula 'y ~ x'

#lambda=0.5
ttcelldata$sqrdiameter<-(ttcelldata$diameter)^0.5
ttcellplot5<-ggplot(ttcelldata,aes(x=conc,y=sqrdiameter))+geom_point()+geom_smooth(method="lm",se=FALSE)
ttcellplot5
## `geom_smooth()` using formula 'y ~ x'

#lambda=0
ttcelldata$logdiameter<-log(ttcelldata$diameter)
ttcellplot6<-ggplot(ttcelldata,aes(x=conc,y=logdiameter))+geom_point()+geom_smooth(method="lm",se=FALSE)
ttcellplot6
## `geom_smooth()` using formula 'y ~ x'

#lambda=-0.5
ttcelldata$invsqrdiameter<-(ttcelldata$diameter)^-(0.5)
ttcellplot7<-ggplot(ttcelldata,aes(x=conc,y=invsqrdiameter))+geom_point()+geom_smooth(method="lm",se=FALSE)
ttcellplot7
## `geom_smooth()` using formula 'y ~ x'

#lambda=-1
ttcelldata$invsdiameter<-(ttcelldata$diameter)^-1
ttcellplot8<-ggplot(ttcelldata,aes(x=conc,y=invsdiameter))+geom_point()+geom_smooth(method="lm",se=FALSE)
ttcellplot8
## `geom_smooth()` using formula 'y ~ x'

#lambda=-2
ttcelldata$invssqdiameter<-(ttcelldata$diameter)^-2
ttcellplot9<-ggplot(ttcelldata,aes(x=conc,y=invssqdiameter))+geom_point()+geom_smooth(method="lm",se=FALSE)
ttcellplot9
## `geom_smooth()` using formula 'y ~ x'

#lambda=-3
ttcelldata$invscbdiameter<-(ttcelldata$diameter)^-3
ttcellplot10<-ggplot(ttcelldata,aes(x=conc,y=invscbdiameter))+geom_point()+geom_smooth(method="lm",se=FALSE)
ttcellplot10
## `geom_smooth()` using formula 'y ~ x'

#Lambda values of -3,-2,-1,-0.5,0,0.5,1,2,3 do NOT seem to change the distribution of the response around the fitted line.
#Thus, transformation of the response variable might not be ncessary.
#Next
#Let's check for the transformation procedure for the explanatory variable
transcheck2<-boxTidwell(diameter~conc, data=ttcelldata)   #Box Cox analysis
transcheck2
##  MLE of lambda Score Statistic (z)  Pr(>|z|)    
##      -0.039951              5.7582 8.501e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## iterations =  5
#The MLE of lambda from the boxTiidwell procedure (-0.0399), close to "0" suggests that a log transformation of the explanatory variable might be necessary
#Log transform the "conc" and name the new colunm as "logconc"
ttcelldata$logconc<-log(ttcelldata$conc)
#Now, replot the scatter plot of the original "average diameter vs the log(conc)"
ttcellplot11<-ggplot(ttcelldata,aes(x=logconc,y=diameter))+geom_point()+geom_smooth(method="lm",se=FALSE)
ttcellplot11
## `geom_smooth()` using formula 'y ~ x'

#Let's fit a linear model to diameter vs logconc
ttcellmodel<-lm(diameter~logconc, data=ttcelldata)
summary(ttcellmodel)
## 
## Call:
## lm(formula = diameter ~ logconc, data = ttcelldata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0739 -0.7417  0.2351  0.6923  1.4475 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 37.03015    1.06614   34.73   <2e-16 ***
## logconc     -1.24437    0.09395  -13.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8593 on 49 degrees of freedom
## Multiple R-squared:  0.7817, Adjusted R-squared:  0.7772 
## F-statistic: 175.4 on 1 and 49 DF,  p-value: < 2.2e-16
plot(ttcellmodel)

#Plots of a separate line of best fit for each glucose group
ttcellplot3<-ggplot(ttcelldata,aes(x=logconc,y=diameter, color=glucoseF))+geom_point()+geom_smooth(method="lm",se=FALSE)
ttcellplot3
## `geom_smooth()` using formula 'y ~ x'

ttcellconf<-confint(ttcellmodel)
ttcellconf
##                 2.5 %    97.5 %
## (Intercept) 34.887659 39.172637
## logconc     -1.433179 -1.055561
#Confidence interval on the mean of average diameter
x<-log(100000)
C.I<-predict(ttcellmodel,newdata=data.frame(logconc=x),interval=c("confidence"),level=.95)
(C.I)
##        fit      lwr      upr
## 1 22.70381 22.45785 22.94976
#Prediction interval on the true average diameter
P.I<-predict(ttcellmodel,newdata=data.frame(logconc=x),interval=c("prediction"),level=.95)
(P.I)
##        fit      lwr      upr
## 1 22.70381 20.95948 24.44813