rm(list = ls())
library(readr)
library(ggplot2)
#weight and systolic blood pressure
setwd("c:/users/Paul/Documents/Rworksmsu")
ttcelldata<- read.csv(file="ttcelldata.csv")
#Let's check the structure of our data
str(ttcelldata)
## 'data.frame': 51 obs. of 4 variables:
## $ Obs : int 1 2 3 4 5 6 7 8 9 10 ...
## $ glucose : int 1 1 1 1 1 1 1 1 1 1 ...
## $ conc : int 631000 592000 563000 475000 461000 416000 385000 321000 302000 199000 ...
## $ diameter: num 21.2 21.5 21.3 21 21.5 21.3 20.3 22.7 21.5 22.2 ...
head(ttcelldata)
## Obs glucose conc diameter
## 1 1 1 631000 21.2
## 2 2 1 592000 21.5
## 3 3 1 563000 21.3
## 4 4 1 475000 21.0
## 5 5 1 461000 21.5
## 6 6 1 416000 21.3
ttcelldata$glucoseF<-as.factor(ttcelldata$glucose)
head(ttcelldata)
## Obs glucose conc diameter glucoseF
## 1 1 1 631000 21.2 1
## 2 2 1 592000 21.5 1
## 3 3 1 563000 21.3 1
## 4 4 1 475000 21.0 1
## 5 5 1 461000 21.5 1
## 6 6 1 416000 21.3 1
#Let's use ggplot to visualiaze the scatter plot bewtween average cell diameter and cell concentration
ttcellplot1<-ggplot(ttcelldata,aes(x=conc,y=diameter))+geom_point()
ttcellplot1

#Let's try transforming the response variable (average diameter), but before then, let's look at the lambda value
library(MASS) # need the MASS library for Box-Cox procedure
library(car) #need the car package for boxTidwell procedure
## Loading required package: carData
transcheck1<-boxcox(diameter~conc, data=ttcelldata,lambda=seq(-5,3,length=10)) #Box Cox analysis

transcheck1
## $x
## [1] -5.00000000 -4.91919192 -4.83838384 -4.75757576 -4.67676768 -4.59595960
## [7] -4.51515152 -4.43434343 -4.35353535 -4.27272727 -4.19191919 -4.11111111
## [13] -4.03030303 -3.94949495 -3.86868687 -3.78787879 -3.70707071 -3.62626263
## [19] -3.54545455 -3.46464646 -3.38383838 -3.30303030 -3.22222222 -3.14141414
## [25] -3.06060606 -2.97979798 -2.89898990 -2.81818182 -2.73737374 -2.65656566
## [31] -2.57575758 -2.49494949 -2.41414141 -2.33333333 -2.25252525 -2.17171717
## [37] -2.09090909 -2.01010101 -1.92929293 -1.84848485 -1.76767677 -1.68686869
## [43] -1.60606061 -1.52525253 -1.44444444 -1.36363636 -1.28282828 -1.20202020
## [49] -1.12121212 -1.04040404 -0.95959596 -0.87878788 -0.79797980 -0.71717172
## [55] -0.63636364 -0.55555556 -0.47474747 -0.39393939 -0.31313131 -0.23232323
## [61] -0.15151515 -0.07070707 0.01010101 0.09090909 0.17171717 0.25252525
## [67] 0.33333333 0.41414141 0.49494949 0.57575758 0.65656566 0.73737374
## [73] 0.81818182 0.89898990 0.97979798 1.06060606 1.14141414 1.22222222
## [79] 1.30303030 1.38383838 1.46464646 1.54545455 1.62626263 1.70707071
## [85] 1.78787879 1.86868687 1.94949495 2.03030303 2.11111111 2.19191919
## [91] 2.27272727 2.35353535 2.43434343 2.51515152 2.59595960 2.67676768
## [97] 2.75757576 2.83838384 2.91919192 3.00000000
##
## $y
## [1] 47.62206 47.82493 48.02438 48.22040 48.41299 48.60213 48.78783 48.97009
## [9] 49.14889 49.32424 49.49612 49.66455 49.82950 49.99098 50.14898 50.30350
## [17] 50.45453 50.60206 50.74608 50.88660 51.02361 51.15709 51.28705 51.41349
## [25] 51.53639 51.65576 51.77162 51.88395 51.99275 52.09805 52.19982 52.29808
## [33] 52.39283 52.48407 52.57180 52.65603 52.73677 52.81403 52.88780 52.95810
## [41] 53.02494 53.08833 53.14826 53.20475 53.25781 53.30744 53.35366 53.39647
## [49] 53.43589 53.47193 53.50461 53.53394 53.55993 53.58258 53.60192 53.61796
## [57] 53.63071 53.64018 53.64640 53.64937 53.64912 53.64566 53.63901 53.62917
## [65] 53.61618 53.60005 53.58079 53.55841 53.53295 53.50441 53.47281 53.43819
## [73] 53.40054 53.35990 53.31628 53.26970 53.22018 53.16774 53.11239 53.05417
## [81] 52.99308 52.92916 52.86241 52.79287 52.72055 52.64546 52.56765 52.48711
## [89] 52.40389 52.31798 52.22942 52.13823 52.04442 51.94802 51.84905 51.74752
## [97] 51.64346 51.53689 51.42783 51.31629
#For the response variable,"average diameter", let's try different transformation means using the lambda value
#lambda=3
ttcelldata$cbdiameter<-(ttcelldata$diameter)^3
ttcellplot2<-ggplot(ttcelldata,aes(x=conc,y=cbdiameter))+geom_point()+geom_smooth(method="lm",se=FALSE)
ttcellplot2
## `geom_smooth()` using formula 'y ~ x'

#lambda=2
ttcelldata$sqdiameter<-(ttcelldata$diameter)^2
ttcellplot3<-ggplot(ttcelldata,aes(x=conc,y=sqdiameter))+geom_point()+geom_smooth(method="lm",se=FALSE)
ttcellplot3
## `geom_smooth()` using formula 'y ~ x'

#lambda=1
ttcellplot4<-ggplot(ttcelldata,aes(x=conc,y=diameter))+geom_point()+geom_smooth(method="lm",se=FALSE)
ttcellplot4
## `geom_smooth()` using formula 'y ~ x'

#lambda=0.5
ttcelldata$sqrdiameter<-(ttcelldata$diameter)^0.5
ttcellplot5<-ggplot(ttcelldata,aes(x=conc,y=sqrdiameter))+geom_point()+geom_smooth(method="lm",se=FALSE)
ttcellplot5
## `geom_smooth()` using formula 'y ~ x'

#lambda=0
ttcelldata$logdiameter<-log(ttcelldata$diameter)
ttcellplot6<-ggplot(ttcelldata,aes(x=conc,y=logdiameter))+geom_point()+geom_smooth(method="lm",se=FALSE)
ttcellplot6
## `geom_smooth()` using formula 'y ~ x'

#lambda=-0.5
ttcelldata$invsqrdiameter<-(ttcelldata$diameter)^-(0.5)
ttcellplot7<-ggplot(ttcelldata,aes(x=conc,y=invsqrdiameter))+geom_point()+geom_smooth(method="lm",se=FALSE)
ttcellplot7
## `geom_smooth()` using formula 'y ~ x'

#lambda=-1
ttcelldata$invsdiameter<-(ttcelldata$diameter)^-1
ttcellplot8<-ggplot(ttcelldata,aes(x=conc,y=invsdiameter))+geom_point()+geom_smooth(method="lm",se=FALSE)
ttcellplot8
## `geom_smooth()` using formula 'y ~ x'

#lambda=-2
ttcelldata$invssqdiameter<-(ttcelldata$diameter)^-2
ttcellplot9<-ggplot(ttcelldata,aes(x=conc,y=invssqdiameter))+geom_point()+geom_smooth(method="lm",se=FALSE)
ttcellplot9
## `geom_smooth()` using formula 'y ~ x'

#lambda=-3
ttcelldata$invscbdiameter<-(ttcelldata$diameter)^-3
ttcellplot10<-ggplot(ttcelldata,aes(x=conc,y=invscbdiameter))+geom_point()+geom_smooth(method="lm",se=FALSE)
ttcellplot10
## `geom_smooth()` using formula 'y ~ x'

#Lambda values of -3,-2,-1,-0.5,0,0.5,1,2,3 do NOT seem to change the distribution of the response around the fitted line.
#Thus, transformation of the response variable might not be ncessary.
#Next
#Let's check for the transformation procedure for the explanatory variable
transcheck2<-boxTidwell(diameter~conc, data=ttcelldata) #Box Cox analysis
transcheck2
## MLE of lambda Score Statistic (z) Pr(>|z|)
## -0.039951 5.7582 8.501e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## iterations = 5
#The MLE of lambda from the boxTiidwell procedure (-0.0399), close to "0" suggests that a log transformation of the explanatory variable might be necessary
#Log transform the "conc" and name the new colunm as "logconc"
ttcelldata$logconc<-log(ttcelldata$conc)
#Now, replot the scatter plot of the original "average diameter vs the log(conc)"
ttcellplot11<-ggplot(ttcelldata,aes(x=logconc,y=diameter))+geom_point()+geom_smooth(method="lm",se=FALSE)
ttcellplot11
## `geom_smooth()` using formula 'y ~ x'

#Let's fit a linear model to diameter vs logconc
ttcellmodel<-lm(diameter~logconc, data=ttcelldata)
summary(ttcellmodel)
##
## Call:
## lm(formula = diameter ~ logconc, data = ttcelldata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0739 -0.7417 0.2351 0.6923 1.4475
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.03015 1.06614 34.73 <2e-16 ***
## logconc -1.24437 0.09395 -13.24 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8593 on 49 degrees of freedom
## Multiple R-squared: 0.7817, Adjusted R-squared: 0.7772
## F-statistic: 175.4 on 1 and 49 DF, p-value: < 2.2e-16
plot(ttcellmodel)




#Plots of a separate line of best fit for each glucose group
ttcellplot3<-ggplot(ttcelldata,aes(x=logconc,y=diameter, color=glucoseF))+geom_point()+geom_smooth(method="lm",se=FALSE)
ttcellplot3
## `geom_smooth()` using formula 'y ~ x'

ttcellconf<-confint(ttcellmodel)
ttcellconf
## 2.5 % 97.5 %
## (Intercept) 34.887659 39.172637
## logconc -1.433179 -1.055561
#Confidence interval on the mean of average diameter
x<-log(100000)
C.I<-predict(ttcellmodel,newdata=data.frame(logconc=x),interval=c("confidence"),level=.95)
(C.I)
## fit lwr upr
## 1 22.70381 22.45785 22.94976
#Prediction interval on the true average diameter
P.I<-predict(ttcellmodel,newdata=data.frame(logconc=x),interval=c("prediction"),level=.95)
(P.I)
## fit lwr upr
## 1 22.70381 20.95948 24.44813