R tutorial Part 2

Hypothesis test

Data BodyTemp50 were collected to see if the mean body temperature for humans differs from 98.6F. Based on the sample in the data, test whether there is evidence that the mean body temperature is different from 98.6F

\(H_0=98.6\)

\(H_A\neq 98.6\)

##Read this, remove  "##" in the next line then click run
##install.packages("Lock5Data")
library("Lock5Data")

## Warning: package 'Lock5Data' was built under R version 3.5.2

data(BodyTemp50)
head(BodyTemp50)

##   BodyTemp Pulse Gender
## 1     97.6    69      0
## 2     99.4    77      1
## 3     99.0    75      0
## 4     98.8    84      1
## 5     98.0    71      0
## 6     98.9    76      1

nrow(BodyTemp50) ## sample size

## [1] 50

with(BodyTemp50, mean(BodyTemp))

## [1] 98.26

mean(BodyTemp50$BodyTemp)

## [1] 98.26

with(BodyTemp50, sd(BodyTemp))

## [1] 0.7653197

t.test(BodyTemp50$BodyTemp, alternative = "two.sided",
       mu = 98.6)

## 
##  One Sample t-test
## 
## data:  BodyTemp50$BodyTemp
## t = -3.1414, df = 49, p-value = 0.002851
## alternative hypothesis: true mean is not equal to 98.6
## 95 percent confidence interval:
##  98.0425 98.4775
## sample estimates:
## mean of x 
##     98.26

t.test(BodyTemp50$BodyTemp, alternative = "less",
       mu = 98.6)

## 
##  One Sample t-test
## 
## data:  BodyTemp50$BodyTemp
## t = -3.1414, df = 49, p-value = 0.001425
## alternative hypothesis: true mean is less than 98.6
## 95 percent confidence interval:
##      -Inf 98.44146
## sample estimates:
## mean of x 
##     98.26

t.test(BodyTemp50$BodyTemp, alternative = "greater",
       mu = 98.6)

## 
##  One Sample t-test
## 
## data:  BodyTemp50$BodyTemp
## t = -3.1414, df = 49, p-value = 0.9986
## alternative hypothesis: true mean is greater than 98.6
## 95 percent confidence interval:
##  98.07854      Inf
## sample estimates:
## mean of x 
##     98.26

Linear regression

Course Example

x<-c(4.2,3.1,5.7,2.9)
y<-c(7.1,4.2,10,4.1)
fit<-lm(y~x)
summary(fit)

## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##        1        2        3        4 
##  0.26043 -0.24613 -0.10334  0.08903 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  -2.2990     0.5031  -4.569  0.04471 * 
## x             2.1758     0.1219  17.851  0.00312 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2711 on 2 degrees of freedom
## Multiple R-squared:  0.9938, Adjusted R-squared:  0.9906 
## F-statistic: 318.7 on 1 and 2 DF,  p-value: 0.003124

plot(fit)

plot(x,y)
abline(fit,col="blue")

Let’s do prediction

new <- data.frame(x = seq(-3, 3, 0.5))
prediction=predict(fit, new, se.fit = TRUE, interval = "prediction")
Data<-data.frame(new,prediction$fit)
plot(Data$x,Data$fit)
abline(fit)
##add prediction intervel
lines(Data$x,Data$lwr,col="red")
lines(Data$x,Data$upr,col="red")

Another example

##Read this, remove  "##" in the next line then click run
##install.packages("alr4")
library(alr4)

## Warning: package 'alr4' was built under R version 3.5.3

## Loading required package: car

## Loading required package: carData

## Loading required package: effects

## Warning: package 'effects' was built under R version 3.5.3

## lattice theme set by effectsTheme()
## See ?effectsTheme for details.

head(fuel2001)

##     Drivers    FuelC Income  Miles      MPC      Pop  Tax
## AL  3559897  2382507  23471  94440 12737.00  3451586 18.0
## AK   472211   235400  30064  13628  7639.16   457728  8.0
## AZ  3550367  2428430  25578  55245  9411.55  3907526 18.0
## AR  1961883  1358174  22257  98132 11268.40  2072622 21.7
## CA 21623793 14691753  32275 168771  8923.89 25599275 18.0
## CO  3287922  2048664  32949  85854  9722.73  3322455 22.0

pairs(~Tax+Income+log(Miles)+FuelC,data=fuel2001)

pairs(~.,data=fuel2001)

fit<-lm(FuelC~.,data=fuel2001)
summary(fit)

## 
## Call:
## lm(formula = FuelC ~ ., data = fuel2001)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1480910  -158802    19267   174208  1090089 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.902e+05  8.199e+05  -0.598 0.552983    
## Drivers      6.368e-01  1.452e-01   4.386 7.09e-05 ***
## Income       7.690e+00  1.632e+01   0.471 0.639793    
## Miles        5.850e+00  1.621e+00   3.608 0.000784 ***
## MPC          4.562e+01  3.565e+01   1.280 0.207337    
## Pop         -1.945e-02  1.245e-01  -0.156 0.876586    
## Tax         -2.087e+04  1.324e+04  -1.576 0.122235    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 398400 on 44 degrees of freedom
## Multiple R-squared:  0.9808, Adjusted R-squared:  0.9782 
## F-statistic: 374.6 on 6 and 44 DF,  p-value: < 2.2e-16

fit1<-lm(FuelC~Drivers+Miles,data=fuel2001)
summary(fit1)

## 
## Call:
## lm(formula = FuelC ~ Drivers + Miles, data = fuel2001)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1667584  -207441    63143   156912  1055581 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.207e+05  1.016e+05  -2.173 0.034783 *  
## Drivers      6.121e-01  1.938e-02  31.578  < 2e-16 ***
## Miles        6.041e+00  1.460e+00   4.137 0.000141 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 406800 on 48 degrees of freedom
## Multiple R-squared:  0.9782, Adjusted R-squared:  0.9773 
## F-statistic:  1075 on 2 and 48 DF,  p-value: < 2.2e-16

plot(fit1)

Homework

9.2.2

library(readxl)
cell<- read_excel("C:/Users/zitao_000/Dropbox/homework/teaching/data/ds9.2.2-endothelial-cell-adherence.xls")
head(cell)

## # A tibble: 6 x 3
##   Sample `Type A` `Type B`
##    <dbl>    <dbl>    <dbl>
## 1      1      127      129
## 2      2      133      133
## 3      3      127      127
## 4      4      116      122
## 5      5      132      131
## 6      6      126      125

##paired t test
boxplot(cell$`Type A`,cell$`Type B`)

z=cell$`Type A`-cell$`Type B`
z.bar=mean(z)
pool.sample.variance=sd(z)
pool.sample.variance

## [1] 6.084117

hypothesis test

\(H_0=\mu_z=\mu_A-\mu_B=0\)

\(H_A=\mu_z=\mu_A-\mu_B\neq0\)

n=14
t=(sqrt(n)*z.bar)/pool.sample.variance
t

## [1] -0.8346262

##p.value
2*pt(t,n-1)

## [1] 0.4190021

##Using confidence interval to check if 0 is in CI
t.score=qt(0.025,n-1)
t.score

## [1] -2.160369

moe=t.score*pool.sample.variance/sqrt(n)
left.pt=z.bar+moe
right.pt=z.bar-moe
ci=c(left.pt,right.pt)
ci

## [1] -4.870008  2.155722

###quick way
##first way:Paired t-test
t.test(cell$`Type A`,cell$`Type B`,paired=TRUE)

## 
##  Paired t-test
## 
## data:  cell$`Type A` and cell$`Type B`
## t = -0.83463, df = 13, p-value = 0.419
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.870008  2.155722
## sample estimates:
## mean of the differences 
##               -1.357143

##second way:One Sample t-test
z=cell$`Type A`-cell$`Type B`
t.test(z)

## 
##  One Sample t-test
## 
## data:  z
## t = -0.83463, df = 13, p-value = 0.419
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -4.870008  2.155722
## sample estimates:
## mean of x 
## -1.357143

##Notice, if you take first way without choosing paired=TRUE
t.test(cell$`Type A`,cell$`Type B`)

## 
##  Welch Two Sample t-test
## 
## data:  cell$`Type A` and cell$`Type B`
## t = -0.50165, df = 24.134, p-value = 0.6205
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.939125  4.224840
## sample estimates:
## mean of x mean of y 
##  128.3571  129.7143