Bootstrap Exploration

This is the bootstrap distribution for the "Use Text Messages by Age" dataset. Due to the bootstrap distribution being fairly symmetric, we can use percentile method, the 2*SE method, and give the entire distribution as a summary.

n <- 800 - 696
zeros.teen <- rep(0, n)
n <- 696
ones.teen <- rep(1, n)
n <- 2252 - 1621
zeros.adult <- rep(0, n)
n <- 1621
ones.adult <- rep(1, n)
teen <- c(zeros.teen, ones.teen)
adult <- c(zeros.adult, ones.adult)
mean(teen)

## [1] 0.87

mean(adult)

## [1] 0.7198046

diff<-mean(teen)-mean(adult)
diff

## [1] 0.1501954

N<-10^4
diff.prop<-numeric(N)
for (i in 1:N)
{ 
  teen.prop<-sample(teen,800,replace=TRUE)        
  adult.prop<-sample(adult,2252,replace=TRUE)
  diff.prop[i]<-mean(teen.prop) - mean(adult.prop)
}
hist(diff.prop)

This is the bootstrap distribution for the "Get Pregnant (by smoking status)" dataset. Due to the bootstrap distribution being fairly symmetric, we can use percentile method, the 2*SE method, and give the entire distribution as a summary.

library(ggplot2)
library(readxl)

n <- 135 - 38
zeros.smoker <- rep(0, n)
n <-38
ones.smoker <- rep(1, n)
n <- 543-206
zeros.non <- rep(0, n)
n <- 206
ones.non <- rep(1, n)
smoker <- c(zeros.smoker, ones.smoker)
non <- c(zeros.non, ones.non)
mean(smoker)

## [1] 0.2814815

mean(non)

## [1] 0.3793738

diff<-mean(smoker)-mean(non)
diff

## [1] -0.09789237

N<-10^4
diff.prop<-numeric(N)
for (i in 1:N)
{ 
  smoke.prop<-sample(smoker,135,replace=TRUE)        
  non.prop<-sample(non,543,replace=TRUE)
  diff.prop[i]<-mean(smoke.prop) - mean(non.prop)
}
hist(diff.prop)

Correlation for Restaurant Tips is calculated below. Due to the heavy skew, our best course of action is to give the entire distribution as a summary.

library(readr)
restaurantTipsBill <- read_csv("~/Desktop/M375T/restaurantTipsBill.csv")

## Parsed with column specification:
## cols(
##   Bill = col_double(),
##   Tip = col_double(),
##   Credit = col_character(),
##   Guests = col_double(),
##   Day = col_character(),
##   Server = col_character(),
##   PctTip = col_double()
## )

N<-10000
sd.boot<-numeric(N)
mean.boot<-numeric(N)
median.boot<-numeric(N)
alpha.boot<-numeric(N)
beta.boot<-numeric(N)
yPred.boot<-numeric(N)
cor.boot<-numeric(N)

n<-157
for (i in 1:N)
{
  index <- sample(n, replace=TRUE)
  tips.boot <- restaurantTipsBill[index,]
  #cor.boot[i] <- cor(mustang.boot$Miles, mustang.boot$Price)
  tips.lm <- lm(tips.boot$Tip~tips.boot$Bill, data= tips.boot)
  sd.boot[i]<-sd(tips.boot$Bill)
  median.boot[i]<-median(tips.boot$Bill)
  cor.boot[i]<-cor(tips.boot$Bill, tips.boot$Tip)
  alpha.boot[i] <- coef(tips.lm)[1]
  beta.boot[i] <- coef(tips.lm)[2]
  yPred.boot[i] <- alpha.boot[i] + beta.boot[i]*20
}

hist(cor.boot)

Finding beta.boot distribution (slope coefficient). Controlling for the tips, for every 100 dollar increase in the bill, there will be a $18 increase in the tip.

tips.lm <- lm(tips.boot$Tip~tips.boot$Bill, data= tips.boot)
summary(tips.lm)

## 
## Call:
## lm(formula = tips.boot$Tip ~ tips.boot$Bill, data = tips.boot)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4471 -0.5423 -0.0429  0.3859  3.9397 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -0.580611   0.149211  -3.891 0.000148 ***
## tips.boot$Bill  0.191714   0.005421  35.365  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.88 on 155 degrees of freedom
## Multiple R-squared:  0.8897, Adjusted R-squared:  0.889 
## F-statistic:  1251 on 1 and 155 DF,  p-value: < 2.2e-16

This is the bootstrap distribution for the slope coefficient. We can use the percentile method, the 2*SE method, and give the entire distribution as a summary since the skew is very small.

alpha.boot<-numeric(N)
beta.boot<-numeric(N)
yPred.boot<-numeric(N)

n<-157
for (i in 1:N)
{
  index <- sample(n, replace=TRUE)
  tips.boot <- restaurantTipsBill[index,]
  tips.lm <- lm(tips.boot$Tip~tips.boot$Bill, data= tips.boot)
  alpha.boot[i] <- coef(tips.lm)[1]
  beta.boot[i] <- coef(tips.lm)[2]
  yPred.boot[i] <- alpha.boot[i] + beta.boot[i]*20
}

hist(beta.boot)