Use Text Messages (by age)

n <- 800 - 696
zeros.teen <- rep(0, n)
n <- 696
ones.teen <- rep(1, n)
n <- 2252 - 1621
zeros.adult <- rep(0, n)
n <- 1621
ones.adult <- rep(1, n)
teen <- c(zeros.teen, ones.teen)
adult <- c(zeros.adult, ones.adult)
mean(teen)

## [1] 0.87

mean(adult)

## [1] 0.7198046

p = mean(teen) - mean(adult)

distribution = replicate(10000,mean(sample(teen, 200, replace = T)) - mean(sample(adult, 200, replace = T)))
mean(distribution)

## [1] 0.1498115

sd(distribution)

## [1] 0.03966704

hist(distribution, main ="Use Text Messages (by age)")

#Check with Normal Theory
sqrt(p*(1-p)/200)

## [1] 0.0252623

biasBoot = mean(distribution)-p
biasBoot/sd(distribution)

## [1] -0.009677603

quantile(distribution, c(.025,.975))

##  2.5% 97.5% 
##  0.07  0.23

Lowerbound = mean(distribution) - 2*sd(distribution)
Upperbound = mean(distribution) + 2*sd(distribution)
Lowerbound

## [1] 0.07047741

Upperbound

## [1] 0.2291456

Our bootstrap of the differences in proportions in text message use by age produced a bell-shaped distribution centered well within two standard deviations of the sample proportions by either measure. Either method of is sufficient to measure the bootstrap.

Get Pregnant (by Smoker Status)

n <- 135 - 38
zeros.smoker <- rep(0, n)
n <- 38
ones.smoker <- rep(1, n)
n <- 543 - 206
zeros.nonsmoker <- rep(0, n)
n <- 206
ones.nonsmoker <- rep(1, n)
smoker <- c(zeros.smoker, ones.smoker)
nonsmoker <- c(zeros.nonsmoker, ones.nonsmoker)
mean(smoker)

## [1] 0.2814815

mean(nonsmoker)

## [1] 0.3793738

p = mean(smoker) - mean(nonsmoker)

distribution = replicate(10000,mean(sample(smoker, 50, replace = T)) - mean(sample(nonsmoker, 50, replace = T)))
mean(distribution)

## [1] -0.096438

sd(distribution)

## [1] 0.09365975

hist(distribution, main="Get Pregnant (by smoking staus)")

#Check with Normal Theory
sqrt(abs(p*(1-p))/50)

## [1] 0.04636276

biasBoot = mean(distribution)-p
biasBoot/sd(distribution)

## [1] 0.0155282

quantile(distribution, c(.025,.975))

##  2.5% 97.5% 
## -0.28  0.08

Lowerbound = mean(distribution) - 2*sd(distribution)
Upperbound = mean(distribution) + 2*sd(distribution)
Lowerbound

## [1] -0.2837575

Upperbound

## [1] 0.09088151

The Pregnant smoker distribution is similar to the Text message distribution, except that it lost some of its bell-shape, due in part to the smaller sample replacements. This decision was made because the orginal sample sizes of the proportions is significantly smaller. Nevertheless, the distribution does fit around the sample proportion difference with minimal bias. Either method is useful.

Restaurant Tips - Correlation

Create Bootstrap

Rest = read.csv("C:/Users/Will/OneDrive/Documents/School/375T Predictive Analytics/HW2/data/restaurantTipsBill.csv")
attach(Rest)
N = 10^4
Tips = Rest$Tip
Bills = Rest$Bill

SumSqTips = mean((Tips - mean(Tips))^2)
SumSqBills = mean((Bills - mean(Bills))^2)
r2 = summary(lm(Tips ~ Bills))$r.squared
bHat = sqrt(r2)*(SumSqTips/SumSqBills)

cor.boot = numeric(N)
beta.boot = numeric(N)
alpha.boot = numeric(N)
yPred.boot = numeric(N)
n = length(Bills)

for (i in 1:N)
{
 index <- sample(n, replace = TRUE) # Sample rows from 1 to 25
 Rest.boot <- Rest[index, ] #resample data
    cor.boot[i] <- cor(Rest.boot$Bill, Rest.boot$Tip)
    
    r = summary(lm(Rest.boot$Tip ~ Rest.boot$Bill))$r.squared #find r^2 of sample
    SumSqTips2 = mean((Rest.boot$Tip - mean(Rest.boot$Tip))^2) #sum of squares sample tips
    SumSqBills2 = mean((Rest.boot$Bill - mean(Rest.boot$Bill))^2)  #sum of squares sample bills
    beta.boot[i] = sqrt(r)*(SumSqTips2/SumSqBills2) #sample beta
}

Correlation

hist(cor.boot, main="Distributation of the Bootstrap Correlation")

mean(cor.boot)

## [1] 0.9153987

sd(cor.boot)

## [1] 0.02499295

biasBoot = mean(cor.boot)-cor(Bills, Tips)
biasBoot/sd(cor.boot)

## [1] 0.01358399

quantile(cor.boot, c(.025,.975))

##      2.5%     97.5% 
## 0.8592791 0.9556681

Lowerbound = mean(cor.boot) - 2*sd(cor.boot)
Upperbound = mean(cor.boot) + 2*sd(cor.boot)
Lowerbound

## [1] 0.8654128

Upperbound

## [1] 0.9653846

The distribution of the boostrap correlation is visibily left-skewed, but not so signficantly that the bounds of the percentile method or 2*SE method are very different from one another. Either method is probably okay, but the percentile method is preferred given the assymetry.

Slope Coefficient

hist(beta.boot, main="Distributation of the Bootstrap Slope Coefficient")

mean(beta.boot)

## [1] 0.03639352

sd(beta.boot)

## [1] 0.003620137

biasBoot = mean(beta.boot)-bHat
biasBoot/sd(beta.boot)

## [1] 0.0301912

quantile(beta.boot, c(.025,.975))

##       2.5%      97.5% 
## 0.03020086 0.04416887

Lowerbound = mean(beta.boot) - 2*sd(beta.boot)
Upperbound = mean(beta.boot) + 2*sd(beta.boot)
Lowerbound

## [1] 0.02915324

Upperbound

## [1] 0.04363379

Our assessment of the distribution of the bootstrap slop coefficient is the same as the correlation, with the only exception being that the distribution is right-skewed. Either method will work, but the percentile method is preferred.

HW2 Bootstrap