n <- 800 - 696
zeros.teen <- rep(0, n)
n <- 696
ones.teen <- rep(1, n)
n <- 2252 - 1621
zeros.adult <- rep(0, n)
n <- 1621
ones.adult <- rep(1, n)
teen <- c(zeros.teen, ones.teen)
adult <- c(zeros.adult, ones.adult)
mean(teen)
## [1] 0.87
mean(adult)
## [1] 0.7198046
p = mean(teen) - mean(adult)
distribution = replicate(10000,mean(sample(teen, 200, replace = T)) - mean(sample(adult, 200, replace = T)))
mean(distribution)
## [1] 0.1498115
sd(distribution)
## [1] 0.03966704
hist(distribution, main ="Use Text Messages (by age)")
#Check with Normal Theory
sqrt(p*(1-p)/200)
## [1] 0.0252623
biasBoot = mean(distribution)-p
biasBoot/sd(distribution)
## [1] -0.009677603
quantile(distribution, c(.025,.975))
## 2.5% 97.5%
## 0.07 0.23
Lowerbound = mean(distribution) - 2*sd(distribution)
Upperbound = mean(distribution) + 2*sd(distribution)
Lowerbound
## [1] 0.07047741
Upperbound
## [1] 0.2291456
Our bootstrap of the differences in proportions in text message use by age produced a bell-shaped distribution centered well within two standard deviations of the sample proportions by either measure. Either method of is sufficient to measure the bootstrap.
n <- 135 - 38
zeros.smoker <- rep(0, n)
n <- 38
ones.smoker <- rep(1, n)
n <- 543 - 206
zeros.nonsmoker <- rep(0, n)
n <- 206
ones.nonsmoker <- rep(1, n)
smoker <- c(zeros.smoker, ones.smoker)
nonsmoker <- c(zeros.nonsmoker, ones.nonsmoker)
mean(smoker)
## [1] 0.2814815
mean(nonsmoker)
## [1] 0.3793738
p = mean(smoker) - mean(nonsmoker)
distribution = replicate(10000,mean(sample(smoker, 50, replace = T)) - mean(sample(nonsmoker, 50, replace = T)))
mean(distribution)
## [1] -0.096438
sd(distribution)
## [1] 0.09365975
hist(distribution, main="Get Pregnant (by smoking staus)")
#Check with Normal Theory
sqrt(abs(p*(1-p))/50)
## [1] 0.04636276
biasBoot = mean(distribution)-p
biasBoot/sd(distribution)
## [1] 0.0155282
quantile(distribution, c(.025,.975))
## 2.5% 97.5%
## -0.28 0.08
Lowerbound = mean(distribution) - 2*sd(distribution)
Upperbound = mean(distribution) + 2*sd(distribution)
Lowerbound
## [1] -0.2837575
Upperbound
## [1] 0.09088151
The Pregnant smoker distribution is similar to the Text message distribution, except that it lost some of its bell-shape, due in part to the smaller sample replacements. This decision was made because the orginal sample sizes of the proportions is significantly smaller. Nevertheless, the distribution does fit around the sample proportion difference with minimal bias. Either method is useful.
Rest = read.csv("C:/Users/Will/OneDrive/Documents/School/375T Predictive Analytics/HW2/data/restaurantTipsBill.csv")
attach(Rest)
N = 10^4
Tips = Rest$Tip
Bills = Rest$Bill
SumSqTips = mean((Tips - mean(Tips))^2)
SumSqBills = mean((Bills - mean(Bills))^2)
r2 = summary(lm(Tips ~ Bills))$r.squared
bHat = sqrt(r2)*(SumSqTips/SumSqBills)
cor.boot = numeric(N)
beta.boot = numeric(N)
alpha.boot = numeric(N)
yPred.boot = numeric(N)
n = length(Bills)
for (i in 1:N)
{
index <- sample(n, replace = TRUE) # Sample rows from 1 to 25
Rest.boot <- Rest[index, ] #resample data
cor.boot[i] <- cor(Rest.boot$Bill, Rest.boot$Tip)
r = summary(lm(Rest.boot$Tip ~ Rest.boot$Bill))$r.squared #find r^2 of sample
SumSqTips2 = mean((Rest.boot$Tip - mean(Rest.boot$Tip))^2) #sum of squares sample tips
SumSqBills2 = mean((Rest.boot$Bill - mean(Rest.boot$Bill))^2) #sum of squares sample bills
beta.boot[i] = sqrt(r)*(SumSqTips2/SumSqBills2) #sample beta
}
hist(cor.boot, main="Distributation of the Bootstrap Correlation")
mean(cor.boot)
## [1] 0.9153987
sd(cor.boot)
## [1] 0.02499295
biasBoot = mean(cor.boot)-cor(Bills, Tips)
biasBoot/sd(cor.boot)
## [1] 0.01358399
quantile(cor.boot, c(.025,.975))
## 2.5% 97.5%
## 0.8592791 0.9556681
Lowerbound = mean(cor.boot) - 2*sd(cor.boot)
Upperbound = mean(cor.boot) + 2*sd(cor.boot)
Lowerbound
## [1] 0.8654128
Upperbound
## [1] 0.9653846
The distribution of the boostrap correlation is visibily left-skewed, but not so signficantly that the bounds of the percentile method or 2*SE method are very different from one another. Either method is probably okay, but the percentile method is preferred given the assymetry.
hist(beta.boot, main="Distributation of the Bootstrap Slope Coefficient")
mean(beta.boot)
## [1] 0.03639352
sd(beta.boot)
## [1] 0.003620137
biasBoot = mean(beta.boot)-bHat
biasBoot/sd(beta.boot)
## [1] 0.0301912
quantile(beta.boot, c(.025,.975))
## 2.5% 97.5%
## 0.03020086 0.04416887
Lowerbound = mean(beta.boot) - 2*sd(beta.boot)
Upperbound = mean(beta.boot) + 2*sd(beta.boot)
Lowerbound
## [1] 0.02915324
Upperbound
## [1] 0.04363379
Our assessment of the distribution of the bootstrap slop coefficient is the same as the correlation, with the only exception being that the distribution is right-skewed. Either method will work, but the percentile method is preferred.