This is the bootstrap distribution for the "Use Text Messages by Age" dataset. Due to the bootstrap distribution being fairly symmetric, we can use percentile method, the 2*SE method, and give the entire distribution as a summary.
n <- 800 - 696
zeros.teen <- rep(0, n)
n <- 696
ones.teen <- rep(1, n)
n <- 2252 - 1621
zeros.adult <- rep(0, n)
n <- 1621
ones.adult <- rep(1, n)
teen <- c(zeros.teen, ones.teen)
adult <- c(zeros.adult, ones.adult)
mean(teen)## [1] 0.87
mean(adult)## [1] 0.7198046
diff<-mean(teen)-mean(adult)
diff## [1] 0.1501954
N<-10^4
diff.prop<-numeric(N)
for (i in 1:N)
{
teen.prop<-sample(teen,800,replace=TRUE)
adult.prop<-sample(adult,2252,replace=TRUE)
diff.prop[i]<-mean(teen.prop) - mean(adult.prop)
}
hist(diff.prop)This is the bootstrap distribution for the "Get Pregnant (by smoking status)" dataset. Due to the bootstrap distribution being fairly symmetric, we can use percentile method, the 2*SE method, and give the entire distribution as a summary.
library(ggplot2)
library(readxl)
n <- 135 - 38
zeros.smoker <- rep(0, n)
n <-38
ones.smoker <- rep(1, n)
n <- 543-206
zeros.non <- rep(0, n)
n <- 206
ones.non <- rep(1, n)
smoker <- c(zeros.smoker, ones.smoker)
non <- c(zeros.non, ones.non)
mean(smoker)## [1] 0.2814815
mean(non)## [1] 0.3793738
diff<-mean(smoker)-mean(non)
diff## [1] -0.09789237
N<-10^4
diff.prop<-numeric(N)
for (i in 1:N)
{
smoke.prop<-sample(smoker,135,replace=TRUE)
non.prop<-sample(non,543,replace=TRUE)
diff.prop[i]<-mean(smoke.prop) - mean(non.prop)
}
hist(diff.prop)Correlation for Restaurant Tips is calculated below. Due to the heavy skew, our best course of action is to give the entire distribution as a summary.
library(readr)
restaurantTipsBill <- read_csv("~/Desktop/M375T/restaurantTipsBill.csv")## Parsed with column specification:
## cols(
## Bill = col_double(),
## Tip = col_double(),
## Credit = col_character(),
## Guests = col_double(),
## Day = col_character(),
## Server = col_character(),
## PctTip = col_double()
## )
N<-10000
sd.boot<-numeric(N)
mean.boot<-numeric(N)
median.boot<-numeric(N)
alpha.boot<-numeric(N)
beta.boot<-numeric(N)
yPred.boot<-numeric(N)
cor.boot<-numeric(N)
n<-157
for (i in 1:N)
{
index <- sample(n, replace=TRUE)
tips.boot <- restaurantTipsBill[index,]
#cor.boot[i] <- cor(mustang.boot$Miles, mustang.boot$Price)
tips.lm <- lm(tips.boot$Tip~tips.boot$Bill, data= tips.boot)
sd.boot[i]<-sd(tips.boot$Bill)
median.boot[i]<-median(tips.boot$Bill)
cor.boot[i]<-cor(tips.boot$Bill, tips.boot$Tip)
alpha.boot[i] <- coef(tips.lm)[1]
beta.boot[i] <- coef(tips.lm)[2]
yPred.boot[i] <- alpha.boot[i] + beta.boot[i]*20
}
hist(cor.boot)Finding beta.boot distribution (slope coefficient). Controlling for the tips, for every 100 dollar increase in the bill, there will be a $18 increase in the tip.
tips.lm <- lm(tips.boot$Tip~tips.boot$Bill, data= tips.boot)
summary(tips.lm)##
## Call:
## lm(formula = tips.boot$Tip ~ tips.boot$Bill, data = tips.boot)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4471 -0.5423 -0.0429 0.3859 3.9397
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.580611 0.149211 -3.891 0.000148 ***
## tips.boot$Bill 0.191714 0.005421 35.365 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.88 on 155 degrees of freedom
## Multiple R-squared: 0.8897, Adjusted R-squared: 0.889
## F-statistic: 1251 on 1 and 155 DF, p-value: < 2.2e-16
This is the bootstrap distribution for the slope coefficient. We can use the percentile method, the 2*SE method, and give the entire distribution as a summary since the skew is very small.
alpha.boot<-numeric(N)
beta.boot<-numeric(N)
yPred.boot<-numeric(N)
n<-157
for (i in 1:N)
{
index <- sample(n, replace=TRUE)
tips.boot <- restaurantTipsBill[index,]
tips.lm <- lm(tips.boot$Tip~tips.boot$Bill, data= tips.boot)
alpha.boot[i] <- coef(tips.lm)[1]
beta.boot[i] <- coef(tips.lm)[2]
yPred.boot[i] <- alpha.boot[i] + beta.boot[i]*20
}
hist(beta.boot)