Week 5 Homework Questions

Question 1

Discrete probability simulation: suppose that a basketball player has a 60% chance of making a shot, and he keeps taking shots until he misses two in a row. Also assume his shots are independent (so that each shot has 60% probability of success, no matter what happened before).

(a) Write an R function to simulate this process.
prob_hit <- .60
maximum <- 100
consec_miss <- 0
  
for (h in 1:maximum) {
  outcome <- rbinom (1, 1, prob_hit) 
  consec_miss <- ifelse (outcome == 0, consec_miss + 1,0) 
  if (consec_miss == 2) break
}

h
## [1] 6
(b) Put the R function in a loop to simulate the process 1000 times. Use the simulation to estimate the mean, standard deviation, and distribution of the total number of shots that the player will take.
prob_hit <- .60
maximum <- 100
num_simulations <- 1000 
shot_results <- rep(NA, num_simulations)

for (i in 1:num_simulations) {
  for (h in 1:maximum) {
  outcome <- rbinom (1, 1, prob_hit) 
  consec_miss <- ifelse (outcome == 0, consec_miss + 1,0) 
  if (consec_miss == 2) break
}

shot_results [i] <- h
}
hist(shot_results)

(c) Using your simulations, make a scatterplot of the number of shots the player will take and the proportion of shots that are successes.
prob_hit <- .60
maximum <- 100
num_simulations <- 1000 

trials <- rep(NA, num_simulations)
suc <- rep(NA, num_simulations)
prob_suc <- rep(NA, num_simulations) 

for (i in 1:num_simulations) {
  h <- 0
  for (t in 1:maximum) {
  outcome <- rbinom (1, 1, prob_hit) 
  h <- ifelse ( outcome == 1, h+1, h)
  consec_miss <- ifelse (outcome == 0, consec_miss + 1,0) 
  if (consec_miss == 2) break
}
trials[i]<- t
suc[i] <- h
prob_suc[i] <- h/t
}
plot(trials, prob_suc)

Questions 2: Propagation of uncertainty: we use a highly idealized setting to illustrate the use of simulations in combining uncertainties. Suppose a company changes its technology for widget production, and a study estimates the cost savings at 5 USD per unit, but with a standard error of 4 USD.Furthermore, a forecast estimates the size of the market (that is, the number of widgets that will be sold) at 40,000, with a standard error of 10,000. Assuming these two sources of uncertainty are independent, use simulation to estimate the total amount of money saved by the new product (that is, savings per unit, multiplied by size of the market).

mean_cost_savings <- 5
standard_error_savings <- 4

mean_size <- 40000
standard_error_size <- 10000

n_sims <- 1000

sim_total_savings <- rep(NA, n_sims)
for (k in 1:n_sims) {
  savings <- rnorm(1, mean_cost_savings, standard_error_savings)
  size <- rnorm(1, mean_size, standard_error_size)
  sim_total_savings [k] <- size * savings
}

hist(sim_total_savings)

is this another way of doing the above?

sim_total_savings <- function() { savings <- rnorm(1, mean_cost_savings, standard_error_savings) size <- rnorm(1, mean_size, standard_error_size) total_save <- savings * size return(total_save) }

total_save_sim <- replicate(n_sims, sim_total_savings())

mean_total_save <- mean(total_save_sim) standard_error_save <- sd(total_save_sim)

Question 3: Predictive simulation for linear regression: take one of the models from Exercise 3.5 or 4.8 that predicts course evaluations from beauty and other input variables. You will do some simulations.

(a) Instructor A is a 50-year-old woman who is a native English speaker and has a beauty score of −1. Instructor B is a 60-year-old man who is a native English speaker and has a beauty score of −0.5. Simulate 1000 random draws of the course evaluation rating of these two instructors. In your simulation, account for the uncertainty in the regression parameters (that is, use the sim() function) as well as the predictive uncertainty.
require(arm)
## Loading required package: arm
## Loading required package: MASS
## Loading required package: Matrix
## Loading required package: lme4
## 
## arm (Version 1.13-1, built: 2022-8-25)
## Working directory is /Users/user/Desktop/R - Spring
require(ggplot2)
## Loading required package: ggplot2
require(foreign)
## Loading required package: foreign
prof <- read.csv("ProfEvaltnsBeautyPublic.csv")
str(prof)
## 'data.frame':    463 obs. of  64 variables:
##  $ tenured          : int  0 1 1 1 0 1 0 1 0 0 ...
##  $ profnumber       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ minority         : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ age              : int  36 59 51 40 31 62 33 51 33 47 ...
##  $ beautyf2upper    : int  6 2 5 4 9 5 5 6 5 6 ...
##  $ beautyflowerdiv  : int  5 4 5 2 7 6 4 4 3 5 ...
##  $ beautyfupperdiv  : int  7 4 2 5 9 6 4 6 7 7 ...
##  $ beautym2upper    : int  6 3 3 2 6 6 4 3 5 6 ...
##  $ beautymlowerdiv  : int  2 2 2 3 7 5 4 2 5 3 ...
##  $ beautymupperdiv  : int  4 3 3 3 6 5 4 3 3 6 ...
##  $ btystdave        : num  0.202 -0.826 -0.66 -0.766 1.421 ...
##  $ btystdf2u        : num  0.289 -1.619 -0.188 -0.665 1.721 ...
##  $ btystdfl         : num  0.458 -0.0735 0.458 -1.1365 1.521 ...
##  $ btystdfu         : num  0.8758 -0.577 -1.5456 -0.0927 1.8444 ...
##  $ btystdm2u        : num  0.682 -1.132 -1.132 -1.736 0.682 ...
##  $ btystdml         : num  -0.9 -0.9 -0.9 -0.313 2.038 ...
##  $ btystdmu         : num  -0.195 -0.655 -0.655 -0.655 0.723 ...
##  $ class1           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class2           : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ class3           : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ class4           : int  0 0 1 0 0 0 1 0 0 1 ...
##  $ class5           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class6           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class7           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class8           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class9           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class10          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class11          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class12          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class13          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class14          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class15          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class16          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class17          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class18          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class19          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class20          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class21          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class22          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class23          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class24          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class25          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class26          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class27          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class28          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class29          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ class30          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ courseevaluation : num  4.3 4.5 3.7 4.3 4.4 4.2 4 3.4 4.5 3.9 ...
##  $ didevaluation    : int  24 17 55 40 42 182 33 25 48 16 ...
##  $ female           : int  1 0 0 1 1 0 1 1 1 0 ...
##  $ formal           : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ fulldept         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ lower            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ multipleclass    : int  1 0 1 1 0 0 1 0 0 1 ...
##  $ nonenglish       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ onecredit        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ percentevaluating: num  55.8 85 100 87 87.5 ...
##  $ profevaluation   : num  4.7 4.6 4.1 4.5 4.8 4.4 4.4 3.4 4.8 4 ...
##  $ students         : int  43 20 55 46 48 282 41 41 60 19 ...
##  $ tenuretrack      : int  1 1 1 1 1 1 1 1 1 0 ...
##  $ blkandwhite      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ btystdvariance   : num  2.13 1.39 2.54 1.76 1.69 ...
##  $ btystdavepos     : num  0.202 0 0 0 1.421 ...
##  $ btystdaveneg     : num  0 -0.826 -0.66 -0.766 0 ...
# convert into factors
prof$profnumber <- as.factor(prof$profnumber)
prof$female <- as.factor(prof$female)

# convert dummy `class*` variables into a factor
dummy_var <- prof[ , 18:47]
prof$class <- factor(apply (dummy_var, FUN = function(r) r %*% 1:30, MARGIN = 1))

# remove dummy variables
prof <- prof[ , -c (18:47)]

# normalize and center professor evaluation (all other predictors are binary)
prof$c.profevaluation <- prof$profevaluation - mean(prof$profevaluation) / (2*sd(prof$profevaluation))

# fit linear model
lm_prof <- lm(courseevaluation ~ female + multipleclass + onecredit + c.profevaluation*nonenglish, data = prof)

display(lm_prof)
## lm(formula = courseevaluation ~ female + multipleclass + onecredit + 
##     c.profevaluation * nonenglish, data = prof)
##                             coef.est coef.se
## (Intercept)                  3.70     0.02  
## female1                     -0.03     0.02  
## multipleclass               -0.01     0.02  
## onecredit                    0.10     0.04  
## c.profevaluation             0.94     0.02  
## nonenglish                  -0.07     0.04  
## c.profevaluation:nonenglish -0.17     0.09  
## ---
## n = 463, k = 7
## residual sd = 0.19, R-Squared = 0.88
n_sims <- 1000
simulation_prof <- sim(lm_prof, n_sims)
# Instructor A: 50-year-old woman who is a native English speaker and has a beauty score of −1 
simulation_1 <- coef(simulation_prof) [, 1] + 1 * coef(simulation_prof) [, 2] + rep(c (0,1), 500) * coef(simulation_prof) [, 3] + -1 * coef(simulation_prof) [, 4] + 0 * coef(simulation_prof) [, 5] + -1 * 0 * coef(simulation_prof) [, 6]

# Instructor B: 60-year-old man who is a native English speaker and has a beauty score of −0.5 
simulation_2 <- coef(simulation_prof) [, 1] + 0 * coef(simulation_prof) [, 2] + rep(c (0,1), 500) * coef(simulation_prof) [, 3] + -5 * coef(simulation_prof) [, 4] + 0 * coef(simulation_prof) [, 5] + -.5 * 0 * coef(simulation_prof) [, 6]

#plotting data
ggplot(data = data.frame (x1 = simulation_1, x2 = simulation_2)) + geom_histogram(aes(x = x1), fill = "#FF95BE", alpha = .35) + geom_histogram(aes(x = x2), fill = "#B0E0E6", alpha = .35)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

(b) Make a histogram of the difference between the course evaluations for A and B. What is the probability that A will have a higher evaluation?

eval_differences <- simulation_1 - simulation_2

ggplot(data = data.frame (x = eval_differences)) + geom_histogram(aes (x = x), fill = "#FF1493", alpha = .35) + labs (title = "Final Score Difference", x = "A vs B")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The probability the course A will have a higher score is zero

mean(eval_differences >= 0)
## [1] 0.993

Question 4: Continuous probability simulation: the logarithms of weights (in pounds) of men in the United States are approximately normally distributed with mean 5.13 and standard deviation 0.17; women with mean 4.96 and standard deviation 0.20. Suppose 10 adults selected at random step on an elevator with a capacity of 1750 pounds. What is the probability that the elevator cable breaks?

prop_f <- .50

men_weight <- 5.13
men_weight_sd <- 0.17

women_weight <- 4.96
women_weight_sd <- 0.20

number_adults <- 10
max_capacity <- 1750
n_sims <- 1000

prob_cable_breaks <- rep(NA, n_sims)
tot_weight <- rep(NA, number_adults)
for (k in 1:n_sims) {
   tot_weight <- rep(NA, number_adults)
    sexes <- rbinom(n=number_adults, size=1, prob=prop_f) 
    for (j in 1:number_adults) {
        tot_weight[j] <- exp(ifelse(sexes[j]==1, rnorm(n=1, mean=women_weight, sd=women_weight_sd), rnorm(n=1, mean=men_weight, sd=men_weight_sd)))
    }
    prob_cable_breaks[k] <- ifelse(max_capacity < sum(tot_weight), 1, 0)
}
    sum(prob_cable_breaks)/n_sims
## [1] 0.062

Question 5 Predictive checks: Using data of interest to you, fit a model of interest

(a) Simulate replicated data sets and visually compare to the actual data
# Read in the data and take a peek
my_data <- read.csv("wpa_lena_pos.csv")
str(my_data)
## 'data.frame':    1488 obs. of  11 variables:
##  $ id_uni         : int  801 801 801 801 801 801 801 801 801 801 ...
##  $ age            : num  10.8 10.8 10.8 10.8 10.8 ...
##  $ sit_time       : num  0.13 0.211 0.371 0.294 0.151 ...
##  $ held_time      : num  0.00669 0.17057 0.22408 0.08361 0.19732 ...
##  $ prone_time     : num  0.1973 0.2408 0.0803 0.2274 0.4047 ...
##  $ supine_time    : num  0.137 0.137 0.13 0.13 0 ...
##  $ upright_time   : num  0.528 0.241 0.194 0.264 0.247 ...
##  $ adult_word_cnt : num  331 500 471 288 407 ...
##  $ child_utt_cnt  : int  61 55 73 59 32 16 16 33 35 10 ...
##  $ conv_turn_count: int  14 24 25 13 6 4 2 6 10 5 ...
##  $ child_cry_scnds: num  12.1 7.31 12.16 9.6 9.31 ...
#data.frame(my_data)
lm_wpa_data <- lm(data = my_data, adult_word_cnt ~ sit_time + age)
summary(lm_wpa_data)
## 
## Call:
## lm(formula = adult_word_cnt ~ sit_time + age, data = my_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -317.25 -152.03  -53.53   90.33 1272.60 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  210.200     69.092   3.042  0.00239 ** 
## sit_time     150.061     19.133   7.843 8.34e-15 ***
## age           -3.874      5.821  -0.666  0.50583    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 209.9 on 1485 degrees of freedom
## Multiple R-squared:  0.03997,    Adjusted R-squared:  0.03868 
## F-statistic: 30.91 on 2 and 1485 DF,  p-value: 7.024e-14
n <- nrow(my_data)
mu <- mean(my_data$adult_word_cnt)
theta <- sd(my_data$adult_word_cnt)
wpa_data <- rep(NA, n)
wpa_data_1 <- rep(NA, n)

for(si in 1:n) { 
  wpa_data_1[si] <- ifelse(si <= .5*n, 1, 2)}

for (ti in 1:1000){
  wpa_data = rnorm(n, mean = mean(my_data$adult_word_cnt), sd = sd(my_data$adult_word_cnt))}

time_sitting <- my_data$sit_time
participant_age <- my_data$age

wpa_sim <- data.frame (wpa_data_1, time_sitting, participant_age, wpa_data)

ggplot(data = my_data, mapping = aes(x = sit_time, y = adult_word_cnt)) + geom_jitter(mapping = aes(color = age))

# Now sim data 
ggplot(data = wpa_sim, mapping = aes(x = time_sitting, y = wpa_data)) + geom_jitter(mapping = aes(color = participant_age))

How do the new simulated data look compared to the collected data? There are negative values which does not make sense.

Is there a way to make the simulated data more similar to the collected data?

wpa_sim_2 <- wpa_sim

for (si in 1:n) {
wpa_sim_2$wpa_data[si] <- ifelse(wpa_sim$wpa_data[si]<= 10, abs(wpa_sim$wpa_data[si]),wpa_sim$wpa_data[si])
}

# plot collected data
  ggplot(data = my_data, mapping = aes(x = sit_time,y = adult_word_cnt)) + geom_jitter(mapping = aes(color = age))

# plot simulated data 
  ggplot(data = wpa_sim_2, mapping = aes(x = time_sitting, y = wpa_data)) + geom_jitter(mapping = aes(color = participant_age))

(b) Summarize the data by a numerical test statistic, and compare to the values of the test statistic in the replicated data sets.
t_test <- t.test(my_data$adult_word_cnt, wpa_sim_2$wpa_data)
t_test_1 <- t.test(my_data[my_data$age < 12.5, "adult_word_cnt"], 
                   my_data[my_data$age > 12.5, "adult_word_cnt"])

t_test_2 <- t.test(wpa_sim_2[wpa_sim_2$participant_age < 12.5, "wpa_data"], 
                   wpa_sim_2[wpa_sim_2$participant_age > 12.5, "wpa_data"])

t_test_1
## 
##  Welch Two Sample t-test
## 
## data:  my_data[my_data$age < 12.5, "adult_word_cnt"] and my_data[my_data$age > 12.5, "adult_word_cnt"]
## t = -0.87965, df = 366.92, p-value = 0.3796
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -43.75016  16.70626
## sample estimates:
## mean of x mean of y 
##  229.3527  242.8746
t_test_2
## 
##  Welch Two Sample t-test
## 
## data:  wpa_sim_2[wpa_sim_2$participant_age < 12.5, "wpa_data"] and wpa_sim_2[wpa_sim_2$participant_age > 12.5, "wpa_data"]
## t = 0.53509, df = 388.51, p-value = 0.5929
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -16.98837  29.69306
## sample estimates:
## mean of x mean of y 
##  268.4256  262.0732

I don’t know what I’m doing below. Just messing around.

mu_1 <- mean(my_data$age)
theta_1 <- sd(my_data$age)
n_sims <- 1000

wpa_simulation <-  function() {
  words <- rnorm(1, mu, theta)
  age <- rnorm(1, mu_1, theta_1)
  x <- words * age
  return(x)
}

x_sim <- replicate(n_sims, wpa_simulation())

mean_x_sim <- mean(x_sim)
standard_error_x_sim <- sd(x_sim)

cat("Estimated mean x: ", mean_x_sim, "\n")
## Estimated mean x:  2591.418
cat("Standard deviation of x: ", standard_error_x_sim)
## Standard deviation of x:  2504.549
sim_lm <- arm::sim(lm_wpa_data, n_sims)
str(sim_lm)
## Formal class 'sim' [package "arm"] with 2 slots
##   ..@ coef : num [1:1000, 1:3] 156 240 195 150 183 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:3] "(Intercept)" "sit_time" "age"
##   ..@ sigma: num [1:1000] 213 216 207 209 208 ...
age_sim <- coef(sim_lm) [, "age"] + 
  coef(sim_lm) [, "age"]
mean(age_sim)
## [1] -7.234379