Week 5 Homework Questions

Question 1

Discrete probability simulation: suppose that a basketball player has a 60% chance of making a shot, and he keeps taking shots until he misses two in a row. Also assume his shots are independent (so that each shot has 60% probability of success, no matter what happened before).

(a) Write an R function to simulate this process.
prob_of_scoring<-.6
max_shots<-50
consec_miss<-0
for(q in 1:max_shots){
  outcome<-rbinom(1,1,prob_of_scoring)
  consec_miss<-ifelse(outcome==0,consec_miss +1,0)
  if (consec_miss==2)break
}
q
## [1] 2
(b) Put the R function in a loop to simulate the process 1000 times. Use the simulation to estimate the mean, standard deviation, and distribution of the total number of shots that the player will take.
prob_of_scoring<-.6
max_shots<-50
consec_miss<-0
bballsims1<-1000
results<-rep(NA,bballsims1)
for(b in 1:bballsims1){
for(q in 1:max_shots){
  outcome<-rbinom(1,1,prob_of_scoring)
  consec_miss<-ifelse(outcome==0,consec_miss +1,0)
  if (consec_miss==2)break
}
q
results[b]<-q
}
hist(results)

(c) Using your simulations, make a scatterplot of the number of shots the player will take and the proportion of shots that are successes.
prob_score<-.6
max_shots<-100
simbball1<-1000

trials<-rep(NA,simbball1)
number_of_scored_shots<-rep(NA,simbball1)
prob_of_succ_shots<-rep(NA,simbball1)

for (a in 1:simbball1){
  s<-0
  for(c in 1:max_shots){
  outcome<-rbinom(1,1,prob_score)
    s<-ifelse(outcome==1,s+1,s)
    consec_miss<-ifelse(outcome==0,consec_miss+1,0)
    if(consec_miss==2)break
  }
  trials[a]<-c
  number_of_scored_shots[a]<-s
  prob_of_succ_shots[a]<-s/c
}
plot(trials,prob_of_succ_shots)    

Questions 2: Propagation of uncertainty: we use a highly idealized setting to illustrate the use of simulations in combining uncertainties. Suppose a company changes its technology for widget production, and a study estimates the cost savings at 5 USD per unit, but with a standard error of 4 USD.Furthermore, a forecast estimates the size of the market (that is, the number of widgets that will be sold) at 40,000, with a standard error of 10,000. Assuming these two sources of uncertainty are independent, use simulation to estimate the total amount of money saved by the new product (that is, savings per unit, multiplied by size of the market).

mean.savings<-5
se.savings<-4
size.market<-40000
se.market<-1000
sims1<-1000
savings.total<-rep(NA,sims1)
for(r in 1:sims1){
  market<-rnorm(n=1,mean=size.market,sd=se.market)
  savings<-rnorm(n=1,mean=mean.savings,sd=se.savings)
  savings.total[r]<-market*savings
  
}
hist(savings.total)

Question 3: Predictive simulation for linear regression: take one of the models from Exercise 3.5 or 4.8 that predicts course evaluations from beauty and other input variables. You will do some simulations.

(a) Instructor A is a 50-year-old woman who is a native English speaker and has a beauty score of −1. Instructor B is a 60-year-old man who is a native English speaker and has a beauty score of −0.5. Simulate 1000 random draws of the course evaluation rating of these two instructors. In your simulation, account for the uncertainty in the regression parameters (that is, use the sim() function) as well as the predictive uncertainty.
require(arm)
## Loading required package: arm
## Loading required package: MASS
## Loading required package: Matrix
## Loading required package: lme4
## 
## arm (Version 1.13-1, built: 2022-8-25)
## Working directory is C:/Users/brand/Desktop/UCR/PhD CCN/PSYC213_Spring2023/Lab/Week 5
require(ggplot2)
## Loading required package: ggplot2
require(foreign)
## Loading required package: foreign
prof <- read.csv("http://www.stat.columbia.edu/~gelman/arm/examples/beauty/ProfEvaltnsBeautyPublic.csv")

# convert into factors
prof$profnumber <- as.factor(prof$profnumber)
prof$female <- as.factor(prof$female)
# convert dummy `class*` variables into a factor
dummies <- prof[, 18:47]
prof$class <- factor(apply(dummies, FUN=function(r) r %*% 1:30, MARGIN=1))
# remove dummy variables
prof <- prof[-c(18:47)]

# normalise and centre professor evaluation (all other predictors are binary)
prof$c.profevaluation <- prof$profevaluation - mean(prof$profevaluation) / (2 * sd(prof$profevaluation))

# fit linear model
m1 <- lm(courseevaluation ~ female + onecredit + c.profevaluation*nonenglish, data=prof)
display(m1)
## lm(formula = courseevaluation ~ female + onecredit + c.profevaluation * 
##     nonenglish, data = prof)
##                             coef.est coef.se
## (Intercept)                  3.69     0.01  
## female1                     -0.03     0.02  
## onecredit                    0.10     0.04  
## c.profevaluation             0.94     0.02  
## nonenglish                  -0.08     0.04  
## c.profevaluation:nonenglish -0.17     0.09  
## ---
## n = 463, k = 6
## residual sd = 0.19, R-Squared = 0.88
nsims1<-1000
sim1<-sim(m1,nsims1)
# Instructor A: 50-year-old woman who is a native English speaker and has a beauty score of −1 
sim.outcome <- coef(sim1)[, 1] + 1 * coef(sim1)[, 2] + 
    rep(c(0,1), 500) * coef(sim1)[, 3] + -1 *coef(sim1)[, 4] + 
    0 * coef(sim1)[, 5] + -1 * 0 * coef(sim1)[, 6] 

# Instructor B: 60-year-old man who is a native English speaker and has a beauty score of −0.5 
sim.outcome2 <- coef(sim1)[, 1] + 0 * coef(sim1)[, 2] + 
    rep(c(0,1), 500) * coef(sim1)[, 3] + -.5 * coef(sim1)[, 4] + 
    0 * coef(sim1)[, 5] + -.5 * 0 * coef(sim1)[, 6] 

ggplot(data=data.frame(x1=sim.outcome, x2=sim.outcome2)) + 
    geom_histogram(aes(x=x1), fill="red", alpha=.35) +
    geom_histogram(aes(x=x2), fill="green", alpha=.35)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

(b) Make a histogram of the difference between the course evaluations for A and B. What is the probability that A will have a higher evaluation?

sim.diff <- sim.outcome - sim.outcome2
ggplot(data=data.frame(x=sim.diff)) + geom_histogram(aes(x=x), fill="blue", alpha=.35) +
    labs(title="Difference in final score of courses ", x="difference between course A and B scores")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The probability the course A will have a higher score is zero

mean(sim.diff>=0)
## [1] 0

Question 4: Continuous probability simulation: the logarithms of weights (in pounds) of men in the United States are approximately normally distributed with mean 5.13 and standard deviation 0.17; women with mean 4.96 and standard deviation 0.20. Suppose 10 adults selected at random step on an elevator with a capacity of 1750 pounds. What is the probability that the elevator cable breaks?

prop.female <- .48
men.log.weight <- 5.13
men.log.sd <- .17
women.log.weight <- 4.96
women.log.sd <- .20
n <- 10
n.sims <- 1000
capacity <- 1750
breaks <- rep(NA, n.sims)
 weights <- rep(NA, n)
 
for (t in 1:n.sims) {
   weights <- rep(NA, n)
    sexes <- rbinom(n=n, size=1, prob=prop.female) # 1 if female, 0 if male
    for (i in 1:n) {
        weights[i] <- exp(ifelse(sexes[i]==1, rnorm(n=1, mean=women.log.weight, sd=women.log.sd), 
                          rnorm(n=1, mean=men.log.weight, sd=men.log.sd)))
    }
    breaks[t] <- ifelse(capacity< sum(weights), 1,0)
}
    sum(breaks)/n.sims
## [1] 0.061

Question 5 Predictive checks: Using data of interest to you, fit a model of interest

(a) Simulate replicated data sets and visually compare to the actual data

I prefer to use my own data. You are welcome to use mine as well but encouraged to use your own data.

# make sure I am in the right directory where the data is stored
getwd()
## [1] "C:/Users/brand/Desktop/UCR/PhD CCN/PSYC213_Spring2023/Lab/Week 5"
#setwd("C:/Usersbrand/Desktop/UCR/PhD CCN/PSYC213_Spring2023/Lab/Week 5")

# Read in the data and take a peek
TLE3<- read.csv("TLE3_Reaction_Times_PSYC213C_02.csv", header=TRUE, colClasses=c('numeric','numeric','numeric','numeric','numeric','numeric'))
## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## cols = 3 != length(data) = 6
TLE3<-data.frame(TLE3)

colnames(TLE3) <- c('Move', 'Target', 'Reaction.Time', 'Trial', 'Participant', 'Teamsize')
# Create lists for Individuals and Dyads

Individuals<-c(4,5,6,7,8,9,10,11,12,13,15,19,21,22,24,25,26,27,28,30,31,33,35,36,37,38,39,40,41,42,44,46,48,54,56,57,59,61,63,64,67,69,71,72,74,78,80,81,84,85,86,87)

Dyads<-c(2,3,14,16,17,18,20,23,29,32,34,43,45,47,49,50,51,52,53,55,58,60,62,65,66,68,70,73,75,76,77,79,82,83)
filtered_TLE3<-subset(TLE3,Move >= 2 & Move <=19)
# Generate Trials

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
for (t in 1:nrow(filtered_TLE3)){
  filtered_TLE3$Trial[t]<-((t-1)%/%18)%%22+1
}
# Generate Participant

count<-as.double(2)
filtered_TLE3$Participant[1] <- count

for (q in 2:nrow(filtered_TLE3)) {
  
  if ( (filtered_TLE3$Trial[q-1] != 1) & (filtered_TLE3$Trial[q] == 1) ) {

    count <- count + 1
    filtered_TLE3$Participant[q] <- count
  } else {

    filtered_TLE3$Participant[q] <- count
  }
  
}
# Generate Teamsize

for (t in 1:nrow(filtered_TLE3)){
  
  if (filtered_TLE3$Participant[t] %in% Individuals) {
    
    filtered_TLE3$Teamsize[t]<- 1
    
  } else if (filtered_TLE3$Participant[t] %in% Dyads) {
    
    filtered_TLE3$Teamsize[t]<- 2
    
  } else {
    
    filtered_TLE3$Teamsize[t]<- -99
  }
}
library(ggplot2)
n<-nrow(filtered_TLE3)
mu<-mean(filtered_TLE3$Reaction.Time)
theta<-sd(filtered_TLE3$Reaction.Time)

fTLE3sim1<-rep(NA,n)
Exp<-rep(NA,n)

for (si in 1:n){
  Exp[si]<-ifelse(si<=.5*n,1,2)
}

fTLE3sim1=rnorm(n,mu,theta)

fTLE3sim2<-mutate(filtered_TLE3,sim.times=fTLE3sim1)

#fTLE3sim2=c(filtered_TLE3,fTLE3sim1)

# TS<-filtered_TLE3$Teamsize
# MN<-filtered_TLE3$Move
# TN<-filtered_TLE3$Trial

#FTLE3_sim<-data.frame(Exp,TS,MN,TN,fTLE3sim1)


  
filtered_TLE3$Teamsize<-as.factor(filtered_TLE3$Teamsize)

  # Now plot them for comparison 
ggplot(
  data = filtered_TLE3, mapping = aes(x = Trial, y= Reaction.Time)) +
  geom_jitter(mapping = aes(color= Teamsize))

fTLE3sim2$Teamsize<-as.factor(fTLE3sim2$Teamsize)

ggplot(
  data=fTLE3sim2, mapping = aes(x = Trial, y = sim.times))+
  geom_jitter(mapping = aes(color = Teamsize))

How do the new simulated data look compared to the collected data? The simulated data don’t look much like the collected data. We have negative numbers for simulated reaction times and in the lines below I took the absolute value of those simulated times to get a slightly better representation for the simulated data.

Is there a way to make the simulated data more similar to the collected data?

for (si in 1:n)
  fTLE3sim2$sim.times[si]<-abs(fTLE3sim2$sim.times[si])

ggplot(
  data=fTLE3sim2, mapping = aes(x = Trial, y = sim.times))+
  geom_jitter(mapping = aes(color = Teamsize))

(b) Summarize the data by a numerical test statistic, and compare to the values of the test statistic in the replicated data sets.
t.test(filtered_TLE3$Reaction.Time,fTLE3sim2$sim.times)
## 
##  Welch Two Sample t-test
## 
## data:  filtered_TLE3$Reaction.Time and fTLE3sim2$sim.times
## t = -25.716, df = 64770, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.10968561 -0.09414993
## sample estimates:
## mean of x mean of y 
## 0.5580634 0.6599812

Garbage tried follows:

# Increment Trial and Participant based on conditions
# for (p in 1:nrow(filtered_TLE3)) {
#   if (filtered_TLE3$Trial[p] <- filtered_TLE3$Participant[i - 1] + 1




#filtered_TLE3
# In this example, filtered_TLE3 is a data frame with three columns: Move, Trial, and Participant.
# 
# Move contains the values ranging from 2 to 19, repeated 18 times for each move number.
# Trial contains the values ranging from 1 to 22, repeating 18 times for each move number.
# Participant starts with a value of 1 and increases by 1 only when there is a shift in Trial from 22 to 1.
# The code iterates over the rows of the data frame, checking for the specified conditions. If Trial shifts from 22 to 1, both Trial and Participant are incremented by 1. If the value in Move increases from the previous row, only Trial is incremented by 1.
# 
# The resulting filtered_TLE3 data frame will show the updated values in Trial and Participant based on the provided conditions.







# #Check for shift and increment Participant column
# for (i in 2:nrow(filtered_TLE3)){
#   if (filtered_TLE3$Trial[i-1]==22 && filtered_TLE3$Trial[i]==1){
#     filtered_TLE3$Participant[i]<-filtered_TLE3$Participant[i-1]+1
#   }
# }


# library(dplyr)
# # Create a new column, MutatedParticipant to store the modified values of Participant column
# filtered_TLE3<- filtered_TLE3 %>%
#   mutate(
#   Participant = ifelse(
#     Trial ==1 & lag(Trial)==22,
#     lag(Participant)+1,
#     Participant
#   ))
# 
# filtered_TLE3


# # Initialize increment variable
# increment<-0
# 
# #Loop through each row
# for (p in 1:nrow(filtered_TLE3$)){
#   #Check if the value in Move shifts from 22 to 1
#   if (filtered_TLE3$Trial[p]==22 && filtered_TLE3$Trial[p+1]==1){
#     # Increment Participant by +1 and update increment variable
#     increment<-increment+1
#     filtered_TLE3$Participant[p + 1]<- increment
#   }
#}
# for (i in 2:19) {
#   cycle <- floor((i-2)/18)  # Calculate the cycle number
#   start <- cycle * 18 + 1  # Calculate the starting value for each cycle
#   end <- (cycle + 1) * 18  # Calculate the ending value for each cycle
#   values_col2 <- rep(1:22, each = 18)[start:end]
#   values_col3 <- ifelse(i == 2 || i == start, 1:(22 * (cycle + 1)), 1:(22 * cycle))
#   df$column1[(i-2)*18 + 1:(i-1)*18] <- i
#   df$column2[(i-2)*18 + 1:(i-1)*18] <- values_col2
#   df$column3[(i-2)*18 + 1:(i-1)*18] <- values_col3
# }
# for (p in 1:highest_value_in_trial_column) {
#   cycle <- floor((p-1)/22)  # Calculate the cycle number
#   start <- cycle * 22 + 1  # Calculate the starting value for each cycle
#   end <- (cycle + 1) * 22  # Calculate the ending value for each cycle
#   values <- ifelse(p == 1 || p == start, rep(cycle + 1, 22), rep(cycle, 22))
#   TLE3$Participant[(p-1)*22 + 1:p*22] <- values
#   TLE3$Trial[(p-1)*22 + 1:p*22] <- p
# }
# TLE3
# for (p in 1:nrow(TLE3)){
#   if(!is.na(TLE3$Trial[p])){
#     TLE3$Participant[p]<-TLE3$Participant[p]+1
#   }
# }

# Increment Trial and Participant based on conditions
# for (p in 1:nrow(filtered_TLE3)) {
#   if (filtered_TLE3$Trial[p] <- filtered_TLE3$Participant[i - 1] + 1




#filtered_TLE3
# In this example, filtered_TLE3 is a data frame with three columns: Move, Trial, and Participant.
# 
# Move contains the values ranging from 2 to 19, repeated 18 times for each move number.
# Trial contains the values ranging from 1 to 22, repeating 18 times for each move number.
# Participant starts with a value of 1 and increases by 1 only when there is a shift in Trial from 22 to 1.
# The code iterates over the rows of the data frame, checking for the specified conditions. If Trial shifts from 22 to 1, both Trial and Participant are incremented by 1. If the value in Move increases from the previous row, only Trial is incremented by 1.
# 
# The resulting filtered_TLE3 data frame will show the updated values in Trial and Participant based on the provided conditions.







# #Check for shift and increment Participant column
# for (i in 2:nrow(filtered_TLE3)){
#   if (filtered_TLE3$Trial[i-1]==22 && filtered_TLE3$Trial[i]==1){
#     filtered_TLE3$Participant[i]<-filtered_TLE3$Participant[i-1]+1
#   }
# }


# library(dplyr)
# # Create a new column, MutatedParticipant to store the modified values of Participant column
# filtered_TLE3<- filtered_TLE3 %>%
#   mutate(
#   Participant = ifelse(
#     Trial ==1 & lag(Trial)==22,
#     lag(Participant)+1,
#     Participant
#   ))
# 
# filtered_TLE3


# # Initialize increment variable
# increment<-0
# 
# #Loop through each row
# for (p in 1:nrow(filtered_TLE3$)){
#   #Check if the value in Move shifts from 22 to 1
#   if (filtered_TLE3$Trial[p]==22 && filtered_TLE3$Trial[p+1]==1){
#     # Increment Participant by +1 and update increment variable
#     increment<-increment+1
#     filtered_TLE3$Participant[p + 1]<- increment
#   }
#}
# for (i in 2:19) {
#   cycle <- floor((i-2)/18)  # Calculate the cycle number
#   start <- cycle * 18 + 1  # Calculate the starting value for each cycle
#   end <- (cycle + 1) * 18  # Calculate the ending value for each cycle
#   values_col2 <- rep(1:22, each = 18)[start:end]
#   values_col3 <- ifelse(i == 2 || i == start, 1:(22 * (cycle + 1)), 1:(22 * cycle))
#   df$column1[(i-2)*18 + 1:(i-1)*18] <- i
#   df$column2[(i-2)*18 + 1:(i-1)*18] <- values_col2
#   df$column3[(i-2)*18 + 1:(i-1)*18] <- values_col3
# }
# for (p in 1:highest_value_in_trial_column) {
#   cycle <- floor((p-1)/22)  # Calculate the cycle number
#   start <- cycle * 22 + 1  # Calculate the starting value for each cycle
#   end <- (cycle + 1) * 22  # Calculate the ending value for each cycle
#   values <- ifelse(p == 1 || p == start, rep(cycle + 1, 22), rep(cycle, 22))
#   TLE3$Participant[(p-1)*22 + 1:p*22] <- values
#   TLE3$Trial[(p-1)*22 + 1:p*22] <- p
# }
# TLE3
# for (p in 1:nrow(TLE3)){
#   if(!is.na(TLE3$Trial[p])){
#     TLE3$Participant[p]<-TLE3$Participant[p]+1
#   }
# }

# index <- 1
# 
# for (i in 1:nrow(TLE3)) {
#   if (TLE3$Trial[i] == 1) {
#     index <- index + 1
#   }
#   TLE3$Participant[i] <- index
# }
# increment <- FALSE

# index <- 1
# 
# for (i in 1:nrow(TLE3)) {
#   if (TLE3$column7[i] == 1) {
#     increment <- TRUE
#     index <- 1
#   }
#   
#   if (TLE3$Trial[i] == 1) {
#     increment <- TRUE
#     TLE3$Participant[i] <- index
#     index <- index + 1
#   } else {
#     increment <- FALSE
#   }
#   
#   if (increment) {
#     TLE3$Participant[i] <- index - 1
#   }
#   
#   TLE3$column6[i] <- index

# 
# for (t in 1:nrow(TLE3)){
#   if ((t-1)%%18==0){
#     if(TLE3$Trial[t]>22){
#       TLE3$Trial[t]<-1
#     }
#     TLE3$Trial[t:(t+17)]<-TLE3$Trial[t]
#   }
#   if(TLE3$Trial[t]==1){
#     TLE3$Participant[t]<-Participant.index
#     Participant.index<-Participant.index +1
#   }
#   TLE3$Trial[t]<-Trial.index
#   Trial.index<-Trial.index+1
# }