Discrete probability simulation: suppose that a basketball player has a 60% chance of making a shot, and he keeps taking shots until he misses two in a row. Also assume his shots are independent (so that each shot has 60% probability of success, no matter what happened before).
prob_hit <- .60
maximum <- 100
consec_miss <- 0
for (h in 1:maximum) {
outcome <- rbinom (1, 1, prob_hit)
consec_miss <- ifelse (outcome == 0, consec_miss + 1,0)
if (consec_miss == 2) break
}
h
## [1] 6
prob_hit <- .60
maximum <- 100
num_simulations <- 1000
shot_results <- rep(NA, num_simulations)
for (i in 1:num_simulations) {
for (h in 1:maximum) {
outcome <- rbinom (1, 1, prob_hit)
consec_miss <- ifelse (outcome == 0, consec_miss + 1,0)
if (consec_miss == 2) break
}
shot_results [i] <- h
}
hist(shot_results)
prob_hit <- .60
maximum <- 100
num_simulations <- 1000
trials <- rep(NA, num_simulations)
suc <- rep(NA, num_simulations)
prob_suc <- rep(NA, num_simulations)
for (i in 1:num_simulations) {
h <- 0
for (t in 1:maximum) {
outcome <- rbinom (1, 1, prob_hit)
h <- ifelse ( outcome == 1, h+1, h)
consec_miss <- ifelse (outcome == 0, consec_miss + 1,0)
if (consec_miss == 2) break
}
trials[i]<- t
suc[i] <- h
prob_suc[i] <- h/t
}
plot(trials, prob_suc)
mean_cost_savings <- 5
standard_error_savings <- 4
mean_size <- 40000
standard_error_size <- 10000
n_sims <- 1000
sim_total_savings <- rep(NA, n_sims)
for (k in 1:n_sims) {
savings <- rnorm(1, mean_cost_savings, standard_error_savings)
size <- rnorm(1, mean_size, standard_error_size)
sim_total_savings [k] <- size * savings
}
hist(sim_total_savings)
is this another way of doing the above?
sim_total_savings <- function() { savings <- rnorm(1, mean_cost_savings, standard_error_savings) size <- rnorm(1, mean_size, standard_error_size) total_save <- savings * size return(total_save) }
total_save_sim <- replicate(n_sims, sim_total_savings())
mean_total_save <- mean(total_save_sim) standard_error_save <- sd(total_save_sim)
require(arm)
## Loading required package: arm
## Loading required package: MASS
## Loading required package: Matrix
## Loading required package: lme4
##
## arm (Version 1.13-1, built: 2022-8-25)
## Working directory is /Users/user/Desktop/R - Spring
require(ggplot2)
## Loading required package: ggplot2
require(foreign)
## Loading required package: foreign
prof <- read.csv("ProfEvaltnsBeautyPublic.csv")
str(prof)
## 'data.frame': 463 obs. of 64 variables:
## $ tenured : int 0 1 1 1 0 1 0 1 0 0 ...
## $ profnumber : int 1 2 3 4 5 6 7 8 9 10 ...
## $ minority : int 1 0 0 0 0 0 0 0 0 0 ...
## $ age : int 36 59 51 40 31 62 33 51 33 47 ...
## $ beautyf2upper : int 6 2 5 4 9 5 5 6 5 6 ...
## $ beautyflowerdiv : int 5 4 5 2 7 6 4 4 3 5 ...
## $ beautyfupperdiv : int 7 4 2 5 9 6 4 6 7 7 ...
## $ beautym2upper : int 6 3 3 2 6 6 4 3 5 6 ...
## $ beautymlowerdiv : int 2 2 2 3 7 5 4 2 5 3 ...
## $ beautymupperdiv : int 4 3 3 3 6 5 4 3 3 6 ...
## $ btystdave : num 0.202 -0.826 -0.66 -0.766 1.421 ...
## $ btystdf2u : num 0.289 -1.619 -0.188 -0.665 1.721 ...
## $ btystdfl : num 0.458 -0.0735 0.458 -1.1365 1.521 ...
## $ btystdfu : num 0.8758 -0.577 -1.5456 -0.0927 1.8444 ...
## $ btystdm2u : num 0.682 -1.132 -1.132 -1.736 0.682 ...
## $ btystdml : num -0.9 -0.9 -0.9 -0.313 2.038 ...
## $ btystdmu : num -0.195 -0.655 -0.655 -0.655 0.723 ...
## $ class1 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class2 : int 0 0 0 1 0 0 0 0 0 0 ...
## $ class3 : int 1 0 0 0 0 0 0 0 0 0 ...
## $ class4 : int 0 0 1 0 0 0 1 0 0 1 ...
## $ class5 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class6 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class7 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class8 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class9 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class10 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class11 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class12 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class13 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class14 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class15 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class16 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class17 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class18 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class19 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class20 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class21 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class22 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class23 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class24 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class25 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class26 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class27 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class28 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class29 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ class30 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ courseevaluation : num 4.3 4.5 3.7 4.3 4.4 4.2 4 3.4 4.5 3.9 ...
## $ didevaluation : int 24 17 55 40 42 182 33 25 48 16 ...
## $ female : int 1 0 0 1 1 0 1 1 1 0 ...
## $ formal : int 0 0 0 0 0 1 0 0 0 0 ...
## $ fulldept : int 1 1 1 1 1 1 1 1 1 1 ...
## $ lower : int 0 0 0 0 0 0 0 0 0 0 ...
## $ multipleclass : int 1 0 1 1 0 0 1 0 0 1 ...
## $ nonenglish : int 0 0 0 0 0 0 0 0 0 0 ...
## $ onecredit : int 0 0 0 0 0 0 0 0 0 0 ...
## $ percentevaluating: num 55.8 85 100 87 87.5 ...
## $ profevaluation : num 4.7 4.6 4.1 4.5 4.8 4.4 4.4 3.4 4.8 4 ...
## $ students : int 43 20 55 46 48 282 41 41 60 19 ...
## $ tenuretrack : int 1 1 1 1 1 1 1 1 1 0 ...
## $ blkandwhite : int 0 0 0 0 0 0 0 0 0 0 ...
## $ btystdvariance : num 2.13 1.39 2.54 1.76 1.69 ...
## $ btystdavepos : num 0.202 0 0 0 1.421 ...
## $ btystdaveneg : num 0 -0.826 -0.66 -0.766 0 ...
# convert into factors
prof$profnumber <- as.factor(prof$profnumber)
prof$female <- as.factor(prof$female)
# convert dummy `class*` variables into a factor
dummy_var <- prof[ , 18:47]
prof$class <- factor(apply (dummy_var, FUN = function(r) r %*% 1:30, MARGIN = 1))
# remove dummy variables
prof <- prof[ , -c (18:47)]
# normalize and center professor evaluation (all other predictors are binary)
prof$c.profevaluation <- prof$profevaluation - mean(prof$profevaluation) / (2*sd(prof$profevaluation))
# fit linear model
lm_prof <- lm(courseevaluation ~ female + multipleclass + onecredit + c.profevaluation*nonenglish, data = prof)
display(lm_prof)
## lm(formula = courseevaluation ~ female + multipleclass + onecredit +
## c.profevaluation * nonenglish, data = prof)
## coef.est coef.se
## (Intercept) 3.70 0.02
## female1 -0.03 0.02
## multipleclass -0.01 0.02
## onecredit 0.10 0.04
## c.profevaluation 0.94 0.02
## nonenglish -0.07 0.04
## c.profevaluation:nonenglish -0.17 0.09
## ---
## n = 463, k = 7
## residual sd = 0.19, R-Squared = 0.88
n_sims <- 1000
simulation_prof <- sim(lm_prof, n_sims)
# Instructor A: 50-year-old woman who is a native English speaker and has a beauty score of −1
simulation_1 <- coef(simulation_prof) [, 1] + 1 * coef(simulation_prof) [, 2] + rep(c (0,1), 500) * coef(simulation_prof) [, 3] + -1 * coef(simulation_prof) [, 4] + 0 * coef(simulation_prof) [, 5] + -1 * 0 * coef(simulation_prof) [, 6]
# Instructor B: 60-year-old man who is a native English speaker and has a beauty score of −0.5
simulation_2 <- coef(simulation_prof) [, 1] + 0 * coef(simulation_prof) [, 2] + rep(c (0,1), 500) * coef(simulation_prof) [, 3] + -5 * coef(simulation_prof) [, 4] + 0 * coef(simulation_prof) [, 5] + -.5 * 0 * coef(simulation_prof) [, 6]
#plotting data
ggplot(data = data.frame (x1 = simulation_1, x2 = simulation_2)) + geom_histogram(aes(x = x1), fill = "#FF95BE", alpha = .35) + geom_histogram(aes(x = x2), fill = "#B0E0E6", alpha = .35)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
eval_differences <- simulation_1 - simulation_2
ggplot(data = data.frame (x = eval_differences)) + geom_histogram(aes (x = x), fill = "#FF1493", alpha = .35) + labs (title = "Final Score Difference", x = "A vs B")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The probability the course A will have a higher score is zero
mean(eval_differences >= 0)
## [1] 0.993
prop_f <- .50
men_weight <- 5.13
men_weight_sd <- 0.17
women_weight <- 4.96
women_weight_sd <- 0.20
number_adults <- 10
max_capacity <- 1750
n_sims <- 1000
prob_cable_breaks <- rep(NA, n_sims)
tot_weight <- rep(NA, number_adults)
for (k in 1:n_sims) {
tot_weight <- rep(NA, number_adults)
sexes <- rbinom(n=number_adults, size=1, prob=prop_f)
for (j in 1:number_adults) {
tot_weight[j] <- exp(ifelse(sexes[j]==1, rnorm(n=1, mean=women_weight, sd=women_weight_sd), rnorm(n=1, mean=men_weight, sd=men_weight_sd)))
}
prob_cable_breaks[k] <- ifelse(max_capacity < sum(tot_weight), 1, 0)
}
sum(prob_cable_breaks)/n_sims
## [1] 0.062
# Read in the data and take a peek
my_data <- read.csv("wpa_lena_pos.csv")
str(my_data)
## 'data.frame': 1488 obs. of 11 variables:
## $ id_uni : int 801 801 801 801 801 801 801 801 801 801 ...
## $ age : num 10.8 10.8 10.8 10.8 10.8 ...
## $ sit_time : num 0.13 0.211 0.371 0.294 0.151 ...
## $ held_time : num 0.00669 0.17057 0.22408 0.08361 0.19732 ...
## $ prone_time : num 0.1973 0.2408 0.0803 0.2274 0.4047 ...
## $ supine_time : num 0.137 0.137 0.13 0.13 0 ...
## $ upright_time : num 0.528 0.241 0.194 0.264 0.247 ...
## $ adult_word_cnt : num 331 500 471 288 407 ...
## $ child_utt_cnt : int 61 55 73 59 32 16 16 33 35 10 ...
## $ conv_turn_count: int 14 24 25 13 6 4 2 6 10 5 ...
## $ child_cry_scnds: num 12.1 7.31 12.16 9.6 9.31 ...
#data.frame(my_data)
lm_wpa_data <- lm(data = my_data, adult_word_cnt ~ sit_time + age)
summary(lm_wpa_data)
##
## Call:
## lm(formula = adult_word_cnt ~ sit_time + age, data = my_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -317.25 -152.03 -53.53 90.33 1272.60
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 210.200 69.092 3.042 0.00239 **
## sit_time 150.061 19.133 7.843 8.34e-15 ***
## age -3.874 5.821 -0.666 0.50583
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 209.9 on 1485 degrees of freedom
## Multiple R-squared: 0.03997, Adjusted R-squared: 0.03868
## F-statistic: 30.91 on 2 and 1485 DF, p-value: 7.024e-14
n <- nrow(my_data)
mu <- mean(my_data$adult_word_cnt)
theta <- sd(my_data$adult_word_cnt)
wpa_data <- rep(NA, n)
wpa_data_1 <- rep(NA, n)
for(si in 1:n) {
wpa_data_1[si] <- ifelse(si <= .5*n, 1, 2)}
for (ti in 1:1000){
wpa_data = rnorm(n, mean = mean(my_data$adult_word_cnt), sd = sd(my_data$adult_word_cnt))}
time_sitting <- my_data$sit_time
participant_age <- my_data$age
wpa_sim <- data.frame (wpa_data_1, time_sitting, participant_age, wpa_data)
ggplot(data = my_data, mapping = aes(x = sit_time, y = adult_word_cnt)) + geom_jitter(mapping = aes(color = age))
# Now sim data
ggplot(data = wpa_sim, mapping = aes(x = time_sitting, y = wpa_data)) + geom_jitter(mapping = aes(color = participant_age))
How do the new simulated data look compared to the collected data? There are negative values which does not make sense.
Is there a way to make the simulated data more similar to the collected data?
wpa_sim_2 <- wpa_sim
for (si in 1:n) {
wpa_sim_2$wpa_data[si] <- ifelse(wpa_sim$wpa_data[si]<= 10, abs(wpa_sim$wpa_data[si]),wpa_sim$wpa_data[si])
}
# plot collected data
ggplot(data = my_data, mapping = aes(x = sit_time,y = adult_word_cnt)) + geom_jitter(mapping = aes(color = age))
# plot simulated data
ggplot(data = wpa_sim_2, mapping = aes(x = time_sitting, y = wpa_data)) + geom_jitter(mapping = aes(color = participant_age))
t_test <- t.test(my_data$adult_word_cnt, wpa_sim_2$wpa_data)
t_test_1 <- t.test(my_data[my_data$age < 12.5, "adult_word_cnt"],
my_data[my_data$age > 12.5, "adult_word_cnt"])
t_test_2 <- t.test(wpa_sim_2[wpa_sim_2$participant_age < 12.5, "wpa_data"],
wpa_sim_2[wpa_sim_2$participant_age > 12.5, "wpa_data"])
t_test_1
##
## Welch Two Sample t-test
##
## data: my_data[my_data$age < 12.5, "adult_word_cnt"] and my_data[my_data$age > 12.5, "adult_word_cnt"]
## t = -0.87965, df = 366.92, p-value = 0.3796
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -43.75016 16.70626
## sample estimates:
## mean of x mean of y
## 229.3527 242.8746
t_test_2
##
## Welch Two Sample t-test
##
## data: wpa_sim_2[wpa_sim_2$participant_age < 12.5, "wpa_data"] and wpa_sim_2[wpa_sim_2$participant_age > 12.5, "wpa_data"]
## t = 0.53509, df = 388.51, p-value = 0.5929
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -16.98837 29.69306
## sample estimates:
## mean of x mean of y
## 268.4256 262.0732
I don’t know what I’m doing below. Just messing around.
mu_1 <- mean(my_data$age)
theta_1 <- sd(my_data$age)
n_sims <- 1000
wpa_simulation <- function() {
words <- rnorm(1, mu, theta)
age <- rnorm(1, mu_1, theta_1)
x <- words * age
return(x)
}
x_sim <- replicate(n_sims, wpa_simulation())
mean_x_sim <- mean(x_sim)
standard_error_x_sim <- sd(x_sim)
cat("Estimated mean x: ", mean_x_sim, "\n")
## Estimated mean x: 2591.418
cat("Standard deviation of x: ", standard_error_x_sim)
## Standard deviation of x: 2504.549
sim_lm <- arm::sim(lm_wpa_data, n_sims)
str(sim_lm)
## Formal class 'sim' [package "arm"] with 2 slots
## ..@ coef : num [1:1000, 1:3] 156 240 195 150 183 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : NULL
## .. .. ..$ : chr [1:3] "(Intercept)" "sit_time" "age"
## ..@ sigma: num [1:1000] 213 216 207 209 208 ...
age_sim <- coef(sim_lm) [, "age"] +
coef(sim_lm) [, "age"]
mean(age_sim)
## [1] -7.234379