Monte Carlo simulation taking a set of polls results.
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dslabs)
## Warning: package 'dslabs' was built under R version 4.0.5
d <- 0.039
Ns <- c(1298, 533, 1342, 897, 774, 254, 812, 324, 1291, 1056, 2172, 516)
p <- (d + 1) / 2
polls <- map_df(Ns, function(N) {
x <- sample(c(0,1), size=N, replace=TRUE, prob=c(1-p, p))
x_hat <- mean(x)
se_hat <- sqrt(x_hat * (1 - x_hat) / N)
list(estimate = 2 * x_hat - 1,
low = 2*(x_hat - 1.96*se_hat) - 1,
high = 2*(x_hat + 1.96*se_hat) - 1,
sample_size = N)
}) %>% mutate(poll = seq_along(Ns))
library(dslabs)
data(heights)
x <- heights %>% filter(sex == "Male") %>%
pull(height)
library(dslabs)
data(heights)
x <- heights %>% filter(sex == "Male") %>% pull(height)
summary(x)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 50.00 67.00 69.00 69.31 72.00 82.68
sd(x)
## [1] 3.611024
mu <- mean(x)
mu
## [1] 69.31475
sig <- sd(x)
sig
## [1] 3.611024
N <- 50
X <- sample(x, N, replace = TRUE)
mean(X)
## [1] 69.47492
sd(X)
## [1] 4.51242
polls data have already been loaded for you. Use the head function to examine them.data(polls_us_election_2016)
polls <- polls_us_election_2016 %>%
filter(pollster %in% c("Rasmussen Reports/Pulse Opinion Research",
"The Times-Picayune/Lucid") &
enddate >= "2016-10-15" &
state == "U.S.") %>%
mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100)
head(as.tibble(polls))
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## Please use `as_tibble()` instead.
## The signature and semantics have changed, see `?as_tibble`.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## # A tibble: 6 x 16
## state startdate enddate pollster grade samplesize population
## <fct> <date> <date> <fct> <fct> <int> <chr>
## 1 U.S. 2016-11-05 2016-11-07 The Times-Picayune/Lu~ <NA> 2521 lv
## 2 U.S. 2016-11-02 2016-11-06 Rasmussen Reports/Pul~ C+ 1500 lv
## 3 U.S. 2016-11-01 2016-11-03 Rasmussen Reports/Pul~ C+ 1500 lv
## 4 U.S. 2016-11-04 2016-11-06 The Times-Picayune/Lu~ <NA> 2584 lv
## 5 U.S. 2016-10-31 2016-11-02 Rasmussen Reports/Pul~ C+ 1500 lv
## 6 U.S. 2016-11-03 2016-11-05 The Times-Picayune/Lu~ <NA> 2526 lv
## # ... with 9 more variables: rawpoll_clinton <dbl>, rawpoll_trump <dbl>,
## # rawpoll_johnson <dbl>, rawpoll_mcmullin <dbl>, adjpoll_clinton <dbl>,
## # adjpoll_trump <dbl>, adjpoll_johnson <dbl>, adjpoll_mcmullin <dbl>,
## # spread <dbl>
sigma that contains a column for pollster and a column for s, the standard deviation of the spreadsigma <- polls %>% group_by(pollster) %>%
summarise(s = sd(spread)) %>%
ungroup()
sigma
## # A tibble: 2 x 2
## pollster s
## <fct> <dbl>
## 1 Rasmussen Reports/Pulse Opinion Research 0.0177
## 2 The Times-Picayune/Lucid 0.0268
res that summarizes the average, standard deviation, and number of polls for the two pollsters.res <- polls %>% group_by(pollster) %>%
summarise(avg = mean(spread),
s = sd(spread),
N = n()) %>%
ungroup()
estimate. Print this value to the console.estimate <- abs(res$avg[1]-res$avg[2])
estimate
## [1] 0.05229167
se_hat. Print this value to the console.se_hat <- sqrt(res$s[2]^2/res$N[2] + res$s[1]^2/res$N[1])
se_hat
## [1] 0.007031433
ci.Q <- qnorm(0.975)
lower <- estimate - Q*se_hat
upper <- estimate + Q*se_hat
ci <- c(lower, upper)
ci
## [1] 0.03851031 0.06607302
res to summarize the average, standard deviation, and number of polls for the two pollsters.res <- polls %>% group_by(pollster) %>%
summarize(avg = mean(spread), s = sd(spread), N = n())
estimate and se_hat contain the spread estimates and standard error, respectively.estimate <- res$avg[2] - res$avg[1]
se_hat <- sqrt(res$s[2]^2/res$N[2] + res$s[1]^2/res$N[1])
2*(1-pnorm(estimate/se_hat))
## [1] 1.030287e-13
polls <- polls_us_election_2016 %>%
filter(enddate >= "2016-10-15" &
state == "U.S.") %>%
group_by(pollster) %>%
filter(n() >= 5) %>%
mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100) %>%
ungroup()
var that contains columns for the pollster, mean spread, and standard deviation. Print the contents of this object to the console.var <- polls %>% group_by(pollster) %>%
summarize(avg = mean(spread), s = sd(spread))
head(var)
## # A tibble: 6 x 3
## pollster avg s
## <fct> <dbl> <dbl>
## 1 ABC News/Washington Post 0.0373 0.0339
## 2 CVOTER International 0.0279 0.0180
## 3 Google Consumer Surveys 0.0303 0.0185
## 4 Gravis Marketing 0.016 0.0152
## 5 IBD/TIPP 0.00105 0.0168
## 6 Ipsos 0.0553 0.0195
\[\begin{align*} \mathrm{P}(1st)& = probability\,of\, developing\, SIDS = 1/8500 \\ \mathrm{P}(2nd \mid 1st)& =\, probability\, of\, developing\, SIDS\, given\, another\, sibling\, has\, the\, condition\, = 1/100 \\ \mathrm{P}(2nd \mid 1st)& =\frac{\mathrm{P}(2nd \cap 1st)}{\mathrm{P}(1st)} \\ \Leftrightarrow\, \\ \mathrm{P}(2nd \cap 1st)& =\mathrm{P}(2nd \mid 1st)*\mathrm{P}(1st)=\mathrm{P}(2nd \cap 1st)=\frac{1}{8500}*\frac{1}{100}=\frac{1}{850000} \end{align*}\]
library(tidyverse)
library(dslabs)
data(polls_us_election_2016)
polls <- polls_us_election_2016 %>%
filter(state == "Florida" & enddate >= "2016-11-04" ) %>%
mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100)
results <- polls %>%
summarize(avg = mean(spread), se = sd(spread)/sqrt(n()))
results
## avg se
## 1 0.004154545 0.007218692
# Load the libraries and poll data
library(dplyr)
library(dslabs)
data(polls_us_election_2016)
# Create an object `polls` that contains the spread of predictions for each candidate in Florida during the last polling days
polls <- polls_us_election_2016 %>%
filter(state == "Florida" & enddate >= "2016-11-04" ) %>%
mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100)
# Examine the `polls` object using the `head` function
head(polls)
## state startdate enddate pollster grade
## 1 Florida 2016-11-03 2016-11-06 Quinnipiac University A-
## 2 Florida 2016-11-02 2016-11-04 YouGov B
## 3 Florida 2016-11-01 2016-11-07 SurveyMonkey C-
## 4 Florida 2016-11-06 2016-11-06 Trafalgar Group C
## 5 Florida 2016-11-05 2016-11-06 Opinion Savvy/InsiderAdvantage C-
## 6 Florida 2016-11-02 2016-11-06 Rasmussen Reports/Pulse Opinion Research C+
## samplesize population rawpoll_clinton rawpoll_trump rawpoll_johnson
## 1 884 lv 46.00 45.00 2.00
## 2 1188 rv 45.00 45.00 NA
## 3 4092 lv 47.00 45.00 4.00
## 4 1100 lv 46.13 49.72 2.43
## 5 843 lv 48.40 46.40 3.00
## 6 525 lv 47.00 45.00 2.00
## rawpoll_mcmullin adjpoll_clinton adjpoll_trump adjpoll_johnson
## 1 NA 46.44315 43.93999 2.098310
## 2 NA 47.07455 46.99468 NA
## 3 NA 45.59190 44.32744 1.692430
## 4 NA 45.75904 46.82230 3.495849
## 5 NA 47.37976 45.68637 2.721857
## 6 NA 47.55885 45.13673 2.418502
## adjpoll_mcmullin spread
## 1 NA 0.0100
## 2 NA 0.0000
## 3 NA 0.0200
## 4 NA -0.0359
## 5 NA 0.0200
## 6 NA 0.0200
# Create an object called `results` that has two columns containing the average spread (`avg`) and the standard error (`se`). Print the results to the console.
results <- polls %>%
summarize(avg = mean(spread), se = sd(spread)/sqrt(n()))
results
## avg se
## 1 0.004154545 0.007218692
# The results` object has already been loaded. Examine the values stored: `avg` and `se` of the spread
results
## avg se
## 1 0.004154545 0.007218692
# Define `mu` and `tau`
mu <- 0
tau <- 0.01
# Define a variable called `sigma` that contains the standard error in the object `results`
sigma <- results %>% pull(se[1])
# Define a variable called `Y` that contains the average in the object `results`
Y <- results %>% pull(avg[1])
# Define a variable `B` using `sigma` and `tau`. Print this value to the console.
B <- sigma^2/(sigma^2+tau^2)
# Calculate the expected value of the posterior distribution
mu+(1-B)*(Y-mu)
## [1] 0.002731286
# Here are the variables we have defined
mu <- 0
tau <- 0.01
sigma <- results$se
Y <- results$avg
B <- sigma^2 / (sigma^2 + tau^2)
# Compute the standard error of the posterior distribution. Print this value to the console.
sqrt(1/((1/sigma^2)+(1/tau^2)))
## [1] 0.005853024
# Here are the variables we have defined in previous exercises
mu <- 0
tau <- 0.01
sigma <- results$se
Y <- results$avg
B <- sigma^2 / (sigma^2 + tau^2)
se <- sqrt( 1/ (1/sigma^2 + 1/tau^2))
E <- mu+(1-B)*(Y-mu) # Expected value
# Construct the 95% credible interval. Save the lower and then the upper confidence interval to a variable called `ci`.
lower <- E-qnorm(.975)*se
upper <- E+qnorm(.975)*se
ci <- c(lower, upper)
ci
## [1] -0.008740432 0.014203003
# Assign the expected value of the posterior distribution to the variable `exp_value`
exp_value <- B*mu + (1-B)*Y
exp_value
## [1] 0.002731286
# Assign the standard error of the posterior distribution to the variable `se`
se <- sqrt( 1/ (1/sigma^2 + 1/tau^2))
# Using the `pnorm` function, calculate the probability that the actual spread was less than 0 (in Trump's favor). Print this value to the console.
pnorm(0,exp_value, se)
## [1] 0.3203769
# Define the variables from previous exercises
mu <- 0
sigma <- results$se
Y <- results$avg
# Define a variable `taus` as different values of tau
taus <- seq(0.005, 0.05, len = 100)
# Create a function called `p_calc` that generates `B` and calculates the probability of the spread being less than 0
p_calc <- function(tau){
B <- sigma^2/(sigma^2+tau^2)
E <- B*mu + (1-B)*Y
se <- sqrt( 1/ (1/sigma^2 + 1/tau^2))
pnorm(0, E, se)
}
# Create a vector called `ps` by applying the function `p_calc` across values in `taus`
ps <- p_calc(taus)
# Plot `taus` on the x-axis and `ps` on the y-axis
chances_df <- data.frame(ps, taus)
ggplot(data = chances_df, mapping = aes(x = taus, y = ps)) +
geom_point(colour = "blue") +
ylab("ps") +
xlab("tau") +
ggtitle("Chances of winning in FL - US Presidential Elections 2016",
subtitle = "estimative based on a vector of values of tau")
# Load the libraries and data
library(dplyr)
library(dslabs)
data("polls_us_election_2016")
# Create a table called `polls` that filters by state, date, and reports the spread
polls <- polls_us_election_2016 %>%
filter(state != "U.S." & enddate >= "2016-10-31") %>%
mutate(spread = rawpoll_clinton/100 - rawpoll_trump/100)
# Create an object called `cis` that has the columns indicated in the instructions
cis <- polls %>% mutate(X_hat = (spread+1)/2,
se = 2*sqrt(X_hat*(1-X_hat)/samplesize),
lower = spread - qnorm(0.975)*se,
upper = spread + qnorm(0.975)*se) %>%
select(state, startdate, enddate, pollster, grade, spread, lower, upper)
# Add the actual results to the `cis` data set
add <- results_us_election_2016 %>% mutate(actual_spread = clinton/100 - trump/100) %>% select(state, actual_spread)
ci_data <- cis %>% mutate(state = as.character(state)) %>% left_join(add, by = "state")
# Create an object called `p_hits` that summarizes the proportion of confidence intervals that contain the actual value. Print this object to the console.
p_hits <- ci_data %>%
mutate(hit = c(actual_spread >= lower & actual_spread <= upper)) %>%
summarize(mean(hit == TRUE))
# The `cis` data have already been loaded for you
add <- results_us_election_2016 %>% mutate(actual_spread = clinton/100 - trump/100) %>% select(state, actual_spread)
ci_data <- cis %>% mutate(state = as.character(state)) %>% left_join(add, by = "state")
# Create an object called `p_hits` that summarizes the proportion of hits for each pollster that has at least 5 polls.
p_hits <- ci_data %>%
mutate(hit = actual_spread >= lower & actual_spread <= upper) %>%
group_by(pollster) %>%
filter(n() >= 5) %>%
summarize(proportion_hits = mean(hit), n = n(), grade = grade[1]) %>%
arrange(desc(proportion_hits))
# The `cis` data have already been loaded for you
add <- results_us_election_2016 %>% mutate(actual_spread = clinton/100 - trump/100) %>% select(state, actual_spread)
ci_data <- cis %>% mutate(state = as.character(state)) %>% left_join(add, by = "state")
# Create an object called `p_hits` that summarizes the proportion of hits for each state that has more than 5 polls.
p_hits <- ci_data %>%
mutate(hit = actual_spread >= lower & actual_spread <= upper) %>%
group_by(state) %>%
filter(n() > 5) %>%
summarize(proportion_hits = mean(hit), n = n()) %>%
arrange(desc(proportion_hits))
library(ggplot2)
# The `p_hits` data have already been loaded for you. Use the `head` function to examine it.
head(p_hits)
## # A tibble: 6 x 3
## state proportion_hits n
## <chr> <dbl> <int>
## 1 Connecticut 1 13
## 2 Delaware 1 12
## 3 Rhode Island 1 10
## 4 New Mexico 0.941 17
## 5 Washington 0.933 15
## 6 Oregon 0.929 14
# Make a barplot of the proportion of hits for each state
p_hits %>%
ggplot(aes(x= reorder(state, +proportion_hits), y= proportion_hits)) +
geom_bar(stat = "identity", width = 0.80, position = position_dodge(0.9), fill= "lightblue") +
ylab("Proportion of polls within the CI") +
xlab("State") +
theme(axis.text=element_text(size=7), aspect.ratio=16/9) +
coord_flip()
# The `cis` data have already been loaded. Examine it using the `head` function.
cis <- cis %>% mutate(state = as.character(state)) %>%
left_join(add, by = "state")
head(cis)
## state startdate enddate pollster grade spread
## 1 New Mexico 2016-11-06 2016-11-06 Zia Poll <NA> 0.02
## 2 Virginia 2016-11-03 2016-11-04 Public Policy Polling B+ 0.05
## 3 Iowa 2016-11-01 2016-11-04 Selzer & Company A+ -0.07
## 4 Wisconsin 2016-10-26 2016-10-31 Marquette University A 0.06
## 5 North Carolina 2016-11-04 2016-11-06 Siena College A 0.00
## 6 Georgia 2016-11-06 2016-11-06 Landmark Communications B -0.03
## lower upper actual_spread
## 1 -0.001331221 0.0413312213 0.083
## 2 -0.005634504 0.1056345040 0.054
## 3 -0.139125210 -0.0008747905 -0.094
## 4 0.004774064 0.1152259363 -0.007
## 5 -0.069295191 0.0692951912 -0.036
## 6 -0.086553820 0.0265538203 -0.051
# Create an object called `errors` that calculates the difference between the predicted and actual spread and indicates if the correct winner was predicted
errors <- cis %>% mutate(error = spread - actual_spread,
hit = sign(spread) == sign(actual_spread))
# Examine the last 6 rows of `errors`
tail(errors)
## state startdate enddate pollster grade spread
## 807 Utah 2016-10-04 2016-11-06 YouGov B -0.0910
## 808 Utah 2016-10-25 2016-10-31 Google Consumer Surveys B -0.0121
## 809 South Dakota 2016-10-28 2016-11-02 Ipsos A- -0.1875
## 810 Washington 2016-10-21 2016-11-02 Ipsos A- 0.0838
## 811 Utah 2016-11-01 2016-11-07 Google Consumer Surveys B -0.1372
## 812 Oregon 2016-10-21 2016-11-02 Ipsos A- 0.0905
## lower upper actual_spread error hit
## 807 -0.1660704570 -0.01592954 -0.180 0.0890 TRUE
## 808 -0.1373083389 0.11310834 -0.180 0.1679 TRUE
## 809 -0.3351563485 -0.03984365 -0.298 0.1105 TRUE
## 810 -0.0004028265 0.16800283 0.162 -0.0782 TRUE
## 811 -0.2519991224 -0.02240088 -0.180 0.0428 TRUE
## 812 -0.0019261469 0.18292615 0.110 -0.0195 TRUE
# Create an object called `errors` that calculates the difference between the predicted and actual spread and indicates if the correct winner was predicted
errors <- cis %>% mutate(error = spread - actual_spread, hit = sign(spread) == sign(actual_spread))
# Create an object called `p_hits` that summarizes the proportion of hits for each state that has 5 or more polls
p_hits <- errors %>% group_by(state) %>%
filter(state >= 5) %>%
summarize(proportion_hits = mean(hit), n = n())
# Make a barplot of the proportion of hits for each state
p_hits %>%
ggplot(aes(x= reorder(state,+proportion_hits), y=proportion_hits))+
geom_bar(stat = "identity", width = 0.80,
position = position_dodge(0.9),fill= "lightblue")+
ylab("Proportion of polls within the CI") +
xlab("State") +
theme(axis.text=element_text(size=7),
aspect.ratio=16/9) +
coord_flip()
# The `errors` data have already been loaded. Examine them using the `head` function.
head(errors)
## state startdate enddate pollster grade spread
## 1 New Mexico 2016-11-06 2016-11-06 Zia Poll <NA> 0.02
## 2 Virginia 2016-11-03 2016-11-04 Public Policy Polling B+ 0.05
## 3 Iowa 2016-11-01 2016-11-04 Selzer & Company A+ -0.07
## 4 Wisconsin 2016-10-26 2016-10-31 Marquette University A 0.06
## 5 North Carolina 2016-11-04 2016-11-06 Siena College A 0.00
## 6 Georgia 2016-11-06 2016-11-06 Landmark Communications B -0.03
## lower upper actual_spread error hit
## 1 -0.001331221 0.0413312213 0.083 -0.063 TRUE
## 2 -0.005634504 0.1056345040 0.054 -0.004 TRUE
## 3 -0.139125210 -0.0008747905 -0.094 0.024 TRUE
## 4 0.004774064 0.1152259363 -0.007 0.067 FALSE
## 5 -0.069295191 0.0692951912 -0.036 0.036 FALSE
## 6 -0.086553820 0.0265538203 -0.051 0.021 TRUE
# Generate a histogram of the error
hist(errors$error)
# Calculate the median of the errors. Print this value to the console.
median(errors$error)
## [1] 0.037
# The `errors` data have already been loaded. Examine them using the `head` function.
head(errors)
## state startdate enddate pollster grade spread
## 1 New Mexico 2016-11-06 2016-11-06 Zia Poll <NA> 0.02
## 2 Virginia 2016-11-03 2016-11-04 Public Policy Polling B+ 0.05
## 3 Iowa 2016-11-01 2016-11-04 Selzer & Company A+ -0.07
## 4 Wisconsin 2016-10-26 2016-10-31 Marquette University A 0.06
## 5 North Carolina 2016-11-04 2016-11-06 Siena College A 0.00
## 6 Georgia 2016-11-06 2016-11-06 Landmark Communications B -0.03
## lower upper actual_spread error hit
## 1 -0.001331221 0.0413312213 0.083 -0.063 TRUE
## 2 -0.005634504 0.1056345040 0.054 -0.004 TRUE
## 3 -0.139125210 -0.0008747905 -0.094 0.024 TRUE
## 4 0.004774064 0.1152259363 -0.007 0.067 FALSE
## 5 -0.069295191 0.0692951912 -0.036 0.036 FALSE
## 6 -0.086553820 0.0265538203 -0.051 0.021 TRUE
# Create a boxplot showing the errors by state for polls with grades B+ or higher
errors %>% filter(grade %in% c("A+", "A", "A-", "B+") | is.na(grade)) %>%
mutate(state = reorder(state, error)) %>%
ggplot(aes(x = state, y = error, fill = state)) +
geom_boxplot()+
geom_point() +
theme(axis.text.x = element_text(size = 7, angle = 90, vjust=.5, hjust=1), legend.position = "none")
# The `errors` data have already been loaded. Examine them using the `head` function.
head(errors)
## state startdate enddate pollster grade spread
## 1 New Mexico 2016-11-06 2016-11-06 Zia Poll <NA> 0.02
## 2 Virginia 2016-11-03 2016-11-04 Public Policy Polling B+ 0.05
## 3 Iowa 2016-11-01 2016-11-04 Selzer & Company A+ -0.07
## 4 Wisconsin 2016-10-26 2016-10-31 Marquette University A 0.06
## 5 North Carolina 2016-11-04 2016-11-06 Siena College A 0.00
## 6 Georgia 2016-11-06 2016-11-06 Landmark Communications B -0.03
## lower upper actual_spread error hit
## 1 -0.001331221 0.0413312213 0.083 -0.063 TRUE
## 2 -0.005634504 0.1056345040 0.054 -0.004 TRUE
## 3 -0.139125210 -0.0008747905 -0.094 0.024 TRUE
## 4 0.004774064 0.1152259363 -0.007 0.067 FALSE
## 5 -0.069295191 0.0692951912 -0.036 0.036 FALSE
## 6 -0.086553820 0.0265538203 -0.051 0.021 TRUE
# Create a boxplot showing the errors by state for states with at least 5 polls with grades B+ or higher.
errors %>% filter(grade %in% c("A+", "A", "A-", "B+") | is.na(grade)) %>%
group_by(state) %>%
filter(state >= 5) %>%
ungroup %>%
mutate(state = reorder(state, error)) %>%
ggplot(aes(x = state, y = error, fill = state)) +
geom_boxplot()+
geom_point() +
theme(axis.text.x = element_text(size = 7, angle = 90, vjust=.5, hjust=1), legend.position = "none")
# Calculate the probability of seeing t-distributed random variables being more than 2 in absolute value when 'df = 3'.
1-pt(2,3)+pt(-2,3)
## [1] 0.139326
# Generate a vector 'df' that contains a sequence of numbers from 3 to 50
df <- c(3:50)
# Make a function called 'pt_func' that calculates the probability that a value is more than |2| for any degrees of freedom
pt_func <- function(n) {
1-pt(2,n)+pt(-2,n)
}
# Generate a vector 'probs' that uses the `pt_func` function to calculate the probabilities
probs <- sapply(df, pt_func)
# Plot 'df' on the x-axis and 'probs' on the y-axis
plot(df, probs)
# Load the necessary libraries and data
library(dslabs)
library(dplyr)
data(heights)
# Use the sample code to generate 'x', a vector of male heights
x <- heights %>% filter(sex == "Male") %>%
.$height
# Create variables for the mean height 'mu', the sample size 'N', and the number of times the simulation should run 'B'
mu <- mean(x)
N <- 15
B <- 10000
se <- sd(x)/sqrt(N)
# Use the `set.seed` function to make sure your answer matches the expected result after random sampling
set.seed(1)
# Generate a logical vector 'res' that contains the results of the simulations
res <- replicate(B, {
X <- sample(x, N, replace = TRUE)
interval <- mean(X) + c(-1,1)*qnorm(0.975)*sd(X)/sqrt(N)
between(mu, interval[1], interval[2])
})
# Calculate the proportion of times the simulation produced values within the 95% confidence interval. Print this value to the console.
mean(res)
## [1] 0.9331
# The vector of filtered heights 'x' has already been loaded for you. Calculate the mean.
mu <- mean(x)
# Use the same sampling parameters as in the previous exercise.
set.seed(1)
N <- 15
B <- 10000
# Generate a logical vector 'res' that contains the results of the simulations using the t-distribution
res <- replicate(B, {
t <- sample(x, N, replace = TRUE)
interval <- mean(t) + c(-1,1)*qt(0.975, N-1)*sd(t)/sqrt(N)
between(mu, interval[1], interval[2])
})
# Calculate the proportion of times the simulation produced values within the 95% confidence interval. Print this value to the console.
mean(res)
## [1] 0.9512
# The 'errors' data have already been loaded. Examine them using the `head` function.
head(errors)
## state startdate enddate pollster grade spread
## 1 New Mexico 2016-11-06 2016-11-06 Zia Poll <NA> 0.02
## 2 Virginia 2016-11-03 2016-11-04 Public Policy Polling B+ 0.05
## 3 Iowa 2016-11-01 2016-11-04 Selzer & Company A+ -0.07
## 4 Wisconsin 2016-10-26 2016-10-31 Marquette University A 0.06
## 5 North Carolina 2016-11-04 2016-11-06 Siena College A 0.00
## 6 Georgia 2016-11-06 2016-11-06 Landmark Communications B -0.03
## lower upper actual_spread error hit
## 1 -0.001331221 0.0413312213 0.083 -0.063 TRUE
## 2 -0.005634504 0.1056345040 0.054 -0.004 TRUE
## 3 -0.139125210 -0.0008747905 -0.094 0.024 TRUE
## 4 0.004774064 0.1152259363 -0.007 0.067 FALSE
## 5 -0.069295191 0.0692951912 -0.036 0.036 FALSE
## 6 -0.086553820 0.0265538203 -0.051 0.021 TRUE
# Generate an object called 'totals' that contains the numbers of good and bad predictions for polls rated A- and C-
totals <- errors %>%
filter(grade %in% c("A-", "C-")) %>%
group_by(grade, hit) %>%
summarise(hits = n()) %>%
spread(grade, hits)
## `summarise()` has grouped output by 'grade'. You can override using the `.groups` argument.
head(totals)
## # A tibble: 2 x 3
## hit `C-` `A-`
## <lgl> <int> <int>
## 1 FALSE 50 26
## 2 TRUE 311 106
# Print the proportion of hits for grade A- polls to the console
totals[[2,3]]/sum(totals[[3]])
## [1] 0.8030303
# Print the proportion of hits for grade C- polls to the console
totals[[2,2]]/sum(totals[[2]])
## [1] 0.8614958
# The 'totals' data have already been loaded. Examine them using the `head` function.
head(totals)
## # A tibble: 2 x 3
## hit `C-` `A-`
## <lgl> <int> <int>
## 1 FALSE 50 26
## 2 TRUE 311 106
# Perform a chi-squared test on the hit data. Save the results as an object called 'chisq_test'.
chisq_test <- totals %>% select(!hit) %>% chisq.test()
chisq_test
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: .
## X-squared = 2.1053, df = 1, p-value = 0.1468
# Print the p-value of the chi-squared test to the console
chisq_test$p.value
## [1] 0.1467902
# The 'totals' data have already been loaded. Examine them using the `head` function.
head(totals)
## # A tibble: 2 x 3
## hit `C-` `A-`
## <lgl> <int> <int>
## 1 FALSE 50 26
## 2 TRUE 311 106
# Generate a variable called `odds_C` that contains the odds of getting the prediction right for grade C- polls
odds_C <- with(totals, (totals[[2,2]]/sum(totals[[2]])) / (totals[[1,2]]/sum(totals[[2]])))
# Generate a variable called `odds_A` that contains the odds of getting the prediction right for grade A- polls
odds_A <- with(totals, (totals[[2,3]]/sum(totals[[3]])) / (totals[[1,3]]/sum(totals[[3]])))
# Calculate the odds ratio to determine how many times larger the odds ratio is for grade A- polls than grade C- polls
odds_A/odds_C
## [1] 0.6554539
# suggested libraries and options
library(tidyverse)
options(digits = 3)
# load brexit_polls object
library(dslabs)
data(brexit_polls)
p <- 0.481 # official proportion voting "Remain"
d <- 2*p-1 # official spread
brexit_polls <- brexit_polls %>%
mutate(x_hat = (spread + 1)/2)
# What is the average of the observed spreads (spread)?
mean(brexit_polls$spread)
## [1] 0.0201
# What is the standard deviation of the observed spreads?
sd(brexit_polls$spread)
## [1] 0.0588
# What is the average of x_hat, the estimates of the parameter p?
mean(brexit_polls$x_hat)
## [1] 0.51
# What is the standard deviation of x_hat?
sd(brexit_polls$x_hat)
## [1] 0.0294
brexit_polls[1,]
## startdate enddate pollster poll_type samplesize remain leave undecided
## 1 2016-06-23 2016-06-23 YouGov Online 4772 0.52 0.48 0
## spread x_hat
## 1 0.04 0.52
# What is the lower bound of the 95% confidence interval?
0.52 - qnorm(.975)*sqrt(.52*(1-.52)/4772)
## [1] 0.506
# What is the upper bound of the 95% confidence interval?
0.52 + qnorm(.975)*sqrt(.52*(1-.52)/4772)
## [1] 0.534
# Create the data frame june_polls containing only Brexit polls ending in June 2016
june_polls <- data.frame(brexit_polls %>%
filter(enddate >= "2016-06-01" & enddate <= "2016-06-30") %>%
mutate(se_x_hat = sqrt(x_hat * (1-x_hat)/samplesize),
se_d = 2*se_x_hat,
lower = (2 * x_hat - 1) - qnorm(.975)*se_d,
upper = (2 * x_hat - 1) + qnorm(.975)*se_d,
hit = -0.038 >= lower & -0.038 <= upper,
zero_ci = c(lower <= 0 & upper >= 0),
above_zero = c(lower > 0)
)
)
# How many polls are in june_polls?
print(nrow(june_polls))
## [1] 32
# What proportion of polls have a confidence interval that covers the value 0?
mean(june_polls$zero_ci)
## [1] 0.625
# What proportion of polls predict "Remain" (confidence interval entirely above 0)?
mean(june_polls$above_zero)
## [1] 0.125
# What proportion of polls have a confidence interval covering the true value of d?
mean(june_polls$hit)
## [1] 0.562
# Group and summarize the june_polls object by pollster to find the proportion of
# hits for each pollster and the number of polls per pollster. Use arrange() to sort by hit rate.
june_polls %>% group_by(pollster) %>%
summarize(proportion = mean(hit), polls = n()) %>%
arrange(desc(proportion))
## # A tibble: 12 x 3
## pollster proportion polls
## <fct> <dbl> <int>
## 1 ICM 1 3
## 2 Survation/IG Group 1 1
## 3 TNS 1 2
## 4 Opinium 0.667 3
## 5 ORB 0.667 3
## 6 YouGov 0.556 9
## 7 Ipsos MORI 0.5 2
## 8 Survation 0.5 2
## 9 ComRes 0.333 3
## 10 BMG Research 0 2
## 11 ORB/Telegraph 0 1
## 12 Populus 0 1
june_polls %>%
ggplot(aes(poll_type, spread, colour = poll_type))+
labs(title="Spread per poll types", x="Poll type", y = "Spread")+
geom_boxplot()+
geom_jitter(shape=16, position=position_jitter(0.2), colour = "black", legend.position = "none")
## Warning: Ignoring unknown parameters: legend.position
combined_by_type <- june_polls %>%
group_by(poll_type) %>%
summarize(N = sum(samplesize),
spread = sum(spread*samplesize)/N,
p_hat = (spread + 1)/2)
combined_by_type %>%
mutate(lower = spread - qnorm(.975)*2*sqrt(p_hat*(1-p_hat)/N),
upper = spread + qnorm(.975)*2*sqrt(p_hat*(1-p_hat)/N),
ci_amplitude = abs(upper-lower),
hit_true_d = lower <= -0.038 & upper >= -0.038
)
## # A tibble: 2 x 8
## poll_type N spread p_hat lower upper ci_amplitude hit_true_d
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl>
## 1 Online 46711 -0.00741 0.496 -0.0165 0.00165 0.0181 FALSE
## 2 Telephone 13490 0.0127 0.506 -0.00414 0.0296 0.0337 FALSE
# suggested libraries
library(tidyverse)
# load brexit_polls object and add x_hat column
library(dslabs)
data(brexit_polls)
brexit_polls <- brexit_polls %>%
mutate(x_hat = (spread + 1)/2)
# final proportion voting "Remain"
p <- 0.481
brexit_hit <- brexit_polls %>%
mutate(p_hat = (spread + 1)/2,
se_spread = 2*sqrt(p_hat*(1-p_hat)/samplesize),
spread_lower = spread - qnorm(.975)*se_spread,
spread_upper = spread + qnorm(.975)*se_spread,
hit = spread_lower < -0.038 & spread_upper > -0.038) %>%
select(poll_type, hit)
# Contingency table for brexit_hit
brexit_table <- t(table(brexit_hit$poll_type, brexit_hit$hit))
brexit_table
##
## Online Telephone
## FALSE 37 32
## TRUE 48 10
# Chi-square
chisq.test(brexit_table)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: brexit_table
## X-squared = 11, df = 1, p-value = 0.001
# Calculate the odds that an online poll generates a confidence interval that covers the actual value of the spread.
odds_table <- data.frame(poll_type = c("no hit", "hit"),
online = c(brexit_table[1,1], brexit_table[2,1]),
telephone = c(brexit_table[1,2], brexit_table[2,2])
)
odds_online <- with(odds_table, (online[2]/sum(online))/(online[1]/sum(online)))
odds_online
## [1] 1.3
# Calculate the odds that a telephone poll generates a confidence interval that covers the actual value of the spread.
odds_telephone <- with(odds_table, (telephone[2]/sum(telephone))/(telephone[1]/sum(telephone)))
odds_telephone
## [1] 0.312
# Calculate the odds ratio to determine how many times larger the odds are for online polls to hit versus telephone polls.
odds_online/odds_telephone
## [1] 4.15