Consider a random variable X with X ~ Bin(20, 0.4).
Using R, work out P(\(X \le 13\)).
Which Normal distribution would you use for Y to approximate X?
Calculate the equivalent probability for Y that you calculated above for X.
Now do the same for X ~ Bin(50, 0.8) and P(\(X \ge 41\)).
In both cases, note how close (or otherwise) the approximations are.
n1 <- 20
p1 <- 0.4
aa <- pbinom (13, n1, p1)
miu1 <- n1*p1
sigma1 <- sqrt (n1*p1*(1-p1))
bb <- pnorm (13+0.5, miu1, sigma1)
## The probability using binomial distribution is 0.9935341.
## The probability using approximated normal distribution is 0.9939702.
n2 <- 50
p2 <- 0.8
aa <- pbinom (41 - 1, n2, p2, lower.tail = FALSE)
miu2 <- n2*p2
sigma2 <- sqrt (n2*p2*(1-p2))
bb <- pnorm (41 - 1 - 0.5, miu2, sigma2)
## The probability using binomial distribution is 0.4437404.
## The probability using approximated normal distribution is 0.4298419.
___
Reproduce the plots in the lecture notes
binnor <- function (n, p) {
plot (0:n, dbinom (0:n, n, p),
col = "blue", type = "h",
main = paste("X ~ Binomial (", n, "," ,p, ")"),
xlab = "", ylab = "probability")
points (0:n, dbinom (0:n, n, p),
col = "blue", pch = 16)
miu <- n*p
sigma <- sqrt (n*p*(1-p))
x <- seq (0, n, length.out = 1000)
lines (x, dnorm (x, miu, sigma),
col = "red", lwd = 2)
legend ("topright", legend = c ("Binomial", "Normal"),
col = c ("blue", "red"),
lty = rep (1, times = 2), cex = 1.2)
}
par (mfrow = c (2, 2))
binnor (n = 20, p = 0.1)
binnor (n = 20, p = 0.3)
binnor (n = 40, p = 0.1)
binnor (n = 40, p = 0.3)
par (mfrow = c (2, 2))
ns <- c (20, 40)
ps <- c (0.1, 0.3)
for (n in ns) {
for (p in ps) {
binnor(n, p)
}
}
Plot the \(\chi_3^2\) distribution over the range (0, 10), using the function dchisq().
x <- seq (0, 10, length.out = 1000)
par (cex.lab = 0.8)
plot (x, dchisq (x, df = 3),
type = "l", col = "blue", lwd = 2,
ylab = substitute (chi[3]^2))
Create a function, named mean.dr(), which takes as inputs n and m and creates a vector of m means using the \(\chi_3^2\) distribution as described above.
The function should then plot these means as a histogram and add a line representing a normal distribution with mean 3 and standard deviation \(\sqrt{\frac{6}{n}}\) (the n variance of a \(\chi_3^2\) distribution is 6).
Finally, the function should output a list (see Section 4.1) containing n and m, together with the mean and standard deviation of that vector.
Run the function for a range of values of n and m and see how the points compare with the line.
If the histogram is not almost exactly on the line for large n, then you have made a mistake.
Finally, create a program which includes the function you have just created, which runs that function for 4 values of m and n of your choosing and creates a 4 × 4 plot of their outputs.
mean.dr <- function (n, m){
mtx <- matrix (nrow = n, ncol = m) # This gives me the structure of a matrix
mtx[1:n,] <- replicate (m, rchisq (n = n, df = 3))
# I put values into the matrix
M <- colMeans (mtx)
meanM <- mean(M)
sdM <- sd(M)
# save them as "result"
result_toplot <- as.numeric (M)
# plot these sample means as a histogram
hist (result_toplot, probability = TRUE,
col = "cornflowerblue", border = "darkgreen",
main = paste("histogram of means ( n =", n, ", df = " ,3, ")"))
# plot the corresponding curve of the normal distribution
x <- seq (min(result_toplot), max(result_toplot),
length.out = 1000)
mean_result <- meanM
sd_result <- sqrt (6 / n)
lines (x, dnorm (x, mean = mean_result, sd = sd_result),
col = "red", lwd = 3)
}
mean.dr (n = 500, m = 1000)
par (mfrow = c (2, 2))
mean.dr (n = 100, m = 1000)
mean.dr (n = 200, m = 1000)
mean.dr (n = 100, m = 2000)
mean.dr (n = 200, m = 2000)
par (mfrow = c (1, 1))
par (mfrow = c (2, 2))
ns <- c (100, 200)
ps <- c (1000, 2000)
for (n in ns) {
for (p in ps) {
mean.dr(n, p)
}
}
par (mfrow = c (1, 1))
Make a copy of the program from Exercise 7.4. Amend that copy so that it uses data from a N(3, 6) distribution rather than a \(\chi_3^2\) (they have the same mean and variance).
Compare the resulting plots with those from Exercise 7.4
mean.nor <- function (n, m, miu, var){
mtx <- matrix (nrow = n, ncol = m)
mtx[1:n,] <- replicate (m, rnorm (n = n, mean = miu, sd = sqrt(var)))
M <- colMeans (mtx)
meanM <- mean(M)
sdM <- sd(M)
# save them as "result"
result_toplot <- as.numeric (M)
# plot these means as a histogram
hist (result_toplot, probability = TRUE,
col = "cornflowerblue", border = "darkgreen",
main = paste("histogram of means (",substitute (miu), "=", miu, ", variance =" ,var, ")"))
x <- seq (min(result_toplot), max(result_toplot),
length.out = 1000)
mean_result <- meanM
sd_result <- sqrt (var/n)
lines (x, dnorm (x, mean = mean_result, sd = sd_result),
col = "red", lwd = 3)
}
par (mfrow = c (2, 2))
ns <- c (100, 200)
ms <- c (1000, 2000)
mius <- 3
vars <- 6
for (n in ns) {
for (m in ms) {
for (miu in mius){
for (var in vars){
mean.nor(n, m, miu, var)
}
}
}
}
This exercise is not critical, but you might be bored. Finish rewriting the loop with k specified in the for statement by replacing the command in the first line of the loop for k in terms of j with one with j in terms of k. Ensure that it produces the same matrix as the code above.
std_norm_table <- matrix(nrow = 601, ncol = 2)
for (j in 1:601) {
k <- -3 + (j - 1) / 100
std_norm_table[j,] <- c(k, round(x = pnorm(q = k), digits = 5))
}
colnames(std_norm_table) <- c("x", paste('U+03A6', "(x)", sep = ""))
The argument in the for loop () has changed instead to: for (k in seq(from = -3, to = 3, length = 601))
std_norm_table76 <- matrix(nrow = 601, ncol = 2)
for (k in seq(from = -3, to = 3, length = 601)) {
j <- round( ( k - (-3) ) *100 )
# float / double
std_norm_table76[j,] <- c(k, round(x = pnorm(q = k), digits = 5))
}
colnames(std_norm_table76) <- c("x", paste('U+03A6', "(x)", sep = ""))
j needs to be round because it represents the rows of a matrix, it should be an integer
try the function: typeof(0.8) ; typeof(1)
## x U+03A6(x)
## [1,] -2.99 0.00139
## [2,] -2.98 0.00144
## [3,] -2.97 0.00149
## [4,] -2.96 0.00154
## [5,] -2.95 0.00159
## [6,] -2.94 0.00164
Investigate the \(\chi^2\) distribution.
Write a function which takes the \(\chi^2\) parameter (e.g. 3 in Exercise 7.4) as an argument and which produces as output a table similar to that we have just produced for the standard normal distribution.
You should think of an appropriate range of values for the table (perhaps look at tables online) and use a loop.
chi_table77 <- matrix(nrow = 601, ncol = 2)
for (k in seq (from = 0,
to = 30,
length = nrow(chi_table77)
)
) {
j <- round (k * 20)
chi_table77[j,] <- c (k,
round (x = pchisq (q = k, df = 3),
digits = 5)
)
}
colnames (chi_table77) <- c ("x", "cdf")
plot (chi_table77[,1], chi_table77[,2])
## x cdf
## [1,] 0.05 0.00293
## [2,] 0.10 0.00816
## [3,] 0.15 0.01477
## [4,] 0.20 0.02241
## [5,] 0.25 0.03086
## [6,] 0.30 0.03997
Amend your program from Exercise 7.5 or that from the answer to Exercise 7.4 so that it uses a loop to write the 4 plots. Is it possible to write this program without a loop or without repeating the code four times?
Create a 1 × 3 plot where the three plots are the three log-log scatter plots you created in Exercise 3.5 (excluding those relating to population).
In each plot use six different colours to represent countries from each of the six continents.
Carry this out using one or more loops to cycle through the three plots and another one to cycle through the six continents.
Create a legend for the various colors.
cd <- read.csv ("Country_data.csv")
cd$continent <- as.factor(cd$continent)
levels(cd$continent) <- c(levels(cd$continent), "NAM")
cd[is.na (cd$continent) == TRUE, 2] <- "NAM"
df_cd_log <- data.frame (
age = log(cd$age_median),
kcal = log(cd$kcals_day),
gdp = log(cd$gdp_head),
con = cd$continent
)
par(mfrow = c(1, 3))
for (i in 1:2) {
for(j in 2:3) {
if (i != j) {
colors <- rainbow(6)
plot (df_cd_log [, i], df_cd_log [, j],
col = colors[df_cd_log$con],
pch = 16,
xlab = paste("log",colnames(df_cd_log)[i]),
ylab = paste("log",colnames(df_cd_log)[j]),
)
legend ("topleft", legend = levels(cd$continent),
col = colors, pch = 20,
title = "Continents")
fit <- lm (df_cd_log [, j] ~ df_cd_log [, i])
abline (fit, col = "red", lwd = 2)
}
}
}
par(mfrow = c(1, 1))