Exercise 7.1. ###############################################
### Expectation: (Note: in all of the following questions, p = 0.2, days = 365)
days <- 365
p <- 0.2
E <- p * days
print(E)
## [1] 73
### Probability that I wear an ironed shirt 16 days or fewer:
sum (dbinom (x = 1:16, size = days, prob = p))
## [1] 4.096763e-18
Exercise 7.2. ###############################################
### probability of the sum of the 17 right-hand-tail entries
sum_of_the_17_right_hand_tail <- 1 - pbinom (q = days - 17,
size = days,
prob = p)
print(sum_of_the_17_right_hand_tail)
## [1] 0
pbinom(q = days - 17, size = days, prob = p, lower.tail = FALSE)
## [1] 1.109471e-218
##########(what's the difference: the two expression above both give the same results,
########## the first expression with 1 - the probability is more preferred~~~)
they are not the same, because the expected value is 73, so the plot of this event is positively skewed:
par (mfrow = c (1,1))
plot (0:days,
dbinom ( x = 0:days, size = days, prob = p),
type = "l", lwd = 2, col = "blue",
xlab = "no. days wear ironed shirts",
ylab = "probability")
abline (v = c (73), col = "red", lty = 2)
In other words, it has longer tail, so the probability of the sum of 17 right-hand tail entries is much smaller than that of the left-hand tail counterpart.
### P (number of days begins with number 2)
sum ( dbinom (x = c (20:29, 200:299), size = days, prob = p))
## [1] 1.625303e-10
### sum of the log ?
sum ( dbinom (x = c (20:29, 200:299), size = days, prob = p,
log = TRUE))
## [1] -21077.44
### sum of all prob in X is 1:
pbinom (q = days, size = days, prob = p)
## [1] 1
Exercise 7.3. ###############################################
### skewness of the distribution:
the distribution of X is positively skewed, because the mean value is less than the half of the total size of the data
X <- dbinom (x = 0:days, size = days, prob = 0.2)
barplot ( X,
names.arg = 0:days,
main = "Binomial Distribution",
xlab = "Number of ironed shirts worn", ylab = "Probability")
### median and mode, and mean (comparison)
median_X <- median(order(X))
print (median_X)
## [1] 183.5
mode_X <- (0:365)[which.max(X)]
print (mode_X)
## [1] 73
mean_X <- sum (0:days * X)
print (mean_X)
## [1] 73
### I drew the three lines vertically into the graph to make it more visualize
abline ( v = c (median_X, mode_X, mean_X),
col = c ("blue", "red", "gold"), lwd = 2, lty = c (2, 2, 3))
# Comments on comparison: mean and mode shares the same value.
Exercise 7.4. ###############################################
### calculate P (Y = 7)
dgeom (x = 7, prob = 0.2)
## [1] 0.04194304
### shape of the distribution and expectation of Y
Y <- dgeom (x = 0:days, prob = p)
par (mfrow = c (1,2))
barplot (Y,
names.arg = 0:days,
main = "Geometric Distribution",
xlab = "Number of Successes (Y)",
ylab = "Probability")
### take a look at the part before Y = 35
barplot (dgeom (x = 0:35, prob = p),
main = "Geometric Distribution \n(closer look)",
xlab = "Number of Successes (Y)",
ylab = "Probability")
# This distribution of Y is positively skewed
mean_Y <- sum (0:days * Y)
print (mean_Y)
## [1] 4
Exercise 7.5. ###############################################
par (mfrow = c (1,1))
poi5 <- dpois(0:20, lambda = 5)
plot (poi5, pch = 20,
xlab = "value of x", ylab = "probability", main = "Lines of mean = 5",
ylim = c(0,0.25))
lines(poi5, lwd = 2)
## the mean of the distribution Poi(5)
mean_poi5 <- sum (0:20 * poi5)
print(mean_poi5)
## [1] 4.999998
################################ add the following lines (3):
### bin(10,0.5) distribution in red
bin10_0.5 <- dbinom (0:10, size = 10, prob = 0.5)
points(bin10_0.5, col = "red", pch = 17)
lines(bin10_0.5, col = "red", lwd = 2)
# the mean of this bin(10, 0.5) distribution
mean_bin10_0.5 <- sum (0:10 * bin10_0.5)
mean_bin10_0.5
## [1] 5
# the mean of this distribution is 5
### the bin(20, 0.25) distribution in blue and find its mean
bin20_0.25 <- dbinom (0:20, size = 20, prob = 0.25)
points (bin20_0.25, col = "blue", pch = 15)
lines (bin20_0.25, col = "blue", lwd = 2, lty = 3)
# the mean of this bin(20, 0.25) distribution
mean_bin20_0.25 <- sum (0:20 * bin20_0.25)
mean_bin20_0.25
## [1] 5
# the mean of this distribution is 5
### the bin (40, 0.125) distribution in gold and find its mean
bin40_0.125 <- dbinom (0:40, size = 40, prob = 0.125)
points (bin40_0.125, col = "gold", pch = 4)
lines (bin40_0.125, col = "gold", lwd = 2, lty = 8)
# the mean of this bin(40, 0.125) distribution
mean_bin40_0.125 <- sum (0:40 * bin40_0.125)
mean_bin40_0.125
## [1] 5
# the mean of this distribution is 5
abline(v = 5, lty = 2, col = "orchid")
# add a suitable legend and a title which describes what you are demonstrating in the plot
legend ("topright", legend = c("Po(5)", "B(10, 0.5)", "B(20, 0.25)", "B(40, 0.125)"),
col = c ("black", "red", "blue", "gold"),
lty = rep ( 1 , times = 3), cex = 0.7)
Exercise 7.6. ###############################################
dunif (0.3, min = -1, max = 1)
## [1] 0.5
# interpretation: a continuous random variable for which
# all values between -1 and 1 are equally likely
punif (1, min = -1, max = 1)
## [1] 1
prob_of_x_smaller_than_1 <- (1 - (-1)) / (1 - (-1))
print (prob_of_x_smaller_than_1)
## [1] 1
### There is no difference
Exercise 7.7. ###############################################
dexp (0.5, rate = 1)
## [1] 0.6065307
pexp (0.5, rate = 1)
## [1] 0.3934693
# as the default rate = 1, so the rate = 1 is not needed
dexp (0.5)
## [1] 0.6065307
pexp (0.5)
## [1] 0.3934693
# comments: it doesn't matter
lambda = 1
pdf_exponential <- function(x) ( lambda * exp (-lambda * x))
area_under_the_plot <- integrate (pdf_exponential,
lower = 0, upper = Inf)
print(area_under_the_plot)
## 1 with absolute error < 5.7e-05
# or:
pexp (q = Inf, rate = 1)
## [1] 1
Exercise 7.8. ###############################################
pnorm (4, mean= 2, sd = sqrt(4))
## [1] 0.8413447
# What are qX1 such that P(X < qX1) = 0.95 and qX2 such that P(X < qX2) = 0.975?
qx1 <- qnorm (0.95, mean = 2, sd = sqrt(4))
print (qx1)
## [1] 5.289707
qx2 <- qnorm (0.975, mean = 2, sd = sqrt (4))
print (qx2)
## [1] 5.919928
# Draw a manual plot of this normal distribution
#and mark roughly the two points that you have just calculated.
x_values <- seq (-5, 9, length.out = 1000)
plot (x_values, dnorm (x_values, mean = 2, sd = 2),
type = "l", col = "blue", lwd = 2,
ylab = "Density", xlab = "X")
abline (v = c (qx1, qx2), col = "red", lty = 2)
Now consider Y ∼ N(0, 1).
# Calculate qY 1 and qY 2 at the same percentage points as for X.
qy1 <- qnorm (0.95, mean = 0, sd = 1)
print (qy1)
## [1] 1.644854
qy2 <- qnorm (0.975, mean = 0, sd = 1)
print (qy2)
## [1] 1.959964
# Add the plot for Y and the two points to your original drawing.
y_values <- seq (-4, 4, length.out = 1000)
plot (y_values, dnorm (y_values, mean = 0, sd = 1),
type = "l", col = "orchid", lwd = 2,
ylab = "Density", xlab = "Y")
abline (v = c (qy1, qy2), col = "red", lty = 2)
# comments: (X - µ) / sigma = Y
Exercise 7.9. ###############################################
qanda <- read.csv("selective_affinities_aftermodified.csv")
par (mfrow = c (2,2))
Handspan_mean <- mean(qanda$Handspan)
Handspan_var <- var(qanda$Handspan)
# Histogram 1
hist(qanda$Handspan, probability = TRUE, ylim = c (0, 0.15),
main = "Handspan with default breakings",
xlab = "handspan(cm)")
Handspan_values <- seq(0, 30, length.out = 1000)
lines(Handspan_values, dnorm(Handspan_values, mean = Handspan_mean, sd = sqrt(Handspan_var)),
type = "l", col = "blue", lwd = 2, lty = 2)
# Histogram 2
hist( qanda$Handspan, probability = TRUE,
breaks = c (seq(from = 8, to = 26, by = 0.5)),
main = "Handspan with 0.5cm breakings",
xlab = "handspan(cm)")
lines(Handspan_values, dnorm(Handspan_values, mean = Handspan_mean, sd = sqrt(Handspan_var)),
type = "l", col = "blue", lwd = 2, lty = 2)
# plot 3 - correct to normal distribution
plot(Handspan_values, dnorm(Handspan_values, mean = Handspan_mean, sd = sqrt(Handspan_var)),
type = "l", col = "blue", lwd = 2,
xlim = c (5, 30), ylim = c (0, 0.15),
ylab = "Density", xlab = "Y",
main = "correction to normal distribution")
# plot 4 - use qq plot
qqnorm (y = qanda$Handspan, pch = 20)
qqline (qanda$Handspan, col = "red", lwd = 2)
# the data fits the normal distribution without splitting the class width
# into 0.5 (the top left graph), as it fits the normal curve better than the second
# while the qq plot at the bottom right reveals that it fits the normal distribution
Exercise 7.10. ###############################################
# Yes, the data statistics suggest that the data are normally distributed.
Exercise 7.11. ###############################################
qqnorm (y = qanda$Handspan, pch = 20)
qqline (qanda$Handspan, col = "red", lwd = 2)
# The plot shows that the data fits the normal distribution,
# because the points are close to a straight line.