STAT0004: Chapter 7 exercise

Exercise 7.1. ###############################################

### Expectation: (Note: in all of the following questions, p = 0.2, days = 365)
days <- 365
p <- 0.2
E <- p * days
print(E)

## [1] 73

### Probability that I wear an ironed shirt 16 days or fewer:
sum (dbinom (x = 1:16, size = days, prob = p))

## [1] 4.096763e-18

Exercise 7.2. ###############################################

### probability of the sum of the 17 right-hand-tail entries
sum_of_the_17_right_hand_tail <- 1 - pbinom (q = days - 17,
                                             size = days,
                                             prob = p)
print(sum_of_the_17_right_hand_tail)

## [1] 0

pbinom(q = days - 17, size = days, prob = p, lower.tail = FALSE)

## [1] 1.109471e-218

##########(what's the difference: the two expression above both give the same results,
########## the first expression with 1 - the probability is more preferred~~~)

they are not the same, because the expected value is 73, so the plot of this event is positively skewed:

par (mfrow = c (1,1))
plot (0:days, 
      dbinom ( x = 0:days, size = days, prob = p), 
      type = "l", lwd = 2, col = "blue",
      xlab = "no. days wear ironed shirts",
      ylab = "probability")
abline (v = c (73), col = "red", lty = 2)

In other words, it has longer tail, so the probability of the sum of 17 right-hand tail entries is much smaller than that of the left-hand tail counterpart.

###  P (number of days begins with number 2)
sum ( dbinom (x = c (20:29, 200:299), size = days, prob = p))

## [1] 1.625303e-10

### sum of the log ?
sum ( dbinom (x = c (20:29, 200:299), size = days, prob = p,
              log = TRUE))

## [1] -21077.44

### sum of all prob in X is 1:
pbinom (q = days, size = days, prob = p)

## [1] 1

Exercise 7.3. ###############################################

### skewness of the distribution:

the distribution of X is positively skewed, because the mean value is less than the half of the total size of the data

X <- dbinom (x = 0:days, size = days, prob = 0.2)
barplot ( X,
          names.arg = 0:days,
          main = "Binomial Distribution",
          xlab = "Number of ironed shirts worn", ylab = "Probability")
### median and mode, and mean (comparison)
median_X <- median(order(X))
print (median_X)

## [1] 183.5

mode_X <- (0:365)[which.max(X)]
print (mode_X)

## [1] 73

mean_X <- sum (0:days * X)
print (mean_X)

## [1] 73

### I drew the three lines vertically into the graph to make it more visualize
abline ( v = c (median_X, mode_X, mean_X), 
         col = c ("blue", "red", "gold"), lwd = 2, lty = c (2, 2, 3))

# Comments on comparison: mean and mode shares the same value.

Exercise 7.4. ###############################################

### calculate P (Y = 7)
dgeom (x = 7, prob = 0.2)

## [1] 0.04194304

### shape of the distribution and expectation of Y
Y <- dgeom (x = 0:days, prob = p)
par (mfrow = c (1,2))
barplot (Y,
         names.arg = 0:days,
         main = "Geometric Distribution",
         xlab = "Number of Successes (Y)",
         ylab = "Probability")
### take a look at the part before Y = 35
barplot (dgeom (x = 0:35, prob = p),
         main = "Geometric Distribution \n(closer look)",
         xlab = "Number of Successes (Y)",
         ylab = "Probability")

# This distribution of Y is positively skewed

mean_Y <- sum (0:days * Y)
print (mean_Y)

## [1] 4

Exercise 7.5. ###############################################

par (mfrow = c (1,1))
poi5 <- dpois(0:20, lambda = 5)
plot (poi5, pch = 20,
      xlab = "value of x", ylab = "probability", main = "Lines of mean = 5",
      ylim = c(0,0.25))
lines(poi5, lwd = 2)

## the mean of the distribution Poi(5)
mean_poi5 <- sum (0:20 * poi5)
print(mean_poi5)

## [1] 4.999998

################################ add the following lines (3):
### bin(10,0.5) distribution in red
bin10_0.5 <- dbinom (0:10, size = 10, prob = 0.5)
points(bin10_0.5, col = "red", pch = 17)
lines(bin10_0.5, col = "red", lwd = 2)

# the mean of this bin(10, 0.5) distribution
mean_bin10_0.5 <- sum (0:10 * bin10_0.5)
mean_bin10_0.5

## [1] 5

# the mean of this distribution is 5

### the bin(20, 0.25) distribution in blue and find its mean
bin20_0.25 <- dbinom (0:20, size = 20, prob = 0.25)
points (bin20_0.25, col = "blue", pch = 15)
lines (bin20_0.25, col = "blue", lwd = 2, lty = 3)

# the mean of this bin(20, 0.25) distribution
mean_bin20_0.25 <- sum (0:20 * bin20_0.25)
mean_bin20_0.25

## [1] 5

# the mean of this distribution is  5

### the bin (40, 0.125) distribution in gold and find its mean
bin40_0.125 <- dbinom (0:40, size = 40, prob = 0.125)
points (bin40_0.125, col = "gold", pch = 4)
lines (bin40_0.125, col = "gold", lwd = 2, lty = 8)

# the mean of this bin(40, 0.125) distribution
mean_bin40_0.125 <- sum (0:40 * bin40_0.125)
mean_bin40_0.125

## [1] 5

# the mean of this distribution is  5

abline(v = 5, lty = 2, col = "orchid")
# add a suitable legend and a title which describes what you are demonstrating in the plot
legend ("topright", legend = c("Po(5)", "B(10, 0.5)", "B(20, 0.25)", "B(40, 0.125)"),
        col = c ("black", "red", "blue", "gold"),
        lty = rep ( 1 , times = 3), cex = 0.7)

Exercise 7.6. ###############################################

dunif (0.3, min = -1, max = 1)

## [1] 0.5

# interpretation: a continuous random variable for which
# all values between -1 and 1 are equally likely
punif (1, min = -1, max = 1)

## [1] 1

prob_of_x_smaller_than_1 <- (1 - (-1)) / (1 - (-1))
print (prob_of_x_smaller_than_1)

## [1] 1

### There is no difference

Exercise 7.7. ###############################################

dexp (0.5, rate = 1)

## [1] 0.6065307

pexp (0.5, rate = 1)

## [1] 0.3934693

# as the default rate = 1, so the rate = 1 is not needed
dexp (0.5)

## [1] 0.6065307

pexp (0.5)

## [1] 0.3934693

# comments: it doesn't matter
lambda = 1
pdf_exponential <- function(x) ( lambda * exp (-lambda * x))
area_under_the_plot <- integrate (pdf_exponential,
                                  lower = 0, upper = Inf)
print(area_under_the_plot)

## 1 with absolute error < 5.7e-05

# or: 
pexp (q = Inf, rate = 1)

## [1] 1

Exercise 7.8. ###############################################

pnorm (4, mean= 2, sd = sqrt(4))

## [1] 0.8413447

# What are qX1 such that P(X < qX1) = 0.95 and qX2 such that P(X < qX2) = 0.975?

qx1 <- qnorm (0.95, mean = 2, sd = sqrt(4))
print (qx1)

## [1] 5.289707

qx2 <- qnorm (0.975, mean = 2, sd = sqrt (4))
print (qx2)

## [1] 5.919928

# Draw a manual plot of this normal distribution 
#and mark roughly the two points that you have just calculated.

x_values <- seq (-5, 9, length.out = 1000)
plot (x_values, dnorm (x_values, mean = 2, sd = 2),
      type = "l", col = "blue", lwd = 2,
      ylab = "Density", xlab = "X")
abline (v = c (qx1, qx2), col = "red", lty = 2)

Now consider Y ∼ N(0, 1).

# Calculate qY 1 and qY 2 at the same percentage points as for X. 

qy1 <- qnorm (0.95, mean = 0, sd = 1)
print (qy1)

## [1] 1.644854

qy2 <- qnorm (0.975, mean = 0, sd = 1)
print (qy2)

## [1] 1.959964

# Add the plot for Y and the two points to your original drawing. 

y_values <- seq (-4, 4, length.out = 1000)
plot (y_values, dnorm (y_values, mean = 0, sd = 1),
      type = "l", col = "orchid", lwd = 2,
      ylab = "Density", xlab = "Y")
abline (v = c (qy1, qy2), col = "red", lty = 2)

# comments: (X - µ) / sigma = Y

Exercise 7.9. ###############################################

qanda <- read.csv("selective_affinities_aftermodified.csv")
par (mfrow = c (2,2))

Handspan_mean <- mean(qanda$Handspan)
Handspan_var <- var(qanda$Handspan)

# Histogram 1 
hist(qanda$Handspan, probability = TRUE, ylim = c (0, 0.15),
     main = "Handspan with default breakings",
     xlab = "handspan(cm)")
Handspan_values <- seq(0, 30, length.out = 1000)
lines(Handspan_values, dnorm(Handspan_values, mean = Handspan_mean, sd = sqrt(Handspan_var)), 
      type = "l", col = "blue", lwd = 2, lty = 2)

# Histogram 2 
hist( qanda$Handspan, probability = TRUE,
      breaks = c (seq(from = 8, to = 26, by = 0.5)),
      main = "Handspan with 0.5cm breakings",
      xlab = "handspan(cm)")
lines(Handspan_values, dnorm(Handspan_values, mean = Handspan_mean, sd = sqrt(Handspan_var)), 
      type = "l", col = "blue", lwd = 2, lty = 2)

# plot 3 - correct to normal distribution
plot(Handspan_values, dnorm(Handspan_values, mean = Handspan_mean, sd = sqrt(Handspan_var)), 
     type = "l", col = "blue", lwd = 2, 
     xlim = c (5, 30), ylim = c (0, 0.15),
     ylab = "Density", xlab = "Y",
     main = "correction to normal distribution")

# plot 4 - use qq plot
qqnorm (y = qanda$Handspan, pch = 20)
qqline (qanda$Handspan, col = "red", lwd = 2)

# the data fits the normal distribution without splitting the class width
# into 0.5 (the top left graph), as it fits the normal curve better than the second
# while the qq plot at the bottom right reveals that it fits the normal distribution

Exercise 7.10. ###############################################

# Yes, the data statistics suggest that the data are normally distributed.

Exercise 7.11. ###############################################

qqnorm (y = qanda$Handspan, pch = 20)
qqline (qanda$Handspan, col = "red", lwd = 2)

# The plot shows that the data fits the normal distribution, 
# because the points are close to a straight line.

STAT0004: Chapter 7 exercise

Aardvark

21st Nov