Exercise 8

Exercise 8.1.

x_values = seq ( -3, 3, length.out = 100000)
plot ( x_values, dnorm (x_values, mean = 0, sd = 1),
       type = "l")
abline ( v = c (-3, 3), col = "red", 
         lty = 2, lwd = 2)

total_area <- pnorm (Inf, mean = 0, sd = 1)
area_below_3 <- pnorm (3, mean = 0, sd = 1)
area_between <- ((2 * area_below_3 - 1) / total_area ) *100
print ( paste (area_between,"%"))

## [1] "99.730020393674 %"

\[Area= \frac{(2 \times P(X<3) -1)}{P(total)} = 99.730020393674 \% \]

Exercise 8.2.

\[ P (exactly~one~of~each~value) = (\frac{1}{4})^4 \times 4! = \frac{3}{32}\]

\[ P (3~different~values~with~one~repeated) = \frac{4!}{2!} \times 1 \times (\frac{1}{4})^3 \times \binom{3}{1} = \frac{9}{16}\]

Exercise 8.3.

set.seed (001)
values <- c ("Buzz", "Michael", "Neil")
probabilities <- c (0.25, 0.01, 0.74)

sample_data_size_200 <- sample (values, size = 200, 
                                replace = TRUE, prob = probabilities)
table_of_200_data <- table (sample_data_size_200)
table_of_200_data

## sample_data_size_200
##    Buzz Michael    Neil 
##      48       3     149

for comparison

data.frame (values = c ("Buzz", "Michael", "Neil"),
            probabilities = c (0.25, 0.01, 0.74),
            sample_probabilities = c (table_of_200_data / 200),
            sample_frequencies = c (table_of_200_data))

##          values probabilities sample_probabilities sample_frequencies
## Buzz       Buzz          0.25                0.240                 48
## Michael Michael          0.01                0.015                  3
## Neil       Neil          0.74                0.745                149

Exercise 8.4.

set.seed(12345)

Generate 4 sets of 100 points from the N(0, 1) distribution

random_normal_1 <- rnorm ( n = 100, mean = 0, sd = 1 )
random_normal_2 <- rnorm ( n = 100, mean = 0, sd = 1 )
random_normal_3 <- rnorm ( n = 100, mean = 0, sd = 1 )
random_normal_4 <- rnorm ( n = 100, mean = 0, sd = 1 )

compare each of the sets using histogram

par (mfrow = c (2,2))

hist (random_normal_1, prob = TRUE, xlim = c (-3, 3),
      main = "1st set of data \n normal 1", col = "darkseagreen1", xlab = "")
curve (dnorm (x, mean = mean(random_normal_1), sd = sd(random_normal_1)), 
       lwd = 2, col = "blue", lty = 2, add = TRUE)

hist (random_normal_2, prob = TRUE, xlim = c (-3, 3),
      main = "2nd set of data \n normal 2", col = "lightsalmon", xlab = "")
curve (dnorm (x, mean = mean(random_normal_2), sd = sd(random_normal_2)), 
       lwd = 2, col = "blue", lty = 2, add = TRUE)

hist (random_normal_3, prob = TRUE, xlim = c (-3, 3),
      main = "3rd set of data \n normal 3", col = "moccasin", xlab = "")
curve (dnorm (x, mean = mean(random_normal_3), sd = sd(random_normal_3)), 
       lwd = 2, col = "blue", lty = 2, add = TRUE)

hist (random_normal_4, prob = TRUE, xlim = c (-3, 3),
      main = "4th set of data \n normal 4", col = "paleturquoise", xlab = "")
curve (dnorm (x, mean = mean(random_normal_4), sd = sd(random_normal_4)), 
       lwd = 2, col = "blue", lty = 2, add = TRUE)

Generate 4 sets of 100 points from the exp(2) distribution

random_exp_1 <- rexp ( n = 100, rate = 2)
random_exp_2 <- rexp ( n = 100, rate = 2)
random_exp_3 <- rexp ( n = 100, rate = 2)
random_exp_4 <- rexp ( n = 100, rate = 2)

Compare each of the sets using a histogram to

the shape of the corresponding Exponential distribution

par (mfrow = c (2,2))

hist (random_exp_1, prob = TRUE,
      main = "1st set of data \n exp 1", col = "royalblue", xlab = "")
curve(dexp(x, rate = 2), add = TRUE, 
      col = "red", lty = 2, lwd = 2)

hist (random_exp_2, prob = TRUE, xlim = c (0, 5),
      main = "2nd set of data \n exp 2", col = "seagreen1", xlab = "")
curve(dexp(x, rate = 2), add = TRUE, 
      col = "red", lty = 2, lwd = 2)

hist (random_exp_3, prob = TRUE, xlim = c (0, 5),
      main = "3rd set of data \n exp 3", col = "plum", xlab = "")
curve(dexp(x, rate = 2), add = TRUE, 
      col = "red", lty = 2, lwd = 2)

hist (random_exp_4, prob = TRUE, xlim = c (0, 5),
      main = "4th set of data \n exp 4", col = "wheat", xlab = "")
curve(dexp(x, rate = 2), add = TRUE, 
      col = "red", lty = 2, lwd = 2)

qq plots comparing the exponential data to a normal distribution

par (mfrow = c (2,2))

qqnorm(random_exp_1, 
       main = "qq plot for both 1st sets of data \n exp to norm", 
       col = "blue", pch = 20)
qqline(random_exp_1, distribution = qnorm, col = "red")

qqnorm(random_exp_2, 
       main = "qq plot for both 2nd sets of data \n exp to norm", 
       col = "blue", pch = 20)
qqline(random_exp_2, col = "red")

qqnorm(random_exp_3, 
       main = "qq plot for both 3rd sets of data \n exp to norm", 
       col = "blue", pch = 20)
qqline(random_exp_3, col = "red")

qqnorm(random_exp_4, 
       main = "qq plot for both 4th sets of data \n exp to norm", 
       col = "blue", pch = 20)
qqline(random_exp_4, col = "red")

Does there appear to be a good fit?

What values for the parameters of the Normal distribution will be used by R in the comparison?

Why do the x axes have negative numbers but the y axes not?

From the diagram, most of them are bad fitted to the line. As an exponential distribution varies a lot from the normal distribution. The parameters of the normal distribution used by R will be mean = 0, sd = 1. The x-axis represents the theoretical quantiles of a standard normal distribution, which include negative and positive numbers. The y-axis represents the observed quantiles of the exponential data, so they are always positive.

qqplot of exp to exp

par (mfrow = c (2,2))

qqplot (qexp (ppoints(100), rate = 2), random_exp_1, 
       main = "qq plot for both 1st set of data \n exp to exp", 
       col = "blue", pch = 20)
abline(a = 0, b = 1, col = "red", lty = 2)

qqplot (qexp(ppoints(100), rate = 2), random_exp_2, 
       main = "qq plot for both 2nd set of data \n exp to exp", 
       col = "blue", pch = 20)
abline(a = 0, b = 1, col = "red", lty = 2)

qqplot (qexp(ppoints(100), rate = 2), random_exp_3, 
       main = "qq plot for both 3rd set of data \n exp to exp", 
       col = "blue", pch = 20)
abline(a = 0, b = 1, col = "red", lty = 2)

qqplot (qexp(ppoints(100), rate = 2), random_exp_4, 
       main = "qq plot for both 4th set of data \n exp to exp", 
       col = "blue", pch = 20)
abline(a = 0, b = 1, col = "red", lty = 2)

They appear to be good fit.