Exercise 1

(a)

CTL_unif <- function(n, m) {
  replicate(n, mean(runif(m, 0, 1)))
}

# example
CTL_unif(n = 5, m = 10)
## [1] 0.4966503 0.3859125 0.4009755 0.4927957 0.4963499

(b)

set.seed(123)  # Set seed for reproducibility

# Generate sample means
data_30 <- CTL_unif(n = 500, m = 30)
data_200 <- CTL_unif(n = 500, m = 200)

# Define theoretical normal distributions
x_vals <- seq(0.3, 0.7, length.out = 100)
y_30 <- dnorm(x_vals, mean = 1/2, sd = sqrt(1 / (12 * 30)))
y_200 <- dnorm(x_vals, mean = 1/2, sd = sqrt(1 / (12 * 200)))

# Plot histograms side by side
par(mfrow = c(1, 2))
hist(data_30, probability = TRUE, main = "Histogram of Means (m=30)", xlab = "Sample Means", xlim = c(0.3, 0.7))
lines(x_vals, y_30, col = "blue", lwd = 2)

hist(data_200, probability = TRUE, main = "Histogram of Means (m=200)", xlab = "Sample Means", xlim = c(0.3, 0.7))
lines(x_vals, y_200, col = "red", lwd = 2)

(c)

As m increases from 30 to 200, the distribution of sample means becomes more concentrated around 0.5, aligning more closely with the normal distribution predicted by the Central Limit Theorem.


Exercise 2

(a)

data("airquality")

# Correct method to check for NAs
sum(is.na(airquality$Ozone)) / nrow(airquality)  # Proportion of missing values
## [1] 0.2418301
# Compute mean excluding NAs
mean(airquality$Ozone, na.rm = TRUE)
## [1] 42.12931

(b)

# Remove rows with any missing values
airquality_clean <- na.omit(airquality)

# Remove Month and Day columns
airquality_clean <- airquality_clean[, !(names(airquality_clean) %in% c("Month", "Day"))]

# Summary statistics
summary(airquality_clean)
##      Ozone          Solar.R           Wind            Temp      
##  Min.   :  1.0   Min.   :  7.0   Min.   : 2.30   Min.   :57.00  
##  1st Qu.: 18.0   1st Qu.:113.5   1st Qu.: 7.40   1st Qu.:71.00  
##  Median : 31.0   Median :207.0   Median : 9.70   Median :79.00  
##  Mean   : 42.1   Mean   :184.8   Mean   : 9.94   Mean   :77.79  
##  3rd Qu.: 62.0   3rd Qu.:255.5   3rd Qu.:11.50   3rd Qu.:84.50  
##  Max.   :168.0   Max.   :334.0   Max.   :20.70   Max.   :97.00
# Boxplots for each variable
par(mfrow = c(2, 2))
for (col in colnames(airquality_clean)) {
  boxplot(airquality_clean[[col]], main = col, xlab = col)
}

(c)

# Compute correlation between Ozone and Temp
cor_ozone_temp <- cor(airquality_clean$Ozone, airquality_clean$Temp)

# Compute correlation between sqrt(Ozone) and Temp
cor_sqrt_ozone_temp <- cor(sqrt(airquality_clean$Ozone), airquality_clean$Temp)

# Scatter plots
par(mfrow = c(1, 2))
plot(airquality_clean$Ozone, airquality_clean$Temp, main = "Ozone vs Temp", xlab = "Ozone", ylab = "Temp", col = "blue")
plot(sqrt(airquality_clean$Ozone), airquality_clean$Temp, main = "sqrt(Ozone) vs Temp", xlab = "sqrt(Ozone)", ylab = "Temp", col = "red")