library(ggplot2)
library(dplyr)
library(kableExtra)
library(knitr)
#setwd("/Users/kevinpiger/Desktop/碩一下/統模/Homework02/")
options(scipen = 100)

Question 01

Uniform (0,1) transformation For uniform (0,1) random variables \(U_1,U_2, ... ,\) define N = min{n :\(\sum_{i=1}^{n}{U_i>1}\) }. That is, N is the number of random numbers that must be summed to exceed 1.

(a)

Estimate E(N) with standard errors by generating 1,000, 2,000, 5,000, 10,000, and 100,000 values of N, and check if there are any patterns in the estimate and its s.e.

# T 欲模擬次數 #
Q1 <- function(T){
N = NULL
S = NULL
for( i in 1:T){
  n = 0; i = 0
  while(n < 1){
    n = sum(n ,runif(1))
    i = i + 1}
    N <- c(N,i)
    }
  return(N)
}

Calculation and output

output <- NULL
output2 <- NULL
sim <- c(1000, 2000, 5000, 10000, 100000)
for(i in 1:5){
  output[i] <- mean(Q1(sim[i]));
  output2[i] <- var(Q1(sim[i]))
}
output <- t(output)
output2 <- t(output2)
output <- rbind(output,output2)
rownames(output) <- c("Output","var")
colnames(output) <- sim
kable(output,"html") %>% 
  kable_styling(bootstrap_options = 'striped', full_width = F) %>% 
  add_header_above(c(" " = 1, "# of Simulation" = 5))
# of Simulation
1000 2000 5000 10000 100000
Output 2.7080000 2.7345000 2.727600 2.7259000 2.7217900
var 0.7378378 0.7490855 0.752803 0.7639223 0.7639274

Question 02

You think you know baseball? Do home run hitters have the highest batting averages? Think about this hypothesis, then analyze the following experience based on a half season with the Braves:

# Question 02 #
x <- c(0.252,0.305,0.299,0.303,0.285,0.191,0.283,0.272,0.310,0.266,0.215,0.211,0.244,0.320)
y <- c(12,6,4,15,2,2,16,6,8,10,0,3,6,7)
Ques <- rbind(x[1:7], y[1:7], x[8:14], y[8:14]) 
rownames(Ques) <- c("Batting Average","Home Runs","Batting Average","Home Runs")
colnames(Ques) <- c(1:7)
kable(Ques, "html") %>%
  kable_styling(bootstrap_options = "striped", full_width = F) %>% 
  add_header_above(c("Label" = 1, 'Baseball Data' = 7))
Label
Baseball Data
1 2 3 4 5 6 7
Batting Average 0.252 0.305 0.299 0.303 0.285 0.191 0.283
Home Runs 12.000 6.000 4.000 15.000 2.000 2.000 16.000
Batting Average 0.272 0.310 0.266 0.215 0.211 0.244 0.320
Home Runs 6.000 8.000 10.000 0.000 3.000 6.000 7.000
\(H_0:\) 打擊率與全壘打無關
\(H_1:\) 打擊率與全壘打有關

num = 100,000 為模擬次數, \(\alpha\) = 0.05

alpha = 0.05
num = 100000
N = NULL
for(i in 1:num){
x1 <- sample(x,14,F) 
y1 <- sample(y,14,F) 
N <- c(N, sum(x1*y1))}
t0 <- sum(x*y)
N1 <- data.frame("BatXHit" = N) %>% mutate(over = ifelse(BatXHit >=t0,  ">= critical value","< critical value"))
N1$over <- N1$over %>% as.factor()
ggplot(N1,aes(x = BatXHit,fill = over))+
  geom_histogram(bins = 100,alpha = 0.7)+
  scale_fill_manual(values = c("skyblue","blue"),
                    name = paste0("Label\n","k = ",t0),
                    breaks = c(">= critical value","< critical value k"),
                    labels = c(">= critical value","< critical value k"))+
  theme(plot.title = element_text(colour = "black", face = "bold", size = 25))+
  geom_vline(aes(xintercept = t0),linetype="dashed",color = "red")+
  labs(title = "Permutation ",x = "Simulation of BatXHit", y = "Count")+
  scale_x_continuous(breaks = c(24:27,t0,28),
                     labels = c(24:27,"k",28))

pv <- sum(N > t0 )/num
print(ifelse(pv > alpha, paste('p value',pv,'>',alpha,'Do not reject H0'),paste('p value',pv,'<=',alpha,'Reject H0')))
## [1] "p value 0.04776 <= 0.05 Reject H0"

Question 03

Similar to what Efron did in the Law school data example, compute the bootstrap simulation for 50, 100, …, 10,000 replications. But, instead of using the original 15 observations, we want to know if the number of observations plays an important role. Randomly select 10, 15, 20, and 25 observations and then see if the bootstrap variance converges as the number of replications increases. (Note: You also need to compare your results with that of population.)

library(bootstrap)
library(dplyr)
library(kableExtra)
library(knitr)
library(ggplot2)

Read data

data = law82
data %>% kable(.,"html") %>% kable_styling() %>% scroll_box(height = "200px")
School LSAT GPA
1 622 3.23
2 542 2.83
3 579 3.24
4 653 3.12
5 606 3.09
6 576 3.39
7 620 3.10
8 615 3.40
9 553 2.97
10 607 2.91
11 558 3.11
12 596 3.24
13 635 3.30
14 581 3.22
15 661 3.43
16 547 2.91
17 599 3.23
18 646 3.47
19 622 3.15
20 611 3.33
21 546 2.99
22 614 3.19
23 628 3.03
24 575 3.01
25 662 3.39
26 627 3.41
27 608 3.04
28 632 3.29
29 587 3.16
30 581 3.17
31 605 3.13
32 704 3.36
33 477 2.57
34 591 3.02
35 578 3.03
36 572 2.88
37 615 3.37
38 606 3.20
39 603 3.23
40 535 2.98
41 595 3.11
42 575 2.92
43 573 2.85
44 644 3.38
45 545 2.76
46 645 3.27
47 651 3.36
48 562 3.19
49 609 3.17
50 555 3.00
51 586 3.11
52 580 3.07
53 594 2.96
54 594 3.05
55 560 2.93
56 641 3.28
57 512 3.01
58 631 3.21
59 597 3.32
60 621 3.24
61 617 3.03
62 637 3.33
62 572 3.08
64 610 3.13
65 562 3.01
66 635 3.30
67 614 3.15
68 546 2.82
69 598 3.20
70 666 3.44
71 570 3.01
72 570 2.92
73 605 3.45
74 565 3.15
75 686 3.50
76 608 3.16
77 595 3.19
78 590 3.15
79 558 2.81
80 611 3.16
81 564 3.02
82 575 2.74
cor(data$LSAT,data$GPA)
## [1] 0.7599979

bootstrap correlatoin

# design function
Q3 <- function(num,Sim_num,rep_num){
  Sim_cor = NULL
  for(i in 1:rep_num){
    temp1 <- sample(num,Sim_num,T)
    temp <- data[temp1,2:3] %>% cor()
    Sim_cor <- c(Sim_cor,temp[1,2])}
  return(c(mean(Sim_cor),var(Sim_cor)))}

observation = 10

set.seed(106354012)
Sim_num = 10
num <- sample(1:82,Sim_num ,F)

A = NULL;B = NULL
for( i in c(seq(50,10000,by = 50))){
  A <- c(A,Q3(num,Sim_num,i)[2])
  B <- c(B,Q3(num,Sim_num,i)[1])}
# mean variance of correlation #
mean(A)
## [1] 0.04274998
# mean correlation #
mean(B)
## [1] 0.7444454
temp <- data.frame(num = c(1:200),boots_var = A)
ggplot(temp) + 
  geom_point(aes(x = temp$num, y = temp$boots_var)) +
  geom_smooth(aes(x = temp$num, y = temp$boots_var)) +
  labs(title="Bootstrap of variance with 10 Observarion ",
       x="Index",
       y="Variance of correlation") +
  theme_bw() 
## `geom_smooth()` using method = 'loess'

temp <- data.frame(num = c(1:200),boots_mean = B)
ggplot(temp) + 
  geom_point(aes(x = temp$num, y = temp$boots_mean)) +
  geom_smooth(aes(x = temp$num, y = temp$boots_mean)) +
  labs(title="Bootstrap of mean with 10 Observarion  ",
       x="Index",
       y="Mean of correlation") +
  theme_bw() 
## `geom_smooth()` using method = 'loess'

observation = 15

set.seed(106354012)
Sim_num = 15
num <- sample(1:82,Sim_num ,F)

A = NULL;B = NULL
for( i in c(seq(50,10000,by = 50))){
  A <- c(A,Q3(num,Sim_num,i)[2])
  B <- c(B,Q3(num,Sim_num,i)[1])}
# mean variance of correlation #
mean(A)
## [1] 0.01197177
# mean correlation #
mean(B)
## [1] 0.8001203
temp <- data.frame(num = c(1:200),boots_var = A)
ggplot(temp) + 
  geom_point(aes(x = temp$num, y = temp$boots_var)) +
  geom_smooth(aes(x = temp$num, y = temp$boots_var)) +
  labs(title="Bootstrap of variance with 15 Observarion ",
       x="Index",
       y="Variance of correlation") +
  theme_bw() 
## `geom_smooth()` using method = 'loess'

temp <- data.frame(num = c(1:200),boots_mean = B)
ggplot(temp) + 
  geom_point(aes(x = temp$num, y = temp$boots_mean)) +
  geom_smooth(aes(x = temp$num, y = temp$boots_mean)) +
  labs(title="Bootstrap of mean with 15 Observarion ",
       x="Index",
       y="Mean of correlation") +
  theme_bw() 
## `geom_smooth()` using method = 'loess'

observation = 20

set.seed(106354012)
Sim_num = 20
num <- sample(1:82,Sim_num ,F)

A = NULL;B = NULL
for( i in c(seq(50,10000,by = 50))){
  A <- c(A,Q3(num,Sim_num,i)[2])
  B <- c(B,Q3(num,Sim_num,i)[1])}
# mean variance of correlation #
mean(A)
## [1] 0.008962489
# mean correlation #
mean(B)
## [1] 0.7763331
temp <- data.frame(num = c(1:200),boots_var = A)

ggplot(temp) + 
  geom_point(aes(x = temp$num, y = temp$boots_var)) +
  geom_smooth(aes(x = temp$num, y = temp$boots_var)) +
  labs(title="Bootstrap of variance with 20 Observarion ",
       x="Index",
       y="Variance of correlation") +
  theme_bw() 
## `geom_smooth()` using method = 'loess'

temp <- data.frame(num = c(1:200),boots_mean = B)
ggplot(temp) + 
  geom_point(aes(x = temp$num, y = temp$boots_mean)) +
  geom_smooth(aes(x = temp$num, y = temp$boots_mean)) +
  labs(title="Bootstrap of mean with 20 Observarion ",
       x="Index",
       y="Mean of correlation") +
  theme_bw() 
## `geom_smooth()` using method = 'loess'

observation = 25

set.seed(106354012)
Sim_num = 25
num <- sample(1:82,Sim_num ,F)

A = NULL;B = NULL
for( i in c(seq(50,10000,by = 50))){
  A <- c(A,Q3(num,Sim_num,i)[2])
  B <- c(B,Q3(num,Sim_num,i)[1])}
# mean variance of correlation #
mean(A)
## [1] 0.008449368
# mean correlation #
mean(B)
## [1] 0.7656147
temp <- data.frame(num = c(1:200),boots_var = A)

ggplot(temp) + 
  geom_point(aes(x = temp$num, y = temp$boots_var)) +
  geom_smooth(aes(x = temp$num, y = temp$boots_var)) +
  labs(title="Bootstrap of variance with 25 Observarion ",
       x="Index",
       y="Variance of correlation") +
  theme_bw() 
## `geom_smooth()` using method = 'loess'

temp <- data.frame(num = c(1:200),boots_mean = B)
ggplot(temp) + 
  geom_point(aes(x = temp$num, y = temp$boots_mean)) +
  geom_smooth(aes(x = temp$num, y = temp$boots_mean)) +
  labs(title="Bootstrap of mean with 25 Observarion ",
       x="Index",
       y="Mean of correlation") +
  theme_bw() 
## `geom_smooth()` using method = 'loess'

Question 04

The block bootstrap can be used in prediction for dependent data. Use the built-in data “sunspot.year” in R, which is can be modeled as an AR(2) model, compare the difference of prediction via block bootstrap and AR(2) model. As a check, you can leave the final 10 observations as “testing” data.

library(dplyr)
library(kableExtra)
library(knitr)
library(ggplot2)
data <- sunspot.year
# 顯示不好,需要再討論 #
head(data)
## [1]  5 11 16 23 36 58

Read and Show data

使用前279筆資料去做訓練data, 後10筆資料為驗證data

train_data <- data[1:279]
test_data <- data[280:289]

#實際資料
test_data
##  [1] 155.4 154.7 140.5 115.9  66.6  45.9  17.9  13.4  29.2 100.2

Block bootstrap

temp <- matrix(0,1000,ncol = 10)
dr <- train_data[-1] - train_data[-279]
set.seed(106354012)
for(i in 1:1000){
st <- sample(1:269,1)
temp[i,] <- dr[c(st:(st+9))]}
# 取每一行值的中位數 #
md <- apply(temp,2,median)
md
##  [1] -2.6 -3.0 -3.0 -4.0 -4.8 -5.0 -4.2 -4.0 -2.6 -2.6
predict_value = NULL
for(i in 1:10){
predict_value <- c(predict_value,sum(md[c(1:i)]) + train_data[279])}

# 預測結果 #
predict_value
##  [1] 89.9 86.9 83.9 79.9 75.1 70.1 65.9 61.9 59.3 56.7
# 繪圖比較 #
temp = data.frame(num = 280:289,test=test_data, sim =predict_value)
ggplot(temp)+
  geom_line(aes(x = num, y = test))+
  geom_line(aes(x = num, y = sim), linetype = "dotdash")+
  labs(title="Prediction by Block Bootstrap",x="Year", y = "Predict_value")+
  scale_x_continuous(breaks = c(280:289),labels = c(1979:1988))

# test
abs(test_data-predict_value) %>% sum()
## [1] 428.7

AR(2)

# 求取AR(2)的參數 method = mle#
sunspot.ar <- ar(train_data,order.max = 2,method = "mle")

sunspot.ar
## 
## Call:
## ar(x = train_data, order.max = 2, method = "mle")
## 
## Coefficients:
##       1        2  
##  1.3781  -0.6819  
## 
## Order selected 2  sigma^2 estimated as  264.2
# 運用預測function去預測, n.head為預測10期 #
temp1 <- predict(sunspot.ar, n.ahead = 10)

# 預測結果 #
temp1$pred
## Time Series:
## Start = 280 
## End = 289 
## Frequency = 1 
##  [1] 123.25346 121.31135  97.66354  66.39796  39.43555  23.59818  20.15822
##  [8]  26.21727  36.91324  47.52198
# 繪圖比較 #
temp = data.frame(num = 280:289,test=test_data, sim = temp1$pred)
ggplot(temp)+
  geom_line(aes(x = num, y = test))+
  geom_line(aes(x = num, y = sim), linetype = 2 )+
  labs(title="Prediction by AR(2)",x="Year", y = "Predict_value")+
  scale_x_continuous(breaks = c(280:289),labels = c(1979:1988))

# test#
sum(abs(temp1$pred - test_data))
## [1] 282.8067

Question 05

If d is the minimum distance between any pair of n uniformly distributed points from a unit square, then \(n(n-1)d^2\) ~ \(Exp(\frac{2}{\pi})\) provided that n is sufficiently large. Using R to check this result: First, write a function to produce n points uniformly distributed from the unit square. Then, write a function to calculate the smallest distance between any pair of n points. Change the value of n, perform simulation, and comment on what you find.

library(foreach)
library(doSNOW)
library(dplyr)
library(ggplot2)

Function design

sim <- function(num){
  x <- runif(num) # 模擬num筆x座標
  y <- runif(num) # 模擬num筆y座標
  d = NULL
  while(length(x) > 1){
    a = (x[-1] - x[1])^2 
    b = (y[-1] - y[1])^2
    d = min(c(d,a+b)) 
    x <- x[-1]
    y <- y[-1]}
  return(d * num * (num - 1))}

使用分散式運算 開通道

cl<-makeCluster(3) #要開啟3個核心
registerDoSNOW(cl) #註冊後才真正開啟

N = 20

#在單位正方形亂數生成座標
A <- cbind(runif(20,0,1),runif(20,0,1))
A <- as.data.frame(A)
ggplot(A,aes(x=A[,1],y=A[,2]))+
  geom_point()+
  labs(x="",y="",title="單位矩形內亂數生成座標")

#計算
set.seed(123)
o= Sys.time()
pv <- NULL
N = 20
n=100
d=1000
for(j in 1:n){
sim2 <- foreach(i=1:d,.combine = cbind,.export ="sim")%dopar%{sim(N)}
# ks.test 
f <- ks.test(sim2, "pexp",pi/2)
pv[j] <- f$p.value}
mean(pv)
## [1] 0.4419295
sum((pv>0.05)*1)/100
## [1] 0.89
pv <- as.data.frame(pv)
ggplot(pv,aes(y=pv))+
  geom_point(aes(x = 1:100))+
  geom_hline(yintercept=0.05,color="red")+
  labs(x="",y="P_value",title="Simulation of P with 100 times ")

exp <- rexp(n*d,pi/2)
hist(sim2)

sim2 <- as.numeric(sim2)%>% as.data.frame()
ggplot(sim2,aes(sim2)) +
  geom_histogram(aes(y = ..density..), binwidth = 0.2)
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.

hist(exp, main = "Data from Exponential(pi/2)")

exp <- as.data.frame(exp)
ggplot(exp,aes(exp))+
geom_histogram(aes(y=..density..),fill="skyblue",binwidth = 0.2)

Sys.time()-o
## Time difference of 35.58508 secs

N = 100

#在單位正方形亂數生成座標
A <- cbind(runif(100,0,1),runif(100,0,1))
A <- as.data.frame(A)
ggplot(A,aes(x=A[,1],y=A[,2]))+
  geom_point()+
  labs(x="",y="",title="單位矩形內亂數生成座標")

#計算
set.seed(123)
o= Sys.time()
pv <- NULL
N = 100
n=100
d=1000
for(j in 1:n){
sim2 <- foreach(i=1:d,.combine = cbind,.export ="sim")%dopar%{sim(N)}
# ks.test 
f <- ks.test(sim2, "pexp",pi/2)
pv[j] <- f$p.value}
mean(pv)
## [1] 0.4889876
sum((pv>0.05)*1)/100
## [1] 0.96
pv <- as.data.frame(pv)
ggplot(pv,aes(y=pv))+
  geom_point(aes(x = 1:100))+
  geom_hline(yintercept=0.05,color="red")+
  labs(x="",y="P_value",title="Simulation of P with 100 times ")

exp <- rexp(n*d,pi/2)
hist(sim2)

sim2 <- as.numeric(sim2)%>% as.data.frame()
ggplot(sim2,aes(sim2)) +
  geom_histogram(aes(y = ..density..), binwidth = 0.2)
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.

hist(exp, main = "Data from Exponential(pi/2)")

exp <- as.data.frame(exp)
ggplot(exp,aes(exp))+
geom_histogram(aes(y=..density..),fill="skyblue",binwidth = 0.2)

Sys.time()-o
## Time difference of 1.097229 mins

N = 1000

#在單位正方形亂數生成座標
A <- cbind(runif(1000,0,1),runif(1000,0,1))
A <- as.data.frame(A)
ggplot(A,aes(x=A[,1],y=A[,2]))+
  geom_point()+
  labs(x="",y="",title="單位矩形內亂數生成座標")

#計算
set.seed(123)
o= Sys.time()
pv <- NULL
N = 1000
n=100
d=1000
for(j in 1:n){
sim2 <- foreach(i=1:d,.combine = cbind,.export ="sim")%dopar%{sim(N)}
# ks.test 
f <- ks.test(sim2, "pexp",pi/2)
pv[j] <- f$p.value}
mean(pv)
## [1] 0.5433344
sum((pv>0.05)*1)/100
## [1] 0.98
pv <- as.data.frame(pv)
ggplot(pv,aes(y=pv))+
  geom_point(aes(x = 1:100))+
  geom_hline(yintercept=0.05,color="red")+
  labs(x="",y="P_value",title="Simulation of P with 100 times ")

exp <- rexp(n*d,pi/2)
hist(sim2)

sim2 <- as.numeric(sim2)%>% as.data.frame()
ggplot(sim2,aes(sim2)) +
  geom_histogram(aes(y = ..density..), binwidth = 0.2)
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.

hist(exp, main = "Data from Exponential(pi/2)")

exp <- as.data.frame(exp)
ggplot(exp,aes(exp))+
geom_histogram(aes(y=..density..),fill="skyblue",binwidth = 0.2)

Sys.time()-o
## Time difference of 21.60511 mins