library(ggplot2)
library(dplyr)
library(kableExtra)
library(knitr)
#setwd("/Users/kevinpiger/Desktop/碩一下/統模/Homework02/")
options(scipen = 100)
Uniform (0,1) transformation For uniform (0,1) random variables \(U_1,U_2, ... ,\) define N = min{n :\(\sum_{i=1}^{n}{U_i>1}\) }. That is, N is the number of random numbers that must be summed to exceed 1.
Estimate E(N) with standard errors by generating 1,000, 2,000, 5,000, 10,000, and 100,000 values of N, and check if there are any patterns in the estimate and its s.e.
# T 欲模擬次數 #
Q1 <- function(T){
N = NULL
S = NULL
for( i in 1:T){
n = 0; i = 0
while(n < 1){
n = sum(n ,runif(1))
i = i + 1}
N <- c(N,i)
}
return(N)
}
Calculation and output
output <- NULL
output2 <- NULL
sim <- c(1000, 2000, 5000, 10000, 100000)
for(i in 1:5){
output[i] <- mean(Q1(sim[i]));
output2[i] <- var(Q1(sim[i]))
}
output <- t(output)
output2 <- t(output2)
output <- rbind(output,output2)
rownames(output) <- c("Output","var")
colnames(output) <- sim
kable(output,"html") %>%
kable_styling(bootstrap_options = 'striped', full_width = F) %>%
add_header_above(c(" " = 1, "# of Simulation" = 5))
| 1000 | 2000 | 5000 | 10000 | 100000 | |
|---|---|---|---|---|---|
| Output | 2.7080000 | 2.7345000 | 2.727600 | 2.7259000 | 2.7217900 |
| var | 0.7378378 | 0.7490855 | 0.752803 | 0.7639223 | 0.7639274 |
You think you know baseball? Do home run hitters have the highest batting averages? Think about this hypothesis, then analyze the following experience based on a half season with the Braves:
# Question 02 #
x <- c(0.252,0.305,0.299,0.303,0.285,0.191,0.283,0.272,0.310,0.266,0.215,0.211,0.244,0.320)
y <- c(12,6,4,15,2,2,16,6,8,10,0,3,6,7)
Ques <- rbind(x[1:7], y[1:7], x[8:14], y[8:14])
rownames(Ques) <- c("Batting Average","Home Runs","Batting Average","Home Runs")
colnames(Ques) <- c(1:7)
kable(Ques, "html") %>%
kable_styling(bootstrap_options = "striped", full_width = F) %>%
add_header_above(c("Label" = 1, 'Baseball Data' = 7))
| 1 | 2 | 3 | 4 | 5 | 6 | 7 | |
|---|---|---|---|---|---|---|---|
| Batting Average | 0.252 | 0.305 | 0.299 | 0.303 | 0.285 | 0.191 | 0.283 |
| Home Runs | 12.000 | 6.000 | 4.000 | 15.000 | 2.000 | 2.000 | 16.000 |
| Batting Average | 0.272 | 0.310 | 0.266 | 0.215 | 0.211 | 0.244 | 0.320 |
| Home Runs | 6.000 | 8.000 | 10.000 | 0.000 | 3.000 | 6.000 | 7.000 |
num = 100,000 為模擬次數, \(\alpha\) = 0.05
alpha = 0.05
num = 100000
N = NULL
for(i in 1:num){
x1 <- sample(x,14,F)
y1 <- sample(y,14,F)
N <- c(N, sum(x1*y1))}
t0 <- sum(x*y)
N1 <- data.frame("BatXHit" = N) %>% mutate(over = ifelse(BatXHit >=t0, ">= critical value","< critical value"))
N1$over <- N1$over %>% as.factor()
ggplot(N1,aes(x = BatXHit,fill = over))+
geom_histogram(bins = 100,alpha = 0.7)+
scale_fill_manual(values = c("skyblue","blue"),
name = paste0("Label\n","k = ",t0),
breaks = c(">= critical value","< critical value k"),
labels = c(">= critical value","< critical value k"))+
theme(plot.title = element_text(colour = "black", face = "bold", size = 25))+
geom_vline(aes(xintercept = t0),linetype="dashed",color = "red")+
labs(title = "Permutation ",x = "Simulation of BatXHit", y = "Count")+
scale_x_continuous(breaks = c(24:27,t0,28),
labels = c(24:27,"k",28))
pv <- sum(N > t0 )/num
print(ifelse(pv > alpha, paste('p value',pv,'>',alpha,'Do not reject H0'),paste('p value',pv,'<=',alpha,'Reject H0')))
## [1] "p value 0.04776 <= 0.05 Reject H0"
Similar to what Efron did in the Law school data example, compute the bootstrap simulation for 50, 100, …, 10,000 replications. But, instead of using the original 15 observations, we want to know if the number of observations plays an important role. Randomly select 10, 15, 20, and 25 observations and then see if the bootstrap variance converges as the number of replications increases. (Note: You also need to compare your results with that of population.)
library(bootstrap)
library(dplyr)
library(kableExtra)
library(knitr)
library(ggplot2)
data = law82
data %>% kable(.,"html") %>% kable_styling() %>% scroll_box(height = "200px")
| School | LSAT | GPA |
|---|---|---|
| 1 | 622 | 3.23 |
| 2 | 542 | 2.83 |
| 3 | 579 | 3.24 |
| 4 | 653 | 3.12 |
| 5 | 606 | 3.09 |
| 6 | 576 | 3.39 |
| 7 | 620 | 3.10 |
| 8 | 615 | 3.40 |
| 9 | 553 | 2.97 |
| 10 | 607 | 2.91 |
| 11 | 558 | 3.11 |
| 12 | 596 | 3.24 |
| 13 | 635 | 3.30 |
| 14 | 581 | 3.22 |
| 15 | 661 | 3.43 |
| 16 | 547 | 2.91 |
| 17 | 599 | 3.23 |
| 18 | 646 | 3.47 |
| 19 | 622 | 3.15 |
| 20 | 611 | 3.33 |
| 21 | 546 | 2.99 |
| 22 | 614 | 3.19 |
| 23 | 628 | 3.03 |
| 24 | 575 | 3.01 |
| 25 | 662 | 3.39 |
| 26 | 627 | 3.41 |
| 27 | 608 | 3.04 |
| 28 | 632 | 3.29 |
| 29 | 587 | 3.16 |
| 30 | 581 | 3.17 |
| 31 | 605 | 3.13 |
| 32 | 704 | 3.36 |
| 33 | 477 | 2.57 |
| 34 | 591 | 3.02 |
| 35 | 578 | 3.03 |
| 36 | 572 | 2.88 |
| 37 | 615 | 3.37 |
| 38 | 606 | 3.20 |
| 39 | 603 | 3.23 |
| 40 | 535 | 2.98 |
| 41 | 595 | 3.11 |
| 42 | 575 | 2.92 |
| 43 | 573 | 2.85 |
| 44 | 644 | 3.38 |
| 45 | 545 | 2.76 |
| 46 | 645 | 3.27 |
| 47 | 651 | 3.36 |
| 48 | 562 | 3.19 |
| 49 | 609 | 3.17 |
| 50 | 555 | 3.00 |
| 51 | 586 | 3.11 |
| 52 | 580 | 3.07 |
| 53 | 594 | 2.96 |
| 54 | 594 | 3.05 |
| 55 | 560 | 2.93 |
| 56 | 641 | 3.28 |
| 57 | 512 | 3.01 |
| 58 | 631 | 3.21 |
| 59 | 597 | 3.32 |
| 60 | 621 | 3.24 |
| 61 | 617 | 3.03 |
| 62 | 637 | 3.33 |
| 62 | 572 | 3.08 |
| 64 | 610 | 3.13 |
| 65 | 562 | 3.01 |
| 66 | 635 | 3.30 |
| 67 | 614 | 3.15 |
| 68 | 546 | 2.82 |
| 69 | 598 | 3.20 |
| 70 | 666 | 3.44 |
| 71 | 570 | 3.01 |
| 72 | 570 | 2.92 |
| 73 | 605 | 3.45 |
| 74 | 565 | 3.15 |
| 75 | 686 | 3.50 |
| 76 | 608 | 3.16 |
| 77 | 595 | 3.19 |
| 78 | 590 | 3.15 |
| 79 | 558 | 2.81 |
| 80 | 611 | 3.16 |
| 81 | 564 | 3.02 |
| 82 | 575 | 2.74 |
cor(data$LSAT,data$GPA)
## [1] 0.7599979
# design function
Q3 <- function(num,Sim_num,rep_num){
Sim_cor = NULL
for(i in 1:rep_num){
temp1 <- sample(num,Sim_num,T)
temp <- data[temp1,2:3] %>% cor()
Sim_cor <- c(Sim_cor,temp[1,2])}
return(c(mean(Sim_cor),var(Sim_cor)))}
set.seed(106354012)
Sim_num = 10
num <- sample(1:82,Sim_num ,F)
A = NULL;B = NULL
for( i in c(seq(50,10000,by = 50))){
A <- c(A,Q3(num,Sim_num,i)[2])
B <- c(B,Q3(num,Sim_num,i)[1])}
# mean variance of correlation #
mean(A)
## [1] 0.04274998
# mean correlation #
mean(B)
## [1] 0.7444454
temp <- data.frame(num = c(1:200),boots_var = A)
ggplot(temp) +
geom_point(aes(x = temp$num, y = temp$boots_var)) +
geom_smooth(aes(x = temp$num, y = temp$boots_var)) +
labs(title="Bootstrap of variance with 10 Observarion ",
x="Index",
y="Variance of correlation") +
theme_bw()
## `geom_smooth()` using method = 'loess'
temp <- data.frame(num = c(1:200),boots_mean = B)
ggplot(temp) +
geom_point(aes(x = temp$num, y = temp$boots_mean)) +
geom_smooth(aes(x = temp$num, y = temp$boots_mean)) +
labs(title="Bootstrap of mean with 10 Observarion ",
x="Index",
y="Mean of correlation") +
theme_bw()
## `geom_smooth()` using method = 'loess'
set.seed(106354012)
Sim_num = 15
num <- sample(1:82,Sim_num ,F)
A = NULL;B = NULL
for( i in c(seq(50,10000,by = 50))){
A <- c(A,Q3(num,Sim_num,i)[2])
B <- c(B,Q3(num,Sim_num,i)[1])}
# mean variance of correlation #
mean(A)
## [1] 0.01197177
# mean correlation #
mean(B)
## [1] 0.8001203
temp <- data.frame(num = c(1:200),boots_var = A)
ggplot(temp) +
geom_point(aes(x = temp$num, y = temp$boots_var)) +
geom_smooth(aes(x = temp$num, y = temp$boots_var)) +
labs(title="Bootstrap of variance with 15 Observarion ",
x="Index",
y="Variance of correlation") +
theme_bw()
## `geom_smooth()` using method = 'loess'
temp <- data.frame(num = c(1:200),boots_mean = B)
ggplot(temp) +
geom_point(aes(x = temp$num, y = temp$boots_mean)) +
geom_smooth(aes(x = temp$num, y = temp$boots_mean)) +
labs(title="Bootstrap of mean with 15 Observarion ",
x="Index",
y="Mean of correlation") +
theme_bw()
## `geom_smooth()` using method = 'loess'
set.seed(106354012)
Sim_num = 20
num <- sample(1:82,Sim_num ,F)
A = NULL;B = NULL
for( i in c(seq(50,10000,by = 50))){
A <- c(A,Q3(num,Sim_num,i)[2])
B <- c(B,Q3(num,Sim_num,i)[1])}
# mean variance of correlation #
mean(A)
## [1] 0.008962489
# mean correlation #
mean(B)
## [1] 0.7763331
temp <- data.frame(num = c(1:200),boots_var = A)
ggplot(temp) +
geom_point(aes(x = temp$num, y = temp$boots_var)) +
geom_smooth(aes(x = temp$num, y = temp$boots_var)) +
labs(title="Bootstrap of variance with 20 Observarion ",
x="Index",
y="Variance of correlation") +
theme_bw()
## `geom_smooth()` using method = 'loess'
temp <- data.frame(num = c(1:200),boots_mean = B)
ggplot(temp) +
geom_point(aes(x = temp$num, y = temp$boots_mean)) +
geom_smooth(aes(x = temp$num, y = temp$boots_mean)) +
labs(title="Bootstrap of mean with 20 Observarion ",
x="Index",
y="Mean of correlation") +
theme_bw()
## `geom_smooth()` using method = 'loess'
set.seed(106354012)
Sim_num = 25
num <- sample(1:82,Sim_num ,F)
A = NULL;B = NULL
for( i in c(seq(50,10000,by = 50))){
A <- c(A,Q3(num,Sim_num,i)[2])
B <- c(B,Q3(num,Sim_num,i)[1])}
# mean variance of correlation #
mean(A)
## [1] 0.008449368
# mean correlation #
mean(B)
## [1] 0.7656147
temp <- data.frame(num = c(1:200),boots_var = A)
ggplot(temp) +
geom_point(aes(x = temp$num, y = temp$boots_var)) +
geom_smooth(aes(x = temp$num, y = temp$boots_var)) +
labs(title="Bootstrap of variance with 25 Observarion ",
x="Index",
y="Variance of correlation") +
theme_bw()
## `geom_smooth()` using method = 'loess'
temp <- data.frame(num = c(1:200),boots_mean = B)
ggplot(temp) +
geom_point(aes(x = temp$num, y = temp$boots_mean)) +
geom_smooth(aes(x = temp$num, y = temp$boots_mean)) +
labs(title="Bootstrap of mean with 25 Observarion ",
x="Index",
y="Mean of correlation") +
theme_bw()
## `geom_smooth()` using method = 'loess'
The block bootstrap can be used in prediction for dependent data. Use the built-in data “sunspot.year” in R, which is can be modeled as an AR(2) model, compare the difference of prediction via block bootstrap and AR(2) model. As a check, you can leave the final 10 observations as “testing” data.
library(dplyr)
library(kableExtra)
library(knitr)
library(ggplot2)
data <- sunspot.year
# 顯示不好,需要再討論 #
head(data)
## [1] 5 11 16 23 36 58
使用前279筆資料去做訓練data, 後10筆資料為驗證data
train_data <- data[1:279]
test_data <- data[280:289]
#實際資料
test_data
## [1] 155.4 154.7 140.5 115.9 66.6 45.9 17.9 13.4 29.2 100.2
temp <- matrix(0,1000,ncol = 10)
dr <- train_data[-1] - train_data[-279]
set.seed(106354012)
for(i in 1:1000){
st <- sample(1:269,1)
temp[i,] <- dr[c(st:(st+9))]}
# 取每一行值的中位數 #
md <- apply(temp,2,median)
md
## [1] -2.6 -3.0 -3.0 -4.0 -4.8 -5.0 -4.2 -4.0 -2.6 -2.6
predict_value = NULL
for(i in 1:10){
predict_value <- c(predict_value,sum(md[c(1:i)]) + train_data[279])}
# 預測結果 #
predict_value
## [1] 89.9 86.9 83.9 79.9 75.1 70.1 65.9 61.9 59.3 56.7
# 繪圖比較 #
temp = data.frame(num = 280:289,test=test_data, sim =predict_value)
ggplot(temp)+
geom_line(aes(x = num, y = test))+
geom_line(aes(x = num, y = sim), linetype = "dotdash")+
labs(title="Prediction by Block Bootstrap",x="Year", y = "Predict_value")+
scale_x_continuous(breaks = c(280:289),labels = c(1979:1988))
# test
abs(test_data-predict_value) %>% sum()
## [1] 428.7
# 求取AR(2)的參數 method = mle#
sunspot.ar <- ar(train_data,order.max = 2,method = "mle")
sunspot.ar
##
## Call:
## ar(x = train_data, order.max = 2, method = "mle")
##
## Coefficients:
## 1 2
## 1.3781 -0.6819
##
## Order selected 2 sigma^2 estimated as 264.2
# 運用預測function去預測, n.head為預測10期 #
temp1 <- predict(sunspot.ar, n.ahead = 10)
# 預測結果 #
temp1$pred
## Time Series:
## Start = 280
## End = 289
## Frequency = 1
## [1] 123.25346 121.31135 97.66354 66.39796 39.43555 23.59818 20.15822
## [8] 26.21727 36.91324 47.52198
# 繪圖比較 #
temp = data.frame(num = 280:289,test=test_data, sim = temp1$pred)
ggplot(temp)+
geom_line(aes(x = num, y = test))+
geom_line(aes(x = num, y = sim), linetype = 2 )+
labs(title="Prediction by AR(2)",x="Year", y = "Predict_value")+
scale_x_continuous(breaks = c(280:289),labels = c(1979:1988))
# test#
sum(abs(temp1$pred - test_data))
## [1] 282.8067
If d is the minimum distance between any pair of n uniformly distributed points from a unit square, then \(n(n-1)d^2\) ~ \(Exp(\frac{2}{\pi})\) provided that n is sufficiently large. Using R to check this result: First, write a function to produce n points uniformly distributed from the unit square. Then, write a function to calculate the smallest distance between any pair of n points. Change the value of n, perform simulation, and comment on what you find.
library(foreach)
library(doSNOW)
library(dplyr)
library(ggplot2)
sim <- function(num){
x <- runif(num) # 模擬num筆x座標
y <- runif(num) # 模擬num筆y座標
d = NULL
while(length(x) > 1){
a = (x[-1] - x[1])^2
b = (y[-1] - y[1])^2
d = min(c(d,a+b))
x <- x[-1]
y <- y[-1]}
return(d * num * (num - 1))}
使用分散式運算 開通道
cl<-makeCluster(3) #要開啟3個核心
registerDoSNOW(cl) #註冊後才真正開啟
#在單位正方形亂數生成座標
A <- cbind(runif(20,0,1),runif(20,0,1))
A <- as.data.frame(A)
ggplot(A,aes(x=A[,1],y=A[,2]))+
geom_point()+
labs(x="",y="",title="單位矩形內亂數生成座標")
#計算
set.seed(123)
o= Sys.time()
pv <- NULL
N = 20
n=100
d=1000
for(j in 1:n){
sim2 <- foreach(i=1:d,.combine = cbind,.export ="sim")%dopar%{sim(N)}
# ks.test
f <- ks.test(sim2, "pexp",pi/2)
pv[j] <- f$p.value}
mean(pv)
## [1] 0.4419295
sum((pv>0.05)*1)/100
## [1] 0.89
pv <- as.data.frame(pv)
ggplot(pv,aes(y=pv))+
geom_point(aes(x = 1:100))+
geom_hline(yintercept=0.05,color="red")+
labs(x="",y="P_value",title="Simulation of P with 100 times ")
exp <- rexp(n*d,pi/2)
hist(sim2)
sim2 <- as.numeric(sim2)%>% as.data.frame()
ggplot(sim2,aes(sim2)) +
geom_histogram(aes(y = ..density..), binwidth = 0.2)
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
hist(exp, main = "Data from Exponential(pi/2)")
exp <- as.data.frame(exp)
ggplot(exp,aes(exp))+
geom_histogram(aes(y=..density..),fill="skyblue",binwidth = 0.2)
Sys.time()-o
## Time difference of 35.58508 secs
#在單位正方形亂數生成座標
A <- cbind(runif(100,0,1),runif(100,0,1))
A <- as.data.frame(A)
ggplot(A,aes(x=A[,1],y=A[,2]))+
geom_point()+
labs(x="",y="",title="單位矩形內亂數生成座標")
#計算
set.seed(123)
o= Sys.time()
pv <- NULL
N = 100
n=100
d=1000
for(j in 1:n){
sim2 <- foreach(i=1:d,.combine = cbind,.export ="sim")%dopar%{sim(N)}
# ks.test
f <- ks.test(sim2, "pexp",pi/2)
pv[j] <- f$p.value}
mean(pv)
## [1] 0.4889876
sum((pv>0.05)*1)/100
## [1] 0.96
pv <- as.data.frame(pv)
ggplot(pv,aes(y=pv))+
geom_point(aes(x = 1:100))+
geom_hline(yintercept=0.05,color="red")+
labs(x="",y="P_value",title="Simulation of P with 100 times ")
exp <- rexp(n*d,pi/2)
hist(sim2)
sim2 <- as.numeric(sim2)%>% as.data.frame()
ggplot(sim2,aes(sim2)) +
geom_histogram(aes(y = ..density..), binwidth = 0.2)
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
hist(exp, main = "Data from Exponential(pi/2)")
exp <- as.data.frame(exp)
ggplot(exp,aes(exp))+
geom_histogram(aes(y=..density..),fill="skyblue",binwidth = 0.2)
Sys.time()-o
## Time difference of 1.097229 mins
#在單位正方形亂數生成座標
A <- cbind(runif(1000,0,1),runif(1000,0,1))
A <- as.data.frame(A)
ggplot(A,aes(x=A[,1],y=A[,2]))+
geom_point()+
labs(x="",y="",title="單位矩形內亂數生成座標")
#計算
set.seed(123)
o= Sys.time()
pv <- NULL
N = 1000
n=100
d=1000
for(j in 1:n){
sim2 <- foreach(i=1:d,.combine = cbind,.export ="sim")%dopar%{sim(N)}
# ks.test
f <- ks.test(sim2, "pexp",pi/2)
pv[j] <- f$p.value}
mean(pv)
## [1] 0.5433344
sum((pv>0.05)*1)/100
## [1] 0.98
pv <- as.data.frame(pv)
ggplot(pv,aes(y=pv))+
geom_point(aes(x = 1:100))+
geom_hline(yintercept=0.05,color="red")+
labs(x="",y="P_value",title="Simulation of P with 100 times ")
exp <- rexp(n*d,pi/2)
hist(sim2)
sim2 <- as.numeric(sim2)%>% as.data.frame()
ggplot(sim2,aes(sim2)) +
geom_histogram(aes(y = ..density..), binwidth = 0.2)
## Don't know how to automatically pick scale for object of type data.frame. Defaulting to continuous.
hist(exp, main = "Data from Exponential(pi/2)")
exp <- as.data.frame(exp)
ggplot(exp,aes(exp))+
geom_histogram(aes(y=..density..),fill="skyblue",binwidth = 0.2)
Sys.time()-o
## Time difference of 21.60511 mins