Experiment with as many variance reduction techniques as you can think of to apply the problem of evaluating \(P(X>1)\) for \(X \sim Cauchy\).
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(kableExtra)
library(knitr)
library(dplyr)
knitr::opts_chunk$set(echo = TRUE)
options(knitr.table.format = "html")
options(scipen = 100)
set.seed(15) # 3 + 12
# X ~ Cauchy(0,1)
# P(X > 1)
1 - pcauchy(1)
## [1] 0.25
x = seq(0,3,by = 0.001)
fx = dcauchy(x)
data <- data.frame(x = x, Cauchy = dcauchy(x))
data <- mutate(data,new = ifelse(x > 1, "blue", "white"))
temp1 <- data[1:1000,]
temp2 <- data[1001:length(x),]
\(P(0 < X <=1 )\)
ggplot()+
geom_line(data = data,aes(x = x, y = Cauchy))+
geom_vline(xintercept = 1,linetype = 2)+
geom_ribbon(data = temp1, aes(x = x,ymin = 0, ymax = Cauchy), fill = "#FF8888",alpha = 0.5)+
geom_ribbon(data = temp2, aes(x = x,ymin = 0, ymax = Cauchy), fill = "white", alpha = 0.7)+
labs(title = "P(0 < X <= 1)")
\(P(X > 1)\)
ggplot()+
geom_line(data = data,aes(x = x, y = Cauchy))+
geom_vline(xintercept = 1,linetype = 2)+
geom_ribbon(data = temp1, aes(x = x,ymin = 0, ymax = Cauchy), fill = "white",alpha = 0.5)+
geom_ribbon(data = temp2, aes(x = x,ymin = 0, ymax = Cauchy), fill = "#FF8888", alpha = 0.7)+
labs(title = "P(X > 1)")
## Monte-Carlo Integration
N = 1000
cauchy <- function(x){
return(1/(pi*(1+x^2)))
}
theta.hat <- NULL
for(i in 1:1000){
# 從cauchy抽取樣本
x <- runif(N)
x <- cauchy(x)
theta.hat[i]<-0.5 - mean(x)}
Mc.mean <- mean(theta.hat)
Mc.var <- var(theta.hat)
N = 1000
theta.hat <- NULL
for ( i in 1:1000){
x <- rcauchy(N)
theta.hat[i] <- mean(x > 1)}
Hm1.mean <- mean(theta.hat)
Hm1.var <- var(theta.hat)
N = 1000
theta.hat = NULL
for(i in 1:1000){
x <- rcauchy(1000) %>% abs()
theta.hat[i] <- 0.5*mean(x > 1)
}
Hm2.mean <- mean(theta.hat)
Hm2.var <- var(theta.hat)
N = 1000
theta.hat <- NULL
cauchy <- function(x){
return(1/(pi*(1+x^2)))}
for(i in 1:1000){
x <- runif(N)
theta.hat[i] <- (1- mean(2*cauchy(x)))/2
}
Hm3.mean <- mean(theta.hat)
Hm3.var <- var(theta.hat)
N = 1000
for( i in 1:1000){
x <- runif(N/2)
y <- 1 - x
temp1 <- cauchy(x)
temp2 <- cauchy(y)
theta.hat[i] <-0.5 - 0.5*(mean(temp1)+mean(temp2))}
AV.mean <- mean(theta.hat)
AV.var <- var(theta.hat)
hes <- function(x){
return(1/(pi*(1 + x^(-2))))}
N = 1000
theta.hat <- NULL
for(i in 1:1000){
U <- runif(N)
x <- 1/U
theta.hat[i] <- mean(hes(x))
}
Is.mean <- mean(theta.hat)
Is.var <- var(theta.hat)
令另一個function為\(\frac{1}{1+x}\)
f <- function(x){
return(1/(1+x))
}
cauchy <- function(x){
return(1/(pi*(1+x^2)))
}
theta.hat = NULL
for(i in 1:N){
u <- runif(1000)
B <- f(u)
A <- cauchy(u)
a <- -cov(A,B) / var(B) #est of c*
x <- runif(N)
T1 <-0.5 - cauchy(x)
theta.hat[i]<- 0.5 - mean(T1 + a * (f(x) -log(2, base = exp(1))))
}
Cv.mean <- mean(theta.hat)
Cv.var <- var(theta.hat)
分五層
cauchy <- function(x){
return(1/(pi*(1+x^2)))
}
SS = NULL
for(i in 1:N){
x1 <- runif(N/5, 0, 0.2)
x2 <- runif(N/5, 0.2, 0.4)
x3 <- runif(N/5, 0.4, 0.6)
x4 <- runif(N/5, 0.6, 0.8)
x5 <- runif(N/5, 0.8, 1)
S1 <- 0.5 - mean(cauchy(x1))
S2 <- 0.5 - mean(cauchy(x2))
S3 <- 0.5 - mean(cauchy(x3))
S4 <- 0.5 - mean(cauchy(x4))
S5 <- 0.5 - mean(cauchy(x5))
SS[i] <- mean(c(S1, S2, S3, S4, S5))
}
SS.mean <- mean(SS)
SS.var <- var(SS)
tmp1 <- c(Mc.mean, Hm1.mean, Hm2.mean, Hm3.mean, AV.mean, Is.mean, Cv.mean, SS.mean)
tmp2 <- c(Mc.var, Hm1.var, Hm2.var, Hm3.var, AV.var, Is.var, Cv.var, SS.var)
table <- rbind(tmp1, tmp2)
rownames(table) <- c("Mean","Variance")
colnames(table) <- c("Monte-Carlo", "Hit or miss1", "Hit or miss2", "Hit or miss3", "Antithetic",
" Importance", "Control", "Stratified(5)")
temp <- table %>% round(., digits = 8)
kable(temp) %>% kable_styling()
| Monte-Carlo | Hit or miss1 | Hit or miss2 | Hit or miss3 | Antithetic | Importance | Control | Stratified(5) | |
|---|---|---|---|---|---|---|---|---|
| Mean | 0.2499791 | 0.2498500 | 0.2497935 | 0.2499957 | 0.2499947 | 0.2499771 | 0.2498210 | 0.2500137 |
| Variance | 0.0000027 | 0.0001932 | 0.0000607 | 0.0000029 | 0.0000000 | 0.0000024 | 0.0000098 | 0.0000001 |
Hammersley and Handscomb (1964) used the integration of \(\theta = \int_{0}^{1} \frac{e^x-1}{e-1}dx\) on \((0,1)\) as a test problem of variance reduction techniques (which is about 0.4180233). Achieve as large a variance reduction as you can. (They achieved 4 million.)
set.seed(15) # 3 + 12
# define function
Q2 <- function(x){
y <- (exp(x)-1)/(exp(1)-1)
return(y)}
x = seq(0,1,by = 0.001)
fx = Q2(x)
data <- data.frame(x = x, fx = Q2(x))
ggplot()+
geom_line(data = data,aes(x = x, y = fx))+
geom_vline(xintercept = 1,linetype = 2)+
geom_ribbon(data = data, aes(x = x,ymin = 0, ymax = fx), fill = "#00FF00",alpha = 0.7)+
labs(title = "Question 02")
## Monte-Carlo Integration
N = 1000
theta.hat = NULL
for(i in 1:N){
x <- runif(N)
theta.hat[i] <- mean(Q2(x))}
Mc.mean <- mean(theta.hat)
Mc.var <-var(theta.hat)
N = 1000
theta.hat = NULL
for(i in 1:N){
x <- runif(N/2)
y <- 1 - x
temp1 <- mean(Q2(x))
temp2 <- mean(Q2(y))
theta.hat[i] <- 0.5*(temp1+temp2)}
AV.mean <- mean(theta.hat)
AV.var <- var(theta.hat)
theta.hat1 = NULL
theta.hat2 = NULL
theta.hat3 = NULL
theta.hat4 = NULL
for( i in 1:N){
u <- runif(N) #f3, inverse transform method
x <- - log(1 - u * (1 - exp(-1)))
fg <- Q2(x) / (exp(-x) / (1 - exp(-1)))
theta.hat1[i] <- mean(fg)
u <- runif(N/2) #f3, inverse transform method + Antithetic Variate
v <- 1 - u
x1 <- - log(1 - u * (1 - exp(-1)))
x2 <- - log(1 - v * (1 - exp(-1)))
fg1 <- Q2(x1) / (exp(-x1) / (1 - exp(-1)))
fg2 <- Q2(x2) / (exp(-x2) / (1 - exp(-1)))
fg <- 0.5*(fg1+fg2)
theta.hat2[i] <- mean(fg)
u <- runif(N) #f4, inverse transform method
x <- tan(pi * u / 4)
fg <- Q2(x) / (4 / ((1 + x^2) * pi))
theta.hat3[i] <- mean(fg)
u <- runif(N/2) #f4, inverse transform method + Antithetic Variate
v <- 1 - u
x1 <- tan(pi * u / 4)
x2 <- tan(pi * v / 4)
fg1 <- Q2(x1) / (4 / ((1 + x1^2) * pi))
fg2 <- Q2(x2) / (4 / ((1 + x2^2) * pi))
fg <- 0.5*(fg1+fg2)
theta.hat4[i] <- mean(fg)
}
mean11 <- c(mean(theta.hat1 ),mean(theta.hat2 ),mean(theta.hat3 ),mean(theta.hat4 ))
var11 <- c(var(theta.hat1 ),var(theta.hat2 ),var(theta.hat3 ),var(theta.hat4))
Is.mean1 <- mean(theta.hat3 )
Is.mean2 <- mean(theta.hat4)
Is.var1 <- var(theta.hat3)
Is.var2 <- var(theta.hat4)
另一函式為 x
f <- function(x){
x}
theta.hat = NULL
for( i in 1:N){
u <- runif(10000)
B <- f(u)
A <- Q2(u)
a <- -cov(A,B) / var(B)
u <- runif(N)
T1 <- Q2(u)
T2 <- T1 + a * (f(u) - 1/2)
theta.hat[i] <- mean(T2)
}
Cv.mean <- mean(theta.hat)
Cv.var <- var(theta.hat)
Q2 <- function(x){
y <- (exp(x)-1)/(exp(1)-1)
return(y)}
N <- 1000
SS = NULL
for(i in 1:N){
x1 <- runif(N/5, 0, 0.2)
x2 <- runif(N/5, 0.2, 0.4)
x3 <- runif(N/5, 0.4, 0.6)
x4 <- runif(N/5, 0.6, 0.8)
x5 <- runif(N/5, 0.8, 1)
S1 <- mean(Q2(x1))
S2 <- mean(Q2(x2))
S3 <- mean(Q2(x3))
S4 <- mean(Q2(x4))
S5 <- mean(Q2(x5))
SS[i] <- mean(c(S1, S2, S3, S4, S5))
}
SS.mean <- mean(SS)
SS.var <- var(SS)
tmp1 <- c(Mc.mean, AV.mean, Is.mean1,Is.mean2, Cv.mean, SS.mean)
tmp2 <- c(Mc.var, AV.var, Is.var1, Is.var2, Cv.var, SS.var)
table <- rbind(tmp1, tmp2)
rownames(table) <- c("Mean","Variance")
colnames(table) <- c("Monte-Carlo", "Antithetic"," Importance", "Importance+Antithetic","Control", "Stratified(5)")
temp <- table %>% round(., digits = 8)
temp %>% kable() %>% kable_styling()
| Monte-Carlo | Antithetic | Importance | Importance+Antithetic | Control | Stratified(5) | |
|---|---|---|---|---|---|---|
| Mean | 0.4178798 | 0.4180536 | 0.4182496 | 0.4180618 | 0.4180334 | 0.4180431 |
| Variance | 0.0000845 | 0.0000028 | 0.0001548 | 0.0000414 | 0.0000013 | 0.0000034 |
Let \(X_i,i=1,2,3,4,5\) be independent exponential random variables each with mean 1, and consider the quantity \(\theta\) defined by \(\theta = P(\sum_{i=1}^{5}iX_i\geq 21.6)\).Propose at least three simulation methods to estimate \(\theta\) and compare their variances. ## 制定 \(\theta = P(\sum_{i=1}^{5}iX_i\geq 21.6)\)的機率形式
Q3<-function(x){
(312.5/60)*exp((-1/5)*x)-
(640/60)*exp((-1/4)*x)+
(405/60)*exp((-1/3)*x)+
(1/24)*exp((-1)*x)+
(-4/3)*exp((-1/2)*x)
}
set.seed(15) # 3 + 12
temp2 <- NULL
for(j in 1:1000){
temp <- NULL
for( i in 1:1000){
tmp <- rexp(5)
temp[i] <- sum(tmp*c(1:5))}
temp2[j] <- sum(temp>21.6)/1000
}
Hm.mean <- mean(temp2)
Hm.var <- var(temp2)
N <- 1000
theta.hat <- NULL
for(i in 1:1000){
x <- runif(N,0,21.6)
theta.hat[i] <-1 - mean(Q3(x)*21.6)
}
Mc.mean <- mean(theta.hat)
Mc.var <- var(theta.hat)
N <- 1000
theta.hat <- NULL
for(i in 1:1000){
x <- runif(N/2,0,21.6)
y <- 21.6 - x
theta.hat[i] <- (2 - mean(Q3(x)*21.6) - mean(Q3(y)*21.6))*0.5
}
Av.mean <- mean(theta.hat)
Av.var <- var(theta.hat)
N <- 1000
theta.hat <- NULL
for(i in 1:1000){
tmp <- c()
for(j in 1:5){
x <- runif(N/5,(j - 1)/5*21.6,j/5*21.6)
tmp <- c(tmp, Q3(x)*21.6)
}
theta.hat[i] <- 1 - mean(tmp)
}
SS.mean <- mean(theta.hat)
SS.var <- var(theta.hat)
table.mean <- c(Mc.mean, Av.mean, SS.mean, Hm.mean)
table.var <- c(Mc.var, Av.var, SS.var, Hm.var)
table <- rbind(table.mean, table.var)
rownames(table) <- c("mean", "var")
colnames(table) <- c("Monte", "Antithetic", "Stratified", "Hit")
temp <- table %>% round(., digits = 8)
temp %>% kable() %>% kable_styling()
| Monte | Antithetic | Stratified | Hit | |
|---|---|---|---|---|
| mean | 0.1681060 | 0.1673069 | 0.1689417 | 0.1681690 |
| var | 0.0001978 | 0.0002972 | 0.0000200 | 0.0001311 |
First, simulate 100 observations from \(Beta(2,3)\) and then use 3 density estimating methods to smooth the observations. You need to specify the parameters in the smoothing methods, and compare the results.
library(dplyr)
library(ggplot2)
library(magrittr)
library(tidyverse)
# First, simulate 100 observations from a mixed distribution of beta(2,3),
# each with probability 0.5. Then, use at least 3 density estimating methods
# to smooth the observations.
# You need to specify the parameters in the smoothing methods,
# and compare the results.
# 取出我們要的亂數
set.seed(106354012)
m = 100
x = rbeta(m,2,3)
# 畫個點的散佈圖
stripchart(x,pch=16,cex=0.5,col=3,main="Dotplot")
# 然後畫個直方圖
y = seq(0,1,length=100)
hist( x, breaks = 10,probability = T,ylim = c(0,3),xlim= c(0,1),col = "#AAAAAA",main = "Density Estimation (h=0.1)")
# 這是真實的beta(2,3)圖形
lines(y , dbeta(y,2,3) , col = "#00AAFF" , lwd = 3)
legend("topright" , "Beta(2,3)" , lty = 1 , col="#00AAFF" , lwd = 3)
# 這是直接用套件模擬出來的函數值
kernal_g = density(x, kernel = c("gaussian"), width = 0.1)
kernal_r = density(x, kernel = c("rectangular"), width = 0.1)
kernal_t = density(x, kernel = c("triangular"), width = 0.1)
# 把他們畫在plot裡面囉
lines(kernal_g, col = "#FF0000" , lwd = 2)
lines(kernal_r, col = "#00FF00" , lwd = 2)
lines(kernal_t, col = "#FFFF00" , lwd = 2)
legend("right",legend=c("gaussian","rectangular","triangular"), col=c("#FF0000","#00FF00","#FFFF00"),lwd=2,cex=1)
box()
# (h=0.2)
hist( x, breaks = 5,probability = T,ylim = c(0,3),xlim= c(0,1),col = "#AAAAAA",main = "Density Estimation (h=0.2)")
# 這是真實的beta(2,3)圖形
lines(y , dbeta(y,2,3) , col = "#00AAFF" , lwd = 3)
legend("topright" , "Beta(2,3)" , lty = 1 , col="#00AAFF" , lwd = 3)
# 這是直接用套件模擬出來的函數值
kernal_g = density(x, kernel = c("gaussian"), width = 0.2)
kernal_r = density(x, kernel = c("rectangular"), width = 0.2)
kernal_t = density(x, kernel = c("triangular"), width = 0.2)
# 把他們畫在plot裡面囉
lines(kernal_g, col = "#FF0000" , lwd = 2)
lines(kernal_r, col = "#00FF00" , lwd = 2)
lines(kernal_t, col = "#FFFF00" , lwd = 2)
legend("right",legend=c("gaussian","rectangular","triangular"), col=c("#FF0000","#00FF00","#FFFF00"),lwd=2,cex=1)
box()
# 這是不想用套件的人寫的code~
set.seed(106354012)
x <- rbeta(100,2,3)
h=0.1
# histogram density estimator (直方圖)
histogram = function(x,h){
g = function(x,a,b){
if (x<=b & x>=a) {return(1)}
else {return(0)}} # 寫出一個計算個數前需要的函數
n <- length(x)
seq <- seq(min(x),max(x),by=h) # 從(0,1)間隔h做出切割點
a = seq[-length(seq)] # 每組下界
b = seq[-1] # 每組上界
ni = NULL
for (k in 1:length(a)){
ni[k] = sum(x<=b[k] & x>=a[k])}
y_hat = NULL
for (i in sort(x)){
I=NULL
for (j in 1:length(a)){
gi=g(i,a[j],b[j])
I=c(I,gi)} # 指標函數
y=1/n*sum(ni/h*I) # 模擬樣本之函數估計值
y_hat=c(y_hat,y)}
return(y_hat)}
# naive density estimator
naive =function(x,h){
w = function(y){
if (abs(y)<1) {return(1/2)}
else {return(0)}} # 寫出公式裡的函數w
n = length(x)
seq = seq(min(x),max(x),length=length(x))
y_hat = NULL
for (i in seq){
W=NULL
for (j in x){
wi=w((i-j)/h)
W=c(W,wi)}
y=1/n*sum(1/h*W) # 樣本之函數估計值
y_hat=c(y_hat,y)}
return(y_hat)}
# kernel density estimator
# norm
kernel_norm=function(x,h){
w=function(y){dnorm(y)} # 核密度函數(常態)
n = length(x)
seq = seq(min(x),max(x),length=length(x))
y_hat = NULL
for (i in seq){
W=NULL
for (j in x){
wi=w((i-j)/h)
W=c(W,wi)}
y=1/n*sum(1/h*W)
y_hat=c(y_hat,y)}
return(y_hat)}
# Gamma核函數
kernel_gamma=function(x,h){
w=function(y){dgamma(y,shape = 1)} # 核密度函數(Gamma)
n = length(x)
seq = seq(min(x),max(x),length=length(x))
y_hat = NULL
for (i in seq){
W=NULL
for (j in x){
wi=w((i-j)/h)
W=c(W,wi)}
y=1/n*sum(1/h*W)
y_hat=c(y_hat,y)}
return(y_hat)}
# 將亂數帶入函數找出估計函數值(h=0.1)
y1 <- histogram(x,0.1) #h=0.1,也可使用其他h值
y2 <- naive(x,0.1) #h=0.1,也可使用其他h值
y3 <- kernel_norm(x,0.1) #h=0.1,也可使用其他h值
y4 <- kernel_gamma(x,0.1) #h=0.1,也可使用其他h值
# 畫圖囉
xx <- sort(x)
yy <- dbeta(xx,2,3)
y2 <- naive(xx,0.1)
y3 <- kernel_norm(xx,0.1)
y4 <- kernel_gamma(xx,0.1)
data <- cbind(xx,yy,y2,y3,y4) %>% as.data.frame()
colnames(data) <- c("sample","Beta(2,3)","naive","kernal(norm)","kernal(gamma)")
LBJ <- gather(data,key = "type",value = "value",2:5)
colnames(LBJ) <- c("sample","LineType","value")
library(magrittr)
LBJ$LineType %<>% as.factor()
library(ggplot2)
ggplot(data = LBJ) + labs(title = "Density Estimation (h=0.1)")+
xlim(0,1)+ ylim(0,2.5) +
geom_histogram(mapping = aes(x=sample,y=..density..),color="black",fill="gray",binwidth = 0.1)+
geom_line(mapping = aes(x=sample,y=value,color=LineType,group=LineType),size=1.2)+
theme(legend.title = element_text(colour="royalblue", size=20, face="bold"))+
theme(legend.text = element_text(size = 16))+
theme(legend.position = c(0.87,0.6))+
theme(legend.background = element_rect(fill="#FFFFF0",size=1, linetype="solid",colour ="darkblue"))+
theme(panel.grid.major = element_blank())+
theme(panel.grid.minor = element_blank())+
theme(panel.background = element_rect(fill="#EEFFFF",colour="black",size = 2))
# 畫圖囉 (h=0.2)
xx <- sort(x)
yy <- dbeta(xx,2,3)
y1 <- histogram(xx,0.2)
y2 <- naive(xx,0.2)
y3 <- kernel_norm(xx,0.2)
y4 <- kernel_gamma(xx,0.2)
data <- cbind(xx,yy,y2,y3,y4) %>% as.data.frame()
colnames(data) <- c("sample","Beta(2,3)","naive","kernal(norm)","kernal(gamma)")
LBJ <- gather(data,key = "type",value = "value",2:5)
colnames(LBJ) <- c("sample","LineType","value")
library(magrittr)
LBJ$LineType %<>% as.factor()
plot(sort(x),y1,typ="l",xlab="x",ylab="f(x)",xlim=c(0,1), ylim=c(0,3))
library(ggplot2)
ggplot(data = LBJ) + labs(title = "Density Estimation (h=0.2)")+
xlim(0,1)+ ylim(0,2.5) +
geom_histogram(mapping = aes(x=sample,y=..density..),color="black",fill="gray",binwidth = 0.2)+
geom_line(mapping = aes(x=sample,y=value,color=LineType,group=LineType),size=1.2)+
theme(legend.title = element_text(colour="royalblue", size=20, face="bold"))+
theme(legend.text = element_text(size = 16))+
theme(legend.position = c(0.87,0.6))+
theme(legend.background = element_rect(fill="#FFFFF0",size=1, linetype="solid",colour ="darkblue"))+
theme(panel.grid.major = element_blank())+
theme(panel.grid.minor = element_blank())+
theme(panel.background = element_rect(fill="#EEFEFF",colour="black",size = 2))
Let \(x\) be 100 equally spaced points on \([0,2π]\) and let \(y_i=sunx_i+\epsilon_i\) with \(\epsilon_i\sim N(0,0.09)\).Apply at least 3 linear smoothers and compare the differences, with respect to mean squares error (i.e., \(bias^2\) and variance) from 1,000 simulation runs.
library(plyr)
library(dplyr)
library(tidyverse)
library(ggplot2)
library(KernSmooth)
library(broom)
library(magrittr)
library(igraph)
# kernel smooth
set.seed(106354012)
a <- seq(0,2*pi, length=100)
b <- sin(a) + rnorm(100,0,0.09)
c <- sin(a)
# kernel smooth (kernel is norm)
data <- ksmooth(a, b, kernel = "normal", bandwidth = 0.1)%>% as.data.frame()
data <- cbind(b,c,data)%>% as.data.frame()
ggplot(data,aes(x=x))+ labs(title="kernel smooth of sin(x) [h=0.1]",x="x",y="sin(x)")+
geom_point(aes(y=b),col="#3333FF")+
geom_vline(xintercept = 0,size=1)+
geom_hline(yintercept = 0,size=1)+
geom_line(aes(y=y),col="#00D0FF",lwd=1)+
scale_x_continuous(breaks = c(0:2*pi))+
theme(panel.grid.major = element_line(NA),panel.grid.minor =element_line(NA))+
theme(panel.background = element_rect(color = "black",size = 2))+
theme(plot.title = element_text(size = 30, face = "bold"))+
theme(legend.title=element_text(size=24))+
theme(legend.text=element_text(size=20))
# MSE
d <- c()
e <- c()
for(j in 1:1000){
b <- sin(a) + rnorm(100,0,0.09)
d <- ksmooth(a, b, kernel = "normal", bandwidth = 0.1)
e <- rbind(e,d$y)
}
bias <- c()
var <- c()
for(i in 1:100){
bias[i] = mean(e[i,])-c[i]
var[i] = var(e[i,])
}
MSE <- sum(bias^2)+sum(var) ; MSE
## [1] 100.1644
# spline
# plot
df_values <- c(3, 4, 8, 16)
x = seq(from=0, to=2*pi, length=100)
f_x = sin(x)
epsilon = rnorm(100, 0, sd = 0.3)
y = f_x + epsilon
values <- data_frame(
x = seq(from=0, to=2*pi, length=100),
f_x = sin(x),
epsilon = rnorm(100, 0, sd = 0.3),
y = f_x + epsilon
)
overall <- NULL
for(df in df_values){
overall <- smooth.spline(values$x, values$y, df=df) %>%
augment() %>%
mutate(df=df) %>%
bind_rows(overall)
}
overall <- cbind(overall,f_x) %>% as.data.frame()
multiple_df <- overall %>%
ggplot(aes(x=x)) +
geom_point(aes(y=y))+
geom_line(aes(y=.fitted,color="#123456"),size=1) +
facet_wrap(~df, nrow=2) +
labs(title="Splines fit w / different degrees of freedom")+
theme(legend.title=element_blank())+
theme(legend.text = element_blank())+
theme(panel.background = element_rect(colour = "black"))+
geom_vline(xintercept = 0,size=1)+
geom_hline(yintercept = 0,size=1)+
theme(panel.grid.major = element_line(NA),panel.grid.minor =element_line(NA))+
theme(panel.background = element_rect(color='#000000',size=2))+
theme(plot.title = element_text(size = 30, face = "bold"))+
theme(legend.title=element_text(size=24))+
theme(legend.text=element_text(size=20))
multiple_df
get.spline.info<-function(object){
data.frame(x=object$x,y=object$y,df=object$df)
}
# plot
a <- seq(0,2*pi, length=100)
b <- sin(seq(0,2*pi, length=100)) + rnorm(100,0,0.09)
c <- sin(seq(0,2*pi, length=100))
spline1 <- smooth.spline(x = b, df = 3)
spline2 <- smooth.spline(x = b, df = 4)
spline3 <- smooth.spline(x = b, df = 8)
spline4 <- smooth.spline(x = b, df = 16)
data<-ldply(list(spline1,spline2,spline3,spline4),get.spline.info)
data <- cbind(a,b,c,data) %>% as.data.frame()
ggplot(data,aes(x=a))+ labs(title="Spline smoother of sin(x)") +
geom_line(aes(y=c),col="gray",size=2)+
geom_point(aes(y=b))+
geom_vline(xintercept = 0,size=1)+
geom_hline(yintercept = 0,size=1)+
geom_line(mapping = aes(y=y,color=factor(round(df,0)),group=df),lwd=1.2)+
scale_color_discrete("The number of nodes")+
theme(legend.position = c(0.8,0.8))+
theme(legend.background = element_rect(fill="#FFFFF0",colour = "black"))+
theme(panel.grid.major = element_line(NA),panel.grid.minor =element_line(NA))+
theme(panel.background = element_rect(colour = "black",size=2))+
theme(plot.title = element_text(size = 30, face = "bold"))+
theme(legend.title=element_text(size=24))+
theme(legend.text=element_text(size=20))
# 最後我們將各節點數分別計算MSE
spline1$y
## [1] 0.714474448 0.711408317 0.708332040 0.705227909 0.702074825
## [6] 0.698845329 0.695506577 0.692023408 0.688355424 0.684460421
## [11] 0.680293859 0.675809336 0.670959703 0.665696557 0.659972586
## [16] 0.653741237 0.646957489 0.639577624 0.631561872 0.622872469
## [21] 0.613472881 0.603330426 0.592414368 0.580698753 0.568161939
## [26] 0.554784175 0.540550967 0.525452893 0.509483419 0.492643542
## [31] 0.474936955 0.456371502 0.436959464 0.416715775 0.395662348
## [36] 0.373823823 0.351229795 0.327914995 0.303916311 0.279275827
## [41] 0.254039664 0.228255921 0.201977687 0.175259756 0.148158988
## [46] 0.120733735 0.093043137 0.065148453 0.037111772 0.008996355
## [51] -0.019134278 -0.047216727 -0.075188474 -0.102988873 -0.130558183
## [56] -0.157838544 -0.184773308 -0.211309321 -0.237396167 -0.262984657
## [61] -0.288028515 -0.312482666 -0.336304848 -0.359456935 -0.381902592
## [66] -0.403609819 -0.424550822 -0.444700249 -0.464039174 -0.482551497
## [71] -0.500227178 -0.517062218 -0.533055344 -0.548211387 -0.562537780
## [76] -0.576047207 -0.588757152 -0.600686770 -0.611859080 -0.622301470
## [81] -0.632043252 -0.641117799 -0.649560093 -0.657408003 -0.664702038
## [86] -0.671483884 -0.677797914 -0.683689190 -0.689203044 -0.694385605
## [91] -0.699282848 -0.703938865 -0.708395447 -0.712692564 -0.716864869
## [96] -0.720944634 -0.724959038 -0.728930567 -0.732879123 -0.736817897
# ns=3
d <- c()
for (i in 1:1000) {
b <- NULL
b <- sin(seq(0,2*pi, length=100)) + rnorm(100,0,0.09)
b1 <- smooth.spline(x = b,df=3)$y
d <- rbind(d,b1)
}
# MSE
bias <- c()
var <- c()
for (j in 1:100) {
bias[j] = mean(d[j,])-c[j]
var[j] = var(d[j,])
}
MSE <- c()
MSE[1] <- sum(bias^2)+sum(var) ; MSE
## [1] 80.0254
# nodes=4
d <- c()
for (i in 1:1000) {
b <- NULL
b <- sin(seq(0,2*pi, length=100)) + rnorm(100,0,0.09)
b1 <- smooth.spline(x = b,df=4)$y
d <- rbind(d,b1)
}
# MSE
bias <- c()
var <- c()
for (j in 1:100) {
bias[j] = mean(d[j,])-c[j]
var[j] = var(d[j,])
}
MSE[2] <- sum(bias^2)+sum(var) ; MSE
## [1] 80.02540 85.99742
# nodes=8
d <- c()
for (i in 1:1000) {
b <- NULL
b <- sin(seq(0,2*pi, length=100)) + rnorm(100,0,0.09)
b1 <- smooth.spline(x = b,df=8)$y
d <- rbind(d,b1)
}
# MSE
bias <- c()
var <- c()
for (j in 1:100) {
bias[j] = mean(d[j,])-c[j]
var[j] = var(d[j,])
}
MSE[3] <- sum(bias^2)+sum(var) ; MSE
## [1] 80.02540 85.99742 98.42961
# nodes=16
d <- c()
for (i in 1:1000) {
b <- NULL
b <- sin(seq(0,2*pi, length=100)) + rnorm(100,0,0.09)
b1 <- smooth.spline(x = b,df=16)$y
d <- rbind(d,b1)
}
# MSE
bias <- c()
var <- c()
for (j in 1:100) {
bias[j] = mean(d[j,])-c[j]
var[j] = var(d[j,])
}
MSE[4] <- sum(bias^2)+sum(var) ; MSE
## [1] 80.02540 85.99742 98.42961 99.48683
# MSE比較
MSE
## [1] 80.02540 85.99742 98.42961 99.48683
# 我們發現當節點為3的時候MSE較小其曲線也較為平滑 (怪怪)
# lowess
set.seed(106354012)
a <- seq(0,2*pi, length=100)
b <- sin(a) + rnorm(100,0,0.09)
c <- sin(a)
# 計算f值為0.23
lowess(x = a, y = b, f = 0.23)
## $x
## [1] 0.00000000 0.06346652 0.12693304 0.19039955 0.25386607 0.31733259
## [7] 0.38079911 0.44426563 0.50773215 0.57119866 0.63466518 0.69813170
## [13] 0.76159822 0.82506474 0.88853126 0.95199777 1.01546429 1.07893081
## [19] 1.14239733 1.20586385 1.26933037 1.33279688 1.39626340 1.45972992
## [25] 1.52319644 1.58666296 1.65012947 1.71359599 1.77706251 1.84052903
## [31] 1.90399555 1.96746207 2.03092858 2.09439510 2.15786162 2.22132814
## [37] 2.28479466 2.34826118 2.41172769 2.47519421 2.53866073 2.60212725
## [43] 2.66559377 2.72906028 2.79252680 2.85599332 2.91945984 2.98292636
## [49] 3.04639288 3.10985939 3.17332591 3.23679243 3.30025895 3.36372547
## [55] 3.42719199 3.49065850 3.55412502 3.61759154 3.68105806 3.74452458
## [61] 3.80799110 3.87145761 3.93492413 3.99839065 4.06185717 4.12532369
## [67] 4.18879020 4.25225672 4.31572324 4.37918976 4.44265628 4.50612280
## [73] 4.56958931 4.63305583 4.69652235 4.75998887 4.82345539 4.88692191
## [79] 4.95038842 5.01385494 5.07732146 5.14078798 5.20425450 5.26772102
## [85] 5.33118753 5.39465405 5.45812057 5.52158709 5.58505361 5.64852012
## [91] 5.71198664 5.77545316 5.83891968 5.90238620 5.96585272 6.02931923
## [97] 6.09278575 6.15625227 6.21971879 6.28318531
##
## $y
## [1] 0.080996850 0.131167242 0.181498128 0.231974823 0.282592602
## [6] 0.333358109 0.384225001 0.435089782 0.485829313 0.536293354
## [11] 0.586394479 0.636417319 0.685275062 0.731675670 0.774570960
## [16] 0.813749007 0.848324236 0.877647517 0.901037529 0.918444799
## [21] 0.930810255 0.939512243 0.944997102 0.946731373 0.944775893
## [26] 0.938700830 0.928911475 0.916683271 0.902802549 0.887170332
## [31] 0.868966913 0.848022179 0.823456204 0.794822085 0.762526080
## [36] 0.726881220 0.687887607 0.645527580 0.599517042 0.550401766
## [41] 0.499086872 0.445878869 0.390986974 0.335550632 0.280263952
## [46] 0.224638884 0.168360705 0.111889093 0.055344523 -0.001540862
## [51] -0.058464533 -0.114384150 -0.168921679 -0.221611357 -0.272610930
## [56] -0.322542916 -0.372094766 -0.421232541 -0.469656094 -0.516963250
## [61] -0.562492965 -0.605821278 -0.646555566 -0.685238559 -0.722671295
## [66] -0.758665593 -0.792578152 -0.824611191 -0.855050512 -0.883277728
## [71] -0.908284203 -0.928711529 -0.943698808 -0.952864649 -0.956468893
## [76] -0.955116277 -0.949607346 -0.940016759 -0.927129255 -0.911130714
## [81] -0.891134866 -0.866768440 -0.838438199 -0.807104371 -0.773500081
## [86] -0.737929222 -0.700771927 -0.662060134 -0.621134083 -0.572504823
## [91] -0.522227790 -0.471726853 -0.421119498 -0.370340871 -0.319315641
## [96] -0.267958749 -0.216193888 -0.163951926 -0.111207817 -0.057986372
# plot
data <- lowess(x = a, y = b, f = 0.23) %>% as.data.frame()
data <- cbind(b,c,data)%>% as.data.frame()
ggplot(data,aes(x=x))+ labs(title="lowess smooth of sin(x) [h=0.1]",x="x",y="sin(x)")+
geom_point(aes(y=b))+
geom_vline(xintercept = 0,size=1)+
geom_hline(yintercept = 0,size=1)+
geom_line(aes(y=y),col="#00D0FF",lwd=1)+
scale_x_continuous(breaks = c(0:2*pi))+
theme(panel.background = element_rect(colour = "black",size=2))+
theme(panel.grid.major = element_line(NA),panel.grid.minor =element_line(NA))+
theme(plot.title = element_text(size = 30, face = "bold"))+
theme(legend.title=element_text(size=24))+
theme(legend.text=element_text(size=20))
# MSE
d <- c()
e <- c()
for(j in 1:1000){
b <- sin(a) + rnorm(100,0,0.09)
d <- ksmooth(a, b, kernel = "normal", bandwidth = 0.1)
e <- rbind(e,d$y)
}
bias <- c()
var <- c()
for(i in 1:100){
bias[i] = mean(e[i,])-c[i]
var[i] = var(e[i,])
}
MSE <- sum(bias^2)+sum(var) ; MSE
## [1] 100.1644
# running mean
set.seed(106354012)
a <- seq(0,2*pi, length=100)
b <- sin(seq(0,2*pi, length=100)) + rnorm(100,0,0.09)
c <- sin(seq(0,2*pi, length=100))
mse = NULL
for (k in 1:20){
r <- running_mean(b, binwidth=k)
x = NULL
for(i in 1:(100-k+1)){
x[i] <- mean(a[i:(i+k-1)])
}
mse[k] <- mean((sin(x)-r)^2)
num = which(mse==min(mse))
}
mse
## [1] 0.007862650 0.004218592 0.002964615 0.002349078 0.002004996
## [6] 0.001715843 0.001529201 0.001419541 0.001347135 0.001330776
## [11] 0.001341624 0.001404192 0.001494370 0.001619996 0.001792292
## [16] 0.002035965 0.002335629 0.002702095 0.003136849 0.003648907
paste("the minimum mse of k is",num)
## [1] "the minimum mse of k is 10"
paste("the minimum mse is",min(mse))
## [1] "the minimum mse is 0.00133077571460659"
# plot
b <- running_mean(b, binwidth=which(mse==min(mse)))
plot(b)
model <- lm(b ~ poly(seq(0,2*pi, length=100-num+1),3))
y <- fitted.values(model)
lines(fitted.values(model)) # 模擬出來的資料線段
lines(c) # 真實的線段
data <- cbind(a[1:length(b)],b,c[1:length(b)])%>%as.data.frame()
colnames(data) <- c("x","running mean","sin(x)")
library(magrittr)
data2 <- gather(data,key = "type",value = "value",2:3)
data2$type %<>% as.factor()
ggplot(data2)+ labs(title = "Running mean of sin(x)")+
theme(panel.grid.major=element_blank(),panel.grid.minor=element_blank())+
xlim(0,2*pi)+ ylim(-1,1) +
geom_vline(xintercept = 0,size=1)+
geom_hline(yintercept = 0,size=1)+
geom_line(mapping = aes(x=x,y=value,color=type,group=type),lwd=1.87)+
geom_point(mapping = aes(x=x,y=value),color="blue",size=1)+
theme(legend.text = element_text(size = 16))+
theme(legend.position = c(0.8,0.8))+
theme(legend.background = element_rect(size=0.5, linetype="solid",fill ="#FFFFF0",colour ="black"))+
theme(panel.background = element_rect(color='#000000',size=2))+
theme(plot.title = element_text(size = 30, face = "bold"))+
theme(legend.title=element_text(size=24))+
theme(legend.text=element_text(size=20))
# 1,000 simulation runs ( 設定k=2 )
a <- seq(0,2*pi, length=100)
b <- sin(seq(0,2*pi, length=100)) + rnorm(100,0,0.09)
c <- sin(seq(0,2*pi, length=100))
d <- c()
for (i in 1:1000) {
b <- NULL
b <- sin(seq(0,2*pi, length=100)) + rnorm(100,0,0.09)
b1 <- running_mean(b, binwidth=2)
d <- rbind(d,b1)
}
# MSE
bias <- c()
var <- c()
for (j in 1:100) {
bias[j] = mean(d[j,])-c[j]
var[j] = var(d[j,])
}
MSE <- sum(bias^2)+sum(var) ; MSE
## [1] 100.6761