Question01

Experiment with as many variance reduction techniques as you can think of to apply the problem of evaluating \(P(X>1)\) for \(X \sim Cauchy\).

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(kableExtra)
library(knitr)
library(dplyr)
knitr::opts_chunk$set(echo = TRUE)
options(knitr.table.format = "html")
options(scipen = 100)
set.seed(15) # 3 + 12
# X ~ Cauchy(0,1)
# P(X > 1)
1 - pcauchy(1)
## [1] 0.25

繪圖

x = seq(0,3,by = 0.001)
fx = dcauchy(x)
data <- data.frame(x = x, Cauchy = dcauchy(x))
data <- mutate(data,new = ifelse(x > 1, "blue", "white"))
temp1 <- data[1:1000,]
temp2 <- data[1001:length(x),]

\(P(0 < X <=1 )\)

ggplot()+
  geom_line(data = data,aes(x = x, y = Cauchy))+
  geom_vline(xintercept = 1,linetype = 2)+
  geom_ribbon(data = temp1, aes(x = x,ymin = 0, ymax = Cauchy), fill = "#FF8888",alpha = 0.5)+
  geom_ribbon(data = temp2, aes(x = x,ymin = 0, ymax = Cauchy), fill = "white", alpha = 0.7)+
  labs(title = "P(0 < X <= 1)")

\(P(X > 1)\)

ggplot()+
  geom_line(data = data,aes(x = x, y = Cauchy))+
  geom_vline(xintercept = 1,linetype = 2)+
  geom_ribbon(data = temp1, aes(x = x,ymin = 0, ymax = Cauchy), fill = "white",alpha = 0.5)+
  geom_ribbon(data = temp2, aes(x = x,ymin = 0, ymax = Cauchy), fill = "#FF8888", alpha = 0.7)+
  labs(title = "P(X > 1)")

## Monte-Carlo Integration

N = 1000
cauchy <- function(x){
  return(1/(pi*(1+x^2)))
}
theta.hat <- NULL
for(i in 1:1000){
  # 從cauchy抽取樣本
  x <- runif(N)
  x <- cauchy(x)
  theta.hat[i]<-0.5 - mean(x)}
Mc.mean <- mean(theta.hat)
Mc.var <- var(theta.hat)

Hit or miss1

N = 1000
theta.hat <- NULL
for ( i in 1:1000){
x <- rcauchy(N)
theta.hat[i] <- mean(x > 1)}
Hm1.mean <- mean(theta.hat)
Hm1.var <- var(theta.hat)

Hit or miss2

N = 1000
theta.hat = NULL
for(i in 1:1000){
  x <- rcauchy(1000) %>% abs()
  theta.hat[i] <- 0.5*mean(x > 1)
}
Hm2.mean <- mean(theta.hat)
Hm2.var <- var(theta.hat)

Hit or miss3

N = 1000
theta.hat <- NULL
cauchy <- function(x){
  return(1/(pi*(1+x^2)))}
for(i in 1:1000){
  x <- runif(N)
  theta.hat[i] <- (1- mean(2*cauchy(x)))/2
  }
Hm3.mean <- mean(theta.hat)
Hm3.var <- var(theta.hat)

Antithetic Variate

N = 1000
for( i in 1:1000){
    x <- runif(N/2)
    y <- 1 - x
    temp1 <- cauchy(x)
    temp2 <- cauchy(y)
    theta.hat[i] <-0.5 - 0.5*(mean(temp1)+mean(temp2))}
  AV.mean <- mean(theta.hat)
  AV.var <- var(theta.hat)

Importance Sampling

hes <- function(x){
  return(1/(pi*(1 + x^(-2))))}
N = 1000
theta.hat <- NULL
for(i in 1:1000){
  U <- runif(N)
  x <- 1/U
  theta.hat[i] <- mean(hes(x))
}
Is.mean <- mean(theta.hat)
Is.var <- var(theta.hat)

Control variate

令另一個function為\(\frac{1}{1+x}\)

f <- function(x){
  return(1/(1+x))
  }
cauchy <- function(x){
  return(1/(pi*(1+x^2)))
}
theta.hat = NULL
for(i in 1:N){
u <- runif(1000)
B <- f(u)
A <- cauchy(u)
a <- -cov(A,B) / var(B) #est of c*

x <- runif(N)
T1 <-0.5 -  cauchy(x)
theta.hat[i]<-  0.5 - mean(T1 + a * (f(x) -log(2, base = exp(1))))
}
Cv.mean <- mean(theta.hat)
Cv.var <- var(theta.hat)

Stratified Sampling

分五層

cauchy <- function(x){
  return(1/(pi*(1+x^2)))
}

SS = NULL
for(i in 1:N){
x1 <- runif(N/5, 0, 0.2)
x2 <- runif(N/5, 0.2, 0.4)
x3 <- runif(N/5, 0.4, 0.6)
x4 <- runif(N/5, 0.6, 0.8)
x5 <- runif(N/5, 0.8, 1)
S1 <- 0.5 - mean(cauchy(x1))
S2 <- 0.5 - mean(cauchy(x2))
S3 <- 0.5 - mean(cauchy(x3))
S4 <- 0.5 - mean(cauchy(x4))
S5 <- 0.5 - mean(cauchy(x5))
SS[i] <- mean(c(S1, S2, S3, S4, S5))
}
SS.mean <- mean(SS)
SS.var <- var(SS)

統整

tmp1 <- c(Mc.mean, Hm1.mean, Hm2.mean, Hm3.mean, AV.mean, Is.mean, Cv.mean, SS.mean)
tmp2 <- c(Mc.var, Hm1.var, Hm2.var, Hm3.var, AV.var, Is.var, Cv.var, SS.var)
table <- rbind(tmp1, tmp2)
rownames(table) <- c("Mean","Variance")
colnames(table) <- c("Monte-Carlo", "Hit or miss1", "Hit or miss2", "Hit or miss3", "Antithetic",
                     " Importance", "Control", "Stratified(5)")
temp <- table %>% round(., digits = 8)
kable(temp) %>% kable_styling() 
Monte-Carlo Hit or miss1 Hit or miss2 Hit or miss3 Antithetic Importance Control Stratified(5)
Mean 0.2499791 0.2498500 0.2497935 0.2499957 0.2499947 0.2499771 0.2498210 0.2500137
Variance 0.0000027 0.0001932 0.0000607 0.0000029 0.0000000 0.0000024 0.0000098 0.0000001

Question02

Hammersley and Handscomb (1964) used the integration of \(\theta = \int_{0}^{1} \frac{e^x-1}{e-1}dx\) on \((0,1)\) as a test problem of variance reduction techniques (which is about 0.4180233). Achieve as large a variance reduction as you can. (They achieved 4 million.)

set.seed(15) # 3 + 12
# define function
Q2 <- function(x){
 y <- (exp(x)-1)/(exp(1)-1)
 return(y)}

x = seq(0,1,by = 0.001)
fx = Q2(x)
data <- data.frame(x = x, fx = Q2(x))
ggplot()+
  geom_line(data = data,aes(x = x, y = fx))+
  geom_vline(xintercept = 1,linetype = 2)+
  geom_ribbon(data = data, aes(x = x,ymin = 0, ymax = fx), fill = "#00FF00",alpha = 0.7)+
  labs(title = "Question 02")

## Monte-Carlo Integration

N = 1000
theta.hat = NULL
for(i in 1:N){
x <- runif(N)
theta.hat[i] <- mean(Q2(x))}
Mc.mean <- mean(theta.hat)
Mc.var <-var(theta.hat)

Antithetic Variate

N = 1000
theta.hat = NULL
for(i in 1:N){
x <- runif(N/2)
y <- 1 - x
temp1 <- mean(Q2(x))
temp2 <- mean(Q2(y))
theta.hat[i] <- 0.5*(temp1+temp2)}
AV.mean <- mean(theta.hat)
AV.var <- var(theta.hat)

Importance Sampling

theta.hat1 = NULL
theta.hat2 = NULL
theta.hat3 = NULL
theta.hat4 = NULL
for( i in 1:N){
  
  u <- runif(N)     #f3, inverse transform method
  x <- - log(1 - u * (1 - exp(-1)))
  fg <- Q2(x) / (exp(-x) / (1 - exp(-1)))
  theta.hat1[i] <- mean(fg)
  
  u <- runif(N/2)     #f3, inverse transform method + Antithetic Variate
  v <- 1 - u
  x1 <- - log(1 - u * (1 - exp(-1)))
  x2 <- - log(1 - v * (1 - exp(-1)))
  fg1 <- Q2(x1) / (exp(-x1) / (1 - exp(-1)))
  fg2 <- Q2(x2) / (exp(-x2) / (1 - exp(-1)))
  fg <- 0.5*(fg1+fg2)
  theta.hat2[i] <- mean(fg)

  u <- runif(N)    #f4, inverse transform method
  x <- tan(pi * u / 4)
  fg <- Q2(x) / (4 / ((1 + x^2) * pi))
  theta.hat3[i] <- mean(fg)

  u <- runif(N/2)    #f4, inverse transform method + Antithetic Variate
  v <- 1 - u
  x1 <- tan(pi * u / 4)
  x2 <- tan(pi * v / 4)
  fg1 <- Q2(x1) / (4 / ((1 + x1^2) * pi))
  fg2 <- Q2(x2) / (4 / ((1 + x2^2) * pi))
  fg <- 0.5*(fg1+fg2)
  theta.hat4[i] <- mean(fg)
}

mean11 <- c(mean(theta.hat1 ),mean(theta.hat2 ),mean(theta.hat3 ),mean(theta.hat4 ))
var11 <- c(var(theta.hat1 ),var(theta.hat2 ),var(theta.hat3 ),var(theta.hat4))
Is.mean1 <- mean(theta.hat3 )
Is.mean2 <- mean(theta.hat4)
Is.var1 <- var(theta.hat3)
Is.var2 <- var(theta.hat4)

Control variate

另一函式為 x

f <- function(x){
  x}
theta.hat = NULL
for( i in 1:N){
  u <- runif(10000)
  B <- f(u)
  A <- Q2(u)
  a <- -cov(A,B) / var(B)
  u <- runif(N)
  T1 <- Q2(u)
  T2 <- T1 + a * (f(u) - 1/2)
  theta.hat[i] <- mean(T2)
  }
Cv.mean <- mean(theta.hat)
Cv.var <- var(theta.hat)

Stratified Sampling

Q2 <- function(x){
  y <- (exp(x)-1)/(exp(1)-1)
  return(y)}
N <- 1000
SS = NULL
for(i in 1:N){
  x1 <- runif(N/5, 0, 0.2)
  x2 <- runif(N/5, 0.2, 0.4)
  x3 <- runif(N/5, 0.4, 0.6)
  x4 <- runif(N/5, 0.6, 0.8)
  x5 <- runif(N/5, 0.8, 1)
  S1 <- mean(Q2(x1))
  S2 <- mean(Q2(x2))
  S3 <- mean(Q2(x3))
  S4 <- mean(Q2(x4))
  S5 <- mean(Q2(x5))
  SS[i] <- mean(c(S1, S2, S3, S4, S5))
}
SS.mean <- mean(SS)
SS.var <- var(SS)

統整

tmp1 <- c(Mc.mean, AV.mean, Is.mean1,Is.mean2, Cv.mean, SS.mean)
tmp2 <- c(Mc.var, AV.var, Is.var1, Is.var2, Cv.var, SS.var)
table <- rbind(tmp1, tmp2)
rownames(table) <- c("Mean","Variance")
colnames(table) <- c("Monte-Carlo", "Antithetic"," Importance", "Importance+Antithetic","Control", "Stratified(5)")
temp <- table %>% round(., digits = 8)
temp %>% kable() %>% kable_styling()
Monte-Carlo Antithetic Importance Importance+Antithetic Control Stratified(5)
Mean 0.4178798 0.4180536 0.4182496 0.4180618 0.4180334 0.4180431
Variance 0.0000845 0.0000028 0.0001548 0.0000414 0.0000013 0.0000034

Question03

Let \(X_i,i=1,2,3,4,5\) be independent exponential random variables each with mean 1, and consider the quantity \(\theta\) defined by \(\theta = P(\sum_{i=1}^{5}iX_i\geq 21.6)\).Propose at least three simulation methods to estimate \(\theta\) and compare their variances. ## 制定 \(\theta = P(\sum_{i=1}^{5}iX_i\geq 21.6)\)的機率形式

Q3<-function(x){
  (312.5/60)*exp((-1/5)*x)-
    (640/60)*exp((-1/4)*x)+
    (405/60)*exp((-1/3)*x)+
    (1/24)*exp((-1)*x)+
    (-4/3)*exp((-1/2)*x)
}

Hit or miss

set.seed(15) # 3 + 12
temp2 <- NULL
for(j in 1:1000){
  temp <- NULL
  for( i in 1:1000){
    tmp <- rexp(5)
    temp[i] <- sum(tmp*c(1:5))}
  temp2[j] <- sum(temp>21.6)/1000
}
Hm.mean <- mean(temp2)
Hm.var <- var(temp2)

Monte-Carlo Integration

N <- 1000
theta.hat <- NULL
for(i in 1:1000){
  x <- runif(N,0,21.6)
  theta.hat[i] <-1 -  mean(Q3(x)*21.6)
}
Mc.mean <- mean(theta.hat)
Mc.var <- var(theta.hat)

Antithetic Variate

N <- 1000
theta.hat <- NULL
for(i in 1:1000){
  x <- runif(N/2,0,21.6)
  y <- 21.6 - x
  theta.hat[i] <- (2 - mean(Q3(x)*21.6) - mean(Q3(y)*21.6))*0.5
}
Av.mean <- mean(theta.hat)
Av.var <- var(theta.hat)

Stratified Sampling

N <- 1000
theta.hat <- NULL
for(i in 1:1000){
  tmp <- c()
  for(j in 1:5){
    x <- runif(N/5,(j - 1)/5*21.6,j/5*21.6)
    tmp <- c(tmp, Q3(x)*21.6)
  }
  theta.hat[i] <- 1 - mean(tmp)
}
SS.mean <- mean(theta.hat)
SS.var <- var(theta.hat)

統整

table.mean <- c(Mc.mean, Av.mean, SS.mean, Hm.mean)
table.var <- c(Mc.var, Av.var, SS.var, Hm.var)
table <- rbind(table.mean, table.var)
rownames(table) <- c("mean", "var")
colnames(table) <- c("Monte", "Antithetic", "Stratified", "Hit")
temp <- table %>% round(., digits = 8)
temp %>% kable() %>% kable_styling()
Monte Antithetic Stratified Hit
mean 0.1681060 0.1673069 0.1689417 0.1681690
var 0.0001978 0.0002972 0.0000200 0.0001311

Question04

First, simulate 100 observations from \(Beta(2,3)\) and then use 3 density estimating methods to smooth the observations. You need to specify the parameters in the smoothing methods, and compare the results.

library(dplyr)
library(ggplot2)
library(magrittr)
library(tidyverse)

Shiuan

# First, simulate 100 observations from a mixed distribution of beta(2,3),
# each with probability 0.5. Then, use at least 3 density estimating methods
# to smooth the observations. 
# You need to specify the parameters in the smoothing methods, 
# and compare the results.

# 取出我們要的亂數
set.seed(106354012)
m = 100
x = rbeta(m,2,3)
# 畫個點的散佈圖
stripchart(x,pch=16,cex=0.5,col=3,main="Dotplot")

# 然後畫個直方圖
y = seq(0,1,length=100)
hist( x, breaks = 10,probability = T,ylim = c(0,3),xlim= c(0,1),col = "#AAAAAA",main = "Density Estimation (h=0.1)")
# 這是真實的beta(2,3)圖形
lines(y , dbeta(y,2,3) , col = "#00AAFF" , lwd = 3)
legend("topright" , "Beta(2,3)" , lty = 1 , col="#00AAFF" , lwd = 3)
# 這是直接用套件模擬出來的函數值
kernal_g = density(x, kernel = c("gaussian"), width = 0.1)
kernal_r = density(x, kernel = c("rectangular"), width = 0.1)
kernal_t = density(x, kernel = c("triangular"), width = 0.1)
# 把他們畫在plot裡面囉
lines(kernal_g, col = "#FF0000" , lwd = 2)
lines(kernal_r, col = "#00FF00" , lwd = 2)
lines(kernal_t, col = "#FFFF00" , lwd = 2)
legend("right",legend=c("gaussian","rectangular","triangular"), col=c("#FF0000","#00FF00","#FFFF00"),lwd=2,cex=1)
box()

# (h=0.2)
hist( x, breaks = 5,probability = T,ylim = c(0,3),xlim= c(0,1),col = "#AAAAAA",main = "Density Estimation (h=0.2)")
# 這是真實的beta(2,3)圖形
lines(y , dbeta(y,2,3) , col = "#00AAFF" , lwd = 3)
legend("topright" , "Beta(2,3)" , lty = 1 , col="#00AAFF" , lwd = 3)
# 這是直接用套件模擬出來的函數值
kernal_g = density(x, kernel = c("gaussian"), width = 0.2)
kernal_r = density(x, kernel = c("rectangular"), width = 0.2)
kernal_t = density(x, kernel = c("triangular"), width = 0.2)
# 把他們畫在plot裡面囉
lines(kernal_g, col = "#FF0000" , lwd = 2)
lines(kernal_r, col = "#00FF00" , lwd = 2)
lines(kernal_t, col = "#FFFF00" , lwd = 2)
legend("right",legend=c("gaussian","rectangular","triangular"), col=c("#FF0000","#00FF00","#FFFF00"),lwd=2,cex=1)
box()

Cao

# 這是不想用套件的人寫的code~

set.seed(106354012)
x <- rbeta(100,2,3)
h=0.1

# histogram density estimator (直方圖)

histogram = function(x,h){
  g = function(x,a,b){
    if (x<=b & x>=a) {return(1)}
    else {return(0)}}            # 寫出一個計算個數前需要的函數
  n <- length(x)
  seq <- seq(min(x),max(x),by=h) # 從(0,1)間隔h做出切割點
  a = seq[-length(seq)]          # 每組下界
  b = seq[-1]                    # 每組上界
  ni = NULL
  for (k in 1:length(a)){
    ni[k] = sum(x<=b[k] & x>=a[k])}
  y_hat = NULL
  for (i in sort(x)){
    I=NULL
    for (j in 1:length(a)){
      gi=g(i,a[j],b[j])
      I=c(I,gi)}                 # 指標函數
    y=1/n*sum(ni/h*I)            # 模擬樣本之函數估計值 
    y_hat=c(y_hat,y)}
  return(y_hat)}                 

# naive density estimator
naive =function(x,h){
  w = function(y){
    if (abs(y)<1) {return(1/2)}
    else {return(0)}}               # 寫出公式裡的函數w
  n = length(x)
  seq = seq(min(x),max(x),length=length(x))        
  y_hat = NULL
  for (i in seq){
    W=NULL
    for (j in x){
      wi=w((i-j)/h)                        
      W=c(W,wi)}
    y=1/n*sum(1/h*W)                # 樣本之函數估計值           
    y_hat=c(y_hat,y)}
  return(y_hat)}

# kernel density estimator
# norm
kernel_norm=function(x,h){
  w=function(y){dnorm(y)}           # 核密度函數(常態)
  n = length(x)
  seq = seq(min(x),max(x),length=length(x))
  y_hat = NULL
  for (i in seq){
    W=NULL
    for (j in x){
      wi=w((i-j)/h)
      W=c(W,wi)}
    y=1/n*sum(1/h*W)
    y_hat=c(y_hat,y)}
  return(y_hat)}

# Gamma核函數
kernel_gamma=function(x,h){
  w=function(y){dgamma(y,shape = 1)}           # 核密度函數(Gamma)
  n = length(x)
  seq = seq(min(x),max(x),length=length(x))
  y_hat = NULL
  for (i in seq){
    W=NULL
    for (j in x){
      wi=w((i-j)/h)
      W=c(W,wi)}
    y=1/n*sum(1/h*W)
    y_hat=c(y_hat,y)}
  return(y_hat)}

# 將亂數帶入函數找出估計函數值(h=0.1)
y1 <- histogram(x,0.1)              #h=0.1,也可使用其他h值
y2 <- naive(x,0.1)                  #h=0.1,也可使用其他h值
y3 <- kernel_norm(x,0.1)            #h=0.1,也可使用其他h值
y4 <- kernel_gamma(x,0.1)           #h=0.1,也可使用其他h值
# 畫圖囉
xx <- sort(x)
yy <- dbeta(xx,2,3)
y2 <- naive(xx,0.1)
y3 <- kernel_norm(xx,0.1)
y4 <- kernel_gamma(xx,0.1)
data <- cbind(xx,yy,y2,y3,y4) %>% as.data.frame()
colnames(data) <- c("sample","Beta(2,3)","naive","kernal(norm)","kernal(gamma)")
LBJ <- gather(data,key = "type",value = "value",2:5)
colnames(LBJ) <- c("sample","LineType","value")  
library(magrittr)
LBJ$LineType %<>% as.factor()
library(ggplot2)
ggplot(data = LBJ) + labs(title = "Density Estimation (h=0.1)")+
  xlim(0,1)+ ylim(0,2.5) +
  geom_histogram(mapping = aes(x=sample,y=..density..),color="black",fill="gray",binwidth = 0.1)+
  geom_line(mapping = aes(x=sample,y=value,color=LineType,group=LineType),size=1.2)+
  theme(legend.title = element_text(colour="royalblue", size=20, face="bold"))+
  theme(legend.text = element_text(size = 16))+
  theme(legend.position = c(0.87,0.6))+
  theme(legend.background = element_rect(fill="#FFFFF0",size=1, linetype="solid",colour ="darkblue"))+
  theme(panel.grid.major = element_blank())+
  theme(panel.grid.minor = element_blank())+
  theme(panel.background = element_rect(fill="#EEFFFF",colour="black",size = 2))

# 畫圖囉 (h=0.2)
xx <- sort(x)
yy <- dbeta(xx,2,3)
y1 <- histogram(xx,0.2)
y2 <- naive(xx,0.2)
y3 <- kernel_norm(xx,0.2)
y4 <- kernel_gamma(xx,0.2)
data <- cbind(xx,yy,y2,y3,y4) %>% as.data.frame()
colnames(data) <- c("sample","Beta(2,3)","naive","kernal(norm)","kernal(gamma)")
LBJ <- gather(data,key = "type",value = "value",2:5)
colnames(LBJ) <- c("sample","LineType","value")  
library(magrittr)
LBJ$LineType %<>% as.factor()
plot(sort(x),y1,typ="l",xlab="x",ylab="f(x)",xlim=c(0,1), ylim=c(0,3))

library(ggplot2)
ggplot(data = LBJ) + labs(title = "Density Estimation (h=0.2)")+
  xlim(0,1)+ ylim(0,2.5) +
  geom_histogram(mapping = aes(x=sample,y=..density..),color="black",fill="gray",binwidth = 0.2)+
  geom_line(mapping = aes(x=sample,y=value,color=LineType,group=LineType),size=1.2)+
  theme(legend.title = element_text(colour="royalblue", size=20, face="bold"))+
  theme(legend.text = element_text(size = 16))+
  theme(legend.position = c(0.87,0.6))+
  theme(legend.background = element_rect(fill="#FFFFF0",size=1, linetype="solid",colour ="darkblue"))+
  theme(panel.grid.major = element_blank())+
  theme(panel.grid.minor = element_blank())+
  theme(panel.background = element_rect(fill="#EEFEFF",colour="black",size = 2))

Question05

Let \(x\) be 100 equally spaced points on \([0,2π]\) and let \(y_i=sunx_i+\epsilon_i\) with \(\epsilon_i\sim N(0,0.09)\).Apply at least 3 linear smoothers and compare the differences, with respect to mean squares error (i.e., \(bias^2\) and variance) from 1,000 simulation runs.

library(plyr)
library(dplyr)
library(tidyverse)
library(ggplot2)
library(KernSmooth)
library(broom)
library(magrittr)
library(igraph)

kernel smooth

# kernel smooth
set.seed(106354012)

a <- seq(0,2*pi, length=100)
b <- sin(a) + rnorm(100,0,0.09)
c <- sin(a)

# kernel smooth (kernel is norm)
data <- ksmooth(a, b, kernel = "normal", bandwidth = 0.1)%>% as.data.frame()
data <- cbind(b,c,data)%>% as.data.frame()
ggplot(data,aes(x=x))+ labs(title="kernel smooth of sin(x) [h=0.1]",x="x",y="sin(x)")+
  geom_point(aes(y=b),col="#3333FF")+
  geom_vline(xintercept = 0,size=1)+
  geom_hline(yintercept = 0,size=1)+
  geom_line(aes(y=y),col="#00D0FF",lwd=1)+
  scale_x_continuous(breaks = c(0:2*pi))+
  theme(panel.grid.major = element_line(NA),panel.grid.minor =element_line(NA))+
  theme(panel.background = element_rect(color = "black",size = 2))+
  theme(plot.title = element_text(size = 30, face = "bold"))+
  theme(legend.title=element_text(size=24))+
  theme(legend.text=element_text(size=20))

# MSE
d <- c()
e <- c()
for(j in 1:1000){
  b <- sin(a) + rnorm(100,0,0.09)
  d <- ksmooth(a, b, kernel = "normal", bandwidth = 0.1)
  e <- rbind(e,d$y)
}
bias <- c()
var <- c()
for(i in 1:100){
  bias[i] = mean(e[i,])-c[i] 
  var[i] = var(e[i,])
}
MSE <- sum(bias^2)+sum(var) ; MSE
## [1] 100.1644

Spline smooth

# spline

# plot
df_values <- c(3, 4, 8, 16)

x = seq(from=0, to=2*pi, length=100)
f_x = sin(x)
epsilon = rnorm(100, 0, sd = 0.3)
y = f_x + epsilon

values <- data_frame(
  x = seq(from=0, to=2*pi, length=100),
  f_x = sin(x),
  epsilon = rnorm(100, 0, sd = 0.3),
  y = f_x + epsilon
)
overall <- NULL
for(df in df_values){
  overall <- smooth.spline(values$x, values$y, df=df) %>%
    augment() %>%
    mutate(df=df) %>%
    bind_rows(overall)
}
overall <- cbind(overall,f_x) %>% as.data.frame()
multiple_df <- overall %>% 
  ggplot(aes(x=x)) +
  geom_point(aes(y=y))+
  geom_line(aes(y=.fitted,color="#123456"),size=1) +
  facet_wrap(~df, nrow=2) +
  labs(title="Splines fit w / different degrees of freedom")+
  theme(legend.title=element_blank())+
  theme(legend.text = element_blank())+
  theme(panel.background = element_rect(colour = "black"))+
  geom_vline(xintercept = 0,size=1)+
  geom_hline(yintercept = 0,size=1)+
  theme(panel.grid.major = element_line(NA),panel.grid.minor =element_line(NA))+
  theme(panel.background = element_rect(color='#000000',size=2))+
  theme(plot.title = element_text(size = 30, face = "bold"))+
  theme(legend.title=element_text(size=24))+
  theme(legend.text=element_text(size=20))

multiple_df

get.spline.info<-function(object){
  data.frame(x=object$x,y=object$y,df=object$df)
}

# plot

a <- seq(0,2*pi, length=100)
b <- sin(seq(0,2*pi, length=100)) + rnorm(100,0,0.09)
c <- sin(seq(0,2*pi, length=100))

spline1 <- smooth.spline(x = b, df = 3)
spline2 <- smooth.spline(x = b, df = 4)
spline3 <- smooth.spline(x = b, df = 8)
spline4 <- smooth.spline(x = b, df = 16)

data<-ldply(list(spline1,spline2,spline3,spline4),get.spline.info)
data <- cbind(a,b,c,data) %>% as.data.frame()
ggplot(data,aes(x=a))+ labs(title="Spline smoother of sin(x)") +
  geom_line(aes(y=c),col="gray",size=2)+
  geom_point(aes(y=b))+
  geom_vline(xintercept = 0,size=1)+
  geom_hline(yintercept = 0,size=1)+
  geom_line(mapping = aes(y=y,color=factor(round(df,0)),group=df),lwd=1.2)+
  scale_color_discrete("The number of nodes")+
  theme(legend.position = c(0.8,0.8))+
  theme(legend.background = element_rect(fill="#FFFFF0",colour = "black"))+
  theme(panel.grid.major = element_line(NA),panel.grid.minor =element_line(NA))+
  theme(panel.background = element_rect(colour = "black",size=2))+
  theme(plot.title = element_text(size = 30, face = "bold"))+
  theme(legend.title=element_text(size=24))+
  theme(legend.text=element_text(size=20))

# 最後我們將各節點數分別計算MSE

spline1$y
##   [1]  0.714474448  0.711408317  0.708332040  0.705227909  0.702074825
##   [6]  0.698845329  0.695506577  0.692023408  0.688355424  0.684460421
##  [11]  0.680293859  0.675809336  0.670959703  0.665696557  0.659972586
##  [16]  0.653741237  0.646957489  0.639577624  0.631561872  0.622872469
##  [21]  0.613472881  0.603330426  0.592414368  0.580698753  0.568161939
##  [26]  0.554784175  0.540550967  0.525452893  0.509483419  0.492643542
##  [31]  0.474936955  0.456371502  0.436959464  0.416715775  0.395662348
##  [36]  0.373823823  0.351229795  0.327914995  0.303916311  0.279275827
##  [41]  0.254039664  0.228255921  0.201977687  0.175259756  0.148158988
##  [46]  0.120733735  0.093043137  0.065148453  0.037111772  0.008996355
##  [51] -0.019134278 -0.047216727 -0.075188474 -0.102988873 -0.130558183
##  [56] -0.157838544 -0.184773308 -0.211309321 -0.237396167 -0.262984657
##  [61] -0.288028515 -0.312482666 -0.336304848 -0.359456935 -0.381902592
##  [66] -0.403609819 -0.424550822 -0.444700249 -0.464039174 -0.482551497
##  [71] -0.500227178 -0.517062218 -0.533055344 -0.548211387 -0.562537780
##  [76] -0.576047207 -0.588757152 -0.600686770 -0.611859080 -0.622301470
##  [81] -0.632043252 -0.641117799 -0.649560093 -0.657408003 -0.664702038
##  [86] -0.671483884 -0.677797914 -0.683689190 -0.689203044 -0.694385605
##  [91] -0.699282848 -0.703938865 -0.708395447 -0.712692564 -0.716864869
##  [96] -0.720944634 -0.724959038 -0.728930567 -0.732879123 -0.736817897
# ns=3
d <- c()
for (i in 1:1000) {
  b <- NULL
  b <- sin(seq(0,2*pi, length=100)) + rnorm(100,0,0.09)
  b1 <- smooth.spline(x = b,df=3)$y
  d <- rbind(d,b1)
}
# MSE
bias <- c()
var <- c()
for (j in 1:100) {
  bias[j] = mean(d[j,])-c[j] 
  var[j] = var(d[j,])
}
MSE <- c()
MSE[1] <- sum(bias^2)+sum(var) ; MSE
## [1] 80.0254
# nodes=4
d <- c()
for (i in 1:1000) {
  b <- NULL
  b <- sin(seq(0,2*pi, length=100)) + rnorm(100,0,0.09)
  b1 <- smooth.spline(x = b,df=4)$y
  d <- rbind(d,b1)
}
# MSE
bias <- c()
var <- c()
for (j in 1:100) {
  bias[j] = mean(d[j,])-c[j] 
  var[j] = var(d[j,])
}
MSE[2] <- sum(bias^2)+sum(var) ; MSE
## [1] 80.02540 85.99742
# nodes=8
d <- c()
for (i in 1:1000) {
  b <- NULL
  b <- sin(seq(0,2*pi, length=100)) + rnorm(100,0,0.09)
  b1 <- smooth.spline(x = b,df=8)$y
  d <- rbind(d,b1)
}
# MSE
bias <- c()
var <- c()
for (j in 1:100) {
  bias[j] = mean(d[j,])-c[j] 
  var[j] = var(d[j,])
}
MSE[3] <- sum(bias^2)+sum(var) ; MSE
## [1] 80.02540 85.99742 98.42961
# nodes=16
d <- c()
for (i in 1:1000) {
  b <- NULL
  b <- sin(seq(0,2*pi, length=100)) + rnorm(100,0,0.09)
  b1 <- smooth.spline(x = b,df=16)$y
  d <- rbind(d,b1)
}
# MSE
bias <- c()
var <- c()
for (j in 1:100) {
  bias[j] = mean(d[j,])-c[j] 
  var[j] = var(d[j,])
}
MSE[4] <- sum(bias^2)+sum(var) ; MSE
## [1] 80.02540 85.99742 98.42961 99.48683
# MSE比較
MSE
## [1] 80.02540 85.99742 98.42961 99.48683
# 我們發現當節點為3的時候MSE較小其曲線也較為平滑 (怪怪)

Lowess smooth

# lowess
set.seed(106354012)

a <- seq(0,2*pi, length=100)
b <- sin(a) + rnorm(100,0,0.09)
c <- sin(a)

# 計算f值為0.23
lowess(x = a, y = b, f = 0.23)
## $x
##   [1] 0.00000000 0.06346652 0.12693304 0.19039955 0.25386607 0.31733259
##   [7] 0.38079911 0.44426563 0.50773215 0.57119866 0.63466518 0.69813170
##  [13] 0.76159822 0.82506474 0.88853126 0.95199777 1.01546429 1.07893081
##  [19] 1.14239733 1.20586385 1.26933037 1.33279688 1.39626340 1.45972992
##  [25] 1.52319644 1.58666296 1.65012947 1.71359599 1.77706251 1.84052903
##  [31] 1.90399555 1.96746207 2.03092858 2.09439510 2.15786162 2.22132814
##  [37] 2.28479466 2.34826118 2.41172769 2.47519421 2.53866073 2.60212725
##  [43] 2.66559377 2.72906028 2.79252680 2.85599332 2.91945984 2.98292636
##  [49] 3.04639288 3.10985939 3.17332591 3.23679243 3.30025895 3.36372547
##  [55] 3.42719199 3.49065850 3.55412502 3.61759154 3.68105806 3.74452458
##  [61] 3.80799110 3.87145761 3.93492413 3.99839065 4.06185717 4.12532369
##  [67] 4.18879020 4.25225672 4.31572324 4.37918976 4.44265628 4.50612280
##  [73] 4.56958931 4.63305583 4.69652235 4.75998887 4.82345539 4.88692191
##  [79] 4.95038842 5.01385494 5.07732146 5.14078798 5.20425450 5.26772102
##  [85] 5.33118753 5.39465405 5.45812057 5.52158709 5.58505361 5.64852012
##  [91] 5.71198664 5.77545316 5.83891968 5.90238620 5.96585272 6.02931923
##  [97] 6.09278575 6.15625227 6.21971879 6.28318531
## 
## $y
##   [1]  0.080996850  0.131167242  0.181498128  0.231974823  0.282592602
##   [6]  0.333358109  0.384225001  0.435089782  0.485829313  0.536293354
##  [11]  0.586394479  0.636417319  0.685275062  0.731675670  0.774570960
##  [16]  0.813749007  0.848324236  0.877647517  0.901037529  0.918444799
##  [21]  0.930810255  0.939512243  0.944997102  0.946731373  0.944775893
##  [26]  0.938700830  0.928911475  0.916683271  0.902802549  0.887170332
##  [31]  0.868966913  0.848022179  0.823456204  0.794822085  0.762526080
##  [36]  0.726881220  0.687887607  0.645527580  0.599517042  0.550401766
##  [41]  0.499086872  0.445878869  0.390986974  0.335550632  0.280263952
##  [46]  0.224638884  0.168360705  0.111889093  0.055344523 -0.001540862
##  [51] -0.058464533 -0.114384150 -0.168921679 -0.221611357 -0.272610930
##  [56] -0.322542916 -0.372094766 -0.421232541 -0.469656094 -0.516963250
##  [61] -0.562492965 -0.605821278 -0.646555566 -0.685238559 -0.722671295
##  [66] -0.758665593 -0.792578152 -0.824611191 -0.855050512 -0.883277728
##  [71] -0.908284203 -0.928711529 -0.943698808 -0.952864649 -0.956468893
##  [76] -0.955116277 -0.949607346 -0.940016759 -0.927129255 -0.911130714
##  [81] -0.891134866 -0.866768440 -0.838438199 -0.807104371 -0.773500081
##  [86] -0.737929222 -0.700771927 -0.662060134 -0.621134083 -0.572504823
##  [91] -0.522227790 -0.471726853 -0.421119498 -0.370340871 -0.319315641
##  [96] -0.267958749 -0.216193888 -0.163951926 -0.111207817 -0.057986372
# plot
data <- lowess(x = a, y = b, f = 0.23) %>% as.data.frame()
data <- cbind(b,c,data)%>% as.data.frame()
ggplot(data,aes(x=x))+ labs(title="lowess smooth of sin(x) [h=0.1]",x="x",y="sin(x)")+
  geom_point(aes(y=b))+
  geom_vline(xintercept = 0,size=1)+
  geom_hline(yintercept = 0,size=1)+
  geom_line(aes(y=y),col="#00D0FF",lwd=1)+
  scale_x_continuous(breaks = c(0:2*pi))+
  theme(panel.background = element_rect(colour = "black",size=2))+
  theme(panel.grid.major = element_line(NA),panel.grid.minor =element_line(NA))+
  theme(plot.title = element_text(size = 30, face = "bold"))+
  theme(legend.title=element_text(size=24))+
  theme(legend.text=element_text(size=20))

# MSE
d <- c()
e <- c()
for(j in 1:1000){
  b <- sin(a) + rnorm(100,0,0.09)
  d <- ksmooth(a, b, kernel = "normal", bandwidth = 0.1)
  e <- rbind(e,d$y)
}
bias <- c()
var <- c()
for(i in 1:100){
  bias[i] = mean(e[i,])-c[i] 
  var[i] = var(e[i,])
}
MSE <- sum(bias^2)+sum(var) ; MSE
## [1] 100.1644

Running mean smooth

# running mean
set.seed(106354012)

a <- seq(0,2*pi, length=100)
b <- sin(seq(0,2*pi, length=100)) + rnorm(100,0,0.09)
c <- sin(seq(0,2*pi, length=100))

mse = NULL
for (k in 1:20){
r <- running_mean(b, binwidth=k)
x = NULL
for(i in 1:(100-k+1)){
x[i] <- mean(a[i:(i+k-1)])
}
mse[k] <- mean((sin(x)-r)^2)
num = which(mse==min(mse))
}
mse
##  [1] 0.007862650 0.004218592 0.002964615 0.002349078 0.002004996
##  [6] 0.001715843 0.001529201 0.001419541 0.001347135 0.001330776
## [11] 0.001341624 0.001404192 0.001494370 0.001619996 0.001792292
## [16] 0.002035965 0.002335629 0.002702095 0.003136849 0.003648907
paste("the minimum mse of k is",num)
## [1] "the minimum mse of k is 10"
paste("the minimum mse is",min(mse))
## [1] "the minimum mse is 0.00133077571460659"
# plot

b <- running_mean(b, binwidth=which(mse==min(mse)))
plot(b)
model <- lm(b ~ poly(seq(0,2*pi, length=100-num+1),3))
y <- fitted.values(model)
lines(fitted.values(model))  # 模擬出來的資料線段
lines(c)                     # 真實的線段

data <- cbind(a[1:length(b)],b,c[1:length(b)])%>%as.data.frame()
colnames(data) <- c("x","running mean","sin(x)")
library(magrittr)
data2 <- gather(data,key = "type",value = "value",2:3)
data2$type %<>% as.factor()
ggplot(data2)+ labs(title = "Running mean of sin(x)")+
  theme(panel.grid.major=element_blank(),panel.grid.minor=element_blank())+
  xlim(0,2*pi)+ ylim(-1,1) +
  geom_vline(xintercept = 0,size=1)+
  geom_hline(yintercept = 0,size=1)+
  geom_line(mapping = aes(x=x,y=value,color=type,group=type),lwd=1.87)+
  geom_point(mapping = aes(x=x,y=value),color="blue",size=1)+
  theme(legend.text = element_text(size = 16))+
  theme(legend.position = c(0.8,0.8))+
  theme(legend.background = element_rect(size=0.5, linetype="solid",fill ="#FFFFF0",colour ="black"))+
  theme(panel.background = element_rect(color='#000000',size=2))+
  theme(plot.title = element_text(size = 30, face = "bold"))+
  theme(legend.title=element_text(size=24))+
  theme(legend.text=element_text(size=20))

# 1,000 simulation runs ( 設定k=2 )

a <- seq(0,2*pi, length=100)
b <- sin(seq(0,2*pi, length=100)) + rnorm(100,0,0.09)
c <- sin(seq(0,2*pi, length=100))
d <- c()
for (i in 1:1000) {
b <- NULL
b <- sin(seq(0,2*pi, length=100)) + rnorm(100,0,0.09)
b1 <- running_mean(b, binwidth=2)
d <- rbind(d,b1)
}
# MSE
bias <- c()
var <- c()
for (j in 1:100) {
  bias[j] = mean(d[j,])-c[j] 
  var[j] = var(d[j,])
}
MSE <- sum(bias^2)+sum(var) ; MSE
## [1] 100.6761