A Brief Overview
One of the most fundamental examples of almost sure convergence is provided by the Glivenko-Cantelli theorem. To state the result,let us first define the Kolmogorov-Smirnov distance between c.d.f.s \(F\) and \(G\) as,
\[ d_K\left(F,G\right)=\underset{t\in\mathbb{R}}{\sup}\left|F\left(t\right)-G\left(t\right)\right| \]
Suppose \(X_1,X_2,\ldots,X_n\) are \(iid\) real-valued random variables with c.d.f. \(F\). Let \(\hat{F}_n\) be the empirical c.d.f. defined by,
\[ \hat{F}_n\left(t\right)=\frac{1}{n}\sum_{i=1}^{n}\boldsymbol{\mathrm{1}}\left[X_i\leq t\right] \]
Then,
\[ d_K\left(\hat{F}_n,F\right)\rightarrow0\,\,\,\,\,\,a.s. \]
First we create a function in R namingly sampler which generates samples from a distribution given by the user.
sampler=function(dist,size=1,...){
if (dist == 'normal') {
sample <- rnorm(size,...)
}
else if (dist == 'uniform') {
sample=runif(size,...)
}
else if (dist == 'poisson') {
sample <- rpois(size,lambda=60,...)
}
else if (dist == 'cauchy') {
sample <- rcauchy(size,...)
}
else if (dist == 'exponential') {
sample <- rexp(size,...)
}
else if (dist == 'gamma') {
sample <- rcauchy(size,...)
}
else if (dist == 'beta') {
sample <- rbeta(size,...)
}
else if (dist == 'laplace') {
sample <- rlaplace(size,...)
}
else if (dist == 'binomial') {
sample <- rbinom(size,...)
}
else if (dist == 'geometric') {
sample <- rgeom(size,prob=0.01,...)
}
else if (dist == 'logistic') {
sample <- rlogis(size,...)
}
else if (dist == 'lognormal') {
sample <- rlnorm(size,...)
}
else if (dist == 't') {
sample <- rt(size,...)
}
else if (dist == 'chisq') {
sample <- rchisq(size,...)
}
else if (dist == 'f') {
sample <- rf(size,...)
}
else if (dist == 'rayleigh'){
sample <- rrayleigh(size,...)
}
else {
stop("Unsupported distribution")
}
return(sample)
}
ppois2=function(x){return(ppois(x,lambda=60))}
pgeom2=function(x){return(pgeom(x,prob=0.01))}
indsamp=function(dist){
if (dist == 'normal') {
sample <- "pnorm"
}
else if (dist == 'uniform') {
sample="punif"
}
else if (dist == 'poisson') {
sample <- "ppois2"
}
else if (dist == 'cauchy') {
sample <- "pcauchy"
}
else if (dist == 'exponential') {
sample <- "pexp"
}
else if (dist == 'gamma') {
sample <- "pcauchy"
}
else if (dist == 'beta') {
sample <- "pbeta"
}
else if (dist == 'laplace') {
sample <- "plaplace"
}
else if (dist == 'binomial') {
sample <- "pbinom"
}
else if (dist == 'geometric') {
sample <- "pgeom2"
}
else if (dist == 'logistic') {
sample <- "plogis"
}
else if (dist == 'lognormal') {
sample <- "plnorm"
}
else if (dist == 't') {
sample <- "pt"
}
else if (dist == 'chisq') {
sample <- "pchisq"
}
else if (dist == 'f') {
sample <- "pf"
}
else if (dist == 'rayleigh') {
sample <- "prayleigh"
}
else {
stop("Unsupported distribution")
}
return(sample)
}GCPlot=function(n=1000,distvec=c("normal","cauchy","laplace","lognormal"))
{
vec=NULL
dKS=function(x,den="pnorm"){return(ks.test(x,den)$statistic)}
for(j in 1:length(distvec))
{
for(i in 1:n)
{
vec=c(vec,dKS(sampler(dist=distvec[j],size=i),den=indsamp(distvec[j])))
}
}
DataMat=data.frame("Values"=vec,"x"=rep(1:n,times=length(distvec)),"Distribution"=as.factor(rep(distvec,each=n)))
ggplot(data=DataMat,mapping=aes(x=x,y=Values,group=Distribution,col=Distribution))+geom_line()+geom_hline(yintercept=0,col="grey",lwd=0.7)+
ggtitle("Plotting KS Distance w.r.t sample sizes \n for different distributions.")+labs(x="Sample Size",y="Kolmogorov Smirnov Distance")
}
GCPlot(n=2000,distvec=c("normal","cauchy","laplace","lognormal",'rayleigh','uniform','poisson','exponential','geometric'))Suppose we have a random sample \(X_1,X_2,\ldots,X_n\) from some unknown population. We want to check whether the drawn random sample can be considered as a random sample from a population with a continuous distribution function \(F_0\) which is completely specified to us.
Hence we set our null hypothesis as,
\[ \mathcal{H}_0:F\left(x\right)=F_0\left(x\right)\,\,\forall x\in \mathbb{R} \]
We can consider several alternate hypothesis of the form,
\[ \mathcal{H}_1:F\left(x\right)\neq F_0\left(x\right)\,\,\ \mathrm{for\,\,some}\,\, x\in \mathbb{R} \]
\[ \mathcal{H}_2:F\left(x\right)\leq F_0\left(x\right)\,\,\forall x\in \mathbb{R} ,F\left(x\right) < F_0\left(x\right)\,\,\ \mathrm{for\, \, some}\,\, x\in \mathbb{R} \]
\[ \mathcal{H}_3:F\left(x\right)\geq F_0\left(x\right)\,\,\forall x\in \mathbb{R} ,F\left(x\right) > F_0\left(x\right)\,\,\ \mathrm{for\, \, some}\,\, x\in \mathbb{R} \]
For this, we define \[ D_n=d_K\left(\hat{F}_n,F_0\right)\]
\[ D_n^+=\underset{x\in\mathbb{R}}{\sup}\left(\hat{F}_n\left(x\right)-F_0\left(x\right)\right) \]
\[ D_n^-=\underset{x\in\mathbb{R}}{\sup}\left(F_0\left(x\right)-\hat{F}_n\left(x\right)\right) \]
We reject \(\mathcal{H}_0\) in favour of \(\mathcal{H}_1\) , \(\left(\mathcal{H}_2,\mathcal{H}_3 \right)\) for large values of \(D_n\) , \(D_n^-\), \(D_n^+\) respectively.
options(warn=-1)
#---Generates observations on Dn---
KSSampler=function(dist,size,iter=100,...)
{
sam=NULL
for(i in 1:iter)
{
x=sampler(dist=dist,size=size,...)
sam=c(sam,ks.test(x,indsamp(dist))$statistic)
}
names(sam)=NULL
return(sam)
}
#---Generates density plot----
KSDensity=function(size=100,iter=1000,distvec=c("normal","cauchy","laplace","lognormal","rayleigh",'uniform'))
{
values=NULL
for(j in 1:length(distvec))
{
values=c(values,KSSampler(dist=distvec[j],size=size,iter=iter))
}
#---Matrix Containing KS Statistic values for different Distributions---
DataMat=data.frame("Values"=values,"Distribution"=rep(distvec,each=iter))
ggplot(data=DataMat,aes(x=values,col=Distribution,fill=Distribution))+geom_density(alpha=0.5,lwd=0.5,col="black")+
theme_minimal()+labs(y="Density",x=expression(D[n]),title=paste("n =",size),caption=paste("#iterations = ",iter))+
theme(plot.title = element_text(family="Playfair",hjust = 1, size = 16, face = "bold.italic"))
}
gd = grid.arrange(KSDensity(size=5)+labs(tag = "Fig.1"),KSDensity(size=10)+labs(tag = "Fig.2"),KSDensity(size=20)+labs(tag = "Fig.3"),KSDensity(size=50)+labs(tag = "Fig.4"),nrow=2,top=paste("Distribution under", expression(H[0])))Here we took \(\mathcal{H}_0:\mathcal{N}\left(0,1\right)\)
KSHSampler=function(nulldist,altdist,size,iter=100,...)
{
sam=NULL
for(i in 1:iter)
{
x=sampler(dist=altdist,size=size,...)
sam=c(sam,ks.test(x,indsamp(nulldist))$statistic)
}
names(sam)=NULL
return(sam)
}
KSHDensity=function(size=100,iter=1000,nulldist='normal',distvec=c("cauchy","laplace","lognormal","rayleigh",'uniform'))
{
values=NULL
for(j in 1:length(distvec))
{
values=c(values,KSHSampler(nulldist=nulldist,altdist=distvec[j],size=size,iter=iter))
}
#---Matrix Containing KS Statistic values for different Distributions---
DataMat=data.frame("Values"=values,"Distribution"=rep(distvec,each=iter))
ggplot(data=DataMat,aes(x=values,col=Distribution,fill=Distribution))+geom_density(alpha=0.5,lwd=0.5,col="black")+
theme_minimal()+labs(y="Density",x=expression(D[n]),title=paste("n =",size),caption=paste("#iterations = ",iter))+
theme(plot.title = element_text(family="Playfair",hjust = 1, size = 16, face = "bold.italic"))
}
gd2 = grid.arrange(KSHDensity(size=5)+labs(tag = "Fig.1"),KSHDensity(size=10)+labs(tag = "Fig.2"),KSHDensity(size=50)+labs(tag = "Fig.3"),KSHDensity(size=100)+labs(tag = "Fig.4"),nrow=2,top=paste("Distribution under", expression(H[1])))Here we restrict ourselves within a location family. i.e. \(X_1,X_2,\ldots,X_n\overset{iid}{\sim}F\left(x-\theta\right)\) and we are testing,
\[ \mathcal{H}_0:\theta=0\,\,\mathrm{against}\,\,\mathcal{H}_1:\theta\neq0 \]
So, the power function would be,
\[ \Psi\left(\theta\right)=\mathbb{P}\left(\mathrm{reject}\,\,\mathcal{H}_0\right|\theta) \]
KSPower=function(iter=1000,size=20,dist="normal",level=0.05,...)
{
sampmat=matrix(sampler(dist=dist,size=iter*size,...),nc=size)
return(mean(apply(sampmat,MARGIN=1,FUN=function(x){ks.test(x,indsamp(dist=dist))$p.value < level})))
}
thetaseq=seq(from=-0.75,to=0.75,length.out=1000)
pseq=aaply(thetaseq,1,.fun=function(x){KSPower(mean=x)})
#-----Cauchy(\theta,1)-----
#---H0:C(0,1) vs. H1:C(theta,1)----
Cauchyfunc=function(x){KSPower(dist="cauchy",location=x)}
pseq2=aaply(thetaseq,1,.fun=Cauchyfunc)
#------Laplace(\theta,1)-----
#-----H0:L(0,1) vs. H1:L(theta,1)----
Lapfunc=function(x){KSPower(dist="laplace",mu=x)}
pseq3=aaply(thetaseq,1,.fun=Lapfunc)
#------Lognormal(0,1)-----
#-----H0:L(0,1) vs. H1:L(theta,1)----
Lfunc=function(x){KSPower(dist="lognormal",meanlog=x)}
pseq4=aaply(thetaseq,1,.fun=Lfunc)
distvec=c("normal","cauchy","laplace","lognormal")
pmat=data.frame("Theta"=rep(thetaseq,times=length(distvec)),"Power"=c(pseq,pseq2,pseq3,pseq4),"Distribution"=rep(distvec,each=length(thetaseq)))
#----ggplot---
ggplot(data=pmat,aes(x=Theta,y=Power,group=Distribution,col=Distribution))+geom_line(lwd=0.5)+geom_hline(yintercept=0.05,col="grey",lwd=0.7)+ggtitle("Power Curves for Different Location Families")+
scale_color_manual(values = c("normal"="violet","cauchy"="tomato","laplace"="#9090EE90","lognormal"="#87CEFA"))+labs(caption = "#iterations=1000,size=20")+xlab(expression(theta))+ylab("Power")pseq=aaply(thetaseq,1,.fun=function(x){KSPower(mean=x,size=100)})
#-----Cauchy(\theta,1)-----
#---H0:C(0,1) vs. H1:C(theta,1)----
Cauchyfunc=function(x){KSPower(dist="cauchy",location=x,size=100)}
pseq2=aaply(thetaseq,1,.fun=Cauchyfunc)
#------Laplace(\theta,1)-----
#-----H0:L(0,1) vs. H1:L(theta,1)----
Lapfunc=function(x){KSPower(dist="laplace",mu=x,size=100)}
pseq3=aaply(thetaseq,1,.fun=Lapfunc)
#------Lognormal(0,1)-----
#-----H0:L(0,1) vs. H1:L(theta,1)----
Lfunc=function(x){KSPower(dist="lognormal",meanlog=x,size=100)}
pseq4=aaply(thetaseq,1,.fun=Lfunc)
distvec=c("normal","cauchy","laplace","lognormal")
pmat=data.frame("Theta"=rep(thetaseq,times=length(distvec)),"Power"=c(pseq,pseq2,pseq3,pseq4),"Distribution"=rep(distvec,each=length(thetaseq)))
#----ggplot---
ggplot(data=pmat,aes(x=Theta,y=Power,group=Distribution,col=Distribution))+geom_line(lwd=0.5)+geom_hline(yintercept=0.05,col="grey",lwd=0.7)+ggtitle("Power Curves for Different Location Families")+
scale_color_manual(values = c("normal"="violet","cauchy"="tomato","laplace"="#9090EE90","lognormal"="#87CEFA"))+labs(caption = "#iterations=1000,size=100")+xlab(expression(theta))+ylab("Power")Similarly here we restrict ourselves within a location family. i.e. \(X_1,X_2,\ldots,X_n\overset{iid}{\sim}F\left(\frac{x}{\sigma}\right);\sigma>0\) and we are testing,
\[ \mathcal{H}_0:\sigma=1\,\,\mathrm{against}\,\,\mathcal{H}_1:\sigma\neq1 \]
So, the power function would be,
\[ \Psi\left(\sigma\right)=\mathbb{P}\left(\mathrm{reject}\,\,\mathcal{H}_0\right|\sigma) \]
distvec2=c("normal","cauchy","laplace","lognormal","exponential")
thetaseq2=seq(from=0.2,to=3.5,length.out=2000)
pseq5=aaply(thetaseq2,1,.fun=function(s){KSPower(dist="normal",sd=s,iter=1000,size=20)})
pseq6=aaply(thetaseq2,1,.fun=function(s){KSPower(dist="cauchy",scale=s,iter=1000,size=20)})
pseq7=aaply(thetaseq2,1,.fun=function(s){KSPower(dist="laplace",sigma=s,iter=1000,size=20)})
pseq8=aaply(thetaseq2,1,.fun=function(s){KSPower(dist="lognormal",sdlog=s,iter=1000,size=20)})
pseq9=aaply(thetaseq2,1,.fun=function(s){KSPower(dist="exponential",rate=s,iter=1000,size=20)})
pmat2=data.frame("Sigma"=rep(thetaseq2,times=length(distvec2)),"Power"=c(pseq5,pseq6,pseq7,pseq8,pseq9),"Distribution"=rep(distvec2,each=length(thetaseq2)))
ggplot(data=pmat2,aes(x=Sigma,y=Power,group=Distribution,col=Distribution))+geom_line(lwd=0.7)+geom_hline(yintercept=0.05,col="grey",lwd=0.7)+ggtitle("Power Curves for Different Scale Families")+
scale_color_manual(values = c("normal"="violet","cauchy"="tomato","laplace"="#90EE90","lognormal"="#87CEFA","exponential"="darkgrey"))+xlim(0.2,3)+labs(caption = "#iterations=1000,size=20")+xlab(expression(sigma))+ylab("Power")pseq52=aaply(thetaseq2,1,.fun=function(s){KSPower(dist="normal",sd=s,iter=1000,size=100)})
pseq62=aaply(thetaseq2,1,.fun=function(s){KSPower(dist="cauchy",scale=s,iter=1000,size=100)})
pseq72=aaply(thetaseq2,1,.fun=function(s){KSPower(dist="laplace",sigma=s,iter=1000,size=100)})
pseq82=aaply(thetaseq2,1,.fun=function(s){KSPower(dist="lognormal",sdlog=s,iter=1000,size=100)})
pseq92=aaply(thetaseq2,1,.fun=function(s){KSPower(dist="exponential",rate=s,iter=1000,size=100)})
pmat22=data.frame("Sigma"=rep(thetaseq2,times=length(distvec2)),"Power"=c(pseq52,pseq62,pseq72,pseq82,pseq92),"Distribution"=rep(distvec2,each=length(thetaseq2)))
ggplot(data=pmat22,aes(x=Sigma,y=Power,group=Distribution,col=Distribution))+geom_line(lwd=0.7)+geom_hline(yintercept=0.05,col="grey",lwd=0.7)+ggtitle("Power Curves for Different Scale Families")+
scale_color_manual(values = c("normal"="violet","cauchy"="tomato","laplace"="#90EE90","lognormal"="#87CEFA","exponential"="darkgrey"))+xlim(0.2,3)+labs(caption = "#iterations=1000,size=100")+xlab(expression(sigma))+ylab("Power")KSPower2=function(nulldist,altdist,iter=1000,size=100,level=0.05,...)
{
sampmat=matrix(sampler(dist=altdist,size=iter*size,...),nc=size)
return(mean(apply(sampmat,MARGIN=1,FUN = function(x){return(ks.test(x,indsamp(nulldist))$p.value < level)})))
}
distvec2=c(distvec,"rayleigh")
names(distvec2)=distvec2
obj=adply(distvec2,1,.fun=function(x){return(aaply(distvec2,1,.fun=function(y){return(KSPower2(nulldist=x,altdist=y))}))})
rownames(obj)=distvec2
obj[1]=NULL
matplot(t(obj),type="l",xlim=c(1,7),xaxt="n",ylab="Power",main="Power Comparisons for Different Family of Distributions")
legend("right",distvec2,col=1:5,lty=1:5,title="AltDist")
axis(side=1,at=1:5,labels=distvec2,las=3)
box(lwd=3)KSPower3=function(nulldist,altdist,iter=1000,size=100,level=0.05,...)
{
sampmat=matrix(sampler(dist=altdist,size=iter*size,...),nc=size)
return(mean(apply(sampmat,MARGIN=1,FUN = function(x){return(ks.test(x,indsamp(nulldist),alternative = "less")$p.value < level)})))
}
distvec3=c("normal","cauchy","lognormal","laplace")
pseq10=aaply(thetaseq,1,.fun=function(x){KSPower3(nulldist='normal',altdist='normal',mean=x)})
pseq11=aaply(thetaseq,1,.fun=function(x){KSPower3(nulldist='cauchy',altdist='cauchy',location=x)})
pseq12=aaply(thetaseq,1,.fun=function(x){KSPower3(nulldist='lognormal',altdist='lognormal',meanlog=x)})
pseq13=aaply(thetaseq,1,.fun=function(x){KSPower3(nulldist='logistic',altdist='laplace',mu=x)})
pmat3=data.frame("Theta"=rep(thetaseq,times=length(distvec3)),"Power"=c(pseq10,pseq11,pseq12,pseq13),"Distribution"=rep(distvec3,each=length(thetaseq)))
ggplot(data=pmat3,aes(x=Theta,y=Power,group=Distribution,col=Distribution))+geom_line()+ggtitle("Power Curves for Different Location Families (One Sided Alternative)",subtitle = "1000 simulations of size 100")+
xlab(expression(theta))+geom_hline(yintercept = 0.05,col='grey',lwd=0.7)+geom_vline(xintercept = 0,col='grey',lwd=0.7)pseq14=aaply(thetaseq2,1,.fun=function(x){KSPower3(nulldist='normal',altdist='normal',sd=x)})
pseq15=aaply(thetaseq2,1,.fun=function(x){KSPower3(nulldist='cauchy',altdist='cauchy',scale=x)})
pseq16=aaply(thetaseq2,1,.fun=function(x){KSPower3(nulldist='lognormal',altdist='lognormal',sdlog=x)})
pseq17=aaply(thetaseq2,1,.fun=function(x){KSPower3(nulldist='logistic',altdist='logistic',scale=x)})
pseq18=aaply(thetaseq2,1,.fun=function(x){KSPower3(nulldist='exponential',altdist='exponential',rate=x)})
distvec4=c('normal','cauchy','lognormal','logistic','exponential')
pmat4=data.frame("Theta"=rep(thetaseq2,times=length(distvec4)),"Power"=c(pseq14,pseq15,pseq16,pseq17,pseq18),"Distribution"=rep(distvec4,each=length(thetaseq2)))
ggplot(data=pmat4,aes(x=Theta,y=Power,group=Distribution,col=Distribution))+geom_line()+ggtitle("Power Curves for Different Scale Families (One Sided Alternative)",subtitle = "1000 simulations of size 100")+
xlab(expression(sigma))+geom_hline(yintercept = 0.05,col='grey',lwd=0.7)+geom_vline(xintercept = 1,col='grey',lwd=0.7)Confidence band is a random band which covers the unknown distribution \(\forall x\) (with a preassigned probability)
\[ \left(1-\alpha\right)=\mathbb{P}\left(D_n\leq d_{n,\alpha}\right) \]
\[ =\mathbb{P}\left[\hat{F}_n\left(x\right)-d_{n,\alpha}\leq F\left(x\right)\leq \hat{F}_n\left(x\right)+d_{n,\alpha}\,\,\forall x \in \mathbb{R}\right] \]
In Practice,
\[ L_n\left(x\right)=\max \left\{0,\hat{F}_n\left(x\right)-d_{n,\alpha}\right\}\, ,U_n\left(x\right)=\min \left\{1,\hat{F}_n\left(x\right)+d_{n,\alpha}\right\}\ \]
#---Function for calculating CDF---
KSCDF=function(x,size,iter,dist)
{
sampmat=matrix(sampler(dist=dist,size=iter*size),nc=size)
return(mean(apply(sampmat,1,FUN = function(y){return(ks.test(y,indsamp(dist))$statistic < x)})))
}
#---Quantiles---
qKS=function(p,dist,size=100,iter=1000)
{
y=seq(from=0,to=2,by=0.01)
prob=aaply(y,1,.fun = function(x){return(KSCDF(x=x,dist=dist,iter=iter,size=size))})
return(max(y[prob < p]))
}
ECDF=function(x,sam)
{
return(mean(sam < x))
}
confband=function(x,sam,level,dist,size=100,iter=1000)
{
d=qKS(p=1-level,dist=dist,size=size,iter=iter)
f=ECDF(x=x,sam=sam)
vec=c(max(0,f-d),min(1,f+d))
names(vec)=c("L","U")
return(vec)
}
#---------Plotting the Confidence Bands---
xseq=seq(from=-4,to=4,length.out=50)
sam=rnorm(100)
df=adply(xseq,1,.fun=function(z){return(confband(x=z,sam=sam,level=0.05,dist='normal'))})
df[1]=NULL
ggplot()+geom_function(fun=pnorm,lwd=1,col='red')+xlim(-4,4)+geom_segment(aes(x=xseq,xend=xseq,y=L,yend=U),col='darkgrey',data=cbind(df,xseq))+
geom_point(data=df,aes(x=xseq,y=L))+geom_point(data=df,aes(x=xseq,y=U))+geom_line(aes(x=xseq,y=L),data=df)+geom_line(aes(x=xseq,y=U),data=df)+ggtitle("Confidence band for N(0,1) CDF")+xlab("x")+ylab("Cumulative Probability")+labs(caption = "#iterations = 1000, sample size = 100")In case \(F\) is continuous,
\[ \underset{n\rightarrow \infty}{\lim}\mathbb{P}\left(D_n^+\leq\frac{z}{\sqrt{n}}\right)=1-e^{-2z^2}\,;\,z>0 \]
DnCDF=function(z,n,dist,iter=1000)
{
sampmat=matrix(sampler(dist=dist,size=iter*n),nc=n)
vec=aaply(sampmat,1,.fun=function(x){return(ks.test(x,indsamp(dist),alternative="gr")$statistic < z/sqrt(n))})
return(mean(vec))
}
r=seq(0,4,length.out=1000)
n=c(10,100,500,1000)
df2=sapply(n,FUN=function(x){aaply(r,1,.fun=function(y){return(DnCDF(z=y,n=x,dist='normal'))})})
df2=data.frame(df2)
df3=data.frame("r"=rep(r,times=length(n)),"val"=c((df2[,1]),(df2[,2]),(df2[,3]),(df2[,4])),"n"=as.factor(rep(n,each=length(r))))
#--True Function--
f=function(x){return(1-exp(-2*(x^2)))}
ggplot(data=df3,aes(x=r,y=val,col=n))+geom_point()+geom_line(aes(x=r,y=f(r)),col="black",lwd=0.7)+xlim(0,2)+xlab("x")+ylab("Sample Proportions")+ggtitle("Limiting CDF of Dn+")In case \(F\) is continuous,
\[ 4n\left(D_n^+\right)^2 \overset{\mathcal{D}}{\longrightarrow}\chi^2_{2}\,\,\mathrm{as}\,\,n\rightarrow \infty \]
Dn=function(m=1000,n,dist='normal',...)
{
sampmat=matrix(sampler(dist=dist,size=m*n,...),nc=m)
vec=apply(sampmat,MARGIN=1,FUN = function(x){4*m*((ks.test(x,indsamp(dist),alternative="gr")$statistic)^2)})
ggplot(data.frame("vec"=vec),aes(x=vec))+geom_histogram(aes(y=after_stat(density)),alpha=0.5,binwidth=1,fill="#69b3a2",col="#e9ecef")+
geom_function(fun=function(x){dchisq(x,df=2)},col="orange")
}
Dn(m=1000,n=150000)+xlab("Values")+ylab("Frequency Density")+ggtitle("Histogram of the Generated sample \n for large values of n")+labs(caption="#iteration=1000 #samples = 150000")In case \(F\) is continuous,
\[ \underset{n\rightarrow \infty}{\lim}\mathbb{P}\left(D_n\leq\frac{z}{\sqrt{n}}\right)=L\left(z\right)\,;\,z>0 \]
where,
\[ L\left(z\right)=1-2\sum_{i=1}^{\infty}\left(-1\right)^{i-1}e^{-2i^2z^2}\,;\,z>0 \]
DnCDF2=function(z,n,dist,iter=1000)
{
sampmat=matrix(sampler(dist=dist,size=iter*n),nc=n)
vec=aaply(sampmat,1,.fun=function(x){return(ks.test(x,indsamp(dist))$statistic < z/sqrt(n))})
return(mean(vec))
}
r=seq(0,4,length.out=1000)
n2=c(10,50,100)
df4=sapply(n2,FUN=function(x){aaply(r,1,.fun=function(y){return(DnCDF2(z=y,n=x,dist='normal'))})})
df4=data.frame(df4)
df5=data.frame("r"=rep(r,times=length(n2)),"val"=c((df4[,1]),(df4[,2]),(df4[,3])),"n"=as.factor(rep(n2,each=length(r))))
gg2=ggplot(data=df5,aes(x=r,y=val,group=n,col=n))+geom_point()+
ggtitle(label=expression(paste("Limiting CDF of ",D[n])),subtitle="# iterations = 1000")+xlab("z")+ylab(expression(paste("CDF of ",sqrt(n),D[n])))+xlim(0,3)
gg2Indian Statistical Institute, Delhi.