Practice_set_8.2

Practice Set 8.2

library(tidyverse)
Warning: package 'tidyverse' was built under R version 4.3.3
Warning: package 'tibble' was built under R version 4.3.3
Warning: package 'tidyr' was built under R version 4.3.3
Warning: package 'readr' was built under R version 4.3.3
Warning: package 'purrr' was built under R version 4.3.3
Warning: package 'dplyr' was built under R version 4.3.3
Warning: package 'stringr' was built under R version 4.3.3
Warning: package 'forcats' was built under R version 4.3.3
Warning: package 'lubridate' was built under R version 4.3.3
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   4.0.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(purrrfect)

Attaching package: 'purrrfect'

The following objects are masked from 'package:base':

    replicate, tabulate
library(knitr)
Warning: package 'knitr' was built under R version 4.3.3
library(kableExtra)
Warning: package 'kableExtra' was built under R version 4.3.3

Attaching package: 'kableExtra'

The following object is masked from 'package:dplyr':

    group_rows

Problem 5

Creating coverage dataframe

(q5df <- parameters(~n, c(5, 10, 20, 40, 80, 160,320))
           %>% add_trials(10000)
           %>% mutate(Ys = map(n,\(n) rbeta(n,2,1)))
           %>% mutate(Ybar = map_dbl(Ys, mean), S2 = map_dbl(Ys, var),Ymax = map_dbl(Ys,max))
           %>% mutate(UCL_exact = log(.025)/(n*log(Ymax)), 
                      LCL_exact = log(.975)/(n*log(Ymax)),
                      LCL_asymptotic = -(Ybar - 1.96*sqrt(S2/n))/(Ybar - 1.96*sqrt(S2/n) - 1), 
                      UCL_asymptotic = -(Ybar + 1.96*sqrt(S2/n))/(Ybar + 1.96*sqrt(S2/n) - 1))
           %>% mutate(exact_covers = ifelse(LCL_exact < 2 & UCL_exact > 2, 1, 0))
           %>%mutate(asymptotic_covers = ifelse(LCL_asymptotic < 2 & UCL_asymptotic > 2, 1, 0))
) %>% head
# A tibble: 6 × 12
      n .trial Ys         Ybar      S2  Ymax UCL_exact LCL_exact LCL_asymptotic
  <dbl>  <dbl> <list>    <dbl>   <dbl> <dbl>     <dbl>     <dbl>          <dbl>
1     5      1 <dbl [5]> 0.797 0.0427  0.947     13.5     0.0927          1.60 
2     5      2 <dbl [5]> 0.781 0.00594 0.893      6.55    0.0450          2.48 
3     5      3 <dbl [5]> 0.655 0.0335  0.842      4.28    0.0294          0.979
4     5      4 <dbl [5]> 0.794 0.0167  0.923      9.19    0.0631          2.13 
5     5      5 <dbl [5]> 0.737 0.0643  0.999   1173.      8.05            1.06 
6     5      6 <dbl [5]> 0.635 0.168   1.000   4235.     29.1             0.380
# ℹ 3 more variables: UCL_asymptotic <dbl>, exact_covers <dbl>,
#   asymptotic_covers <dbl>

Summarizing coverage to find percent exact versus asymptotic coverage

(covq5df <- q5df %>%
    summarize("excov" = mean(exact_covers),"asycov" = mean(asymptotic_covers), .by = n)
    )
# A tibble: 7 × 3
      n excov asycov
  <dbl> <dbl>  <dbl>
1     5 0.951  0.823
2    10 0.948  0.911
3    20 0.952  0.932
4    40 0.946  0.937
5    80 0.948  0.950
6   160 0.950  0.945
7   320 0.948  0.950

Plotting exact versus asymptotic coverage

ggplot(data = covq5df) + 
  geom_line(aes(x = n, y = excov, color = 'exact')) + 
  geom_line(aes(x = n, y = asycov, color = 'asymptotic')) +
  geom_hline(aes(yintercept = 0.95)) + 
  scale_color_discrete(name='') + ylab('Coverage') +
  theme_classic() 

As you can see in the above plot, the exact coverage is almost always \(95\%\) , regardless of \(n\) , but the asymptotic coverage only starts “kicking in” to \(95\%\) at around \(n=40\) ,and then appears to fall back down again slightly at \(n=300\) . This issue of falling down again could most likely be due to sampling error and should not occur at even larger for larger \(n\).

Problem 6

Creating coverage dataframe

vardist <- \(x){
  y <- case_when(x=="UNIF"~16/3,
                 x=="EXP"~16,
                 x=="GEO"~20
              )
  return(y)
}
q6df <- (parameters(~n,c(5,20,50,100))
         %>%add_trials(10000)
         %>%mutate(UNIF = map(n,\(n) runif(n,0,8)),EXP = map(n,\(n) rexp(n,1/4)),GEO = map(n,\(n) rgeom(n,.2)))
         %>%pivot_longer(UNIF:EXP:GEO,
                         names_to = "Distribution",
                         values_to = "Ysample")
         %>%mutate(Ybar = map_dbl(Ysample,mean),S2 = map_dbl(Ysample,var),tv = map_dbl(Distribution,vardist))
         %>%mutate(muLCL= Ybar - qt(.975,n-1)*sqrt(S2/n),
                   muUCL= Ybar + qt(.975,n-1)*sqrt(S2/n),
                   sigLCL = ((n-1)*S2)/qgamma(.975,(n-1)/2,1/2),
                   sigUCL = ((n-1)*S2)/qgamma(.025,(n-1)/2,1/2)
                   )
         %>% mutate(mucovers = ifelse(muLCL < 4 & muUCL > 4, 1, 0))
         %>%mutate(sigcovers = ifelse(sigLCL < tv & sigUCL > tv, 1, 0))
)
Warning in x:y: numerical expression has 2 elements: only the first used

Summarizing and tabulating \(\mu\) coverage across distributions.

covq6df <- (q6df %>%
              summarize("mu_coverage" = mean(mucovers),"sig_coverage" = mean(sigcovers), .by = c(n,Distribution))
)
tabmuq6 <- (covq6df %>%
            select(-sig_coverage) %>%
            pivot_wider(names_from = Distribution,values_from = mu_coverage)
            %>%mutate(n = paste("$n=",n,"$",sep =""))
)
tabmuq6 <- rbind(list("$\\mu$","$4$","$4$","$4$"),tabmuq6)
kable(tabmuq6,col.names = c("","$\\text{UNIF}(0,8)$","$\\text{EXP}(\\beta=4)$","$\\text{GEO}(p=2)$"))
\(\text{UNIF}(0,8)\) \(\text{EXP}(\beta=4)\) \(\text{GEO}(p=2)\)
\(\mu\) \(4\) \(4\) \(4\)
\(n=5\) 0.9327 0.8829 0.8779
\(n=20\) 0.9486 0.9196 0.9154
\(n=50\) 0.9499 0.9405 0.9338
\(n=100\) 0.9553 0.9428 0.9412

The least severe violation of normality for \(\mu\) coverage is the Uniform distribution, as for all \(n\) , the coverage is relatively high, and it approaches \(95\%\) as \(n\) goes to \(100\). Next is the Exponential distribution which the coverage does start out relatively low, but as \(n\) increases, the coverage becomes closer to the desired \(95\%\). Lastly the Geometric distribution seems to have the most severe consequences as it starts out relatively low, and it approaches \(95\%\) slower than the other two distributions.

Summarizing and tabulating \(\sigma^2\) coverage across distributions.

tabsigq6 <- (covq6df %>%
            select(-mu_coverage) %>%
            pivot_wider(names_from = Distribution,values_from = sig_coverage)
            %>%mutate(n = paste("$n=",n,"$",sep =""))
)
tabsigq6 <- rbind(list("$\\sigma^2$","$16/3$","$16$","$20$"),tabsigq6)
kable(tabsigq6,col.names = c("","$\\text{UNIF}(0,8)$","$\\text{EXP}(\\beta=4)$","$\\text{GEO}(p=.2)$"))
\(\text{UNIF}(0,8)\) \(\text{EXP}(\beta=4)\) \(\text{GEO}(p=.2)\)
\(\sigma^2\) \(16/3\) \(16\) \(20\)
\(n=5\) 0.9843 0.8177 0.8203
\(n=20\) 0.9952 0.7218 0.7249
\(n=50\) 0.9968 0.7051 0.7014
\(n=100\) 0.9982 0.6938 0.6901

Again, the uniform distribution appears to have the least severe consequences of violating normality for \(\sigma^2\) coverage, with consistently having close to \(99\%\) coverage for all \(n\). Then the other two distributions both have poor coverage of \(\sigma^2\), which appears to get even worse as \(n\) increases. It appears that in this case the geometric distribution is slightly better at coverage than the Exponential distribution for all \(n\) except for \(n=5\).