1 Group Homework

You will work with your group to complete this assignment.
Upload your html file on RPubs and include the link when you submit your submission files on Collab.
Submit your group’s shared .Rmd AND “knitted”.html files on Collab.
Your “knitted .html” submission must be created from your “group .Rmd” but be created on your own computer.
Confirm this with the following comment included in your submission text box: “Honor Pledge: I have recreated my group submission using using the tools I have installed on my own computer”
Name the files with a group name and YOUR name for your submission.
One of the group members will present R codes and plots for Parts 2 and 3 in class on Nov. 2 (Tue). Please e-mail the instructor if you’re a presenter by 11:59pm, Nov. 1. Also, if you’re a presenter, please bring your laptop so that you can share your screen on zoom for the presentation.

2 Part 1

Use the SCS Data set you downloaded from the previous group assignments, and then investigate the relationship between the mathematics achievement score (“mathpre”) and the math anxiety score (“mars”).
Plot the data, linear line, and bootstrap confidence envelopes. Use 2,000 bootstrap replicates (i.e., R=2000) in function boot, and add appropriate x- and y- labels, and a title to the graph.
Please refer to section: Linear regression with bootstrap confidence intervals in DS3003_visualizingerrors_reg_note.html, DS3003_visualizingerrors_reg_code.html, and DS3003_visualizingerrors_reg_code.html

# add your codes
library(foreign)
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.6.2

SCS <- read.spss('SCS_QE.sav', to.data.frame = TRUE)

## re-encoding from CP1252

## Warning in read.spss("SCS_QE.sav", to.data.frame = TRUE): Undeclared
## level(s) 0 added in variable: married

## Boot Strap
library(boot)
b.stat <- function(data, i)
{
   b.dat <- data[i ,]
   out.lm <- lm(mars ~ mathpre, b.dat)
   predict(out.lm, data.frame(mathpre=SCS$mathpre))   
}

b.out <- boot(SCS, b.stat, R = 2000) # R = num of replications

boot.ci(b.out, index = 1, type = 'perc') # 95% CI for the first observation

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 2000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = b.out, type = "perc", index = 1)
## 
## Intervals : 
## Level     Percentile     
## 95%   (53.56, 59.03 )  
## Calculations and Intervals on Original Scale

b.ci <- t(sapply(1:nrow(SCS), function(x) boot.ci(b.out, index = x, type = 'perc')$percent))[, 4:5]
dimnames(b.ci) <- list(rownames(SCS), c('lower', 'upper'))
kable(head(b.ci, 4))

lower	upper
53.56206	59.02661
61.09778	71.03611
57.75656	64.71544
59.43509	67.82575

SCS2 <- cbind(SCS, b.ci) # combine two datasets


## Plot data
SCS_plot <- ggplot(SCS2, aes(x = mathpre, y = mars)) + geom_jitter(color="steel blue") + xlab("Mathematics Acievement Score") + ylab("Mathematics Anxiety Score") + geom_smooth(method='lm', formula= y~x, color="red", se=FALSE) + ggtitle("Mathematics Anxiety Score vs. Mathermatics Achievement Score") + theme(plot.title = element_text(hjust = 0.5)) + geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.3, fill="#69b3a2")

SCS_plot

3 Part 2

Create WHO Reporting Barplots with error bars separated by WHO region using either facet_grid or facet_wrap.
First, get the latest data from from https://covid19.who.int/table.
- The file should likely be named “WHO COVID-19 global table data October XXth 2021 at XXXXX.csv”
- Don’t use the data that I uploaded on Collab. It’s not the most recent data.
Second, create a subset including 3 countries per WHO region (Africa, Americas, Eastern Mediterranean, Europe, South-East Asia, Western Pacific). You can choose any three countries within each WHO region to compare the mortality rate (mutate(rate = "Deaths - cumulative total"/"Cases - cumulative total")).
Third, draw bar plots with error bars using your subset, but adjust the graph in the facets using either facet_grid or facet_wrap (e.g., facet_grid(~ "WHO region", scale="free"). Please include scale="free" in your facet function.

# add your codes
library(dplyr)

## Warning: package 'dplyr' was built under R version 3.6.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

who <- read.csv('WHO COVID-19 global table data October 31st 2021 at 6.47.33 PM.csv')

who_2 <- who[(who$Name == 'Sierra Leone' | who$Name == 'Mali' | who$Name == 'Mauritius' |   who$Name == 'Nicaragua' | who$Name == 'Aruba' | who$Name == 'Cuba' | who$Name == 'Somalia' | who$Name == 'Sudan' | who$Name == 'Iraq' | who$Name == 'Luxembourg' | who$Name == 'Montenegro' | who$Name == 'Finland' | who$Name == 'Myanmar' | who$Name == 'Sri Lanka' | who$Name == 'Nepal' | who$Name == 'Viet Nam' | who$Name == 'Japan' | who$Name == 'Malaysia'),]

mortality <- who_2 %>%
  select( 'Deaths...cumulative.total', 'Cases...cumulative.total','WHO.Region', 'Name' ) %>%
  mutate(rate = `Deaths...cumulative.total`/`Cases...cumulative.total`, SE = sqrt(rate*(1-rate)/`Cases...cumulative.total`))
mortality

##     Deaths...cumulative.total Cases...cumulative.total
## 20                      28769                  2454749
## 23                      23083                  2052123
## 26                      18237                  1717980
## 38                       8219                   949747
## 40                      21910                   905477
## 44                      11388                   811897
## 56                      13696                   538860
## 60                      18622                   497700
## 102                      1158                   157531
## 105                      2082                   142808
## 118                       843                    81501
## 135                      2995                    40433
## 151                      1208                    21998
## 156                       166                    17698
## 161                       562                    15948
## 162                       171                    15865
## 169                       208                    12866
## 181                       121                     6397
##                WHO.Region         Name        rate           SE
## 20        Western Pacific     Malaysia 0.011719732 6.869029e-05
## 23  Eastern Mediterranean         Iraq 0.011248351 7.361840e-05
## 26        Western Pacific        Japan 0.010615374 7.818819e-05
## 38               Americas         Cuba 0.008653884 9.504170e-05
## 40        Western Pacific     Viet Nam 0.024197191 1.614823e-04
## 44        South-East Asia        Nepal 0.014026410 1.305135e-04
## 56        South-East Asia    Sri Lanka 0.025416620 2.144028e-04
## 60        South-East Asia      Myanmar 0.037416114 2.690078e-04
## 102                Europe      Finland 0.007350934 2.152217e-04
## 105                Europe   Montenegro 0.014579015 3.171748e-04
## 118                Europe   Luxembourg 0.010343431 3.543995e-04
## 135 Eastern Mediterranean        Sudan 0.074073158 1.302419e-03
## 151 Eastern Mediterranean      Somalia 0.054914083 1.535981e-03
## 156                Africa    Mauritius 0.009379591 7.245752e-04
## 161                Africa         Mali 0.035239528 1.460063e-03
## 162              Americas        Aruba 0.010778443 8.197941e-04
## 169              Americas    Nicaragua 0.016166641 1.111857e-03
## 181                Africa Sierra Leone 0.018915116 1.703216e-03

ggplot(mortality, aes(x = Name, y = rate, fill = as.factor(WHO.Region))) +
  geom_bar(stat = "identity") + facet_grid(~ WHO.Region, scale="free") + geom_errorbar(aes(ymin=rate-1.96*SE, ymax=rate+1.96*SE), width=.2) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Example:

4 Part 3

See TABLE 2. COVID-19 vaccine effectiveness against COVID-19–associated hospitalization among adults without immunocompromising conditions, by vaccine product — 21 hospitals in 18 U.S. states, March–August 2021 from a recent study on Comparative Effectiveness of Moderna, Pfizer-BioNTech, and Janssen (Johnson & Johnson) Vaccines.
Draw your best plot to visualize results of VE against COVID-19 hospitalization (95% CI), i.e., the third column of TABLE 2.
First, save data about Vaccine/Period and VE against COVID-19 hospitalization (95% CI).
- You could create a .csv file including VE against COVID-19 hospitalization (95% CI) and Vaccine/Period.
- Or, you could save data in R directly, say using data.frame() or tibble().
Second, draw bar plots with error bars, using the data you saved in the first step.
- You could use the facets function as in Part 2.
- Or, you could draw separate bar plots for each vaccine and collect three bar plots into a single figure using something like, e.g., gridExtra::grid.arrange(). Here are some helpul notes and explanations.

# add your codes
vax <- read.csv('Vaccine Efficacy.csv')

vax2 <- vax %>%
  mutate(SE = sqrt(VE.Against.COVID.19.hospitalization*(100-VE.Against.COVID.19.hospitalization)/`Total.Patients`))

ggplot(vax2, aes(x = Period, y = VE.Against.COVID.19.hospitalization, fill = as.factor(Vaccine))) +
  geom_bar(stat = "identity") + facet_grid(~ Vaccine, scale="free") + geom_errorbar(aes(ymin=VE.Against.COVID.19.hospitalization-1.96*SE, ymax=VE.Against.COVID.19.hospitalization+1.96*SE), width=.2) + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

Assignment 12 - Visualizing errors

Group #7, Nicholas Kalinowski

Due Date: 11:59pm, Nov 1

1 Group Homework

2 Part 1

3 Part 2

4 Part 3