#Upload data set
COVID <- read.csv('C:/Users/mbarker/Desktop/NCU - Data Sci PhD/RStudio Files/COVID-19.csv', stringsAsFactors = T)
#Load packages from library
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(naniar)
library(stargazer)
##
## Please cite as:
##
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(ggplot2)
#Count missing values
sum(is.na(COVID))
## [1] 114413
#Find percent of rows with a missing value
pct_miss(COVID)
## [1] 6.993802
#Check for missing data by creating missingness maps
gg_miss_var(COVID, show_pct = TRUE)

vis_miss(COVID, warn_large_data = FALSE)

missmap(COVID)

#Determine how many rows have more than 20% missing values
myr = function(x){
temp=rep(0,nrow(x))
for (i in 1:nrow(x)){
temp[i]=sum(is.na(x[i, 1:ncol(x)]))/ncol(x)}
x$ROWMISS=temp
x=x[order(-x$ROWMISS), ]
return(x)
}
COVID=myr(COVID)
length(COVID$ROWMISS[COVID$ROWMISS>.2])
## [1] 11492
#Delete rows missing more than 20% of their data
COVID <- COVID[COVID$ROWMISS <= 0.2,]
#Detect the variables and data type
str(COVID)
## 'data.frame': 90753 obs. of 17 variables:
## $ Last.update.date : Factor w/ 605 levels "1/1/2021","1/10/2021",..: 12 12 12 12 12 12 12 12 12 12 ...
## $ Town.number : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Town : Factor w/ 169 levels "Andover","Ansonia",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Total.cases : int 118 1236 158 614 115 362 1081 269 1368 139 ...
## $ Confirmed.cases : int 109 1131 155 591 112 343 1035 249 1200 128 ...
## $ Probable.cases : int 9 105 3 23 3 19 46 20 168 11 ...
## $ Case.rate : int 3652 6602 3708 3355 3173 5856 5291 4910 6939 4062 ...
## $ Total.deaths : int 2 20 3 63 1 4 26 2 42 3 ...
## $ Confirmed.deaths : int 2 13 3 52 1 4 22 1 33 2 ...
## $ Probable.deaths : int 0 7 0 11 0 0 4 1 9 1 ...
## $ People.tested : int 1386 8358 1917 8980 1521 2855 9439 2597 9901 1470 ...
## $ Rate.tested.per.100k : int 42897 44645 44989 49066 41970 46182 46197 47399 50223 42957 ...
## $ Number.of.tests : int 3018 22604 4432 23448 3877 7753 26565 6684 26133 3817 ...
## $ Number.of.positives : int 134 1451 180 700 126 416 1274 332 1605 159 ...
## $ Number.of.negatives : int 2882 21074 4249 22712 3743 7314 25268 6339 24507 3653 ...
## $ Number.of.indeterminates: int 2 79 3 36 8 23 23 13 21 5 ...
## $ ROWMISS : num 0 0 0 0 0 0 0 0 0 0 ...
summary(COVID)
## Last.update.date Town.number Town Total.cases
## 1/1/2021 : 169 Min. : 1 Andover : 537 Min. : 0
## 1/10/2021: 169 1st Qu.: 43 Ansonia : 537 1st Qu.: 191
## 1/10/2022: 169 Median : 85 Ashford : 537 Median : 700
## 1/11/2021: 169 Mean : 85 Avon : 537 Mean : 2095
## 1/11/2022: 169 3rd Qu.:127 Barkhamsted : 537 3rd Qu.: 2111
## 1/12/2021: 169 Max. :169 Beacon Falls: 537 Max. :42752
## (Other) :89739 (Other) :87531
## Confirmed.cases Probable.cases Case.rate Total.deaths
## Min. : 0 Min. : 0.0 Min. : 0 Min. : 0.00
## 1st Qu.: 177 1st Qu.: 9.0 1st Qu.: 2216 1st Qu.: 3.00
## Median : 644 Median : 52.0 Median : 6996 Median : 17.00
## Mean : 1903 Mean : 191.9 Mean : 8205 Mean : 45.24
## 3rd Qu.: 1896 3rd Qu.: 186.0 3rd Qu.:12041 3rd Qu.: 56.00
## Max. :38547 Max. :4252.0 Max. :31406 Max. :503.00
##
## Confirmed.deaths Probable.deaths People.tested Rate.tested.per.100k
## Min. : 0.00 Min. : 0.000 Min. : 17 Min. : 1170
## 1st Qu.: 2.00 1st Qu.: 0.000 1st Qu.: 2217 1st Qu.: 33637
## Median : 13.00 Median : 3.000 Median : 6025 Median : 62315
## Mean : 36.94 Mean : 8.301 Mean : 12434 Mean : 56437
## 3rd Qu.: 44.00 3rd Qu.:11.000 3rd Qu.: 14959 3rd Qu.: 77800
## Max. :442.00 Max. :81.000 Max. :140154 Max. :143111
##
## Number.of.tests Number.of.positives Number.of.negatives
## Min. : 17 Min. : 0 Min. : 12
## 1st Qu.: 5261 1st Qu.: 226 1st Qu.: 5009
## Median : 18388 Median : 833 Median : 17512
## Mean : 49617 Mean : 2560 Mean : 46999
## 3rd Qu.: 54237 3rd Qu.: 2535 3rd Qu.: 51556
## Max. :928333 Max. :55043 Max. :882389
##
## Number.of.indeterminates ROWMISS
## Min. : 0.00 Min. :0
## 1st Qu.: 4.00 1st Qu.:0
## Median : 18.00 Median :0
## Mean : 58.35 Mean :0
## 3rd Qu.: 59.00 3rd Qu.:0
## Max. :1296.00 Max. :0
##
#Obtain frequency information for each variable
# Apply summary function to each variable
result <- lapply(COVID, function(x) length(unique(x)))
# Create a data frame from the result
table1 <- data.frame(Variable = names(result), Count = unlist(result))
table1
## Variable Count
## Last.update.date Last.update.date 537
## Town.number Town.number 169
## Town Town 169
## Total.cases Total.cases 11456
## Confirmed.cases Confirmed.cases 10792
## Probable.cases Probable.cases 2347
## Case.rate Case.rate 21940
## Total.deaths Total.deaths 468
## Confirmed.deaths Confirmed.deaths 401
## Probable.deaths Probable.deaths 81
## People.tested People.tested 30934
## Rate.tested.per.100k Rate.tested.per.100k 54863
## Number.of.tests Number.of.tests 53234
## Number.of.positives Number.of.positives 12805
## Number.of.negatives Number.of.negatives 52304
## Number.of.indeterminates Number.of.indeterminates 915
## ROWMISS ROWMISS 1
#Use describe() function to obtain basic statistics and information on the shape of the data
describe(COVID)
## vars n mean sd median trimmed mad
## Last.update.date* 1 90753 300.43 184.38 288 299.79 249.08
## Town.number 2 90753 85.00 48.79 85 85.00 62.27
## Town* 3 90753 85.00 48.79 85 85.00 62.27
## Total.cases 4 90753 2095.05 4082.14 700 1161.72 920.69
## Confirmed.cases 5 90753 1903.15 3722.61 644 1052.56 840.63
## Probable.cases 6 90753 191.90 389.03 52 100.98 72.65
## Case.rate 7 90753 8205.13 6653.14 6996 7450.34 7238.05
## Total.deaths 8 90753 45.24 68.42 17 29.86 23.72
## Confirmed.deaths 9 90753 36.94 57.32 13 23.97 19.27
## Probable.deaths 10 90753 8.30 12.40 3 5.48 4.45
## People.tested 11 90753 12433.95 18244.44 6025 8394.68 6914.85
## Rate.tested.per.100k 12 90753 56437.17 27716.63 62315 57525.18 27608.98
## Number.of.tests 13 90753 49617.16 87103.46 18388 29509.23 23402.84
## Number.of.positives 14 90753 2559.58 5103.29 833 1393.99 1098.61
## Number.of.negatives 15 90753 46999.23 82213.37 17512 28051.55 22296.82
## Number.of.indeterminates 16 90753 58.35 113.17 18 32.10 23.72
## ROWMISS 17 90753 0.00 0.00 0 0.00 0.00
## min max range skew kurtosis se
## Last.update.date* 1 605 604 0.04 -1.37 0.61
## Town.number 1 169 168 0.00 -1.20 0.16
## Town* 1 169 168 0.00 -1.20 0.16
## Total.cases 0 42752 42752 4.52 26.99 13.55
## Confirmed.cases 0 38547 38547 4.54 26.96 12.36
## Probable.cases 0 4252 4252 4.60 30.35 1.29
## Case.rate 0 31406 31406 0.83 0.00 22.08
## Total.deaths 0 503 503 2.54 7.97 0.23
## Confirmed.deaths 0 442 442 2.71 9.30 0.19
## Probable.deaths 0 81 81 2.17 5.11 0.04
## People.tested 17 140154 140137 3.28 13.40 60.56
## Rate.tested.per.100k 1170 143111 141941 -0.33 -0.76 92.00
## Number.of.tests 17 928333 928316 3.95 21.03 289.14
## Number.of.positives 0 55043 55043 4.66 28.66 16.94
## Number.of.negatives 12 882389 882377 3.94 21.02 272.91
## Number.of.indeterminates 0 1296 1296 4.38 26.13 0.38
## ROWMISS 0 0 0 NaN NaN 0.00
#Obtain descriptive statistics
stargazer(COVID, type = "text", title="Descriptive statistics", digits=1, out="table1.txt")
##
## Descriptive statistics
## ===============================================================
## Statistic N Mean St. Dev. Min Max
## ---------------------------------------------------------------
## Town.number 90,753 85.0 48.8 1 169
## Total.cases 90,753 2,095.1 4,082.1 0 42,752
## Confirmed.cases 90,753 1,903.2 3,722.6 0 38,547
## Probable.cases 90,753 191.9 389.0 0 4,252
## Case.rate 90,753 8,205.1 6,653.1 0 31,406
## Total.deaths 90,753 45.2 68.4 0 503
## Confirmed.deaths 90,753 36.9 57.3 0 442
## Probable.deaths 90,753 8.3 12.4 0 81
## People.tested 90,753 12,433.9 18,244.4 17 140,154
## Rate.tested.per.100k 90,753 56,437.2 27,716.6 1,170 143,111
## Number.of.tests 90,753 49,617.2 87,103.5 17 928,333
## Number.of.positives 90,753 2,559.6 5,103.3 0 55,043
## Number.of.negatives 90,753 46,999.2 82,213.4 12 882,389
## Number.of.indeterminates 90,753 58.3 113.2 0 1,296
## ROWMISS 90,753 0.0 0.0 0 0
## ---------------------------------------------------------------
# Print the table
print(table)
## function (..., exclude = if (useNA == "no") c(NA, NaN), useNA = c("no",
## "ifany", "always"), dnn = list.names(...), deparse.level = 1)
## {
## list.names <- function(...) {
## l <- as.list(substitute(list(...)))[-1L]
## if (length(l) == 1L && is.list(..1) && !is.null(nm <- names(..1)))
## return(nm)
## nm <- names(l)
## fixup <- if (is.null(nm))
## seq_along(l)
## else nm == ""
## dep <- vapply(l[fixup], function(x) switch(deparse.level +
## 1, "", if (is.symbol(x)) as.character(x) else "",
## deparse(x, nlines = 1)[1L]), "")
## if (is.null(nm))
## dep
## else {
## nm[fixup] <- dep
## nm
## }
## }
## miss.use <- missing(useNA)
## miss.exc <- missing(exclude)
## useNA <- if (miss.use && !miss.exc && !match(NA, exclude,
## nomatch = 0L))
## "ifany"
## else match.arg(useNA)
## doNA <- useNA != "no"
## if (!miss.use && !miss.exc && doNA && match(NA, exclude,
## nomatch = 0L))
## warning("'exclude' containing NA and 'useNA' != \"no\"' are a bit contradicting")
## args <- list(...)
## if (length(args) == 1L && is.list(args[[1L]])) {
## args <- args[[1L]]
## if (length(dnn) != length(args))
## dnn <- paste(dnn[1L], seq_along(args), sep = ".")
## }
## if (!length(args))
## stop("nothing to tabulate")
## bin <- 0L
## lens <- NULL
## dims <- integer()
## pd <- 1L
## dn <- NULL
## for (a in args) {
## if (is.null(lens))
## lens <- length(a)
## else if (length(a) != lens)
## stop("all arguments must have the same length")
## fact.a <- is.factor(a)
## if (doNA)
## aNA <- anyNA(a)
## if (!fact.a) {
## a0 <- a
## op <- options(warn = 2)
## on.exit(options(op))
## a <- factor(a, exclude = exclude)
## options(op)
## }
## add.na <- doNA
## if (add.na) {
## ifany <- (useNA == "ifany")
## anNAc <- anyNA(a)
## add.na <- if (!ifany || anNAc) {
## ll <- levels(a)
## if (add.ll <- !anyNA(ll)) {
## ll <- c(ll, NA)
## TRUE
## }
## else if (!ifany && !anNAc)
## FALSE
## else TRUE
## }
## else FALSE
## }
## if (add.na)
## a <- factor(a, levels = ll, exclude = NULL)
## else ll <- levels(a)
## a <- as.integer(a)
## if (fact.a && !miss.exc) {
## ll <- ll[keep <- which(match(ll, exclude, nomatch = 0L) ==
## 0L)]
## a <- match(a, keep)
## }
## else if (!fact.a && add.na) {
## if (ifany && !aNA && add.ll) {
## ll <- ll[!is.na(ll)]
## is.na(a) <- match(a0, c(exclude, NA), nomatch = 0L) >
## 0L
## }
## else {
## is.na(a) <- match(a0, exclude, nomatch = 0L) >
## 0L
## }
## }
## nl <- length(ll)
## dims <- c(dims, nl)
## if (prod(dims) > .Machine$integer.max)
## stop("attempt to make a table with >= 2^31 elements")
## dn <- c(dn, list(ll))
## bin <- bin + pd * (a - 1L)
## pd <- pd * nl
## }
## names(dn) <- dnn
## bin <- bin[!is.na(bin)]
## if (length(bin))
## bin <- bin + 1L
## y <- array(tabulate(bin, pd), dims, dimnames = dn)
## class(y) <- "table"
## y
## }
## <bytecode: 0x0000018627f97008>
## <environment: namespace:base>
#Create frequency distribution histograms
# Set graphical parameters
par(mar = c(1, 1, 1, 1))
par(mfrow = c(3, 5))
# Loop over columns and plot histograms
for (i in 4:16) {
hist(COVID[, i], main = colnames(COVID)[i], breaks = "Sturges")
}

#Create box-plots for numerical variables
#Specify path to save PDF to
destination = 'C:\\Users\\mbarker\\Desktop\\NCU - Data Sci PhD\\RStudio Files.pdf'
#open PDF
pdf(file = destination)
#Create box-plots and save to PDF
for (col in colnames(COVID)){
p <- ggplot(COVID, aes(x = .data[[col]])) +
geom_boxplot(outlier.color = "red", outlier.shape =4)+ theme_gray()+
labs(title = paste("Boxplot of", col), x = col, y = "Frequency")
print(p)
}
#Turn off PDF plotting
dev.off()
## png
## 2