#Upload data set

COVID <- read.csv('C:/Users/mbarker/Desktop/NCU - Data Sci PhD/RStudio Files/COVID-19.csv', stringsAsFactors = T)
#Load packages from library

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(Amelia)
## Loading required package: Rcpp
## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2023 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(naniar)
library(stargazer)
## 
## Please cite as: 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(psych)
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(ggplot2)
#Count missing values

sum(is.na(COVID))
## [1] 114413
#Find percent of rows with a missing value 

pct_miss(COVID)
## [1] 6.993802
#Check for missing data by creating missingness maps

gg_miss_var(COVID, show_pct = TRUE)

vis_miss(COVID, warn_large_data = FALSE)

missmap(COVID)

#Determine how many rows have more than 20% missing values
myr = function(x){
  temp=rep(0,nrow(x))
  for (i in 1:nrow(x)){
    temp[i]=sum(is.na(x[i, 1:ncol(x)]))/ncol(x)}
  x$ROWMISS=temp
  x=x[order(-x$ROWMISS), ]
  return(x)
}
COVID=myr(COVID)

length(COVID$ROWMISS[COVID$ROWMISS>.2])
## [1] 11492
#Delete rows missing more than 20% of their data
COVID <- COVID[COVID$ROWMISS <= 0.2,]
#Detect the variables and data type

str(COVID)
## 'data.frame':    90753 obs. of  17 variables:
##  $ Last.update.date        : Factor w/ 605 levels "1/1/2021","1/10/2021",..: 12 12 12 12 12 12 12 12 12 12 ...
##  $ Town.number             : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Town                    : Factor w/ 169 levels "Andover","Ansonia",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Total.cases             : int  118 1236 158 614 115 362 1081 269 1368 139 ...
##  $ Confirmed.cases         : int  109 1131 155 591 112 343 1035 249 1200 128 ...
##  $ Probable.cases          : int  9 105 3 23 3 19 46 20 168 11 ...
##  $ Case.rate               : int  3652 6602 3708 3355 3173 5856 5291 4910 6939 4062 ...
##  $ Total.deaths            : int  2 20 3 63 1 4 26 2 42 3 ...
##  $ Confirmed.deaths        : int  2 13 3 52 1 4 22 1 33 2 ...
##  $ Probable.deaths         : int  0 7 0 11 0 0 4 1 9 1 ...
##  $ People.tested           : int  1386 8358 1917 8980 1521 2855 9439 2597 9901 1470 ...
##  $ Rate.tested.per.100k    : int  42897 44645 44989 49066 41970 46182 46197 47399 50223 42957 ...
##  $ Number.of.tests         : int  3018 22604 4432 23448 3877 7753 26565 6684 26133 3817 ...
##  $ Number.of.positives     : int  134 1451 180 700 126 416 1274 332 1605 159 ...
##  $ Number.of.negatives     : int  2882 21074 4249 22712 3743 7314 25268 6339 24507 3653 ...
##  $ Number.of.indeterminates: int  2 79 3 36 8 23 23 13 21 5 ...
##  $ ROWMISS                 : num  0 0 0 0 0 0 0 0 0 0 ...
summary(COVID)
##   Last.update.date  Town.number            Town        Total.cases   
##  1/1/2021 :  169   Min.   :  1   Andover     :  537   Min.   :    0  
##  1/10/2021:  169   1st Qu.: 43   Ansonia     :  537   1st Qu.:  191  
##  1/10/2022:  169   Median : 85   Ashford     :  537   Median :  700  
##  1/11/2021:  169   Mean   : 85   Avon        :  537   Mean   : 2095  
##  1/11/2022:  169   3rd Qu.:127   Barkhamsted :  537   3rd Qu.: 2111  
##  1/12/2021:  169   Max.   :169   Beacon Falls:  537   Max.   :42752  
##  (Other)  :89739                 (Other)     :87531                  
##  Confirmed.cases Probable.cases     Case.rate      Total.deaths   
##  Min.   :    0   Min.   :   0.0   Min.   :    0   Min.   :  0.00  
##  1st Qu.:  177   1st Qu.:   9.0   1st Qu.: 2216   1st Qu.:  3.00  
##  Median :  644   Median :  52.0   Median : 6996   Median : 17.00  
##  Mean   : 1903   Mean   : 191.9   Mean   : 8205   Mean   : 45.24  
##  3rd Qu.: 1896   3rd Qu.: 186.0   3rd Qu.:12041   3rd Qu.: 56.00  
##  Max.   :38547   Max.   :4252.0   Max.   :31406   Max.   :503.00  
##                                                                   
##  Confirmed.deaths Probable.deaths  People.tested    Rate.tested.per.100k
##  Min.   :  0.00   Min.   : 0.000   Min.   :    17   Min.   :  1170      
##  1st Qu.:  2.00   1st Qu.: 0.000   1st Qu.:  2217   1st Qu.: 33637      
##  Median : 13.00   Median : 3.000   Median :  6025   Median : 62315      
##  Mean   : 36.94   Mean   : 8.301   Mean   : 12434   Mean   : 56437      
##  3rd Qu.: 44.00   3rd Qu.:11.000   3rd Qu.: 14959   3rd Qu.: 77800      
##  Max.   :442.00   Max.   :81.000   Max.   :140154   Max.   :143111      
##                                                                         
##  Number.of.tests  Number.of.positives Number.of.negatives
##  Min.   :    17   Min.   :    0       Min.   :    12     
##  1st Qu.:  5261   1st Qu.:  226       1st Qu.:  5009     
##  Median : 18388   Median :  833       Median : 17512     
##  Mean   : 49617   Mean   : 2560       Mean   : 46999     
##  3rd Qu.: 54237   3rd Qu.: 2535       3rd Qu.: 51556     
##  Max.   :928333   Max.   :55043       Max.   :882389     
##                                                          
##  Number.of.indeterminates    ROWMISS 
##  Min.   :   0.00          Min.   :0  
##  1st Qu.:   4.00          1st Qu.:0  
##  Median :  18.00          Median :0  
##  Mean   :  58.35          Mean   :0  
##  3rd Qu.:  59.00          3rd Qu.:0  
##  Max.   :1296.00          Max.   :0  
## 
#Obtain frequency information for each variable
# Apply summary function to each variable
result <- lapply(COVID, function(x) length(unique(x)))

# Create a data frame from the result
table1 <- data.frame(Variable = names(result), Count = unlist(result))
table1
##                                          Variable Count
## Last.update.date                 Last.update.date   537
## Town.number                           Town.number   169
## Town                                         Town   169
## Total.cases                           Total.cases 11456
## Confirmed.cases                   Confirmed.cases 10792
## Probable.cases                     Probable.cases  2347
## Case.rate                               Case.rate 21940
## Total.deaths                         Total.deaths   468
## Confirmed.deaths                 Confirmed.deaths   401
## Probable.deaths                   Probable.deaths    81
## People.tested                       People.tested 30934
## Rate.tested.per.100k         Rate.tested.per.100k 54863
## Number.of.tests                   Number.of.tests 53234
## Number.of.positives           Number.of.positives 12805
## Number.of.negatives           Number.of.negatives 52304
## Number.of.indeterminates Number.of.indeterminates   915
## ROWMISS                                   ROWMISS     1
#Use describe() function to obtain basic statistics and information on the shape of the data

describe(COVID)
##                          vars     n     mean       sd median  trimmed      mad
## Last.update.date*           1 90753   300.43   184.38    288   299.79   249.08
## Town.number                 2 90753    85.00    48.79     85    85.00    62.27
## Town*                       3 90753    85.00    48.79     85    85.00    62.27
## Total.cases                 4 90753  2095.05  4082.14    700  1161.72   920.69
## Confirmed.cases             5 90753  1903.15  3722.61    644  1052.56   840.63
## Probable.cases              6 90753   191.90   389.03     52   100.98    72.65
## Case.rate                   7 90753  8205.13  6653.14   6996  7450.34  7238.05
## Total.deaths                8 90753    45.24    68.42     17    29.86    23.72
## Confirmed.deaths            9 90753    36.94    57.32     13    23.97    19.27
## Probable.deaths            10 90753     8.30    12.40      3     5.48     4.45
## People.tested              11 90753 12433.95 18244.44   6025  8394.68  6914.85
## Rate.tested.per.100k       12 90753 56437.17 27716.63  62315 57525.18 27608.98
## Number.of.tests            13 90753 49617.16 87103.46  18388 29509.23 23402.84
## Number.of.positives        14 90753  2559.58  5103.29    833  1393.99  1098.61
## Number.of.negatives        15 90753 46999.23 82213.37  17512 28051.55 22296.82
## Number.of.indeterminates   16 90753    58.35   113.17     18    32.10    23.72
## ROWMISS                    17 90753     0.00     0.00      0     0.00     0.00
##                           min    max  range  skew kurtosis     se
## Last.update.date*           1    605    604  0.04    -1.37   0.61
## Town.number                 1    169    168  0.00    -1.20   0.16
## Town*                       1    169    168  0.00    -1.20   0.16
## Total.cases                 0  42752  42752  4.52    26.99  13.55
## Confirmed.cases             0  38547  38547  4.54    26.96  12.36
## Probable.cases              0   4252   4252  4.60    30.35   1.29
## Case.rate                   0  31406  31406  0.83     0.00  22.08
## Total.deaths                0    503    503  2.54     7.97   0.23
## Confirmed.deaths            0    442    442  2.71     9.30   0.19
## Probable.deaths             0     81     81  2.17     5.11   0.04
## People.tested              17 140154 140137  3.28    13.40  60.56
## Rate.tested.per.100k     1170 143111 141941 -0.33    -0.76  92.00
## Number.of.tests            17 928333 928316  3.95    21.03 289.14
## Number.of.positives         0  55043  55043  4.66    28.66  16.94
## Number.of.negatives        12 882389 882377  3.94    21.02 272.91
## Number.of.indeterminates    0   1296   1296  4.38    26.13   0.38
## ROWMISS                     0      0      0   NaN      NaN   0.00
#Obtain descriptive statistics 

stargazer(COVID, type = "text", title="Descriptive statistics", digits=1, out="table1.txt") 
## 
## Descriptive statistics
## ===============================================================
## Statistic                  N      Mean   St. Dev.  Min    Max  
## ---------------------------------------------------------------
## Town.number              90,753   85.0     48.8     1     169  
## Total.cases              90,753 2,095.1  4,082.1    0   42,752 
## Confirmed.cases          90,753 1,903.2  3,722.6    0   38,547 
## Probable.cases           90,753  191.9    389.0     0    4,252 
## Case.rate                90,753 8,205.1  6,653.1    0   31,406 
## Total.deaths             90,753   45.2     68.4     0     503  
## Confirmed.deaths         90,753   36.9     57.3     0     442  
## Probable.deaths          90,753   8.3      12.4     0     81   
## People.tested            90,753 12,433.9 18,244.4  17   140,154
## Rate.tested.per.100k     90,753 56,437.2 27,716.6 1,170 143,111
## Number.of.tests          90,753 49,617.2 87,103.5  17   928,333
## Number.of.positives      90,753 2,559.6  5,103.3    0   55,043 
## Number.of.negatives      90,753 46,999.2 82,213.4  12   882,389
## Number.of.indeterminates 90,753   58.3    113.2     0    1,296 
## ROWMISS                  90,753   0.0      0.0      0      0   
## ---------------------------------------------------------------
# Print the table
print(table)
## function (..., exclude = if (useNA == "no") c(NA, NaN), useNA = c("no", 
##     "ifany", "always"), dnn = list.names(...), deparse.level = 1) 
## {
##     list.names <- function(...) {
##         l <- as.list(substitute(list(...)))[-1L]
##         if (length(l) == 1L && is.list(..1) && !is.null(nm <- names(..1))) 
##             return(nm)
##         nm <- names(l)
##         fixup <- if (is.null(nm)) 
##             seq_along(l)
##         else nm == ""
##         dep <- vapply(l[fixup], function(x) switch(deparse.level + 
##             1, "", if (is.symbol(x)) as.character(x) else "", 
##             deparse(x, nlines = 1)[1L]), "")
##         if (is.null(nm)) 
##             dep
##         else {
##             nm[fixup] <- dep
##             nm
##         }
##     }
##     miss.use <- missing(useNA)
##     miss.exc <- missing(exclude)
##     useNA <- if (miss.use && !miss.exc && !match(NA, exclude, 
##         nomatch = 0L)) 
##         "ifany"
##     else match.arg(useNA)
##     doNA <- useNA != "no"
##     if (!miss.use && !miss.exc && doNA && match(NA, exclude, 
##         nomatch = 0L)) 
##         warning("'exclude' containing NA and 'useNA' != \"no\"' are a bit contradicting")
##     args <- list(...)
##     if (length(args) == 1L && is.list(args[[1L]])) {
##         args <- args[[1L]]
##         if (length(dnn) != length(args)) 
##             dnn <- paste(dnn[1L], seq_along(args), sep = ".")
##     }
##     if (!length(args)) 
##         stop("nothing to tabulate")
##     bin <- 0L
##     lens <- NULL
##     dims <- integer()
##     pd <- 1L
##     dn <- NULL
##     for (a in args) {
##         if (is.null(lens)) 
##             lens <- length(a)
##         else if (length(a) != lens) 
##             stop("all arguments must have the same length")
##         fact.a <- is.factor(a)
##         if (doNA) 
##             aNA <- anyNA(a)
##         if (!fact.a) {
##             a0 <- a
##             op <- options(warn = 2)
##             on.exit(options(op))
##             a <- factor(a, exclude = exclude)
##             options(op)
##         }
##         add.na <- doNA
##         if (add.na) {
##             ifany <- (useNA == "ifany")
##             anNAc <- anyNA(a)
##             add.na <- if (!ifany || anNAc) {
##                 ll <- levels(a)
##                 if (add.ll <- !anyNA(ll)) {
##                   ll <- c(ll, NA)
##                   TRUE
##                 }
##                 else if (!ifany && !anNAc) 
##                   FALSE
##                 else TRUE
##             }
##             else FALSE
##         }
##         if (add.na) 
##             a <- factor(a, levels = ll, exclude = NULL)
##         else ll <- levels(a)
##         a <- as.integer(a)
##         if (fact.a && !miss.exc) {
##             ll <- ll[keep <- which(match(ll, exclude, nomatch = 0L) == 
##                 0L)]
##             a <- match(a, keep)
##         }
##         else if (!fact.a && add.na) {
##             if (ifany && !aNA && add.ll) {
##                 ll <- ll[!is.na(ll)]
##                 is.na(a) <- match(a0, c(exclude, NA), nomatch = 0L) > 
##                   0L
##             }
##             else {
##                 is.na(a) <- match(a0, exclude, nomatch = 0L) > 
##                   0L
##             }
##         }
##         nl <- length(ll)
##         dims <- c(dims, nl)
##         if (prod(dims) > .Machine$integer.max) 
##             stop("attempt to make a table with >= 2^31 elements")
##         dn <- c(dn, list(ll))
##         bin <- bin + pd * (a - 1L)
##         pd <- pd * nl
##     }
##     names(dn) <- dnn
##     bin <- bin[!is.na(bin)]
##     if (length(bin)) 
##         bin <- bin + 1L
##     y <- array(tabulate(bin, pd), dims, dimnames = dn)
##     class(y) <- "table"
##     y
## }
## <bytecode: 0x0000018627f97008>
## <environment: namespace:base>
#Create frequency distribution histograms 
# Set graphical parameters
par(mar = c(1, 1, 1, 1))
par(mfrow = c(3, 5))

# Loop over columns and plot histograms
for (i in 4:16) {
  hist(COVID[, i], main = colnames(COVID)[i], breaks = "Sturges")
}

#Create box-plots for numerical variables
#Specify path to save PDF to
destination = 'C:\\Users\\mbarker\\Desktop\\NCU - Data Sci PhD\\RStudio Files.pdf'

#open PDF
pdf(file = destination)
 
#Create box-plots and save to PDF
for (col in colnames(COVID)){
    p <- ggplot(COVID, aes(x = .data[[col]])) +
    geom_boxplot(outlier.color = "red", outlier.shape =4)+ theme_gray()+
    labs(title = paste("Boxplot of", col), x = col, y = "Frequency")
       
    print(p)
    }

#Turn off PDF plotting
dev.off()
## png 
##   2