#Exporation of US covid statistics
#Author: Andrew Meyenn
#Date: 12/7/2022
library(data.table)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library("GGally")
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.7 v dplyr 1.0.9
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::between() masks data.table::between()
## x dplyr::filter() masks stats::filter()
## x dplyr::first() masks data.table::first()
## x dplyr::lag() masks stats::lag()
## x dplyr::last() masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
library(tidygraph)
##
## Attaching package: 'tidygraph'
## The following object is masked from 'package:stats':
##
## filter
library(dplyr)
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.2.1
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
#The data file containes a range of stats, 20 fields. We will conside firsly
#WP white proportion of State, BP black proportion of State and compare these
#as expected again the WD white deaths and BD black deaths
#a code book has been placed on the githib site
setwd("C:/Users/Cathie/SkyDrive/Rprogs")
df<-fread("./Covid/USCovid.csv") #available on github site
df<-df %>% remove_rownames %>% column_to_rownames(var="Code") #set rownames
options(digits=12)
####Check distribution and correlations. PCA works well with good correlation
ggpairs(df[, c(2,4:12)]) #library(GGally)

####The data file contains diffW and diffB fields, these are the difference
####between observed and expected deaths from covide
#The plot shows large deviation between W and less for B
plot(df$diffW, type="l", xaxt="n", main="US WD%-WP% and BD%-BP%", ylim=c(-0.20, 0.30))
abline(h=0)
lines(df$diffB, cex=0.85, col="red")
axis(1, 1:nrow(df), labels=rownames(df), cex.axis=0.7)

#basic stats
#there seems to be a significant negative correlation between diffW nd diffB.
#NOTE the outlier, which DC where many more B deaths occured compared to W.
#are we convinced!
cor(df$diffW, df$diffB)
## [1] -0.634400772856
cor.test(df$diffW, df$diffB)
##
## Pearson's product-moment correlation
##
## data: df$diffW and df$diffB
## t = -5.744865025, df = 49, p-value = 5.77182887e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.774566585264 -0.434840477172
## sample estimates:
## cor
## -0.634400772856
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
scatterplot(x=df$diffW, y=df$diffB)

#PCA, but before do some tests. We already know that there is little correlation
#and little normal distribution, although PCA does not need this, and we can certanly
#do some factor analysis with non-normal data
cm <- cor(df[, c(2, 4:6)])
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
KMO(cm) #in theory Pov should come out <0.5, the others are ok, overall 0.73 OK
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cm)
## Overall MSA = 0.73
## MSA for each item =
## Pov TC TD Pop
## 0.24 0.67 0.87 0.71
#hence we can say that these are contributing to the overall variance so it
#is useful to use PCA.
#you can run PCA just be careful with Pov in, compare to if not included.
colnames(df)
## [1] "State" "Pov" "Party" "TC" "TD" "Pop"
## [7] "WD" "WP" "diffW" "BD" "BP" "diffB"
## [13] "Dunder55" "Dover55" "D55-65" "Deaths65" "65Over" "D65"
## [19] "Wd65" "Bd65" "Wp65" "Bp65"
#PCA does it seem reasonable
pca<-prcomp(df[,c(2, 4:6)], scale=TRUE)
screeplot(pca)

fviz_pca_biplot(pca, habillage = df$Party)

#check the correlations
#these accord with the biplot. Pov is at 90degree and this means no correlation
#TC, TD and POP are close and lie along PC1 or DIM1 and capture 74% of variance
cor(df[,c(2,4:6)])
## Pov TC TD Pop
## Pov 1.0000000000000 0.0918562508896 0.157258219733 0.0697198024058
## TC 0.0918562508896 1.0000000000000 0.974045345601 0.9928630246602
## TD 0.1572582197335 0.9740453456009 1.000000000000 0.9661043830228
## Pop 0.0697198024058 0.9928630246602 0.966104383023 1.0000000000000
#this is saying that cases and deaths and pop are correlated, each pop is bigger
#we get more cases and deaths.
#we can look what are termed rotations to the contribution of each variable/field
#to each PC, this is what the biplot is saying.
as.data.frame(pca$rotation[,1:2])
## PC1 PC2
## Pov -0.0928771666331 0.993360596291321
## TC -0.5766478610703 -0.069346843768385
## TD -0.5734929108676 0.000790233488597
## Pop -0.5744187996800 -0.091788433509614
#### INVESTIAGTE Dendrogram and clusters
hc <- hclust(dist(as.data.table(df[,c(2,4:12)])), "ave")
dend1 <- as.dendrogram(hc)
plot(dend1)

####check clusterabilit and plot Clusters
cl<-get_clust_tendency(data = df[,c(2,4:12)], n = 5)
cl$hopkins_stat #near 1 good to go
## [1] 0.939360283212
sub_grp <- cutree(hc, k = 3)
table(sub_grp)
## sub_grp
## 1 2 3
## 47 1 3
plot(hc, cex = 0.6)
rect.hclust(hc, k = 6, border = 2:5)

fviz_cluster(list(data = df[,c(2, 4:12)], cluster = sub_grp))

# Compute PAM Clusters version 1 - simplest syntax structure - many available
library("cluster")
p<- pam(df[,c(2, 4:12)], 3) #3 is the desired clusters, three 3 etc. Observation looks like Pop
# Visualize
fviz_cluster(p)

##Heatmap
dfScale<-scale(df[,c(2, 4:12)])
heatmap(dfScale, scale = "col")

##Inference Stats using non-parametic tests
##test proportion difference for WD v's WP observed v expected via pop proportion
results <- c()
for (i in 1:51){
pp<-prop.test(x=df[i,5]*df[i,7], n=df[i,5], p=df[i, 8], alternative="two.sided")
results[[i]]<-pp$p.value
}
results #many states have p<0.05 reject Ho, proprotions different than expected
## [[1]]
## [1] 4.26525531175e-49
##
## [[2]]
## [1] 8.9626831611e-09
##
## [[3]]
## [1] 2.48504123644e-12
##
## [[4]]
## [1] 6.95703280038e-82
##
## [[5]]
## [1] 2.34438824678e-10
##
## [[6]]
## [1] 0.013604292105
##
## [[7]]
## [1] 2.47782752361e-70
##
## [[8]]
## [1] 4.57101513912e-41
##
## [[9]]
## [1] 2.34954875699e-74
##
## [[10]]
## [1] 1.29871887236e-167
##
## [[11]]
## [1] 3.72304599449e-274
##
## [[12]]
## [1] 3.51157598274e-18
##
## [[13]]
## [1] 4.87787639135e-20
##
## [[14]]
## [1] 1.3614800383e-33
##
## [[15]]
## [1] 1.22799359685e-114
##
## [[16]]
## [1] 7.68903964589e-83
##
## [[17]]
## [1] 3.02848169564e-40
##
## [[18]]
## [1] 3.96526316881e-130
##
## [[19]]
## [1] 8.36629506114e-08
##
## [[20]]
## [1] 1.71002954424e-14
##
## [[21]]
## [1] 2.42424221461e-22
##
## [[22]]
## [1] 6.98621947058e-219
##
## [[23]]
## [1] 1
##
## [[24]]
## [1] 3.21115415621e-86
##
## [[25]]
## [1] 6.41317715297e-06
##
## [[26]]
## [1] 1.43663101189e-69
##
## [[27]]
## [1] 4.41673966393e-07
##
## [[28]]
## [1] 6.61603890244e-48
##
## [[29]]
## [1] 5.4318766264e-49
##
## [[30]]
## [1] 3.75570099182e-24
##
## [[31]]
## [1] 1.70405118165e-76
##
## [[32]]
## [1] 2.9737532138e-20
##
## [[33]]
## [1] 1.79727566081e-26
##
## [[34]]
## [1] 3.41909537506e-117
##
## [[35]]
## [1] 9.91618839392e-05
##
## [[36]]
## [1] 2.7817375765e-185
##
## [[37]]
## [1] 1.84759455121e-90
##
## [[38]]
## [1] 6.8147656131e-60
##
## [[39]]
## [1] 2.92687096907e-269
##
## [[40]]
## [1] 1.19031792291e-87
##
## [[41]]
## [1] 5.33237046098e-17
##
## [[42]]
## [1] 2.57005809962e-05
##
## [[43]]
## [1] 1.20028936084e-110
##
## [[44]]
## [1] 2.22847183985e-202
##
## [[45]]
## [1] 0.0967060930034
##
## [[46]]
## [1] 5.76700719968e-05
##
## [[47]]
## [1] 8.29688041996e-70
##
## [[48]]
## [1] 6.61645021362e-67
##
## [[49]]
## [1] 6.27078060316e-23
##
## [[50]]
## [1] 3.54856417096e-54
##
## [[51]]
## [1] 1
#repeat for BD v BP
results <- c()
for (i in 1:51){
pp<-prop.test(x=df[i,5]*df[i,10], n=df[i,5], p=df[i, 11], alternative="two.sided")
results[[i]]<-pp$p.value
}
results #many more p>0.05 accept Ho, proprotions not different, but are in some
## [[1]]
## [1] 1
##
## [[2]]
## [1] 1
##
## [[3]]
## [1] 1
##
## [[4]]
## [1] 0.0026879858028
##
## [[5]]
## [1] 1.51767771481e-171
##
## [[6]]
## [1] 1
##
## [[7]]
## [1] 9.7649336478e-26
##
## [[8]]
## [1] 0.194772921376
##
## [[9]]
## [1] 2.68331036802e-88
##
## [[10]]
## [1] 1.11046392712e-53
##
## [[11]]
## [1] 2.22152768951e-05
##
## [[12]]
## [1] 1
##
## [[13]]
## [1] 1
##
## [[14]]
## [1] 1.69153129452e-113
##
## [[15]]
## [1] 7.09731435415e-08
##
## [[16]]
## [1] 5.82467994301e-07
##
## [[17]]
## [1] 1.58173416555e-05
##
## [[18]]
## [1] 2.9679501893e-06
##
## [[19]]
## [1] 2.25976049689e-17
##
## [[20]]
## [1] 1
##
## [[21]]
## [1] 3.84427492624e-40
##
## [[22]]
## [1] 1.5161931952e-08
##
## [[23]]
## [1] 2.04263601565e-257
##
## [[24]]
## [1] 1.5396232887e-06
##
## [[25]]
## [1] 1
##
## [[26]]
## [1] 4.62201358458e-06
##
## [[27]]
## [1] 1
##
## [[28]]
## [1] 0.00280257466583
##
## [[29]]
## [1] 0.000257486066358
##
## [[30]]
## [1] 1
##
## [[31]]
## [1] 2.72242526507e-177
##
## [[32]]
## [1] 2.49808525262e-10
##
## [[33]]
## [1] 0
##
## [[34]]
## [1] 6.78105271511e-15
##
## [[35]]
## [1] 0.000813736361832
##
## [[36]]
## [1] 1
##
## [[37]]
## [1] 1
##
## [[38]]
## [1] 1
##
## [[39]]
## [1] 1.0544519934e-12
##
## [[40]]
## [1] 0.0125859012563
##
## [[41]]
## [1] 4.76001078493e-20
##
## [[42]]
## [1] 0.00014136010559
##
## [[43]]
## [1] 8.58858815376e-06
##
## [[44]]
## [1] 4.15009836288e-20
##
## [[45]]
## [1] 1
##
## [[46]]
## [1] 1
##
## [[47]]
## [1] 1.65141388885e-48
##
## [[48]]
## [1] 4.69867500862e-09
##
## [[49]]
## [1] 1
##
## [[50]]
## [1] 3.2814691603e-07
##
## [[51]]
## [1] 1
#Investigate the Means Pop and TD and compare
sumPop<-sum(df$Pop)
sumDth<-sum(df$TD)
grPop<-aggregate(df$Pop, by = list(gr=df$Party), FUN = sum)
grDeaths<-aggregate(df$TD, by = list(gr=df$Party), FUN = sum)
grPop #Democrat = 2, larger than Rep
## gr x
## 1 1 145921772
## 2 2 182317751
grDeaths #Deaths Rep > Deaths Dem
## gr x
## 1 1 479940
## 2 2 533157
#check democrat deaths v pop proportion
a1<-grDeaths[1,2]/sumDth #observed
a2<-grPop[1,2]/sumPop #expected
a1 #2=Dem 0.52, 1=Rep 0.47
## [1] 0.473735486335
a2 #2=Dem 0.55, 1=Rep 0.44
## [1] 0.444558810793
#observed death rate is less than democrat pop proportion
#observed deaths, observed deaths, expect pop prop
prop.test(x=a1, n=sumDth, p=a2, alternative="two.sided")
##
## 1-sample proportions test with continuity correction
##
## data: a1 out of sumDth, null probability a2
## X-squared = 810849.5712, df = 1, p-value < 2.220446e-16
## alternative hypothesis: true p is not equal to 0.444558810793
## 95 percent confidence interval:
## 0.00000000000e+00 5.54754646524e-06
## sample estimates:
## p
## 4.67611182676e-07
#reject Ho, p<0.05, observed not same as expected, p-value=2.2e-16
#if the distributions are normal can use parametric tests
#check normality, p>0.05 is the H0, if p<0.05 not normal
apply(df[,c(4:7,10)],2, shapiro.test)
## $TC
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.7071079139, p-value = 8.83637606e-09
##
##
## $TD
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.7637293276, p-value = 1.14148118e-07
##
##
## $Pop
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.7071891254, p-value = 8.86677819e-09
##
##
## $WD
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.8958545817, p-value = 0.000306703814
##
##
## $BD
##
## Shapiro-Wilk normality test
##
## data: newX[, i]
## W = 0.7828280348, p-value = 2.94608334e-07
library("ggpubr")
ggboxplot(df, x = "Party", y = c("Pop"),
color = "Party", palette = c("#00AFBB", "#E7B800"),
ylab = "Pop", xlab = "Party")

library(dplyr)
dd<-df %>%
group_by(Party) %>%
summarise_each(funs(sum, mean,), Pop, TC, TD)
## Warning: `summarise_each_()` was deprecated in dplyr 0.7.0.
## Please use `across()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
dd
## # A tibble: 2 x 7
## Party Pop_sum TC_sum TD_sum Pop_mean TC_mean TD_mean
## <int> <int> <int> <int> <dbl> <dbl> <dbl>
## 1 1 145921772 39683255 479940 5836871. 1587330. 19198.
## 2 2 182317751 47491813 533157 7012221. 1826608. 20506.
#non-parametric test as data not normally distributed
pop <- wilcox.test(TD*BD ~ Party, data = df,
exact = FALSE)
pop #p>0.05 hence accept Ho BD proportion same/similar for each party
##
## Wilcoxon rank sum test with continuity correction
##
## data: TD * BD by Party
## W = 304, p-value = 0.699299908
## alternative hypothesis: true location shift is not equal to 0
#Investigate deaths > 65, roughly 75% > 65y are W AND 10% B,
#overall 16% of US pop is 65years or greater
#Pop is overall W 62% and B 13.4%
#The number values are in the data file, see the plot names below
plot(df$Dunder55, type="l",main="Deaths <55, 55 to 65, >65", xaxt="n", ylim=c(0, 100000))
lines(df$`D55-65`, type="l", col="red")
lines(df$D65, type="l", col="blue")
axis(1, 1:nrow(df), labels=rownames(df), cex.axis=0.7)

#note the large discrpency of the blue line >65!
#the two means of the difference in observed deaths vs expected based on pop proportion
#are shown below
mean(df$diffW) #0.036
## [1] 0.0364705882353
mean(df$diffB) #0.014
## [1] 0.0135294117647
#another way to look at this is to note that 75% of people aged over 65 are W and
#we would expect that 75% of the deaths over 65 are W, 10% would be B.
#hence 75-62 = 13% More W, and 10-13.4 = -3.4 less are B
#BUT is this expected, I can't find observed data to finish the story