#Exporation of US covid statistics
#Author: Andrew Meyenn
#Date: 12/7/2022

library(data.table)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library("GGally")
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.7     v dplyr   1.0.9
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::between()   masks data.table::between()
## x dplyr::filter()    masks stats::filter()
## x dplyr::first()     masks data.table::first()
## x dplyr::lag()       masks stats::lag()
## x dplyr::last()      masks data.table::last()
## x purrr::transpose() masks data.table::transpose()
library(tidygraph)
## 
## Attaching package: 'tidygraph'
## The following object is masked from 'package:stats':
## 
##     filter
library(dplyr)
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.2.1
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
#The data file containes a range of stats, 20 fields. We will conside firsly
#WP white proportion of State, BP black proportion of State and compare these
#as expected again the WD white deaths and BD black deaths
#a code book has been placed on the githib site
setwd("C:/Users/Cathie/SkyDrive/Rprogs")

df<-fread("./Covid/USCovid.csv") #available on github site
df<-df %>% remove_rownames %>% column_to_rownames(var="Code") #set rownames

options(digits=12)
####Check distribution and correlations. PCA works well with good correlation
ggpairs(df[, c(2,4:12)]) #library(GGally)

####The data file contains diffW and diffB fields, these are the difference
####between observed and expected deaths from covide
#The plot shows large deviation between W and less for B

plot(df$diffW, type="l", xaxt="n", main="US WD%-WP% and BD%-BP%", ylim=c(-0.20, 0.30)) 
abline(h=0)
lines(df$diffB, cex=0.85, col="red")
axis(1, 1:nrow(df), labels=rownames(df), cex.axis=0.7)

#basic stats
#there seems to be a significant negative correlation between diffW nd diffB.
#NOTE the outlier, which DC where many more B deaths occured compared to W.
#are we convinced!

cor(df$diffW, df$diffB)
## [1] -0.634400772856
cor.test(df$diffW, df$diffB)
## 
##  Pearson's product-moment correlation
## 
## data:  df$diffW and df$diffB
## t = -5.744865025, df = 49, p-value = 5.77182887e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.774566585264 -0.434840477172
## sample estimates:
##             cor 
## -0.634400772856
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
scatterplot(x=df$diffW, y=df$diffB)      

#PCA, but before do some tests. We already know that there is little correlation
#and little normal distribution, although PCA does not need this, and we can certanly
#do some factor analysis with non-normal data

cm <- cor(df[, c(2, 4:6)])
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:car':
## 
##     logit
## The following object is masked from 'package:Hmisc':
## 
##     describe
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
KMO(cm) #in theory Pov should come out <0.5, the others are ok, overall 0.73 OK
## Kaiser-Meyer-Olkin factor adequacy
## Call: KMO(r = cm)
## Overall MSA =  0.73
## MSA for each item = 
##  Pov   TC   TD  Pop 
## 0.24 0.67 0.87 0.71
#hence we can say that these are contributing to the overall variance so it
#is useful to use PCA.
#you can run PCA just be careful with Pov in, compare to if not included.
colnames(df)
##  [1] "State"    "Pov"      "Party"    "TC"       "TD"       "Pop"     
##  [7] "WD"       "WP"       "diffW"    "BD"       "BP"       "diffB"   
## [13] "Dunder55" "Dover55"  "D55-65"   "Deaths65" "65Over"   "D65"     
## [19] "Wd65"     "Bd65"     "Wp65"     "Bp65"
#PCA does it seem reasonable
pca<-prcomp(df[,c(2, 4:6)], scale=TRUE) 
screeplot(pca)

fviz_pca_biplot(pca, habillage = df$Party) 

#check the correlations
#these accord with the biplot. Pov is at 90degree and this means no correlation
#TC, TD and POP are close and lie along PC1 or DIM1 and capture 74% of variance

cor(df[,c(2,4:6)])
##                 Pov              TC             TD             Pop
## Pov 1.0000000000000 0.0918562508896 0.157258219733 0.0697198024058
## TC  0.0918562508896 1.0000000000000 0.974045345601 0.9928630246602
## TD  0.1572582197335 0.9740453456009 1.000000000000 0.9661043830228
## Pop 0.0697198024058 0.9928630246602 0.966104383023 1.0000000000000
#this is saying that cases and deaths and pop are correlated, each pop is bigger
#we get more cases and deaths. 

#we can look what are termed rotations to the contribution of each variable/field
#to each PC, this is what the biplot is saying.

as.data.frame(pca$rotation[,1:2])
##                  PC1                PC2
## Pov -0.0928771666331  0.993360596291321
## TC  -0.5766478610703 -0.069346843768385
## TD  -0.5734929108676  0.000790233488597
## Pop -0.5744187996800 -0.091788433509614
#### INVESTIAGTE Dendrogram and clusters
hc <- hclust(dist(as.data.table(df[,c(2,4:12)])), "ave")
dend1 <- as.dendrogram(hc)
plot(dend1)

####check clusterabilit and plot Clusters
cl<-get_clust_tendency(data = df[,c(2,4:12)], n = 5)
cl$hopkins_stat #near 1 good to go
## [1] 0.939360283212
sub_grp <- cutree(hc, k = 3)
table(sub_grp)
## sub_grp
##  1  2  3 
## 47  1  3
plot(hc, cex = 0.6)
rect.hclust(hc, k = 6, border = 2:5)

fviz_cluster(list(data = df[,c(2, 4:12)], cluster = sub_grp))

# Compute PAM Clusters version 1 - simplest syntax structure - many available
library("cluster")
p<- pam(df[,c(2, 4:12)], 3) #3 is the desired clusters, three 3 etc. Observation looks like Pop
# Visualize
fviz_cluster(p)

##Heatmap
dfScale<-scale(df[,c(2, 4:12)])
heatmap(dfScale, scale = "col")

##Inference Stats using non-parametic tests
##test proportion difference for WD v's WP observed v expected via pop proportion
results <- c()
for (i in 1:51){
  pp<-prop.test(x=df[i,5]*df[i,7], n=df[i,5], p=df[i, 8], alternative="two.sided")
  results[[i]]<-pp$p.value
}
results #many states have p<0.05 reject Ho, proprotions different than expected 
## [[1]]
## [1] 4.26525531175e-49
## 
## [[2]]
## [1] 8.9626831611e-09
## 
## [[3]]
## [1] 2.48504123644e-12
## 
## [[4]]
## [1] 6.95703280038e-82
## 
## [[5]]
## [1] 2.34438824678e-10
## 
## [[6]]
## [1] 0.013604292105
## 
## [[7]]
## [1] 2.47782752361e-70
## 
## [[8]]
## [1] 4.57101513912e-41
## 
## [[9]]
## [1] 2.34954875699e-74
## 
## [[10]]
## [1] 1.29871887236e-167
## 
## [[11]]
## [1] 3.72304599449e-274
## 
## [[12]]
## [1] 3.51157598274e-18
## 
## [[13]]
## [1] 4.87787639135e-20
## 
## [[14]]
## [1] 1.3614800383e-33
## 
## [[15]]
## [1] 1.22799359685e-114
## 
## [[16]]
## [1] 7.68903964589e-83
## 
## [[17]]
## [1] 3.02848169564e-40
## 
## [[18]]
## [1] 3.96526316881e-130
## 
## [[19]]
## [1] 8.36629506114e-08
## 
## [[20]]
## [1] 1.71002954424e-14
## 
## [[21]]
## [1] 2.42424221461e-22
## 
## [[22]]
## [1] 6.98621947058e-219
## 
## [[23]]
## [1] 1
## 
## [[24]]
## [1] 3.21115415621e-86
## 
## [[25]]
## [1] 6.41317715297e-06
## 
## [[26]]
## [1] 1.43663101189e-69
## 
## [[27]]
## [1] 4.41673966393e-07
## 
## [[28]]
## [1] 6.61603890244e-48
## 
## [[29]]
## [1] 5.4318766264e-49
## 
## [[30]]
## [1] 3.75570099182e-24
## 
## [[31]]
## [1] 1.70405118165e-76
## 
## [[32]]
## [1] 2.9737532138e-20
## 
## [[33]]
## [1] 1.79727566081e-26
## 
## [[34]]
## [1] 3.41909537506e-117
## 
## [[35]]
## [1] 9.91618839392e-05
## 
## [[36]]
## [1] 2.7817375765e-185
## 
## [[37]]
## [1] 1.84759455121e-90
## 
## [[38]]
## [1] 6.8147656131e-60
## 
## [[39]]
## [1] 2.92687096907e-269
## 
## [[40]]
## [1] 1.19031792291e-87
## 
## [[41]]
## [1] 5.33237046098e-17
## 
## [[42]]
## [1] 2.57005809962e-05
## 
## [[43]]
## [1] 1.20028936084e-110
## 
## [[44]]
## [1] 2.22847183985e-202
## 
## [[45]]
## [1] 0.0967060930034
## 
## [[46]]
## [1] 5.76700719968e-05
## 
## [[47]]
## [1] 8.29688041996e-70
## 
## [[48]]
## [1] 6.61645021362e-67
## 
## [[49]]
## [1] 6.27078060316e-23
## 
## [[50]]
## [1] 3.54856417096e-54
## 
## [[51]]
## [1] 1
#repeat for BD v BP
results <- c()
for (i in 1:51){
  pp<-prop.test(x=df[i,5]*df[i,10], n=df[i,5], p=df[i, 11], alternative="two.sided")
  results[[i]]<-pp$p.value
}
results #many more p>0.05 accept Ho, proprotions not different, but are in some
## [[1]]
## [1] 1
## 
## [[2]]
## [1] 1
## 
## [[3]]
## [1] 1
## 
## [[4]]
## [1] 0.0026879858028
## 
## [[5]]
## [1] 1.51767771481e-171
## 
## [[6]]
## [1] 1
## 
## [[7]]
## [1] 9.7649336478e-26
## 
## [[8]]
## [1] 0.194772921376
## 
## [[9]]
## [1] 2.68331036802e-88
## 
## [[10]]
## [1] 1.11046392712e-53
## 
## [[11]]
## [1] 2.22152768951e-05
## 
## [[12]]
## [1] 1
## 
## [[13]]
## [1] 1
## 
## [[14]]
## [1] 1.69153129452e-113
## 
## [[15]]
## [1] 7.09731435415e-08
## 
## [[16]]
## [1] 5.82467994301e-07
## 
## [[17]]
## [1] 1.58173416555e-05
## 
## [[18]]
## [1] 2.9679501893e-06
## 
## [[19]]
## [1] 2.25976049689e-17
## 
## [[20]]
## [1] 1
## 
## [[21]]
## [1] 3.84427492624e-40
## 
## [[22]]
## [1] 1.5161931952e-08
## 
## [[23]]
## [1] 2.04263601565e-257
## 
## [[24]]
## [1] 1.5396232887e-06
## 
## [[25]]
## [1] 1
## 
## [[26]]
## [1] 4.62201358458e-06
## 
## [[27]]
## [1] 1
## 
## [[28]]
## [1] 0.00280257466583
## 
## [[29]]
## [1] 0.000257486066358
## 
## [[30]]
## [1] 1
## 
## [[31]]
## [1] 2.72242526507e-177
## 
## [[32]]
## [1] 2.49808525262e-10
## 
## [[33]]
## [1] 0
## 
## [[34]]
## [1] 6.78105271511e-15
## 
## [[35]]
## [1] 0.000813736361832
## 
## [[36]]
## [1] 1
## 
## [[37]]
## [1] 1
## 
## [[38]]
## [1] 1
## 
## [[39]]
## [1] 1.0544519934e-12
## 
## [[40]]
## [1] 0.0125859012563
## 
## [[41]]
## [1] 4.76001078493e-20
## 
## [[42]]
## [1] 0.00014136010559
## 
## [[43]]
## [1] 8.58858815376e-06
## 
## [[44]]
## [1] 4.15009836288e-20
## 
## [[45]]
## [1] 1
## 
## [[46]]
## [1] 1
## 
## [[47]]
## [1] 1.65141388885e-48
## 
## [[48]]
## [1] 4.69867500862e-09
## 
## [[49]]
## [1] 1
## 
## [[50]]
## [1] 3.2814691603e-07
## 
## [[51]]
## [1] 1
#Investigate the Means Pop and TD and compare

sumPop<-sum(df$Pop)
sumDth<-sum(df$TD)
grPop<-aggregate(df$Pop, by = list(gr=df$Party), FUN = sum)
grDeaths<-aggregate(df$TD, by = list(gr=df$Party), FUN = sum)
grPop #Democrat = 2, larger than Rep
##   gr         x
## 1  1 145921772
## 2  2 182317751
grDeaths #Deaths Rep > Deaths Dem
##   gr      x
## 1  1 479940
## 2  2 533157
#check democrat deaths v pop proportion
a1<-grDeaths[1,2]/sumDth #observed
a2<-grPop[1,2]/sumPop    #expected
a1 #2=Dem 0.52, 1=Rep 0.47
## [1] 0.473735486335
a2 #2=Dem 0.55, 1=Rep 0.44
## [1] 0.444558810793
#observed death rate is less than democrat pop proportion
#observed deaths, observed deaths, expect pop prop

prop.test(x=a1, n=sumDth, p=a2, alternative="two.sided") 
## 
##  1-sample proportions test with continuity correction
## 
## data:  a1 out of sumDth, null probability a2
## X-squared = 810849.5712, df = 1, p-value < 2.220446e-16
## alternative hypothesis: true p is not equal to 0.444558810793
## 95 percent confidence interval:
##  0.00000000000e+00 5.54754646524e-06
## sample estimates:
##                 p 
## 4.67611182676e-07
#reject Ho, p<0.05, observed not same as expected, p-value=2.2e-16
#if the distributions are normal can use parametric tests
#check normality, p>0.05 is the H0, if p<0.05 not normal
apply(df[,c(4:7,10)],2, shapiro.test)
## $TC
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.7071079139, p-value = 8.83637606e-09
## 
## 
## $TD
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.7637293276, p-value = 1.14148118e-07
## 
## 
## $Pop
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.7071891254, p-value = 8.86677819e-09
## 
## 
## $WD
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.8958545817, p-value = 0.000306703814
## 
## 
## $BD
## 
##  Shapiro-Wilk normality test
## 
## data:  newX[, i]
## W = 0.7828280348, p-value = 2.94608334e-07
library("ggpubr")
ggboxplot(df, x = "Party", y = c("Pop"), 
          color = "Party", palette = c("#00AFBB", "#E7B800"),
          ylab = "Pop", xlab = "Party")

library(dplyr)
dd<-df %>%
  group_by(Party) %>%
  summarise_each(funs(sum, mean,), Pop, TC, TD)
## Warning: `summarise_each_()` was deprecated in dplyr 0.7.0.
## Please use `across()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
dd  
## # A tibble: 2 x 7
##   Party   Pop_sum   TC_sum TD_sum Pop_mean  TC_mean TD_mean
##   <int>     <int>    <int>  <int>    <dbl>    <dbl>   <dbl>
## 1     1 145921772 39683255 479940 5836871. 1587330.  19198.
## 2     2 182317751 47491813 533157 7012221. 1826608.  20506.
#non-parametric test as data not normally distributed
pop <- wilcox.test(TD*BD ~ Party, data = df,
                   exact = FALSE)
pop #p>0.05 hence accept Ho BD proportion same/similar for each party
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  TD * BD by Party
## W = 304, p-value = 0.699299908
## alternative hypothesis: true location shift is not equal to 0
#Investigate deaths > 65, roughly 75% > 65y are W AND 10% B, 
#overall 16% of US pop is 65years or greater
#Pop is overall W 62% and B 13.4%
#The number values are in the data file, see the plot names below

plot(df$Dunder55, type="l",main="Deaths <55, 55 to 65, >65", xaxt="n", ylim=c(0, 100000))
lines(df$`D55-65`, type="l", col="red")
lines(df$D65, type="l", col="blue")
axis(1, 1:nrow(df), labels=rownames(df), cex.axis=0.7)

#note the large discrpency of the blue line >65!
#the two means of the difference in observed deaths vs expected based on pop proportion
#are shown below
mean(df$diffW) #0.036
## [1] 0.0364705882353
mean(df$diffB) #0.014
## [1] 0.0135294117647
#another way to look at this is to note that 75% of people aged over 65 are W and
#we would expect that 75% of the deaths over 65 are W, 10% would be B.
#hence 75-62 = 13% More W, and 10-13.4 = -3.4 less are B

#BUT is this expected, I can't find observed data to finish the story