library(readxl)
## Warning: package 'readxl' was built under R version 3.6.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
setwd("E:/mikhilesh/HU Sem VI ANLY 510 and 506/ANLY 506 Doha Exploratory Data Analysis/assignment and project")
data <- read_xlsx("EDA Project data COVID-19 death count.xlsx")
#View(data)
names(data)
## [1] "Data as of"
## [2] "State"
## [3] "Indicator"
## [4] "Non-Hispanic White"
## [5] "Non-Hispanic Black or African American"
## [6] "Non-Hispanic American Indian or Alaska Native"
## [7] "Non-Hispanic Asian"
## [8] "Hispanic or Latino"
## [9] "Other"
## [10] "Footnote"
str(data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 126 obs. of 10 variables:
## $ Data as of : POSIXct, format: "2020-06-03" "2020-06-03" ...
## $ State : chr "United States" "United States" "United States" "Alabama" ...
## $ Indicator : chr "Distribution of COVID-19 deaths (%)" "Weighted distribution of population (%)" "Unweighted distribution of population (%)" "Distribution of COVID-19 deaths (%)" ...
## $ Non-Hispanic White : num 53.2 42.2 60.4 51.5 54.3 65.4 53 54.7 54.4 64.8 ...
## $ Non-Hispanic Black or African American : num 23 17.7 12.5 45.7 38.1 26.5 3.1 5.2 4.4 30.5 ...
## $ Non-Hispanic American Indian or Alaska Native: num 0.5 0.3 0.7 NA 0.4 0.6 21.4 2 4 0 ...
## $ Non-Hispanic Asian : num 5.3 11 5.7 NA 2 1.5 1.8 4.1 3.4 0 ...
## $ Hispanic or Latino : num 16.4 27 18.3 1.8 3.8 4.4 18.4 31.8 31.6 NA ...
## $ Other : num 1.6 1.9 2.4 0 1.4 1.6 2.3 2.2 2.2 NA ...
## $ Footnote : chr NA NA NA "One or more data cells have counts between 1–9 and have been suppressed in accordance with NCHS confidentiality standards." ...
dim(data) #shape(data), glimpse(data), dim() are function to shws how many rows/records and number of variables/columns
## [1] 126 10
summary(data)
## Data as of State Indicator Non-Hispanic White
## Min. :2020-06-03 Length:126 Length:126 Min. :12.70
## 1st Qu.:2020-06-03 Class :character Class :character 1st Qu.:49.27
## Median :2020-06-03 Mode :character Mode :character Median :60.95
## Mean :2020-06-03 Mean :59.97
## 3rd Qu.:2020-06-03 3rd Qu.:72.08
## Max. :2020-06-03 Max. :93.60
## NA's :3
## Non-Hispanic Black or African American
## Min. : 1.400
## 1st Qu.: 8.225
## Median :16.050
## Mean :19.253
## 3rd Qu.:26.575
## Max. :75.700
## NA's :4
## Non-Hispanic American Indian or Alaska Native Non-Hispanic Asian
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.200 1st Qu.: 2.450
## Median : 0.300 Median : 4.000
## Mean : 1.521 Mean : 5.023
## 3rd Qu.: 0.600 3rd Qu.: 6.500
## Max. :45.600 Max. :17.100
## NA's :25 NA's :15
## Hispanic or Latino Other Footnote
## Min. : 1.10 Min. :0.000 Length:126
## 1st Qu.: 6.20 1st Qu.:1.600 Class :character
## Median :10.95 Median :1.900 Mode :character
## Mean :14.36 Mean :2.003
## 3rd Qu.:19.23 3rd Qu.:2.300
## Max. :49.10 Max. :5.800
## NA's :8 NA's :23
names(data)[names(data) == "Non-Hispanic White"] <- "Non.Hispanic.White"
names(data)[names(data) == "Non-Hispanic Black or African American"] <- "Non.Hispanic.Black.or.African.American"
names(data)[names(data) == "Non-Hispanic American Indian or Alaska Native"] <- "Non.Hispanic.American.Indian.or.Alaska.Native"
names(data)[names(data) == "Non-Hispanic Asian"] <- "Non.Hispanic.Asian"
names(data)[names(data) == "Hispanic or Latino"] <- "Hispanic.or.Latino"
data$Non.Hispanic.Black.or.African.American <- ifelse(is.na(data$Non.Hispanic.Black.or.African.American),
ave(data$Non.Hispanic.Black.or.African.American, FUN = function(x)
mean(x, na.rm = TRUE)),
data$Non.Hispanic.Black.or.African.American)
data$Non.Hispanic.American.Indian.or.Alaska.Native <- ifelse(is.na(data$Non.Hispanic.American.Indian.or.Alaska.Native),
ave(data$Non.Hispanic.American.Indian.or.Alaska.Native, FUN = function(x)
mean(x, na.rm = TRUE)),
data$Non.Hispanic.American.Indian.or.Alaska.Native)
data$Non.Hispanic.Asian <- ifelse(is.na(data$Non.Hispanic.Asian),
ave(data$Non.Hispanic.Asian, FUN = function(x)
mean(x, na.rm = TRUE)),
data$Non.Hispanic.Asian)
data$Hispanic.or.Latino <- ifelse(is.na(data$Hispanic.or.Latino),
ave(data$Hispanic.or.Latino, FUN = function(x)
mean(x, na.rm = TRUE)),
data$Hispanic.or.Latino)
data$Other <- ifelse(is.na(data$Other),
ave(data$Other, FUN = function(x)
mean(x, na.rm = TRUE)),
data$Other)
subdata <- data[,c(-1,-10)]
#View(subdata)
subdata <- subdata[which(subdata$Indicator == "Weighted distribution of population (%)"),]
subdata <- subdata[,c(-2,-5,-6,-8)]
summary(subdata)
## State Non.Hispanic.White Non.Hispanic.Black.or.African.American
## Length:42 Min. :27.90 Min. : 2.00
## Class :character 1st Qu.:43.70 1st Qu.: 8.55
## Mode :character Median :57.05 Median :17.95
## Mean :55.09 Mean :19.39
## 3rd Qu.:62.55 3rd Qu.:25.90
## Max. :86.60 Max. :50.10
## Hispanic.or.Latino
## Min. : 2.700
## 1st Qu.: 7.425
## Median :12.700
## Mean :16.486
## 3rd Qu.:22.875
## Max. :47.200
#reshape - long to wide format
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.6.2
longdata <- melt(subdata)
## Using State as id variables
#View(longdata)
summary(longdata)
## State variable value
## Length:126 Non.Hispanic.White :42 Min. : 2.00
## Class :character Non.Hispanic.Black.or.African.American:42 1st Qu.:11.22
## Mode :character Hispanic.or.Latino :42 Median :24.25
## Mean :30.32
## 3rd Qu.:47.02
## Max. :86.60
colnames(longdata)[c(1,2,3)] <- c("State","Race","Percentage") #Using State as id variables
library(moments)
plot(density(longdata$Percentage), main = "Density Plot")
qqnorm(longdata$Percentage)
qqline(longdata$Percentage)
qqnorm(log(longdata$Percentage)) #transformation of data using log() function
qqline(log(longdata$Percentage))
qqnorm(sqrt(longdata$Percentage)) #transformation of data using sqrt() function
qqline(sqrt(longdata$Percentage))
agostino.test(longdata$Percentage)
##
## D'Agostino skewness test
##
## data: longdata$Percentage
## skew = 0.55041, z = 2.50894, p-value = 0.01211
## alternative hypothesis: data have a skewness
shapiro.test(longdata$Percentage)
##
## Shapiro-Wilk normality test
##
## data: longdata$Percentage
## W = 0.92041, p-value = 1.536e-06
anscombe.test(longdata$Percentage)
##
## Anscombe-Glynn kurtosis test
##
## data: longdata$Percentage
## kurt = 2.1196, z = -3.3505, p-value = 0.0008068
## alternative hypothesis: kurtosis is not equal to 3
# par(mfrow = c(2,2))
# Data has a skewness and it's not perfectly normalized. Transformation of data with log() function. However, sample size >20. And The F-test could still be robust to this assumption the sizes of each group are equal.
#Residual plot (lm - linear model)
eruption.lm <- lm(longdata$Percentage ~ factor(longdata$Race), data = longdata)
eruption.lm
##
## Call:
## lm(formula = longdata$Percentage ~ factor(longdata$Race), data = longdata)
##
## Coefficients:
## (Intercept)
## 55.09
## factor(longdata$Race)Non.Hispanic.Black.or.African.American
## -35.70
## factor(longdata$Race)Hispanic.or.Latino
## -38.61
eruption.res <- resid(eruption.lm)
eruption.res
## 1 2 3 4 5 6
## -12.89285714 -0.79285714 -0.39285714 -0.09285714 -27.19285714 7.30714286
## 7 8 9 10 11 12
## 6.80714286 6.60714286 -17.99285714 -23.49285714 -13.09285714 -11.89285714
## 13 14 15 16 17 18
## 4.70714286 24.20714286 14.40714286 14.00714286 -6.59285714 -13.99285714
## 19 20 21 22 23 24
## 13.40714286 4.10714286 13.70714286 -10.19285714 10.30714286 17.00714286
## 25 26 27 28 29 30
## -11.79285714 31.50714286 -6.09285714 -16.99285714 7.50714286 -24.29285714
## 31 32 33 34 35 36
## -0.19285714 11.20714286 5.10714286 15.00714286 2.40714286 7.20714286
## 37 38 39 40 41 42
## 4.30714286 -4.99285714 -21.89285714 -3.29285714 5.80714286 1.50714286
## 43 44 45 46 47 48
## -1.69285714 18.70714286 -14.19285714 14.40714286 -11.79285714 -12.89285714
## 49 50 51 52 53 54
## -7.39285714 2.30714286 25.50714286 -1.19285714 19.00714286 2.90714286
## 55 56 57 58 59 60
## 4.10714286 -12.99285714 -10.69285714 0.50714286 18.80714286 13.40714286
## 61 62 63 64 65 66
## -11.69285714 8.80714286 -6.69285714 30.70714286 5.30714286 -10.89285714
## 67 68 69 70 71 72
## -7.99285714 -17.39285714 -4.99285714 -17.19285714 -8.79285714 3.90714286
## 73 74 75 76 77 78
## 6.90714286 2.90714286 -7.59285714 -15.09285714 3.70714286 -11.09285714
## 79 80 81 82 83 84
## 10.60714286 18.40714286 -2.39285714 -5.79285714 -13.29285714 2.80714286
## 85 86 87 88 89 90
## 10.51428571 -12.68571429 15.31428571 -9.78571429 30.71428571 7.31428571
## 91 92 93 94 95 96
## 2.51428571 -6.58571429 -5.18571429 30.01428571 -5.68571429 8.61428571
## 97 98 99 100 101 102
## -5.58571429 -8.98571429 -2.58571429 -10.88571429 -8.08571429 -2.18571429
## 103 104 105 106 107 108
## -3.78571429 -11.18571429 -9.58571429 -13.78571429 -12.98571429 -3.78571429
## 109 110 111 112 113 114
## 14.91428571 -10.68571429 6.51428571 28.51428571 2.01428571 12.01428571
## 115 116 117 118 119 120
## -5.28571429 -11.18571429 -1.98571429 -2.68571429 -5.48571429 6.01428571
## 121 122 123 124 125 126
## -10.28571429 -9.08571429 25.01428571 -1.48571429 -5.88571429 -2.58571429
plot(longdata$Percentage, eruption.res, ylab = "Residuals", xlab = "Group", main = "Independence of observations(driving school)")
abline(0,0)
#Data Meets the criterion for Independence of Observations
bartlett.test(longdata$Percentage, factor(longdata$Race))
##
## Bartlett test of homogeneity of variances
##
## data: longdata$Percentage and factor(longdata$Race)
## Bartlett's K-squared = 0.5516, df = 2, p-value = 0.759
# Variance are equal. p-value = 0.759 failed to reject the null hypothesis
#Checking variance among groups
tapply(longdata$Percentage,longdata$Race,var)
## Non.Hispanic.White Non.Hispanic.Black.or.African.American
## 182.4846 152.7934
## Hispanic.or.Latino
## 146.8983
#Ratio of largest variance to smallest variance 182.28/146.89 = 1.24 - is way less than 3 (signifies that there is no issue with failing to reject null hypothesis)
#We may have a problem with variance equality as it did not pass the bartlett test and the ratio of largest variance and smallest variance is slightly greater than 3. DELETE AFTER VERIFYING WITH PINWEN
library(car)
## Warning: package 'car' was built under R version 3.6.2
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
scatterplot(longdata$Percentage ~ longdata$Race)
## [1] "89" "94"
#After checking that all the assumptions fulfill our criteria, now we will perform ANOVA
#With only one “Predictor” and more than two groups
#IV - race (predictor) - Race
#DV - mortality (outcome) - Percentage
#Perform the linear model:
summary(aov(Percentage ~ factor(Race), data = longdata)) #(Don't forget to factor()ize your predictor)
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(Race) 2 38828 19414 120.8 <2e-16 ***
## Residuals 123 19769 161
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#We can also create a model:
model <- aov(aov(Percentage ~ factor(Race), data = longdata))
model
## Call:
## aov(formula = aov(Percentage ~ factor(Race), data = longdata))
##
## Terms:
## factor(Race) Residuals
## Sum of Squares 38828.34 19769.23
## Deg. of Freedom 2 123
##
## Residual standard error: 12.67775
## Estimated effects may be unbalanced
summary(model)
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(Race) 2 38828 19414 120.8 <2e-16 ***
## Residuals 123 19769 161
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#To find out what group differs from the others
library(pgirmess)
## Warning: package 'pgirmess' was built under R version 3.6.3
#Pairwise comparisons using t tests - Bonferroni Method
pairwise.t.test(longdata$Percentage, longdata$Race, paired = FALSE, p.adjust.method = "bonferroni") #method can be "none", "bonferroni", "holm", "hochberg", "hommel", "BH", or "BY“
##
## Pairwise comparisons using t tests with pooled SD
##
## data: longdata$Percentage and longdata$Race
##
## Non.Hispanic.White
## Non.Hispanic.Black.or.African.American <2e-16
## Hispanic.or.Latino <2e-16
## Non.Hispanic.Black.or.African.American
## Non.Hispanic.Black.or.African.American -
## Hispanic.or.Latino 0.89
##
## P value adjustment method: bonferroni
#Kurskal-Wallis:
kruskalmc(Percentage ~ factor(Race), data = longdata)
## Multiple comparison test after Kruskal-Wallis
## p.value: 0.05
## Comparisons
## obs.dif
## Non.Hispanic.White-Non.Hispanic.Black.or.African.American 56.011905
## Non.Hispanic.White-Hispanic.or.Latino 62.380952
## Non.Hispanic.Black.or.African.American-Hispanic.or.Latino 6.369048
## critical.dif
## Non.Hispanic.White-Non.Hispanic.Black.or.African.American 19.07688
## Non.Hispanic.White-Hispanic.or.Latino 19.07688
## Non.Hispanic.Black.or.African.American-Hispanic.or.Latino 19.07688
## difference
## Non.Hispanic.White-Non.Hispanic.Black.or.African.American TRUE
## Non.Hispanic.White-Hispanic.or.Latino TRUE
## Non.Hispanic.Black.or.African.American-Hispanic.or.Latino FALSE
#Tukey’s Test:
TukeyHSD(model)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = aov(Percentage ~ factor(Race), data = longdata))
##
## $`factor(Race)`
## diff lwr
## Non.Hispanic.Black.or.African.American-Non.Hispanic.White -35.700000 -42.26334
## Hispanic.or.Latino-Non.Hispanic.White -38.607143 -45.17048
## Hispanic.or.Latino-Non.Hispanic.Black.or.African.American -2.907143 -9.47048
## upr p adj
## Non.Hispanic.Black.or.African.American-Non.Hispanic.White -29.136663 0.0000000
## Hispanic.or.Latino-Non.Hispanic.White -32.043806 0.0000000
## Hispanic.or.Latino-Non.Hispanic.Black.or.African.American 3.656194 0.5463936
library(pastecs)
## Warning: package 'pastecs' was built under R version 3.6.3
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
library(compute.es)
## Warning: package 'compute.es' was built under R version 3.6.3
#First get the relevant stats for each factor level (only relevant output pasted below):
by(longdata$Percentage, longdata$Race, stat.desc)
## longdata$Race: Non.Hispanic.White
## nbr.val nbr.null nbr.na min max range
## 42.0000000 0.0000000 0.0000000 27.9000000 86.6000000 58.7000000
## sum median mean SE.mean CI.mean.0.95 var
## 2313.9000000 57.0500000 55.0928571 2.0844354 4.2096027 182.4845819
## std.dev coef.var
## 13.5086854 0.2451985
## ------------------------------------------------------------
## longdata$Race: Non.Hispanic.Black.or.African.American
## nbr.val nbr.null nbr.na min max range
## 42.0000000 0.0000000 0.0000000 2.0000000 50.1000000 48.1000000
## sum median mean SE.mean CI.mean.0.95 var
## 814.5000000 17.9500000 19.3928571 1.9073377 3.8519467 152.7933624
## std.dev coef.var
## 12.3609612 0.6373976
## ------------------------------------------------------------
## longdata$Race: Hispanic.or.Latino
## nbr.val nbr.null nbr.na min max range
## 42.0000000 0.0000000 0.0000000 2.7000000 47.2000000 44.5000000
## sum median mean SE.mean CI.mean.0.95 var
## 692.4000000 12.7000000 16.4857143 1.8701816 3.7769084 146.8983275
## std.dev coef.var
## 12.1201620 0.7351918
#n - sample size, M - mean, SD - standard deviation
#Race: Non.Hispanic.White (n = 42, M = 55.09 , SD = 13.50)
#Race: Non.Hispanic.Black.or.African.American (n = 42, M = 19.39, SD = 12.36)
#Race: Hispanic.or.Latino (n = 42, M = 16.48, SD = 12.12)
#Use these values to compute a few standard Measures of Effect Size (MES or mes) for any pair of interest
#OPtion 1 will compare Race: Non.Hispanic.White and Non.Hispanic.Black.or.African.American ~ Percentage vs Race
mes(19.39, 55.09, 12.36, 13.50, 42, 42)
## Mean Differences ES:
##
## d [ 95 %CI] = -2.76 [ -3.36 , -2.16 ]
## var(d) = 0.09
## p-value(d) = 0
## U3(d) = 0.29 %
## CLES(d) = 2.56 %
## Cliff's Delta = -0.95
##
## g [ 95 %CI] = -2.73 [ -3.32 , -2.14 ]
## var(g) = 0.09
## p-value(g) = 0
## U3(g) = 0.31 %
## CLES(g) = 2.66 %
##
## Correlation ES:
##
## r [ 95 %CI] = -0.81 [ -0.87 , -0.72 ]
## var(r) = 0
## p-value(r) = 0
##
## z [ 95 %CI] = -1.14 [ -1.35 , -0.92 ]
## var(z) = 0.01
## p-value(z) = 0
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 0.01 [ 0 , 0.02 ]
## p-value(OR) = 0
##
## Log OR [ 95 %CI] = -5 [ -6.09 , -3.92 ]
## var(lOR) = 0.31
## p-value(Log OR) = 0
##
## Other:
##
## NNT = -5
## Total N = 84
#OPtion 2 will compare Race: Non.Hispanic.Black.or.African.American and Hispanic.or.Latino ~ Percentage vs Race
mes(16.48, 19.39, 12.12, 12.36, 42, 42)
## Mean Differences ES:
##
## d [ 95 %CI] = -0.24 [ -0.67 , 0.19 ]
## var(d) = 0.05
## p-value(d) = 0.28
## U3(d) = 40.6 %
## CLES(d) = 43.33 %
## Cliff's Delta = -0.13
##
## g [ 95 %CI] = -0.24 [ -0.66 , 0.19 ]
## var(g) = 0.05
## p-value(g) = 0.28
## U3(g) = 40.69 %
## CLES(g) = 43.39 %
##
## Correlation ES:
##
## r [ 95 %CI] = -0.12 [ -0.33 , 0.1 ]
## var(r) = 0.01
## p-value(r) = 0.28
##
## z [ 95 %CI] = -0.12 [ -0.34 , 0.1 ]
## var(z) = 0.01
## p-value(z) = 0.28
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 0.65 [ 0.3 , 1.42 ]
## p-value(OR) = 0.28
##
## Log OR [ 95 %CI] = -0.43 [ -1.21 , 0.35 ]
## var(lOR) = 0.16
## p-value(Log OR) = 0.28
##
## Other:
##
## NNT = -16.73
## Total N = 84
#OPtion 3 will compare Race: Non.Hispanic.White and Hispanic.or.Latino ~ Percentage vs Race
mes(16.48, 55.09, 12.12, 13.50, 42, 42)
## Mean Differences ES:
##
## d [ 95 %CI] = -3.01 [ -3.63 , -2.39 ]
## var(d) = 0.1
## p-value(d) = 0
## U3(d) = 0.13 %
## CLES(d) = 1.67 %
## Cliff's Delta = -0.97
##
## g [ 95 %CI] = -2.98 [ -3.6 , -2.36 ]
## var(g) = 0.1
## p-value(g) = 0
## U3(g) = 0.14 %
## CLES(g) = 1.75 %
##
## Correlation ES:
##
## r [ 95 %CI] = -0.84 [ -0.89 , -0.76 ]
## var(r) = 0
## p-value(r) = 0
##
## z [ 95 %CI] = -1.21 [ -1.43 , -0.99 ]
## var(z) = 0.01
## p-value(z) = 0
##
## Odds Ratio ES:
##
## OR [ 95 %CI] = 0 [ 0 , 0.01 ]
## p-value(OR) = 0
##
## Log OR [ 95 %CI] = -5.46 [ -6.59 , -4.33 ]
## var(lOR) = 0.33
## p-value(Log OR) = 0
##
## Other:
##
## NNT = -5
## Total N = 84
#Effect size (more than or equal to) < = 0.1 is small, 0.25 is medium, 0.4 is large (Cohen, 1988)
#Summary of ANOVA:
Observations from the study were analyzed by conducting a one-way analysis of variance using R version 3.6.1. First, all assumptions are met and only adjustment made is transformation of data using log (and sqrt) function during normality testing. Results suggest that the Race has a significant influence on measure of the Percentage death (F(2, 123) = 120.8, p < .001). Continuing the discussion with specifically which Race experienced the signiificantaly differed measures of the Percentage death, a Tukey’s hoc test was established. The result suggested that there is a significant difference between Race:- Non.Hispanic.White VS Non.Hispanic.Black.or.African.American, and Non.Hispanic.White Vs Hispanic.or.Latino (p-value < 0.001). The effect was large:- Cohen’s D = 3.28 for Race: Non.Hispanic.White and Non.Hispanic.Black.or.African.American, and Cohen’s D = 2.41 for Race: Non.Hispanic.White and Hispanic.or.Latino.