DATA 607 Final Project

Introduction

In this project I will analyze the relationship between the Novel Coronavirus (COVID-19)and population around the world, to demonstrate dataprocessing and visualisation with R, tidyverse and ggplot2.

Loading Libraries

library(magrittr) # pipe operations
library(tidyverse) # ggplot2, tidyr, dplyr...

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.0     v purrr   0.3.3
## v tibble  3.0.0     v dplyr   0.8.4
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::extract()   masks magrittr::extract()
## x dplyr::filter()    masks stats::filter()
## x dplyr::lag()       masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()

library(gridExtra) # multiple grid-based plots on a page

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(ggforce) # accelerating ggplot2
library(kableExtra) # complex tables

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(leaflet) # map
library(ggrepel)

Data

The data source used for this analysis is from Our World in Data github repository found here: https://github.com/owid/covid-19-data/blob/master/public/data/owid-covid-data.csv

Load Data in R

At first, the dataset, which is a CSV files, is downloaded and saved as local files and then are loaded into R from github.

url <- 'https://raw.githubusercontent.com/Vthomps000/DATA607_VT/Final-Project/owid-covid-data.csv'
COVID19 <- read.csv(url)
c19 <- data.frame(COVID19)
head(c19)

##   iso_code location       date total_cases new_cases total_deaths new_deaths
## 1      ABW    Aruba 2020-03-13           2         2            0          0
## 2      ABW    Aruba 2020-03-20           4         2            0          0
## 3      ABW    Aruba 2020-03-24          12         8            0          0
## 4      ABW    Aruba 2020-03-25          17         5            0          0
## 5      ABW    Aruba 2020-03-26          19         2            0          0
## 6      ABW    Aruba 2020-03-27          28         9            0          0
##   total_cases_per_million new_cases_per_million total_deaths_per_million
## 1                  18.733                18.733                        0
## 2                  37.465                18.733                        0
## 3                 112.395                74.930                        0
## 4                 159.227                46.831                        0
## 5                 177.959                18.733                        0
## 6                 262.256                84.296                        0
##   new_deaths_per_million total_tests new_tests total_tests_per_thousand
## 1                      0          NA        NA                       NA
## 2                      0          NA        NA                       NA
## 3                      0          NA        NA                       NA
## 4                      0          NA        NA                       NA
## 5                      0          NA        NA                       NA
## 6                      0          NA        NA                       NA
##   new_tests_per_thousand tests_units population population_density median_age
## 1                     NA                 106766              584.8       41.2
## 2                     NA                 106766              584.8       41.2
## 3                     NA                 106766              584.8       41.2
## 4                     NA                 106766              584.8       41.2
## 5                     NA                 106766              584.8       41.2
## 6                     NA                 106766              584.8       41.2
##   aged_65_older aged_70_older gdp_per_capita extreme_poverty cvd_death_rate
## 1        13.085         7.452       35973.78              NA             NA
## 2        13.085         7.452       35973.78              NA             NA
## 3        13.085         7.452       35973.78              NA             NA
## 4        13.085         7.452       35973.78              NA             NA
## 5        13.085         7.452       35973.78              NA             NA
## 6        13.085         7.452       35973.78              NA             NA
##   diabetes_prevalence female_smokers male_smokers handwashing_facilities
## 1               11.62             NA           NA                     NA
## 2               11.62             NA           NA                     NA
## 3               11.62             NA           NA                     NA
## 4               11.62             NA           NA                     NA
## 5               11.62             NA           NA                     NA
## 6               11.62             NA           NA                     NA
##   hospital_beds_per_100k
## 1                     NA
## 2                     NA
## 3                     NA
## 4                     NA
## 5                     NA
## 6                     NA

glimpse(c19)

## Rows: 17,013
## Columns: 29
## $ iso_code                 <fct> ABW, ABW, ABW, ABW, ABW, ABW, ABW, ABW, AB...
## $ location                 <fct> Aruba, Aruba, Aruba, Aruba, Aruba, Aruba, ...
## $ date                     <fct> 2020-03-13, 2020-03-20, 2020-03-24, 2020-0...
## $ total_cases              <int> 2, 4, 12, 17, 19, 28, 28, 28, 50, 55, 55, ...
## $ new_cases                <int> 2, 2, 8, 5, 2, 9, 0, 0, 22, 5, 0, 5, 2, 2,...
## $ total_deaths             <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ new_deaths               <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ total_cases_per_million  <dbl> 18.733, 37.465, 112.395, 159.227, 177.959,...
## $ new_cases_per_million    <dbl> 18.733, 18.733, 74.930, 46.831, 18.733, 84...
## $ total_deaths_per_million <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, ...
## $ new_deaths_per_million   <dbl> 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, ...
## $ total_tests              <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ new_tests                <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ total_tests_per_thousand <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ new_tests_per_thousand   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ tests_units              <fct> , , , , , , , , , , , , , , , , , , , , , ...
## $ population               <dbl> 106766, 106766, 106766, 106766, 106766, 10...
## $ population_density       <dbl> 584.8, 584.8, 584.8, 584.8, 584.8, 584.8, ...
## $ median_age               <dbl> 41.2, 41.2, 41.2, 41.2, 41.2, 41.2, 41.2, ...
## $ aged_65_older            <dbl> 13.085, 13.085, 13.085, 13.085, 13.085, 13...
## $ aged_70_older            <dbl> 7.452, 7.452, 7.452, 7.452, 7.452, 7.452, ...
## $ gdp_per_capita           <dbl> 35973.78, 35973.78, 35973.78, 35973.78, 35...
## $ extreme_poverty          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ cvd_death_rate           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ diabetes_prevalence      <dbl> 11.62, 11.62, 11.62, 11.62, 11.62, 11.62, ...
## $ female_smokers           <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ male_smokers             <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ handwashing_facilities   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ hospital_beds_per_100k   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...

The dataset has 171013 rows and 29 columns shown above.

Data Cleaning

For this analysis I will be specifically looking at location,total_cases, and population.

c19_2<- c19 %>%
  select(location, total_cases, population) %>%
  arrange(location)

The csv file entries were configured by date, causing duplicate entries for location. I collapsed all row entries by the number of total_cases.

c19_3 <- count(c19_2, location, wt= total_cases)
names(c19_3)[names(c19_3) == "n"] <- "cases"

Merging Datasets

Here, I merged the dataframes to regain the population column. Next I removed and replaced, total_cases with the column cases I created above.

c19_4 <- merge(c19_2,c19_3, all.x = TRUE, all.y = TRUE)

c19_5 <- c19_4 %>%
  select(-total_cases)

Here, I removed all the duplicate entries and row names.

df_new <- unique(c19_5)

rownames(df_new) <- c()

Now, I have a tidied dataset to work with.

kable(df_new) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

location	population	cases
Afghanistan	38928341	70317
Albania	2877800	27856
Algeria	43851043	135473
Andorra	77265	30959
Angola	32866268	1105
Anguilla	15002	137
Antigua and Barbuda	97928	965
Argentina	45195777	149894
Armenia	2963234	75662
Aruba	106766	4084
Australia	25499881	310390
Austria	9006400	691090
Azerbaijan	10139175	63793
Bahamas	393248	2780
Bahrain	1701583	106630
Bangladesh	164689383	216405
Barbados	287371	3338
Belarus	9449321	387860
Belgium	11589616	1699109
Belize	397621	645
Benin	12123198	3486
Bermuda	62273	3737
Bhutan	771612	306
Bolivia	11673029	40930
Bonaire Sint Eustatius and Saba	26221	191
Bosnia and Herzegovina	3280815	59786
Botswana	2351625	719
Brazil	212559409	2657049
British Virgin Islands	30237	215
Brunei	437483	7460
Bulgaria	6948445	50992
Burkina Faso	20903278	25438
Burundi	11890781	447
Cambodia	16718971	6305
Cameroon	26545864	58769
Canada	37742157	1733054
Cape Verde	555988	3921
Cayman Islands	65720	2660
Central African Republic	4829764	1985
Chad	16425859	3451
Chile	19116209	571538
China	1439323774	7741044
Colombia	50882884	220423
Comoros	869595	81
Congo	5518092	6912
Costa Rica	5094114	30125
Cote d’Ivoire	26378275	40866
Croatia	4105268	81633
Cuba	11326616	46900
Curacao	164100	668
Cyprus	875899	32513
Czech Republic	10708982	306744
Democratic Republic of Congo	89561404	19959
Denmark	5792203	348720
Djibouti	988002	30280
Dominica	71991	732
Dominican Republic	10847904	230499
Ecuador	17643060	669292
Egypt	102334403	183290
El Salvador	6486201	14320
Equatorial Guinea	1402985	7725
Eritrea	3546427	1560
Estonia	1326539	69009
Ethiopia	114963583	4951
Faeroe Islands	48865	9428
Falkland Islands	3483	400
Fiji	896444	740
Finland	5540718	179980
France	65273512	5060467
French Polynesia	280904	2597
Gabon	2225728	9537
Gambia	2416664	489
Georgia	3989175	18337
Germany	83783945	6377123
Ghana	31072945	65624
Gibraltar	33691	5924
Greece	10423056	109731
Greenland	56772	534
Grenada	112519	734
Guam	168783	6140
Guatemala	17915567	18371
Guernsey	NA	9589
Guinea	13132792	38707
Guinea-Bissau	1967998	7713
Guyana	786559	2813
Haiti	11402533	3067
Honduras	9904608	33022
Hong Kong	7496988	0
Hungary	9660350	93043
Iceland	341250	83094
India	1380004385	1117151
Indonesia	273523621	327931
International	NA	11737
Iran	83992953	4012174
Iraq	40222503	78618
Ireland	4937796	657036
Isle of Man	85032	11292
Israel	8655541	590112
Italy	60461828	8719677
Jamaica	2961161	11327
Japan	126476458	472469
Jersey	NA	10424
Jordan	10203140	20023
Kazakhstan	18776707	103799
Kenya	53771300	14502
Kosovo	NA	24495
Kuwait	4270563	136477
Kyrgyzstan	6524191	25419
Laos	7275556	775
Latvia	1886202	34390
Lebanon	6825442	33418
Liberia	5057677	4313
Libya	6871287	1971
Liechtenstein	38137	4122
Lithuania	2722291	54831
Luxembourg	625976	162754
Macedonia	2083380	50292
Madagascar	27691019	5345
Malawi	19129955	1080
Malaysia	32365998	250652
Maldives	540542	10730
Mali	20250834	13595
Malta	441539	18902
Mauritania	4649660	333
Mauritius	1271767	13728
Mexico	128932753	586738
Moldova	4033963	119559
Monaco	39244	4200
Mongolia	3278292	1420
Montenegro	628062	12785
Montserrat	4999	447
Morocco	36910558	141238
Mozambique	31255435	2150
Myanmar	54409794	4505
Namibia	2540916	750
Nepal	29136808	2105
Netherlands	17134873	1442888
New Caledonia	285491	885
New Zealand	4822233	48800
Nicaragua	6624554	492
Niger	24206636	24561
Nigeria	206139587	61827
Northern Mariana Islands	57557	548
Norway	5421242	340647
Oman	5106622	69941
Pakistan	220892331	554032
Palestine	5101416	16893
Panama	4314768	218704
Papua New Guinea	8947027	246
Paraguay	7132530	12025
Peru	32971846	1092421
Philippines	109581085	296471
Poland	37846605	430155
Portugal	10196707	899217
Puerto Rico	2860840	53363
Qatar	2881060	394270
Romania	19237682	408618
Russia	145934460	3208483
Rwanda	12952209	8087
Saint Kitts and Nevis	53192	597
Saint Lucia	183629	722
Saint Vincent and the Grenadines	110947	546
San Marino	33938	22898
Sao Tome and Principe	219161	1963
Saudi Arabia	34813867	670642
Senegal	16743930	31122
Serbia	6804596	273044
Seychelles	98340	565
Sierra Leone	7976985	3941
Singapore	5850343	448956
Sint Maarten (Dutch part)	42882	2558
Slovakia	5459643	49033
Slovenia	2078932	62685
Somalia	15893219	15924
South Africa	59308690	198463
South Korea	51269183	709850
South Sudan	11193729	1354
Spain	46754783	8362973
Sri Lanka	21413250	20098
Sudan	43849269	14801
Suriname	586634	504
Swaziland	1160164	2590
Sweden	10099270	713499
Switzerland	8654618	1268153
Syria	17500657	1514
Taiwan	23816775	21662
Tajikistan	9537642	4637
Tanzania	59734213	10595
Thailand	69799978	128736
Timor	1318442	663
Togo	8278737	4333
Trinidad and Tobago	1399491	5482
Tunisia	11818618	37468
Turkey	84339067	3745928
Turks and Caicos Islands	38718	454
Uganda	45741000	3235
Ukraine	43733759	306585
United Arab Emirates	9890400	374047
United Kingdom	67886004	5608894
United States	331002647	35598223
United States Virgin Islands	104423	2485
Uruguay	3473727	25980
Uzbekistan	33469199	63850
Vatican	809	430
Venezuela	28435943	12679
Vietnam	97338583	14186
Western Sahara	597330	108
World	7794798729	118037984
Yemen	29825968	385
Zambia	18383956	4257
Zimbabwe	14862927	1079

Data Analysis and Visualization

I wanted to take a look at the trends of all the available countries in the data set. However, that contained a large number of rows, so I decided to rank the top 20 countries by both cases, and population for comparison.

ggplot(df_new, aes(x= log(population) , y= log(cases), label=location))+
  geom_point() +geom_text(aes(label=location),hjust=0, vjust=0, size=4)

## Warning: Removed 4 rows containing missing values (geom_point).

## Warning: Removed 4 rows containing missing values (geom_text).

linearMod <- lm(population ~ cases, data=df_new)  # build linear regression model on full data
print(linearMod)

## 
## Call:
## lm(formula = population ~ cases, data = df_new)
## 
## Coefficients:
## (Intercept)        cases  
##   5.785e+06    6.088e+01

cor.test(df_new$population, df_new$cases)

## 
##  Pearson's product-moment correlation
## 
## data:  df_new$population and df_new$cases
## t = 39.608, df = 205, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9223627 0.9544103
## sample estimates:
##       cor 
## 0.9404415

If the p Value is less than 0.05 and the cor test reult is 0.9404415, we reject the null hypothesis that the true correlation is zero (i.e. they are independent). Here, we reject the null hypothesis and conclude that cases is dependent on population.

AIC(linearMod)

## [1] 8483.602

BIC(linearMod)

## [1] 8493.6

The Akaike’s information criterion - AIC (Akaike, 1974) and the Bayesian information criterion - BIC (Schwarz, 1978) are measures of the goodness of fit of an estimated statistical model and can also be used for model selection. Both criteria depend on the maximized value of the likelihood function L for the estimated model. Here, they are both quite high, indicating a poor fit.

summary(linearMod)

## 
## Call:
## lm(formula = population ~ cases, data = df_new)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.842e+09 -6.241e+06 -4.507e+06  9.102e+06  1.306e+09 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.785e+06  1.333e+07   0.434    0.665    
## cases       6.088e+01  1.537e+00  39.608   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 190100000 on 205 degrees of freedom
##   (4 observations deleted due to missingness)
## Multiple R-squared:  0.8844, Adjusted R-squared:  0.8839 
## F-statistic:  1569 on 1 and 205 DF,  p-value: < 2.2e-16

The mean of residuals is close to 0.

mod <- lm(population ~ cases, data=df_new)
mean(mod$residuals)

## [1] -3.07193e-09

mod2 <- lm(population ~ cases, data=df_new[1:211, ])  #  linear model
plot(mod2)

The Q-Q plot the points fall along a line in the middle of the graph, but curve off in the extremities, this indicates that the data has more extreme values than would be expected if they truly came from a Normal distribution. The distribution appears not normal.

Based on the linear model, the slope is positive, indicating a relationship between the population and number of cases. The \(R^2\) is 0.8844, adjusted \(R^2\) is 0.8839, the p value is far less than 0.5. However, there are many outliers in both variables, that standard error is very high, and 4 variables were deleted due to NA values..

outlier_values <- boxplot.stats(df_new$cases)$out
outlier_values

##  [1]    691090    387860   1699109   2657049   1733054    571538   7741044
##  [8]    669292   5060467   6377123   1117151   4012174    657036    590112
## [15]   8719677    472469    586738   1442888    554032   1092421    430155
## [22]    899217    394270    408618   3208483    670642    448956    709850
## [29]   8362973    713499   1268153   3745928   5608894  35598223 118037984

outlier_values <- boxplot.stats(df_new$population)$out
outlier_values

##  [1]  164689383  212559409 1439323774   89561404  102334403  114963583
##  [7]   83783945 1380004385  273523621   83992953  126476458  128932753
## [13]  206139587  220892331  109581085  145934460   69799978   84339067
## [19]   67886004  331002647   97338583 7794798729

Top 20 Countries Ranked

## ranking by confirmed cases
df_20 <- df_new %>% 
select(location, population, cases) %>%
mutate(ranking = dense_rank(desc(cases)))
k <- 20
## top 20 countries: 21 incl. 'World'
top.countries <- df_20 %>% filter(ranking <= k + 1) %>%
arrange(ranking)

kable(top.countries) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

location	population	cases	ranking
World	7794798729	118037984	1
United States	331002647	35598223	2
Italy	60461828	8719677	3
Spain	46754783	8362973	4
China	1439323774	7741044	5
Germany	83783945	6377123	6
United Kingdom	67886004	5608894	7
France	65273512	5060467	8
Iran	83992953	4012174	9
Turkey	84339067	3745928	10
Russia	145934460	3208483	11
Brazil	212559409	2657049	12
Canada	37742157	1733054	13
Belgium	11589616	1699109	14
Netherlands	17134873	1442888	15
Switzerland	8654618	1268153	16
India	1380004385	1117151	17
Peru	32971846	1092421	18
Portugal	10196707	899217	19
Sweden	10099270	713499	20
South Korea	51269183	709850	21

## ranking by population
df2_20 <- df_new %>% 
select(location, population, cases) %>%
mutate(ranking = dense_rank(desc(population)))
k <- 20
## top 20 countries: 21 incl. 'World'
top.countriespop <- df2_20 %>% filter(ranking <= k + 1) %>%
arrange(ranking)

kable(top.countriespop) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

location	population	cases	ranking
World	7794798729	118037984	1
China	1439323774	7741044	2
India	1380004385	1117151	3
United States	331002647	35598223	4
Indonesia	273523621	327931	5
Pakistan	220892331	554032	6
Brazil	212559409	2657049	7
Nigeria	206139587	61827	8
Bangladesh	164689383	216405	9
Russia	145934460	3208483	10
Mexico	128932753	586738	11
Japan	126476458	472469	12
Ethiopia	114963583	4951	13
Philippines	109581085	296471	14
Egypt	102334403	183290	15
Vietnam	97338583	14186	16
Democratic Republic of Congo	89561404	19959	17
Turkey	84339067	3745928	18
Iran	83992953	4012174	19
Germany	83783945	6377123	20
Thailand	69799978	128736	21

ggplot(top.countries, aes(x= log(population) , y= log(cases), label=location))+
  geom_point() +geom_text(aes(label=location),hjust=0, vjust=0, size=4)

ggplot(top.countriespop, aes(x= log(population) , y= log(cases), label=location))+
  geom_point() +geom_text(aes(label=location),hjust=0, vjust=0, size=4)

ggplot()+
  geom_point(data = top.countries, aes(x = log(population), y = log(cases), color = "blue"))+
  geom_text(data = top.countries, aes(x = log(population), y = log(cases), label= location, color = "blue"))+
  geom_point(data = top.countriespop, aes(x = log(population), y = log(cases), color = "red"), show.legend = TRUE)+
  geom_text(data = top.countriespop, aes(x = log(population), y = log(cases), label= location, color = "red"))+
  scale_color_manual(values = c("blue","red"), labels = c("top.countries","top.countriespop"), name = "")

Conclusion

Based on the graphs and tables above there does appear to be some relationship between the population and the number of worlwide COVID-19 cases. However, this is just an observational study, as many variables were removed. Due to several other factors, including pre-existing health conditions and age, etc., we cannot confirm this hypothesis.