Loading the data and basic EDA
library(readxl)
covid <- read.csv("covid.csv")
co <- covid # backup of data
dim(covid) # data contains 34 attributes
## [1] 117771 34
str(covid)
## 'data.frame': 117771 obs. of 34 variables:
## $ ID : Factor w/ 115846 levels "000-1-","000-1-1",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ age : Factor w/ 184 levels "","0-10","0-18",..: 1 109 1 1 1 1 1 130 109 69 ...
## $ sex : Factor w/ 3 levels "","female","male": 1 3 1 1 1 1 1 2 3 2 ...
## $ city : Factor w/ 2366 levels "","Aa en Hunze",..: 1 1839 1 1 1 1 1 1839 1839 1839 ...
## $ province : Factor w/ 513 levels "","Aargau","Abruzzo",..: 342 495 1 1 1 1 205 495 495 495 ...
## $ country : Factor w/ 106 levels "","Afghanistan",..: 33 103 1 1 1 1 43 103 103 103 ...
## $ wuhan.0._not_wuhan.1. : int 1 1 NA NA NA NA 1 1 1 1 ...
## $ latitude : num 47.5 48 NA NA NA ...
## $ longitude : num -0.811 -121.696 NA NA NA ...
## $ geo_resolution : Factor w/ 7 levels "","admin","admin0",..: 4 5 1 1 1 1 4 5 5 5 ...
## $ date_onset_symptoms : Factor w/ 84 levels "","- 25.02.2020",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ date_admission_hospital : Factor w/ 79 levels "","01.01.2020",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ date_confirmation : Factor w/ 96 levels "","01.02.2020",..: 9 39 1 1 1 1 3 39 39 39 ...
## $ symptoms : Factor w/ 354 levels "","19","20","21",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ lives_in_Wuhan : Factor w/ 4 levels "","no","no, work in Wuhan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ travel_history_dates : Factor w/ 149 levels "","- 01.03.2020",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ travel_history_location : Factor w/ 482 levels "",";;Iran-;;Qatar",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ reported_market_exposure: Factor w/ 65 levels "","AM","Austria",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ additional_information : Factor w/ 1732 levels "","\"children\"",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ chronic_disease_binary : Factor w/ 243 levels "","0","1","AIPAC conference in Washington D.C.",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ chronic_disease : Factor w/ 31 levels "","\"thought to have had other pre-existing conditions\"",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ source : Factor w/ 2770 levels "","(1) http://www.ms.ro/2020/03/03/al-patrulea-caz-de-infectare-cu-noul-coronavirus-in-romania/; (2) https://www."| __truncated__,..: 2350 2396 1 1 1 1 2765 2396 2396 2396 ...
## $ sequence_available : Factor w/ 212 levels "","(a) https://for.ge/view/181718/koronavirusiani-meoTxe-pacientis-mdgomareoba-mZimea---ra-safrTxe-emuqreba-italii"| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
## $ outcome : Factor w/ 17 levels "","critical condition, intubated as of 14.02.2020",..: 1 11 1 1 1 1 1 1 1 1 ...
## $ date_death_or_discharge : Factor w/ 57 levels "","01.02.2020",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ notes_for_discussion : Factor w/ 62 levels "","Asymptomatic but placed in quarantine: https://www.thestar.com.my/news/nation/2020/01/25/three-chinese-national"| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
## $ location : Factor w/ 332 levels "","#N/A","Abu Dhabi",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ admin3 : Factor w/ 410 levels "","#N/A","Baihe County",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ admin2 : Factor w/ 877 levels "","#N/A","Abu Dhabi",..: 1 699 1 1 1 1 1 699 699 699 ...
## $ admin1 : Factor w/ 370 levels "","#N/A","Aargau",..: 250 355 1 1 1 1 159 355 355 355 ...
## $ country_new : Factor w/ 97 levels "","#N/A","19",..: 31 95 1 1 1 1 39 95 95 95 ...
## $ admin_id : Factor w/ 1219 levels "","#N/A","#REF!",..: 64 564 1 1 1 1 183 564 564 564 ...
## $ data_moderator_initials : Factor w/ 5 levels "","DSC","FS",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ travel_history_binary : int NA NA NA NA NA NA NA NA NA NA ...
colnames(covid)
## [1] "ID" "age"
## [3] "sex" "city"
## [5] "province" "country"
## [7] "wuhan.0._not_wuhan.1." "latitude"
## [9] "longitude" "geo_resolution"
## [11] "date_onset_symptoms" "date_admission_hospital"
## [13] "date_confirmation" "symptoms"
## [15] "lives_in_Wuhan" "travel_history_dates"
## [17] "travel_history_location" "reported_market_exposure"
## [19] "additional_information" "chronic_disease_binary"
## [21] "chronic_disease" "source"
## [23] "sequence_available" "outcome"
## [25] "date_death_or_discharge" "notes_for_discussion"
## [27] "location" "admin3"
## [29] "admin2" "admin1"
## [31] "country_new" "admin_id"
## [33] "data_moderator_initials" "travel_history_binary"
Checking missing data
options(scipen = 99)
# Checking Missing Values
sort(colSums(is.na(covid)),decr = T)
## travel_history_binary wuhan.0._not_wuhan.1. latitude
## 117170 53027 3332
## longitude age sex
## 3332 234 191
## reported_market_exposure travel_history_dates travel_history_location
## 36 12 6
## chronic_disease_binary chronic_disease date_onset_symptoms
## 6 6 2
## lives_in_Wuhan ID city
## 2 0 0
## province country geo_resolution
## 0 0 0
## date_admission_hospital date_confirmation symptoms
## 0 0 0
## additional_information source sequence_available
## 0 0 0
## outcome date_death_or_discharge notes_for_discussion
## 0 0 0
## location admin3 admin2
## 0 0 0
## admin1 country_new admin_id
## 0 0 0
## data_moderator_initials
## 0
covid$travel_history_binary <- NULL
covid$wuhan.0._not_wuhan.1.<- NULL
# this shows that the data contains sufficient missing values, however, most of the variables for our analysis are in good shape
Creating a part of a World Map
library(maps)
library(mapdata)
library(mapproj)
library(ggmap)
## Loading required package: ggplot2
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
# creating a world map
w <- map_data('world')
icj <- map_data('world',
region = c('USA','India', 'China', 'Japan'))
ggplot(icj, aes(x = long, y = lat, group = group)) +
geom_polygon(fill = 'orange')

# creating a USA Map
w<- map_data('state')
ggplot(w, aes(x = long, y = lat, group = group, fill=region)) +
geom_polygon(color = 'black') +
coord_map('polyconic')+
guides(fill=F)

Merge data
usa$province <- tolower(usa$province)
str(usa)
## Classes 'tbl_df', 'tbl' and 'data.frame': 51 obs. of 2 variables:
## $ province: chr "new york" "washington" "new jersey" "california" ...
## $ count : int 4154 1006 742 674 520 482 422 331 271 246 ...
data <- merge(w, usa,
by.x = 'region',
by.y = 'province')
ggplot(data, aes(x = long, y = lat,
group = group,
fill = count)) +
geom_polygon(color = 'gray') +
ggtitle("No of Cases in USA by states")
