Data as of Mar 28th 2020

Loading the data and basic EDA
library(readxl)
covid <- read.csv("covid.csv")

co <- covid # backup of data 

dim(covid) # data contains 34 attributes 
## [1] 117771     34
str(covid)
## 'data.frame':    117771 obs. of  34 variables:
##  $ ID                      : Factor w/ 115846 levels "000-1-","000-1-1",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ age                     : Factor w/ 184 levels "","0-10","0-18",..: 1 109 1 1 1 1 1 130 109 69 ...
##  $ sex                     : Factor w/ 3 levels "","female","male": 1 3 1 1 1 1 1 2 3 2 ...
##  $ city                    : Factor w/ 2366 levels "","Aa en Hunze",..: 1 1839 1 1 1 1 1 1839 1839 1839 ...
##  $ province                : Factor w/ 513 levels "","Aargau","Abruzzo",..: 342 495 1 1 1 1 205 495 495 495 ...
##  $ country                 : Factor w/ 106 levels "","Afghanistan",..: 33 103 1 1 1 1 43 103 103 103 ...
##  $ wuhan.0._not_wuhan.1.   : int  1 1 NA NA NA NA 1 1 1 1 ...
##  $ latitude                : num  47.5 48 NA NA NA ...
##  $ longitude               : num  -0.811 -121.696 NA NA NA ...
##  $ geo_resolution          : Factor w/ 7 levels "","admin","admin0",..: 4 5 1 1 1 1 4 5 5 5 ...
##  $ date_onset_symptoms     : Factor w/ 84 levels "","- 25.02.2020",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ date_admission_hospital : Factor w/ 79 levels "","01.01.2020",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ date_confirmation       : Factor w/ 96 levels "","01.02.2020",..: 9 39 1 1 1 1 3 39 39 39 ...
##  $ symptoms                : Factor w/ 354 levels "","19","20","21",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ lives_in_Wuhan          : Factor w/ 4 levels "","no","no, work in Wuhan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ travel_history_dates    : Factor w/ 149 levels "","- 01.03.2020",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ travel_history_location : Factor w/ 482 levels "",";;Iran-;;Qatar",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ reported_market_exposure: Factor w/ 65 levels "","AM","Austria",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ additional_information  : Factor w/ 1732 levels "","\"children\"",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ chronic_disease_binary  : Factor w/ 243 levels "","0","1","AIPAC conference in Washington D.C.",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ chronic_disease         : Factor w/ 31 levels "","\"thought to have had other pre-existing conditions\"",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ source                  : Factor w/ 2770 levels "","(1) http://www.ms.ro/2020/03/03/al-patrulea-caz-de-infectare-cu-noul-coronavirus-in-romania/;  (2) https://www."| __truncated__,..: 2350 2396 1 1 1 1 2765 2396 2396 2396 ...
##  $ sequence_available      : Factor w/ 212 levels "","(a) https://for.ge/view/181718/koronavirusiani-meoTxe-pacientis-mdgomareoba-mZimea---ra-safrTxe-emuqreba-italii"| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ outcome                 : Factor w/ 17 levels "","critical condition, intubated as of 14.02.2020",..: 1 11 1 1 1 1 1 1 1 1 ...
##  $ date_death_or_discharge : Factor w/ 57 levels "","01.02.2020",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ notes_for_discussion    : Factor w/ 62 levels "","Asymptomatic but placed in quarantine: https://www.thestar.com.my/news/nation/2020/01/25/three-chinese-national"| __truncated__,..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ location                : Factor w/ 332 levels "","#N/A","Abu Dhabi",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ admin3                  : Factor w/ 410 levels "","#N/A","Baihe County",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ admin2                  : Factor w/ 877 levels "","#N/A","Abu Dhabi",..: 1 699 1 1 1 1 1 699 699 699 ...
##  $ admin1                  : Factor w/ 370 levels "","#N/A","Aargau",..: 250 355 1 1 1 1 159 355 355 355 ...
##  $ country_new             : Factor w/ 97 levels "","#N/A","19",..: 31 95 1 1 1 1 39 95 95 95 ...
##  $ admin_id                : Factor w/ 1219 levels "","#N/A","#REF!",..: 64 564 1 1 1 1 183 564 564 564 ...
##  $ data_moderator_initials : Factor w/ 5 levels "","DSC","FS",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ travel_history_binary   : int  NA NA NA NA NA NA NA NA NA NA ...
colnames(covid)
##  [1] "ID"                       "age"                     
##  [3] "sex"                      "city"                    
##  [5] "province"                 "country"                 
##  [7] "wuhan.0._not_wuhan.1."    "latitude"                
##  [9] "longitude"                "geo_resolution"          
## [11] "date_onset_symptoms"      "date_admission_hospital" 
## [13] "date_confirmation"        "symptoms"                
## [15] "lives_in_Wuhan"           "travel_history_dates"    
## [17] "travel_history_location"  "reported_market_exposure"
## [19] "additional_information"   "chronic_disease_binary"  
## [21] "chronic_disease"          "source"                  
## [23] "sequence_available"       "outcome"                 
## [25] "date_death_or_discharge"  "notes_for_discussion"    
## [27] "location"                 "admin3"                  
## [29] "admin2"                   "admin1"                  
## [31] "country_new"              "admin_id"                
## [33] "data_moderator_initials"  "travel_history_binary"

Checking missing data

options(scipen = 99)
# Checking Missing Values 

sort(colSums(is.na(covid)),decr = T)
##    travel_history_binary    wuhan.0._not_wuhan.1.                 latitude 
##                   117170                    53027                     3332 
##                longitude                      age                      sex 
##                     3332                      234                      191 
## reported_market_exposure     travel_history_dates  travel_history_location 
##                       36                       12                        6 
##   chronic_disease_binary          chronic_disease      date_onset_symptoms 
##                        6                        6                        2 
##           lives_in_Wuhan                       ID                     city 
##                        2                        0                        0 
##                 province                  country           geo_resolution 
##                        0                        0                        0 
##  date_admission_hospital        date_confirmation                 symptoms 
##                        0                        0                        0 
##   additional_information                   source       sequence_available 
##                        0                        0                        0 
##                  outcome  date_death_or_discharge     notes_for_discussion 
##                        0                        0                        0 
##                 location                   admin3                   admin2 
##                        0                        0                        0 
##                   admin1              country_new                 admin_id 
##                        0                        0                        0 
##  data_moderator_initials 
##                        0
covid$travel_history_binary <- NULL
covid$wuhan.0._not_wuhan.1.<- NULL

# this shows that the data contains sufficient missing values, however, most of the variables for our analysis are in good shape 
Creating a part of a World Map
library(maps)
library(mapdata)
library(mapproj)
library(ggmap)
## Loading required package: ggplot2
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
# creating a world map
w <- map_data('world')
icj <- map_data('world',
                region = c('USA','India', 'China', 'Japan'))
ggplot(icj, aes(x = long, y = lat, group = group)) +
  geom_polygon(fill = 'orange') 

# creating a USA Map 

w<- map_data('state')
ggplot(w, aes(x = long, y = lat, group = group, fill=region)) +
  geom_polygon(color = 'black') +
  coord_map('polyconic')+
  guides(fill=F)

Extracting the USA Data

data <- read.csv("covid.csv", header=T)

# COVID data - USA

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
usa <- covid[covid$country=='United States',]
usa <- usa %>% group_by(province) %>% 
  summarise(count = n()) %>% 
  arrange(desc(count))

Merge data

usa$province <- tolower(usa$province)

str(usa)
## Classes 'tbl_df', 'tbl' and 'data.frame':    51 obs. of  2 variables:
##  $ province: chr  "new york" "washington" "new jersey" "california" ...
##  $ count   : int  4154 1006 742 674 520 482 422 331 271 246 ...
data <- merge(w, usa,
              by.x = 'region',
              by.y = 'province')

ggplot(data, aes(x = long, y = lat, 
                 group = group,
                 fill = count)) +
  geom_polygon(color = 'gray') +
  ggtitle("No of Cases in USA by states")