# 4.a
# Load covid 19 data from `/data/covid_19_data.csv`, format as needed
library(readr)
coronaData = read_csv("data/covid_19_data.csv",
col_types = cols(ObservationDate = col_date(format = "%m/%d/%Y")))
# Cant put `Confirmed` as int?, return Nah
# View(coronaData)
# Create ID base on Country and Province
# Create ID column in coronaData
coronaData$ID = paste(coronaData$`Country/Region`, coronaData$`Province/State`, sep = '.')
# if separator = '_', the code break?
# Put each col into variables according to col's name, for easy access
# for(col in colnames(coronaData)) { assign(col, coronaData[[col]])}
# View(`Country/Region`)
# Variable cannot be used with View(), try frame
for (col in names(coronaData)[-ncol(coronaData)]) { # exclude ID column
tmpFrame = data.frame(ID = coronaData$ID, coronaData[[col]])
colnames(tmpFrame)[2] = paste(col, sep = '')
assign(paste(col, sep = ''), tmpFrame)
}
# Use unique(ID) to compress frame?
# New created dataframes have no link?
# Create ID base on Country and Province
# Create ID column in coronaData
coronaData$ID = paste(coronaData$`Country/Region`, coronaData$`Province/State`, sep = '.')
# if separator = '_', the code break?
# Get col count of coronaData
ncol(coronaData)
## [1] 9
# Get row count of coronaData
nrow(coronaData)
## [1] 306429
# Get first 10 row of coronaData
head(coronaData, 10)
## # A tibble: 10 × 9
## SNo ObservationDate Province…¹ Count…² Last …³ Confi…⁴ Deaths Recov…⁵ ID
## <dbl> <date> <chr> <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 1 2020-01-22 Anhui Mainla… 1/22/2… 1 0 0 Main…
## 2 2 2020-01-22 Beijing Mainla… 1/22/2… 14 0 0 Main…
## 3 3 2020-01-22 Chongqing Mainla… 1/22/2… 6 0 0 Main…
## 4 4 2020-01-22 Fujian Mainla… 1/22/2… 1 0 0 Main…
## 5 5 2020-01-22 Gansu Mainla… 1/22/2… 0 0 0 Main…
## 6 6 2020-01-22 Guangdong Mainla… 1/22/2… 26 0 0 Main…
## 7 7 2020-01-22 Guangxi Mainla… 1/22/2… 2 0 0 Main…
## 8 8 2020-01-22 Guizhou Mainla… 1/22/2… 1 0 0 Main…
## 9 9 2020-01-22 Hainan Mainla… 1/22/2… 4 0 0 Main…
## 10 10 2020-01-22 Hebei Mainla… 1/22/2… 1 0 0 Main…
## # … with abbreviated variable names ¹`Province/State`, ²`Country/Region`,
## # ³`Last Update`, ⁴Confirmed, ⁵Recovered
# Get column name of coronaData
names(coronaData)
## [1] "SNo" "ObservationDate" "Province/State" "Country/Region"
## [5] "Last Update" "Confirmed" "Deaths" "Recovered"
## [9] "ID"
# Get country from coronaData
#print(`Country/Region`)
# Get max confirmed case from coronaData
print(max(Confirmed$Confirmed))
## [1] 5863138
maxConfirmedCases = max(Confirmed['Confirmed'])
# Get coronaData which match Country == 'Mainland China'
coronaChina = coronaData[which(coronaData['Country/Region'] == 'Mainland China'),]
# put coronaData into variable according to country
# for(countryName in `Country/Region`) { assign(paste(countryName, "data"), coronaData[which(coronaData$`Country/Region` == countryName),])}
# This kill the system, find another approach
# Get country which has most confirmed
maxCountryConfirmedCorona = coronaData[which(coronaData$Confirmed==maxConfirmedCases),]['Country/Region']
# Find province with most confirmedcases
maxStateConfirmedCorona = coronaData[which(coronaData$Confirmed==maxConfirmedCases),]['Province/State']
# Get coronaData in Jun 2020
data_jan = coronaData[which(coronaData$ObservationDate >= "2020-01-01" & coronaData$ObservationDate <= "2020-01-31"), ]
# 4.b
# get coronaData of Vietnam
coronaVietnam = coronaData[which(coronaData['Country/Region'] == 'Vietnam'),]
# 4.c
# print the most confirmed case in Vietnam
print(max(coronaVietnam['Confirmed']))
## [1] 6908
# 4.d
# Get Vietnam corona data in Feb 2021
coronaVietnamJanFeb2021 = coronaVietnam[which(coronaVietnam$ObservationDate >= as.Date("2021-01-01") & coronaVietnam$ObservationDate <= as.Date("2021-02-28")),]
# use subset ?
# 4.e
# get max confirmed case in Vietnamjanfeb
print(max(coronaVietnamJanFeb2021['Confirmed']))
## [1] 2448
# 4.f
# same as 4.e but with Indonesia and Philippine
coronaIndonesia = coronaData[which(coronaData['Country/Region'] == 'Indonesia'),]
coronaIndonesiaJanFeb2021 = coronaIndonesia[which(coronaIndonesia$ObservationDate >= as.Date("2021-01-01") & coronaIndonesia$ObservationDate <= as.Date("2021-02-28")),]
print(max(coronaIndonesiaJanFeb2021['Confirmed']))
## [1] 1334634
coronaPhilippines = coronaData[which(coronaData['Country/Region'] == 'Philippines'),]
coronaPhilippinesJanFeb2021 = coronaPhilippines[which(coronaPhilippines$ObservationDate >= as.Date("2021-01-01") & coronaPhilippines$ObservationDate <= as.Date("2021-02-28")),]
print(max(coronaPhilippinesJanFeb2021['Confirmed']))
## [1] 576352
# 4.g
# get confirmed case in china from 010221 to 150221
coronaChina15Feb = coronaChina[which(coronaChina$ObservationDate >= as.Date("2021-02-01") & coronaChina$ObservationDate <= as.Date("2021-02-15")),]
print(coronaChina15Feb)
## # A tibble: 480 × 9
## SNo ObservationDate Provinc…¹ Count…² Last …³ Confi…⁴ Deaths Recov…⁵ ID
## <dbl> <date> <chr> <chr> <chr> <dbl> <dbl> <dbl> <chr>
## 1 216354 2021-02-01 Anhui Mainla… 2021-0… 994 6 987 Main…
## 2 216388 2021-02-01 Beijing Mainla… 2021-0… 1039 9 977 Main…
## 3 216438 2021-02-01 Chongqing Mainla… 2021-0… 591 6 584 Main…
## 4 216480 2021-02-01 Fujian Mainla… 2021-0… 541 1 525 Main…
## 5 216485 2021-02-01 Gansu Mainla… 2021-0… 187 2 181 Main…
## 6 216503 2021-02-01 Guangdong Mainla… 2021-0… 2127 8 2077 Main…
## 7 216504 2021-02-01 Guangxi Mainla… 2021-0… 267 2 263 Main…
## 8 216507 2021-02-01 Guizhou Mainla… 2021-0… 147 2 145 Main…
## 9 216510 2021-02-01 Hainan Mainla… 2021-0… 171 6 165 Main…
## 10 216516 2021-02-01 Hebei Mainla… 2021-0… 1313 7 976 Main…
## # … with 470 more rows, and abbreviated variable names ¹`Province/State`,
## # ²`Country/Region`, ³`Last Update`, ⁴Confirmed, ⁵Recovered
# 4.h
# Stat coronachina in feb21 by province
coronaChinaFeb = coronaChina[which(coronaChina$ObservationDate >= as.Date("2021-02-01") & coronaChina$ObservationDate <= as.Date("2021-02-28")),]
library(gridExtra)
library(grid)
table(coronaChinaFeb$`Province/State`)
##
## Anhui Beijing Chongqing Fujian Gansu
## 28 28 28 28 28
## Guangdong Guangxi Guizhou Hainan Hebei
## 28 28 28 28 28
## Heilongjiang Henan Hubei Hunan Inner Mongolia
## 28 28 28 28 28
## Jiangsu Jiangxi Jilin Liaoning Ningxia
## 28 28 28 28 28
## Qinghai Shaanxi Shandong Shanxi Shanghai
## 28 28 28 28 28
## Sichuan Tianjin Tibet Unknown Xinjiang
## 28 28 28 28 28
## Yunnan Zhejiang
## 28 28
# another way to get coronaChina in feb 2021
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# coronaChinaFeb = filter(coronaData, Country/Region == "China", year(ObservationDate) == 2021, month(ObservationDate) == 2)
# 4.i
# get confirmed case base on province
chinaGroupedData = group_by(coronaChinaFeb, `Province/State`)
chinaLastDayData = summarise(chinaGroupedData, confirmed = last(Confirmed), ObservationDate = max(ObservationDate))
chinaFirstDayData = summarise(chinaGroupedData, confirmed = first(Confirmed), ObservationDate = min(ObservationDate))
chinaCombinedData = inner_join(chinaLastDayData, chinaFirstDayData, by = "Province/State")
chinaCombinedData$confirmedDiff = chinaCombinedData$confirmed.x - chinaCombinedData$confirmed.y
table(chinaCombinedData$`Province/State`, chinaCombinedData$confirmedDiff)
##
## 0 1 2 3 4 5 7 10 11 12 18 19 29 85 91
## Anhui 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Beijing 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## Chongqing 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Fujian 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
## Gansu 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Guangdong 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
## Guangxi 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Guizhou 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Hainan 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Hebei 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## Heilongjiang 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
## Henan 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## Hubei 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## Hunan 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## Inner Mongolia 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## Jiangsu 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## Jiangxi 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Jilin 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## Liaoning 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## Ningxia 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Qinghai 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Shaanxi 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## Shandong 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
## Shanxi 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
## Shanghai 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## Sichuan 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## Tianjin 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
## Tibet 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Unknown 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Xinjiang 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## Yunnan 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
## Zhejiang 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
chinaDiffProvinceFeb2021 = data.frame(Province = chinaCombinedData$`Province/State`, Diff = chinaCombinedData$confirmedDiff)
# table(chinaDiffProvinceFeb2021)
#print(chinaDiffProvinceFeb2021)
# 4.k
# get china death from 010221 to 150221
print(coronaChina15Feb$Deaths)
## [1] 6 9 6 1 2 8 2 2 6 7 13 22 4512 4 1
## [16] 0 1 3 2 0 0 3 7 7 0 3 3 0 0 3
## [31] 2 1 6 9 6 1 2 8 2 2 6 7 13 22 4512
## [46] 4 1 0 1 3 2 0 0 3 7 7 0 3 3 0
## [61] 0 3 2 1 6 9 6 1 2 8 2 2 6 7 13
## [76] 22 4512 4 1 0 1 3 2 0 0 3 7 7 0 3
## [91] 3 0 0 3 2 1 6 9 6 1 2 8 2 2 6
## [106] 7 13 22 4512 4 1 0 1 3 2 0 0 3 7 7
## [121] 0 3 3 0 0 3 2 1 6 9 6 1 2 8 2
## [136] 2 6 7 13 22 4512 4 1 0 1 3 2 0 0 3
## [151] 7 7 0 3 3 0 0 3 2 1 6 9 6 1 2
## [166] 8 2 2 6 7 13 22 4512 4 1 0 1 3 2 0
## [181] 0 3 7 7 0 3 3 0 0 3 2 1 6 9 6
## [196] 1 2 8 2 2 6 7 13 22 4512 4 1 0 1 3
## [211] 2 0 0 3 7 7 0 3 3 0 0 3 2 1 6
## [226] 9 6 1 2 8 2 2 6 7 13 22 4512 4 1 0
## [241] 1 3 2 0 0 3 7 7 0 3 3 0 0 3 2
## [256] 1 6 9 6 1 2 8 2 2 6 7 13 22 4512 4
## [271] 1 0 1 3 2 0 0 3 7 7 0 3 3 0 0
## [286] 3 2 1 6 9 6 1 2 8 2 2 6 7 13 22
## [301] 4512 4 1 0 1 3 2 0 0 3 7 7 0 3 3
## [316] 0 0 3 2 1 6 9 6 1 2 8 2 2 6 7
## [331] 13 22 4512 4 1 0 1 3 2 0 0 3 7 7 0
## [346] 3 3 0 0 3 2 1 6 9 6 1 2 8 2 2
## [361] 6 7 13 22 4512 4 1 0 1 3 2 0 0 3 7
## [376] 7 0 3 3 0 0 3 2 1 6 9 6 1 2 8
## [391] 2 2 6 7 13 22 4512 4 1 0 1 3 2 0 0
## [406] 3 7 7 0 3 3 0 0 3 2 1 6 9 6 1
## [421] 2 8 2 2 6 7 13 22 4512 4 1 0 1 3 2
## [436] 0 0 3 7 7 0 3 3 0 0 3 2 1 6 9
## [451] 6 1 2 8 2 2 6 7 13 22 4512 4 1 0 1
## [466] 3 2 0 0 3 7 7 0 3 3 0 0 3 2 1
# 4.l
# get new confirmed in vietnam 0521 and 0520
# filter the coronaVietnam for May 2020 and 2021
vietnamData = coronaVietnam[((format(as.Date(coronaVietnam$ObservationDate, "%m/%d/%Y"), "%Y-%m") == "2020-05") | (format(as.Date(coronaVietnam$ObservationDate, "%m/%d/%Y"), "%Y-%m") == "2021-05")), ]
vietnamData$ObservationDate = as.Date(vietnamData$ObservationDate, format = "%m/%d/%Y")
vietnamSummarizedData = data.frame(ObservationDate = unique(vietnamData$ObservationDate), confirmed = rep(0, length(unique(vietnamData$ObservationDate))))
for (i in 1:nrow(vietnamSummarizedData)) { vietnamSummarizedData[i, "confirmed"] = sum(vietnamData[vietnamData$ObservationDate == vietnamSummarizedData[i, "ObservationDate"], "Confirmed"])}
vietnamDiffData = data.frame(ObservationDate = vietnamSummarizedData$ObservationDate, diff = c(NA, diff(vietnamSummarizedData$confirmed)))
vietnamMayData = vietnamDiffData[format(vietnamDiffData$ObservationDate, "%m") == "05", ]
#print(vietnamMayData$diff)
# Filter for May 2020 and May 2021
coronaMay2020 = coronaVietnam[format(coronaVietnam$ObservationDate, "%m/%Y") == "05/2020", ]
coronaMay2021 = coronaVietnam[format(coronaVietnam$ObservationDate, "%m/%Y") == "05/2021", ]
par(bg = "darkblue", col.lab = "white", col.axis = 'white', col = 'white', col.main = 'white')
# Plot the confirmed cases for May 2020
plot(coronaMay2020$ObservationDate, coronaMay2020$Confirmed, xlab = 'Date', ylab = 'Confirmed' , type="b", main = "Confirmed Cases in May 2020 (Vietnam)")
grid(lty = "dotted")

# Plot the confirmed cases for May 2021
plot(coronaMay2021$ObservationDate, coronaMay2021$Confirmed, xlab = 'Date', ylab = 'Confirmed' , type="b", main = "Confirmed Cases in May 2021 (Vietnam)")
grid(lty = "dotted")

# Plot the new confirmed cases for May 2020
vietnamDiffData2020 = vietnamDiffData[format(vietnamDiffData$ObservationDate, '%m/%Y') == '05/2020',]
plot(vietnamDiffData2020$ObservationDate, vietnamDiffData2020$diff, xlab = "Date", ylab = "New cases", type = 'o', main = 'New confirmed cases in May 2020 (Vietnam)')
grid(lty = "dotted")

# Plot the new confirmed cases for May 2021
vietnamDiffData2021 = vietnamDiffData[format(vietnamDiffData$ObservationDate, '%m/%Y') == '05/2021',]
plot(vietnamDiffData2021$ObservationDate, vietnamDiffData2021$diff, xlab = "Date", ylab = "New cases", type = 'o', main = 'New confirmed cases in May 2021 (Vietnam)', col = "white")
grid(lty = "dotted")
