# 4.a
# Load covid 19 data from `/data/covid_19_data.csv`, format as needed
library(readr)
coronaData = read_csv("data/covid_19_data.csv", 
                          col_types = cols(ObservationDate = col_date(format = "%m/%d/%Y")))
# Cant put `Confirmed` as int?, return Nah
# View(coronaData)

# Create ID base on Country and Province
# Create ID column in coronaData
coronaData$ID = paste(coronaData$`Country/Region`, coronaData$`Province/State`, sep = '.')
# if separator = '_', the code break?

# Put each col into variables according to col's name, for easy access
# for(col in colnames(coronaData)) { assign(col, coronaData[[col]])}
# View(`Country/Region`)
# Variable cannot be used with View(), try frame
for (col in names(coronaData)[-ncol(coronaData)]) { # exclude ID column
  tmpFrame = data.frame(ID = coronaData$ID, coronaData[[col]])
  colnames(tmpFrame)[2] = paste(col, sep = '')
  assign(paste(col, sep = ''), tmpFrame)
}
# Use unique(ID) to compress frame?

# New created dataframes have no link?
# Create ID base on Country and Province
# Create ID column in coronaData
coronaData$ID = paste(coronaData$`Country/Region`, coronaData$`Province/State`, sep = '.')
# if separator = '_', the code break?

# Get col count of coronaData
ncol(coronaData)
## [1] 9
# Get row count of coronaData
nrow(coronaData)
## [1] 306429
# Get first 10 row of coronaData
head(coronaData, 10)
## # A tibble: 10 × 9
##      SNo ObservationDate Province…¹ Count…² Last …³ Confi…⁴ Deaths Recov…⁵ ID   
##    <dbl> <date>          <chr>      <chr>   <chr>     <dbl>  <dbl>   <dbl> <chr>
##  1     1 2020-01-22      Anhui      Mainla… 1/22/2…       1      0       0 Main…
##  2     2 2020-01-22      Beijing    Mainla… 1/22/2…      14      0       0 Main…
##  3     3 2020-01-22      Chongqing  Mainla… 1/22/2…       6      0       0 Main…
##  4     4 2020-01-22      Fujian     Mainla… 1/22/2…       1      0       0 Main…
##  5     5 2020-01-22      Gansu      Mainla… 1/22/2…       0      0       0 Main…
##  6     6 2020-01-22      Guangdong  Mainla… 1/22/2…      26      0       0 Main…
##  7     7 2020-01-22      Guangxi    Mainla… 1/22/2…       2      0       0 Main…
##  8     8 2020-01-22      Guizhou    Mainla… 1/22/2…       1      0       0 Main…
##  9     9 2020-01-22      Hainan     Mainla… 1/22/2…       4      0       0 Main…
## 10    10 2020-01-22      Hebei      Mainla… 1/22/2…       1      0       0 Main…
## # … with abbreviated variable names ¹​`Province/State`, ²​`Country/Region`,
## #   ³​`Last Update`, ⁴​Confirmed, ⁵​Recovered
# Get column name of coronaData
names(coronaData)
## [1] "SNo"             "ObservationDate" "Province/State"  "Country/Region" 
## [5] "Last Update"     "Confirmed"       "Deaths"          "Recovered"      
## [9] "ID"
# Get country from coronaData
#print(`Country/Region`)

# Get max confirmed case from coronaData
print(max(Confirmed$Confirmed))
## [1] 5863138
maxConfirmedCases = max(Confirmed['Confirmed'])

# Get coronaData which match Country == 'Mainland China'
coronaChina = coronaData[which(coronaData['Country/Region'] == 'Mainland China'),]

# put coronaData into variable according to country
# for(countryName in `Country/Region`) { assign(paste(countryName, "data"), coronaData[which(coronaData$`Country/Region` == countryName),])}
# This kill the system, find another approach 

# Get country which has most confirmed 
maxCountryConfirmedCorona = coronaData[which(coronaData$Confirmed==maxConfirmedCases),]['Country/Region']

# Find province with most confirmedcases
maxStateConfirmedCorona = coronaData[which(coronaData$Confirmed==maxConfirmedCases),]['Province/State']

# Get coronaData in Jun 2020
data_jan = coronaData[which(coronaData$ObservationDate >= "2020-01-01" & coronaData$ObservationDate <= "2020-01-31"), ]

# 4.b
# get coronaData of Vietnam
coronaVietnam = coronaData[which(coronaData['Country/Region'] == 'Vietnam'),]

# 4.c
# print the most confirmed case in Vietnam
print(max(coronaVietnam['Confirmed']))
## [1] 6908
# 4.d
# Get Vietnam corona data in Feb 2021
coronaVietnamJanFeb2021 = coronaVietnam[which(coronaVietnam$ObservationDate >= as.Date("2021-01-01") & coronaVietnam$ObservationDate <= as.Date("2021-02-28")),]
# use subset ?

# 4.e
# get max confirmed case in Vietnamjanfeb
print(max(coronaVietnamJanFeb2021['Confirmed']))
## [1] 2448
# 4.f
# same as 4.e but with Indonesia and Philippine
coronaIndonesia = coronaData[which(coronaData['Country/Region'] == 'Indonesia'),]
coronaIndonesiaJanFeb2021 = coronaIndonesia[which(coronaIndonesia$ObservationDate >= as.Date("2021-01-01") & coronaIndonesia$ObservationDate <= as.Date("2021-02-28")),]
print(max(coronaIndonesiaJanFeb2021['Confirmed']))
## [1] 1334634
coronaPhilippines = coronaData[which(coronaData['Country/Region'] == 'Philippines'),]
coronaPhilippinesJanFeb2021 = coronaPhilippines[which(coronaPhilippines$ObservationDate >= as.Date("2021-01-01") & coronaPhilippines$ObservationDate <= as.Date("2021-02-28")),]
print(max(coronaPhilippinesJanFeb2021['Confirmed']))
## [1] 576352
# 4.g 
# get confirmed case in china from 010221 to 150221
coronaChina15Feb = coronaChina[which(coronaChina$ObservationDate >= as.Date("2021-02-01") & coronaChina$ObservationDate <= as.Date("2021-02-15")),]
print(coronaChina15Feb)
## # A tibble: 480 × 9
##       SNo ObservationDate Provinc…¹ Count…² Last …³ Confi…⁴ Deaths Recov…⁵ ID   
##     <dbl> <date>          <chr>     <chr>   <chr>     <dbl>  <dbl>   <dbl> <chr>
##  1 216354 2021-02-01      Anhui     Mainla… 2021-0…     994      6     987 Main…
##  2 216388 2021-02-01      Beijing   Mainla… 2021-0…    1039      9     977 Main…
##  3 216438 2021-02-01      Chongqing Mainla… 2021-0…     591      6     584 Main…
##  4 216480 2021-02-01      Fujian    Mainla… 2021-0…     541      1     525 Main…
##  5 216485 2021-02-01      Gansu     Mainla… 2021-0…     187      2     181 Main…
##  6 216503 2021-02-01      Guangdong Mainla… 2021-0…    2127      8    2077 Main…
##  7 216504 2021-02-01      Guangxi   Mainla… 2021-0…     267      2     263 Main…
##  8 216507 2021-02-01      Guizhou   Mainla… 2021-0…     147      2     145 Main…
##  9 216510 2021-02-01      Hainan    Mainla… 2021-0…     171      6     165 Main…
## 10 216516 2021-02-01      Hebei     Mainla… 2021-0…    1313      7     976 Main…
## # … with 470 more rows, and abbreviated variable names ¹​`Province/State`,
## #   ²​`Country/Region`, ³​`Last Update`, ⁴​Confirmed, ⁵​Recovered
# 4.h
# Stat coronachina in feb21 by province
coronaChinaFeb = coronaChina[which(coronaChina$ObservationDate >= as.Date("2021-02-01") & coronaChina$ObservationDate <= as.Date("2021-02-28")),]
library(gridExtra)
library(grid)
table(coronaChinaFeb$`Province/State`)
## 
##          Anhui        Beijing      Chongqing         Fujian          Gansu 
##             28             28             28             28             28 
##      Guangdong        Guangxi        Guizhou         Hainan          Hebei 
##             28             28             28             28             28 
##   Heilongjiang          Henan          Hubei          Hunan Inner Mongolia 
##             28             28             28             28             28 
##        Jiangsu        Jiangxi          Jilin       Liaoning        Ningxia 
##             28             28             28             28             28 
##        Qinghai        Shaanxi       Shandong         Shanxi       Shanghai 
##             28             28             28             28             28 
##        Sichuan        Tianjin          Tibet        Unknown       Xinjiang 
##             28             28             28             28             28 
##         Yunnan       Zhejiang 
##             28             28
# another way to get coronaChina in feb 2021
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# coronaChinaFeb = filter(coronaData, Country/Region == "China", year(ObservationDate) == 2021, month(ObservationDate) == 2)

# 4.i
# get confirmed case base on province
chinaGroupedData = group_by(coronaChinaFeb, `Province/State`)
chinaLastDayData = summarise(chinaGroupedData, confirmed = last(Confirmed), ObservationDate = max(ObservationDate))
chinaFirstDayData = summarise(chinaGroupedData, confirmed = first(Confirmed), ObservationDate = min(ObservationDate))
chinaCombinedData = inner_join(chinaLastDayData, chinaFirstDayData, by = "Province/State")
chinaCombinedData$confirmedDiff = chinaCombinedData$confirmed.x - chinaCombinedData$confirmed.y
table(chinaCombinedData$`Province/State`, chinaCombinedData$confirmedDiff)
##                 
##                  0 1 2 3 4 5 7 10 11 12 18 19 29 85 91
##   Anhui          1 0 0 0 0 0 0  0  0  0  0  0  0  0  0
##   Beijing        0 0 0 0 0 0 0  1  0  0  0  0  0  0  0
##   Chongqing      1 0 0 0 0 0 0  0  0  0  0  0  0  0  0
##   Fujian         0 0 0 0 0 0 0  1  0  0  0  0  0  0  0
##   Gansu          1 0 0 0 0 0 0  0  0  0  0  0  0  0  0
##   Guangdong      0 0 0 0 0 0 0  0  0  0  0  0  0  1  0
##   Guangxi        1 0 0 0 0 0 0  0  0  0  0  0  0  0  0
##   Guizhou        1 0 0 0 0 0 0  0  0  0  0  0  0  0  0
##   Hainan         1 0 0 0 0 0 0  0  0  0  0  0  0  0  0
##   Hebei          0 0 0 0 1 0 0  0  0  0  0  0  0  0  0
##   Heilongjiang   0 0 0 0 0 0 0  0  0  0  0  1  0  0  0
##   Henan          0 1 0 0 0 0 0  0  0  0  0  0  0  0  0
##   Hubei          0 1 0 0 0 0 0  0  0  0  0  0  0  0  0
##   Hunan          0 0 0 0 1 0 0  0  0  0  0  0  0  0  0
##   Inner Mongolia 0 1 0 0 0 0 0  0  0  0  0  0  0  0  0
##   Jiangsu        0 0 0 0 0 0 1  0  0  0  0  0  0  0  0
##   Jiangxi        1 0 0 0 0 0 0  0  0  0  0  0  0  0  0
##   Jilin          0 0 0 0 0 0 0  0  0  0  1  0  0  0  0
##   Liaoning       0 0 0 0 1 0 0  0  0  0  0  0  0  0  0
##   Ningxia        1 0 0 0 0 0 0  0  0  0  0  0  0  0  0
##   Qinghai        1 0 0 0 0 0 0  0  0  0  0  0  0  0  0
##   Shaanxi        0 0 0 0 0 0 0  0  1  0  0  0  0  0  0
##   Shandong       0 0 0 1 0 0 0  0  0  0  0  0  0  0  0
##   Shanxi         0 1 0 0 0 0 0  0  0  0  0  0  0  0  0
##   Shanghai       0 0 0 0 0 0 0  0  0  0  0  0  0  0  1
##   Sichuan        0 0 0 0 0 0 0  0  0  0  0  0  1  0  0
##   Tianjin        0 0 0 0 0 0 0  0  0  1  0  0  0  0  0
##   Tibet          1 0 0 0 0 0 0  0  0  0  0  0  0  0  0
##   Unknown        1 0 0 0 0 0 0  0  0  0  0  0  0  0  0
##   Xinjiang       1 0 0 0 0 0 0  0  0  0  0  0  0  0  0
##   Yunnan         0 0 1 0 0 0 0  0  0  0  0  0  0  0  0
##   Zhejiang       0 0 0 0 0 1 0  0  0  0  0  0  0  0  0
chinaDiffProvinceFeb2021 = data.frame(Province = chinaCombinedData$`Province/State`, Diff = chinaCombinedData$confirmedDiff)
# table(chinaDiffProvinceFeb2021)
#print(chinaDiffProvinceFeb2021)

# 4.k
# get china death from 010221 to 150221
print(coronaChina15Feb$Deaths)
##   [1]    6    9    6    1    2    8    2    2    6    7   13   22 4512    4    1
##  [16]    0    1    3    2    0    0    3    7    7    0    3    3    0    0    3
##  [31]    2    1    6    9    6    1    2    8    2    2    6    7   13   22 4512
##  [46]    4    1    0    1    3    2    0    0    3    7    7    0    3    3    0
##  [61]    0    3    2    1    6    9    6    1    2    8    2    2    6    7   13
##  [76]   22 4512    4    1    0    1    3    2    0    0    3    7    7    0    3
##  [91]    3    0    0    3    2    1    6    9    6    1    2    8    2    2    6
## [106]    7   13   22 4512    4    1    0    1    3    2    0    0    3    7    7
## [121]    0    3    3    0    0    3    2    1    6    9    6    1    2    8    2
## [136]    2    6    7   13   22 4512    4    1    0    1    3    2    0    0    3
## [151]    7    7    0    3    3    0    0    3    2    1    6    9    6    1    2
## [166]    8    2    2    6    7   13   22 4512    4    1    0    1    3    2    0
## [181]    0    3    7    7    0    3    3    0    0    3    2    1    6    9    6
## [196]    1    2    8    2    2    6    7   13   22 4512    4    1    0    1    3
## [211]    2    0    0    3    7    7    0    3    3    0    0    3    2    1    6
## [226]    9    6    1    2    8    2    2    6    7   13   22 4512    4    1    0
## [241]    1    3    2    0    0    3    7    7    0    3    3    0    0    3    2
## [256]    1    6    9    6    1    2    8    2    2    6    7   13   22 4512    4
## [271]    1    0    1    3    2    0    0    3    7    7    0    3    3    0    0
## [286]    3    2    1    6    9    6    1    2    8    2    2    6    7   13   22
## [301] 4512    4    1    0    1    3    2    0    0    3    7    7    0    3    3
## [316]    0    0    3    2    1    6    9    6    1    2    8    2    2    6    7
## [331]   13   22 4512    4    1    0    1    3    2    0    0    3    7    7    0
## [346]    3    3    0    0    3    2    1    6    9    6    1    2    8    2    2
## [361]    6    7   13   22 4512    4    1    0    1    3    2    0    0    3    7
## [376]    7    0    3    3    0    0    3    2    1    6    9    6    1    2    8
## [391]    2    2    6    7   13   22 4512    4    1    0    1    3    2    0    0
## [406]    3    7    7    0    3    3    0    0    3    2    1    6    9    6    1
## [421]    2    8    2    2    6    7   13   22 4512    4    1    0    1    3    2
## [436]    0    0    3    7    7    0    3    3    0    0    3    2    1    6    9
## [451]    6    1    2    8    2    2    6    7   13   22 4512    4    1    0    1
## [466]    3    2    0    0    3    7    7    0    3    3    0    0    3    2    1
# 4.l
# get new confirmed in vietnam 0521 and 0520
# filter the coronaVietnam for May 2020 and 2021
vietnamData = coronaVietnam[((format(as.Date(coronaVietnam$ObservationDate, "%m/%d/%Y"), "%Y-%m") == "2020-05") | (format(as.Date(coronaVietnam$ObservationDate, "%m/%d/%Y"), "%Y-%m") == "2021-05")), ] 
vietnamData$ObservationDate = as.Date(vietnamData$ObservationDate, format = "%m/%d/%Y")
vietnamSummarizedData = data.frame(ObservationDate = unique(vietnamData$ObservationDate), confirmed = rep(0, length(unique(vietnamData$ObservationDate))))
for (i in 1:nrow(vietnamSummarizedData)) { vietnamSummarizedData[i, "confirmed"] = sum(vietnamData[vietnamData$ObservationDate == vietnamSummarizedData[i, "ObservationDate"], "Confirmed"])}
vietnamDiffData = data.frame(ObservationDate = vietnamSummarizedData$ObservationDate, diff = c(NA, diff(vietnamSummarizedData$confirmed)))
vietnamMayData = vietnamDiffData[format(vietnamDiffData$ObservationDate, "%m") == "05", ]
#print(vietnamMayData$diff)
# Filter for May 2020 and May 2021
coronaMay2020 = coronaVietnam[format(coronaVietnam$ObservationDate, "%m/%Y") == "05/2020", ]
coronaMay2021 = coronaVietnam[format(coronaVietnam$ObservationDate, "%m/%Y") == "05/2021", ]
par(bg = "darkblue", col.lab = "white", col.axis = 'white', col = 'white', col.main = 'white')
# Plot the confirmed cases for May 2020
plot(coronaMay2020$ObservationDate, coronaMay2020$Confirmed, xlab = 'Date', ylab = 'Confirmed' , type="b", main = "Confirmed Cases in May 2020 (Vietnam)")
grid(lty = "dotted")

# Plot the confirmed cases for May 2021
plot(coronaMay2021$ObservationDate, coronaMay2021$Confirmed, xlab = 'Date', ylab = 'Confirmed' , type="b", main = "Confirmed Cases in May 2021 (Vietnam)")
grid(lty = "dotted")

# Plot the new confirmed cases for May 2020
vietnamDiffData2020 = vietnamDiffData[format(vietnamDiffData$ObservationDate, '%m/%Y') == '05/2020',]
plot(vietnamDiffData2020$ObservationDate, vietnamDiffData2020$diff, xlab = "Date", ylab = "New cases", type = 'o', main = 'New confirmed cases in May 2020 (Vietnam)')
grid(lty = "dotted")

# Plot the new confirmed cases for May 2021
vietnamDiffData2021 = vietnamDiffData[format(vietnamDiffData$ObservationDate, '%m/%Y') == '05/2021',]
plot(vietnamDiffData2021$ObservationDate, vietnamDiffData2021$diff, xlab = "Date", ylab = "New cases", type = 'o', main = 'New confirmed cases in May 2021 (Vietnam)', col = "white")
grid(lty = "dotted")