https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge
coronavirus_confirmed_cases <- read.csv("time_series_covid_19_confirmed.csv")
str(coronavirus_confirmed_cases)
## 'data.frame': 258 obs. of 76 variables:
## $ Province.State: chr "" "" "" "" ...
## $ Country.Region: chr "Afghanistan" "Albania" "Algeria" "Andorra" ...
## $ Lat : num 33 41.2 28 42.5 -11.2 ...
## $ Long : num 65 20.17 1.66 1.52 17.87 ...
## $ X1.22.20 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ X1.23.20 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ X1.24.20 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ X1.25.20 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ X1.26.20 : int 0 0 0 0 0 0 0 0 0 3 ...
## $ X1.27.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X1.28.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X1.29.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X1.30.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X1.31.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.1.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.2.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.3.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.4.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.5.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.6.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.7.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.8.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.9.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.10.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.11.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.12.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.13.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.14.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.15.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.16.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.17.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.18.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.19.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.20.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.21.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.22.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.23.20 : int 0 0 0 0 0 0 0 0 0 4 ...
## $ X2.24.20 : int 1 0 0 0 0 0 0 0 0 4 ...
## $ X2.25.20 : int 1 0 1 0 0 0 0 0 0 4 ...
## $ X2.26.20 : int 1 0 1 0 0 0 0 0 0 4 ...
## $ X2.27.20 : int 1 0 1 0 0 0 0 0 0 4 ...
## $ X2.28.20 : int 1 0 1 0 0 0 0 0 0 4 ...
## $ X2.29.20 : int 1 0 1 0 0 0 0 0 0 4 ...
## $ X3.1.20 : int 1 0 1 0 0 0 0 1 0 6 ...
## $ X3.2.20 : int 1 0 3 1 0 0 0 1 0 6 ...
## $ X3.3.20 : int 1 0 5 1 0 0 1 1 0 13 ...
## $ X3.4.20 : int 1 0 12 1 0 0 1 1 0 22 ...
## $ X3.5.20 : int 1 0 12 1 0 0 1 1 0 22 ...
## $ X3.6.20 : int 1 0 17 1 0 0 2 1 0 26 ...
## $ X3.7.20 : int 1 0 17 1 0 0 8 1 0 28 ...
## $ X3.8.20 : int 4 0 19 1 0 0 12 1 0 38 ...
## $ X3.9.20 : int 4 2 20 1 0 0 12 1 0 48 ...
## $ X3.10.20 : int 5 10 20 1 0 0 17 1 0 55 ...
## $ X3.11.20 : int 7 12 20 1 0 0 19 1 0 65 ...
## $ X3.12.20 : int 7 23 24 1 0 0 19 4 0 65 ...
## $ X3.13.20 : int 7 33 26 1 0 1 31 8 1 92 ...
## $ X3.14.20 : int 11 38 37 1 0 1 34 18 1 112 ...
## $ X3.15.20 : int 16 42 48 1 0 1 45 26 1 134 ...
## $ X3.16.20 : int 21 51 54 2 0 1 56 52 2 171 ...
## $ X3.17.20 : int 22 55 60 39 0 1 68 78 2 210 ...
## $ X3.18.20 : int 22 59 74 39 0 1 79 84 3 267 ...
## $ X3.19.20 : int 22 64 87 53 0 1 97 115 4 307 ...
## $ X3.20.20 : int 24 70 90 75 1 1 128 136 6 353 ...
## $ X3.21.20 : int 24 76 139 88 2 1 158 160 9 436 ...
## $ X3.22.20 : int 40 89 201 113 2 1 266 194 19 669 ...
## $ X3.23.20 : int 40 104 230 133 3 3 301 235 32 669 ...
## $ X3.24.20 : int 74 123 264 164 3 3 387 249 39 818 ...
## $ X3.25.20 : int 84 146 302 188 3 3 387 265 39 1029 ...
## $ X3.26.20 : int 94 174 367 224 4 7 502 290 53 1219 ...
## $ X3.27.20 : int 110 186 409 267 4 7 589 329 62 1405 ...
## $ X3.28.20 : int 110 197 454 308 5 7 690 407 71 1617 ...
## $ X3.29.20 : int 120 212 511 334 7 7 745 424 77 1791 ...
## $ X3.30.20 : int 170 223 584 370 7 7 820 482 78 2032 ...
## $ X3.31.20 : int 174 243 716 376 7 7 1054 532 80 2032 ...
## $ X4.1.20 : int 237 259 847 390 8 7 1054 571 84 2182 ...
## $ X4.2.20 : int 273 277 986 428 8 9 1133 663 87 2298 ...
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.1 ✓ dplyr 1.0.6
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
japan <- coronavirus_confirmed_cases %>% filter(Country.Region=="Japan") %>%
select(-c(1:4)) %>%
gather("date","cases",1:72)
japan$date <- str_sub(japan$date, 2) %>% mdy()
glimpse(japan)
## Rows: 72
## Columns: 2
## $ date <date> 2020-01-22, 2020-01-23, 2020-01-24, 2020-01-25, 2020-01-26, 202…
## $ cases <int> 2, 2, 2, 2, 4, 4, 7, 7, 11, 15, 20, 20, 20, 22, 22, 22, 25, 25, …
japan %>% ggplot(aes(x=date, y=cases)) +
geom_point() +
labs(title = "Japan Coronavirus Known Cases",
x = "",
y = "") +
theme_minimal()
japan %>% ggplot(aes(x=date, y=cases)) +
geom_line() +
scale_y_log10() +
labs(title = "Japan Coronavirus Cases",
subtitle = "Log Scale Base 10",
x = "",
y = "") +
theme_minimal()
model <- lm(log(cases)~date, data = japan)
summary(model)
##
## Call:
## lm(formula = log(cases) ~ date, data = japan)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1067 -0.2789 0.1624 0.2921 0.5299
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.738e+03 3.979e+01 -43.68 <2e-16 ***
## date 9.514e-02 2.172e-03 43.80 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.383 on 70 degrees of freedom
## Multiple R-squared: 0.9648, Adjusted R-squared: 0.9643
## F-statistic: 1919 on 1 and 70 DF, p-value: < 2.2e-16
test <- data.frame(date = seq(ymd('2020-02-01'),ymd('2020-07-01'),by='day'))
test$preds <- exp(predict(model, newdata=test))
test %>% ggplot(aes(x=date, y=preds)) +
geom_line() +
scale_y_log10() +
labs(title = "Coronavirus predictions together with known cases (red)",
x="",
y="Log Scale 10") +
geom_point(aes(x=date,y=cases), data=japan, col="red")