Loading the libraries

library(dplyr)
library(lubridate)
library(ggplot2)
library(GGally)
library(corrplot)
library(plotly)
library(car)

Loading the dataset

df <- read.csv("covid3.csv")
df$date <- mdy(df$date) ### change the date type

Filling the NA’s of 14,15 and 16 of October using the median cases rate of idiotiki protovoulia

of October

df$private_cases_rate <- round(df$private_cases/df$new_cases,3) ### idiotiki protovoulia case rate
private_median_case_rate <- median(df$private_cases_rate[204:229], na.rm=T) ### median case rate for October

### calculating the cases using the October's median
df[which(is.na(df$private_cases)),]$private_cases <- round(df[which(is.na(df$private_cases)),]$new_cases * private_median_case_rate) 

df[which(is.na(df$private_cases_rate)),]$private_cases_rate[22:24]<- round(df$private_cases[217:219]/df$new_cases[217:219],3)

df[which(is.na(df$private_cases_rate)),]$private_cases_rate[1:21] <- as.numeric("0.000")
df$total_private_tests <- cumsum(df$private_test)
df$public_tests <- df$new_tests-df$private_test
df$public_cases <- df$new_cases - df$private_cases
df$pos_rate_total <- round(df$new_cases/df$new_tests , 3)
df$pos_rate_private <- round(df$private_cases/df$private_test,3)
df$pos_rate_public <- round(df$public_cases/df$public_tests,3)
df$private_perc <- round(df$private_test/df$new_tests,3)
ggplotly(ggplot(df, aes(x=date, y= total_cases,colour="Total"))+ geom_line(size=1) + ylab("Number of Cases") + xlab("Days") + ggtitle("Total cases") + geom_line(aes(x=date, y= cumsum(public_cases),colour="Public")) + theme_bw() + geom_line(aes(x=date, y= cumsum(private_cases),colour="Private")) + scale_color_discrete(name = "Test type", labels = c(Total="Red", Public="Blue", Private="Green")))
ggplot(df[18:229,], aes(x=date, y=cumsum(private_tests))) + geom_line(colour="Red", size=1) + geom_line(aes(x=date, y=cumsum(public_tests)), colour="Blue", size=1) + ylab("Number of tests") + ggtitle("Total number of tests") + theme_bw()

ggplotly(ggplot(df, aes(x=date,y=new_cases)) + geom_line(colour="Red", size=1) + ylab("Number of cases") + ggtitle("New total cases") + theme_bw())
ggplotly(ggplot(df, aes(x=date,y=private_cases,colour="Private")) + geom_line( size=1) + ylab("Number of cases") + ggtitle("New cases") + theme_bw() + geom_line(aes(x=date,y=public_cases, colour="Public"), size=1)+ scale_color_discrete(name = "Test type", labels = c(Private="Red", Public="Blue")))
ggplotly(ggplot(df, aes(x=date,y=private_tests)) + geom_line(aes(colour="Private"), size=1) + ylab("Number of cases") + ggtitle("Daily number of tests") + theme_bw() + geom_line(aes(x=date,y=new_tests, colour="Total"), size=1) + scale_color_discrete(name = "Test type", labels = c(Private="Red", Total="Blue")))
ggplotly(ggplot(df[50:229,]) + geom_line(aes(x=date, y=pos_rate_total,colour="Total"), size=1) + geom_line(aes(x=date,y=pos_rate_public,colour="Public"), size=1) + ggtitle("Positive rate per type") + geom_line(aes(x=date,y=pos_rate_private,colour="Private"), size=1) + theme_bw() + ylab("Positive rate")+ scale_color_discrete(name = "Test type", labels = c(Total="Red", Public="Blue", Private="Green")))
ggplotly(ggplot(df, aes(x=date,y=private_perc))+geom_line(size=0.7, colour="Orange")+theme_bw()+geom_smooth(span=0.1)+ylab("Percentage")+ ggtitle("Private test percentage"))
ylim.prim <- c(0,5000)
ylim.sec <- c(0,0.08)
b <- diff(ylim.prim)/diff(ylim.sec)
a <- b*(ylim.prim[1] - ylim.sec[1])
ggplot(df, aes(x=date, y=new_tests)) + geom_bar(stat="identity", colour="white", fill="Red") + geom_bar(aes(x=date,y=new_cases), stat="identity",colour="blue", fill="blue", size=1) + theme_minimal() + geom_line(aes(y= b*pos_rate_total), size=1, colour="black") + scale_y_continuous(name = "Number of cases", 
    sec.axis = sec_axis(~ (. - a)/b, name = "Positive tests"))

cor_cov <- cor(df[,2:16],use = "complete.obs")
corrplot(cor_cov,order = "hclust")

corrplot(cor_cov, method = "color",type = "upper", order = "hclust",addCoef.col = "red",diag = FALSE)

plot(df[50:229,]$pos_rate_private,df[50:229,]$pos_rate_total)

model <- lm( pos_rate_private~pos_rate_total, data=df[50:229,])
summary(model)
## 
## Call:
## lm(formula = pos_rate_private ~ pos_rate_total, data = df[50:229, 
##     ])
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.019978 -0.002391 -0.000195  0.001356  0.047609 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.0001952  0.0005611   0.348    0.728    
## pos_rate_total 1.2989253  0.0603999  21.505   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.006369 on 177 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.7232, Adjusted R-squared:  0.7217 
## F-statistic: 462.5 on 1 and 177 DF,  p-value: < 2.2e-16
symptomatic <- read.csv("symptomatic.csv")
symptomatic$date <- ymd(symptomatic$date)
ggplotly(ggplot(symptomatic, aes(x=date, y=sym_cases))+ geom_bar(stat="identity", fill="blue", colour="white") + theme_minimal() + ylab("N. of cases") + ggtitle("Cases with symptoms. Date of first day of symptoms appeared"))
merged <- merge(df, symptomatic, all.x=T, by="date")
ggplotly(ggplot(merged, aes(x=date,y=private_cases, colour="Private"))+ geom_line(size=1)+ geom_line(aes(x=date,y=sym_cases,colour="Symptomatic"), size=1)+ theme_minimal() + scale_color_discrete(name = "Test type", labels = c(Private="Red", Symptomatic="Blue"))+ ylab("N. of cases"))
model2 <- lm(new_cases~new_tests+private_cases+private_tests+sym_cases + private_cases * sym_cases, data=merged)
summary(model2)
## 
## Call:
## lm(formula = new_cases ~ new_tests + private_cases + private_tests + 
##     sym_cases + private_cases * sym_cases, data = merged)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.260  -5.165  -2.486   2.191  42.605 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              7.2911227  1.6501059   4.419 1.61e-05 ***
## new_tests               -0.0005625  0.0009695  -0.580  0.56242    
## private_cases            1.5596013  0.0832931  18.724  < 2e-16 ***
## private_tests           -0.0046256  0.0029573  -1.564  0.11934    
## sym_cases                0.5339078  0.0580170   9.203  < 2e-16 ***
## private_cases:sym_cases -0.0042702  0.0015114  -2.825  0.00519 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.006 on 204 degrees of freedom
##   (19 observations deleted due to missingness)
## Multiple R-squared:  0.9069, Adjusted R-squared:  0.9046 
## F-statistic: 397.4 on 5 and 204 DF,  p-value: < 2.2e-16
### VAF
predicted <- predict(model2, merged[,c(6:8,18)])
(1-var(merged$new_cases[20:227]-predicted[20:227])/var(merged$new_cases[20:227]))*100
## [1] 91.02076
### VIF 
vif1 <- vif(model2)
vif1
##               new_tests           private_cases           private_tests 
##                2.237857                5.677579                3.886717 
##               sym_cases private_cases:sym_cases 
##                1.955732                4.794124
barplot(vif1, main = "VIF Values", col = "steelblue",cex.names=0.7)

ggplot(merged, aes(x=sym_cases,y=new_cases)) + geom_point() + theme_minimal() + xlab("Symptomatic cases") + ylab("New cases")
## Warning: Removed 2 rows containing missing values (geom_point).

model_sn <- lm(new_cases~sym_cases, data=merged)
summary(model_sn)
## 
## Call:
## lm(formula = new_cases ~ sym_cases, data = merged)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -66.257  -6.189  -2.947   0.811 150.766 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.94712    1.64017   3.016  0.00285 ** 
## sym_cases    1.24179    0.09819  12.647  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.55 on 225 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.4155, Adjusted R-squared:  0.4129 
## F-statistic:   160 on 1 and 225 DF,  p-value: < 2.2e-16
ggplot(merged, aes(x=sym_cases, y=private_cases)) + geom_point() + theme_minimal() + xlab("Symptomatic cases") + ylab("Private cases")
## Warning: Removed 2 rows containing missing values (geom_point).

model_sp <- lm(private_cases~sym_cases, data=merged)
summary(model_sp)
## 
## Call:
## lm(formula = private_cases ~ sym_cases, data = merged)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.803  -3.360  -0.513   0.487 124.340 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.09326    1.12487  -0.083    0.934    
## sym_cases    0.60591    0.06734   8.998   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.78 on 225 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.2646, Adjusted R-squared:  0.2614 
## F-statistic: 80.96 on 1 and 225 DF,  p-value: < 2.2e-16
ggplot(merged, aes(x=private_cases, y=new_cases)) + geom_point() + theme_minimal() + xlab("Private cases") + ylab("New cases") + geom_smooth()

model_pn <- lm(new_cases~private_cases, data=merged)
summary(model_pn)
## 
## Call:
## lm(formula = new_cases ~ private_cases, data = merged)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.800  -6.843  -3.825   2.480  50.175 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.82498    0.74898   10.45   <2e-16 ***
## private_cases  1.53554    0.04178   36.75   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.86 on 226 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.8567, Adjusted R-squared:  0.856 
## F-statistic:  1351 on 1 and 226 DF,  p-value: < 2.2e-16
ggplotly(ggplot(merged, aes(x=sym_cases, y=private_cases, colour="Private")) + geom_point( size=3, alpha=0.5) + theme_minimal() + xlab("Symptomatic cases") + ylab("Cases") + geom_point(aes(x=sym_cases, y=new_cases, colour="Daily"), size=3, alpha=0.5) + geom_smooth(aes(x=sym_cases, y=private_cases), fill="Blue", alpha=0.2) + geom_smooth(aes(x=sym_cases, y=new_cases), fill="Red", alpha=0.2) + scale_color_discrete(name = "Test type", labels = c(Private="Red", Daily="Blue")))
corr2 <- cor(merged[,c(7,9,18)],use = "complete.obs")
corrplot(corr2, method = "color",type = "upper", order = "hclust",addCoef.col = "red",diag = FALSE)

model3 <- lm(new_cases~new_tests + private_tests, data=merged, na.action = na.omit)
summary(model3)
## 
## Call:
## lm(formula = new_cases ~ new_tests + private_tests, data = merged, 
##     na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.266 -12.212  -4.060   6.704 118.245 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -2.411704   3.967254  -0.608    0.544    
## new_tests     -0.002607   0.002300  -1.134    0.258    
## private_tests  0.052331   0.005320   9.837   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.06 on 208 degrees of freedom
##   (18 observations deleted due to missingness)
## Multiple R-squared:  0.4529, Adjusted R-squared:  0.4476 
## F-statistic: 86.09 on 2 and 208 DF,  p-value: < 2.2e-16
vif(model3)
##     new_tests private_tests 
##       2.10257       2.10257
model4 <-lm(new_cases~private_tests + public_tests, data=merged, na.action = na.omit)
summary(model4)
## 
## Call:
## lm(formula = new_cases ~ private_tests + public_tests, data = merged, 
##     na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.266 -12.212  -4.060   6.704 118.245 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -2.411704   3.967254  -0.608    0.544    
## private_tests  0.049724   0.003984  12.481   <2e-16 ***
## public_tests  -0.002607   0.002300  -1.134    0.258    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.06 on 208 degrees of freedom
##   (18 observations deleted due to missingness)
## Multiple R-squared:  0.4529, Adjusted R-squared:  0.4476 
## F-statistic: 86.09 on 2 and 208 DF,  p-value: < 2.2e-16
vif(model4)
## private_tests  public_tests 
##      1.179121      1.179121

```