library(tidyverse)
library(car)

輸入資料

DATE <- c("324", "325", "326", "327", "328", "329", "330", "331", "401", "402", "403", "404", "405", "406", "407", "408", "409", "410", "411", "412", "413", "414", "415", "416", "417", "418", "419")
Domestic <- c(15, 14, 21, 83, 34, 33, 56, 87, 104, 160, 183, 133, 216, 281, 382, 384, 442, 431, 439, 551, 744, 874, 1209, 1199, 1210, 1390, 1626) 
Oversea <- c(124, 122, 82, 120, 93, 63, 107, 152, 132, 244, 97, 142, 65, 78, 149, 123, 136, 144, 191, 112, 189, 108, 75, 152, 93, 90, 101)
Weekday <- c("Y", "Y", "N", "N", "Y", "Y", "Y", "Y", "Y", "N", "N", "Y", "Y", "Y", "Y", "Y", "N", "N", "Y", "Y", "Y", "Y", "Y", "N", "N", "Y", "Y")

建立一個data名為cvdta

cvdta <- data.frame(DATE, Domestic, Oversea, Weekday)

檢視前六筆資料

head(cvdta)
##   DATE Domestic Oversea Weekday
## 1  324       15     124       Y
## 2  325       14     122       Y
## 3  326       21      82       N
## 4  327       83     120       N
## 5  328       34      93       Y
## 6  329       33      63       Y

檢視資料結構

str(cvdta)
## 'data.frame':    27 obs. of  4 variables:
##  $ DATE    : chr  "324" "325" "326" "327" ...
##  $ Domestic: num  15 14 21 83 34 33 56 87 104 160 ...
##  $ Oversea : num  124 122 82 120 93 63 107 152 132 244 ...
##  $ Weekday : chr  "Y" "Y" "N" "N" ...

將DATE轉換成類別變項

cvdta$DATE <- as.factor(cvdta$DATE)

將WEEKDAY轉換成一個類別變項

cvdta$Weekday <- as.factor(cvdta$Weekday)

畫出Oversea和Domestic在不同Weekday水準下的相關

ggplot(aes(y = Domestic, x = Oversea, color = Weekday), data = cvdta) +
  geom_point() +
  geom_smooth(method = lm, se = F) + 
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'

可看出Oversea與Domestic的斜率有差異

因此提出假設:
H0:Oversea與Domestic的確診數有相關。 H1:Oversea與Domestic的確診數沒有相關。

多元迴歸看Oversea與Weekday與國內確診的數值和相關

cvmod <- lm(Domestic ~ Oversea + Weekday, data = cvdta)
summary(cvmod)
## 
## Call:
## lm(formula = Domestic ~ Oversea + Weekday, data = cvdta)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -506.0 -364.0 -175.5  236.4 1156.4 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  623.923    367.071   1.700    0.102
## Oversea       -1.182      2.410  -0.490    0.628
## WeekdayY     -34.906    214.547  -0.163    0.872
## 
## Residual standard error: 499.8 on 24 degrees of freedom
## Multiple R-squared:  0.01013,    Adjusted R-squared:  -0.07236 
## F-statistic: 0.1228 on 2 and 24 DF,  p-value: 0.885

得出公式 : Domestic = 623.923- 1.182 Oversea- 34.906 Weekday+ 499.8

本土案例的平均值為截距623.92,然而t值1.700小於1.96且p值大於0.05,無法推翻虛無假設,表示平均值未顯著大於零。

境外移入案例的平均值為截距-1.182,然而t值-0.490的絕對值小於1.96且p值大於0.05,無法推翻虛無假設,表示境外移入案例與本土案例的關係未達顯著水準。

Weekday是類別變項為參照組。相較週末的本土案例,週間本土案例的平均數低34.906。然而t值-0.163絕對值小於1.96且p值大於0.05,表示兩者的平均數未有顯著差異,對本土案例亦未有顯著影響力(或預測力)。因此兩個自變項(海外案例和工作日)顯然都不是很理想的預測變項(predictors)。

此迴歸模式的R-squared為0.0101,表示本土案例的變異量僅有1.01%可被解釋。

#對迴歸進行診斷,藉找出最小的殘差進而找到最佳的迴歸線

畫直方圖檢視殘差是否常態分配

hist( x = residuals(cvmod),
      xlab = "Value of residual",
      main = "",
      breaks = 20)

畫Normal Q-Q圖再次檢視殘差是否呈常態分布

plot(cvmod, which = 2)

由上面兩個圖可知殘差非常態分佈,且平均數不為0,表示預測值-觀察值的誤差不集中在0,因為點皆不落在線上,尤其24、26、27的點為outlier。

畫圖看預測值與殘差間關係

plot(cvmod, which  = 1)

當紅線未呈筆直的水平線,表示尚有變異未被模型所解釋,或有更理想的自變項未被納入模型且點分布呈離散代表預測值與殘差間無關係,意即殘差不會隨預測值改變。

畫出個別自變項與殘差間關係

residualPlots(cvmod)

##            Test stat Pr(>|Test stat|)
## Oversea      -0.1894           0.8514
## Weekday                              
## Tukey test   -0.3492           0.7269

畫出預測值變異數與殘差變異數間關係

plot(cvmod, which = 3)