library(tidyverse)
library(car)
DATE <- c("324", "325", "326", "327", "328", "329", "330", "331", "401",
"402", "403", "404", "405", "406", "407", "408", "409", "410", "411", "412")
Domestic <- c(15, 14, 21, 83, 34, 33, 56, 87, 104, 160, 183, 133, 216, 281, 382, 384, 442, 431, 439, 551)
Temperature <- c(15, 19, 17, 16, 15, 18, 21, 17, 15, 13, 13, 17, 19, 18, 20, 20, 20, 21, 23, 22)
#氣溫以台灣首都”台北市的最低氣溫”為主,因為有流行病學專家說病毒比較容易在秋冬的溫度存活
建立一個資料名為cvdta
cvdta <- data.frame(DATE, Domestic, Temperature)
將DATE命名成一個類別變項
cvdta$DATE <- as.factor(cvdta$DATE)
用scatterplot圖檢視Domestic可不可以被Temperature預測及其關係,還有Domestic、Temperature分別的變異量
scatterplot(Domestic ~ Temperature,
data = cvdta,
smooth = F)
#可看出兩者關係呈現正相關
用ggplot圖繼續檢視兩者的線性關係
ggplot(aes(x = Temperature, y = Domestic), data = cvdta) +
geom_point() +
geom_smooth(method = lm, se = T) +
theme_bw()
此圖是根據最小平方法繪製而成,灰色範圍越小越準確,標準差越小,表示估算越接近母體
將資料丟進簡單迴歸中
vmod <- lm(Domestic ~ Temperature, data = cvdta)
summary(vmod)
##
## Call:
## lm(formula = Domestic ~ Temperature, data = cvdta)
##
## Residuals:
## Min 1Q Median 3Q Max
## -265.524 -73.800 -5.361 103.506 190.435
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -498.33 200.99 -2.479 0.0233 *
## Temperature 39.04 11.06 3.529 0.0024 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 138.6 on 18 degrees of freedom
## Multiple R-squared: 0.4089, Adjusted R-squared: 0.3761
## F-statistic: 12.45 on 1 and 18 DF, p-value: 0.002398
得出公式 : Domestic=39.04*Temperature-498.33
###結論
因為t value=3.592 >1.96,顯著大於0 ;p-value 0.0024 <0.05,所以可以拒絕虛無假設。
因為F-statistic=12.45 >1 ; p-value0.002398 <0.05,所以結果顯著
決定係數為0.3761表示依變項Domestic確診數可被自變項Temperature預測的變異量百分比有37.61%,高於20%,表示預測力佳