packages = c(
"dplyr","ggplot2","d3heatmap","googleVis","devtools","plotly",
"magrittr","caTools","ROCR","corrplot",
"doParallel", "caret", "glmnet"
)
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(dplyr); library(ggplot2)x = seq(-2,2,length=200)
y = x + x^2 + 0.5 * rnorm(length(x))
new = data.frame(x = seq(-2, 2, length=200))
set.seed(5); i = sample(1:length(x),10)
par(cex=0.7,mar=c(2,2,2,2),mfrow=c(3,3))
for(k in 1:9) {
plot(x,y,col='gray')
points(x[i],y[i],pch=20,cex=2,col='blue')
fit = lm(y ~ poly(x, k), data.frame(x=x[i], y=y[i]))
lines(new$x, predict(fit, new), col='red')
text(-1.5, 5.5, k, cex=4, col='pink')
}model 1: \(y = b_0 + b_1 x + \epsilon\)
model 2: \(y = b_0 + b_1 x + b_2 x^2 + \epsilon\)
model 3: \(y = b_0 + b_1 x + b_2 x^2 + b_3 x^3 + \epsilon\)
…
model 9: \(y = b_0+b_1x+b_2x^2+b_3x^3+b_4x^4+b_5x^5+b_6x^6+b_7x^7+b_8x^8+b_9x^9+\epsilon\)
建模工具其實就是優化工具(Estimators are Optimizers),線性回歸最基本的優化方法是OLS(Ordinary Least Square),它的優化公式是:
\[ \underset{\{b_0, ... ,b_k\}}{\operatorname{argmin}} \sum_{i=1}^n (\hat{y}-y_i)^2 \]
lm()D = read.csv(unzip("Unit2.zip","Unit2/climate_change.csv"))
TR = subset(D, Year <= 2006)
TS = subset(D, Year > 2006)m1 = lm(Temp~MEI+CO2+CH4+N2O+CFC.11+CFC.12+TSI+Aerosols,TR)options(digits=4, scipen=10)
summary(m1)##
## Call:
## lm(formula = Temp ~ MEI + CO2 + CH4 + N2O + CFC.11 + CFC.12 +
## TSI + Aerosols, data = TR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2589 -0.0591 -0.0008 0.0565 0.3243
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -124.594260 19.886800 -6.27 0.0000000014310 ***
## MEI 0.064205 0.006470 9.92 < 2e-16 ***
## CO2 0.006457 0.002285 2.83 0.00505 **
## CH4 0.000124 0.000516 0.24 0.81015
## N2O -0.016528 0.008565 -1.93 0.05467 .
## CFC.11 -0.006630 0.001626 -4.08 0.0000595728765 ***
## CFC.12 0.003808 0.001014 3.76 0.00021 ***
## TSI 0.093141 0.014755 6.31 0.0000000010959 ***
## Aerosols -1.537613 0.213252 -7.21 0.0000000000054 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0917 on 275 degrees of freedom
## Multiple R-squared: 0.751, Adjusted R-squared: 0.744
## F-statistic: 104 on 8 and 275 DF, p-value: <2e-16
\(y\) 的 預測值:
簡單講,對每一個自變數而言:
嚴格來講:
幸好:
vx = c(3:5,7); cx = summary(m1)$coef
par(cex=0.8)
plot(0,0,xlim=c(-0.04,0.02),ylim=c(0,800),pch=20,
xlab="coefficients: the estimated value of b's",
ylab='probability density',
main='Probability Density Function of Coefficients')
abline(h=seq(0,800,100),v=seq(-0.04,0.03,0.0025),col='lightgray',lty=3)
for(i in vx) curve(dnorm(x,cx[i,1],cx[i,2]),add=T,col=i-1,lwd=2,n=1000)
abline(v=0,col='orange')
legend("topleft",col=vx-1,lwd=2,legend=paste0(
"b",vx," ( ",rownames(cx)[vx]," )", c("**","","。","***") ) )summary(m1)$coef[vx,]## Estimate Std. Error t value Pr(>|t|)
## CO2 0.006457 0.0022846 2.8264 0.0050525
## CH4 0.000124 0.0005158 0.2405 0.8101456
## N2O -0.016528 0.0085649 -1.9297 0.0546693
## CFC.12 0.003808 0.0010135 3.7573 0.0002097
【Q1】依據這個模型,在CO2、CH4、N2O、CFC12之中:
哪一種氣體的效果估計值最大? 它的效果是顯著的嗎?
哪一種氣體的效果最顯著? 它的效果估計值是最大的嗎?
除了係數的估計值之外,還有會影響自變數的顯著性?
【Q2】如果樣本變大的話:
係數的估計值會(a)變大、(b)變小、或(c)不變?
係數的估計標準差會(a)變大、(b)變小、或(c)不變?
自變數的\(p\)-value會(a)變大、(b)變小、或(c)不變?
自變數的顯著性會(a)變大、(b)變小、或(c)不變?
cor(TR[3:10]) %>% round(2) # 自變數之間的相關性## MEI CO2 CH4 N2O CFC.11 CFC.12 TSI Aerosols
## MEI 1.00 -0.04 -0.03 -0.05 0.07 0.01 -0.15 0.34
## CO2 -0.04 1.00 0.88 0.98 0.51 0.85 0.18 -0.36
## CH4 -0.03 0.88 1.00 0.90 0.78 0.96 0.25 -0.27
## N2O -0.05 0.98 0.90 1.00 0.52 0.87 0.20 -0.34
## CFC.11 0.07 0.51 0.78 0.52 1.00 0.87 0.27 -0.04
## CFC.12 0.01 0.85 0.96 0.87 0.87 1.00 0.26 -0.23
## TSI -0.15 0.18 0.25 0.20 0.27 0.26 1.00 0.05
## Aerosols 0.34 -0.36 -0.27 -0.34 -0.04 -0.23 0.05 1.00
library(corrplot)## corrplot 0.84 loaded
cor(TR[3:10]) %>% corrplot.mixed(tl.cex=0.7, tl.col='black')m2 = lm(Temp~MEI+N2O+TSI+Aerosols,TR) # 手動挑選自變數
summary(m2)##
## Call:
## lm(formula = Temp ~ MEI + N2O + TSI + Aerosols, data = TR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2792 -0.0598 -0.0060 0.0567 0.3419
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -116.22686 20.22303 -5.75 0.00000002373584 ***
## MEI 0.06419 0.00665 9.65 < 2e-16 ***
## N2O 0.02532 0.00131 19.31 < 2e-16 ***
## TSI 0.07949 0.01488 5.34 0.00000018937322 ***
## Aerosols -1.70174 0.21800 -7.81 0.00000000000012 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0955 on 279 degrees of freedom
## Multiple R-squared: 0.726, Adjusted R-squared: 0.722
## F-statistic: 185 on 4 and 279 DF, p-value: <2e-16
m3 = step(m1) # 自動挑選自變數## Start: AIC=-1348
## Temp ~ MEI + CO2 + CH4 + N2O + CFC.11 + CFC.12 + TSI + Aerosols
##
## Df Sum of Sq RSS AIC
## - CH4 1 0.000 2.31 -1350
## <none> 2.31 -1348
## - N2O 1 0.031 2.34 -1346
## - CO2 1 0.067 2.38 -1342
## - CFC.12 1 0.119 2.43 -1336
## - CFC.11 1 0.140 2.45 -1333
## - TSI 1 0.335 2.65 -1312
## - Aerosols 1 0.437 2.75 -1301
## - MEI 1 0.828 3.14 -1263
##
## Step: AIC=-1350
## Temp ~ MEI + CO2 + N2O + CFC.11 + CFC.12 + TSI + Aerosols
##
## Df Sum of Sq RSS AIC
## <none> 2.31 -1350
## - N2O 1 0.031 2.35 -1348
## - CO2 1 0.067 2.38 -1344
## - CFC.12 1 0.130 2.44 -1337
## - CFC.11 1 0.139 2.45 -1335
## - TSI 1 0.335 2.65 -1314
## - Aerosols 1 0.440 2.75 -1303
## - MEI 1 0.831 3.14 -1265
summary(m3)##
## Call:
## lm(formula = Temp ~ MEI + CO2 + N2O + CFC.11 + CFC.12 + TSI +
## Aerosols, data = TR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2577 -0.0599 -0.0010 0.0559 0.3220
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -124.515178 19.850113 -6.27 0.0000000013651 ***
## MEI 0.064068 0.006434 9.96 < 2e-16 ***
## CO2 0.006401 0.002269 2.82 0.0051 **
## N2O -0.016021 0.008287 -1.93 0.0542 .
## CFC.11 -0.006609 0.001621 -4.08 0.0000595362634 ***
## CFC.12 0.003868 0.000981 3.94 0.0001 ***
## TSI 0.093116 0.014729 6.32 0.0000000010355 ***
## Aerosols -1.540206 0.212616 -7.24 0.0000000000044 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0916 on 276 degrees of freedom
## Multiple R-squared: 0.751, Adjusted R-squared: 0.745
## F-statistic: 119 on 7 and 276 DF, p-value: <2e-16
基礎模型的差方和(SST):\(\sum_i (y_i-\bar{y})^2\)
回歸模型的差方和(SSE):\(\sum_i (\hat{y_i}-y_i)^2\)
準確性指標(\(R^2\)):\(1 - \frac{SSE}{SST}\)
平均誤差(RMSE):\(\sqrt{SSE/n}\)
pred = predict(m3) # 使用訓練資料做預測
SSE = sum((pred - TR$Temp)^2) # SSE.train
SST = sum((mean(TR$Temp) - TR$Temp)^2) # SST.train
R2 = 1 - SSE/SST # R2.train
RMSE = sqrt(SSE/nrow(TR)) # RMSE.train
c(SSE, SST, R2, RMSE)## [1] 2.31351 9.28526 0.75084 0.09026
pred = predict(m3, TS) # 使用測試資料做預測
SSE = sum((pred - TS$Temp)^2) # SSE.test
SST = sum((mean(TR$Temp) - TS$Temp)^2) # SST.test
R2 = 1 - SSE/SST # R2.test
RMSE = sqrt(SSE/nrow(TS)) # RMSE.test
c(SSE, SST, R2, RMSE)## [1] 0.21764 0.58602 0.62861 0.09523
注意一下,我們計算 測試資料 的差方和是使用 訓練資料 的平均值做基準
\[ SST_{test} = \sum_{i=1}^n(\bar{y}_{train} - y_{i,test})^2\] 所以 \(R_{test}^2\) 有可能會小於零。
【Q3】在商務數據分析的角度,我們應該關心的是自變數的:
(a) 係數估計值
(b) \(p\)-value
(c) 顯著性
(d) 其他,請說明為甚麼?