library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
sjPlot::plot_scatter(Lib, ALP, data = AustralianElectionPolling)
## Warning in check_dep_version(): ABI version mismatch:
## lme4 was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install lme4 from source or restore original 'Matrix' package
Figure 2.1: 散佈圖
#A significant p-value indicates non-linearity using the Tukey test
library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
Aus <- lm(ALP ~ Lib + org, data = pscl::AustralianElectionPolling)
# Tukey test
residualPlots(Aus)
Figure 2.2: 線性假設檢定圖
## Test stat Pr(>|Test stat|)
## Lib 2.78 0.0059 **
## org
## Tukey test 2.63 0.0086 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
違反線性假設表示\(\beta\)可能會在不同的X有不同的值。當\(Y=\beta_{0}+\beta_{1}X\),X=0時,\(Y=\beta_{0}\),X=1時,如果符合線性假設,\(Y=\beta_{0}+\beta_{1}\),X從0變成1,Y的變化是\(Y=\beta_{0}\)。如果不符合線性假設,\(Y=\beta_{0}+a*\beta_{1}X\),X從0變成1,Y的變化是\(\beta_{0}+(a-1)*\beta_{1}\)。因此,違反線性假設時難以詮釋Y的變化程度。
如果真的違反線性假設,必須特別注意Y相對於不同的X值可能有不同的線性變化。
\[\hat{\beta_{1}}=\frac{\sum_{i=1}^n(x_{i}-\bar{x})(y_{i}-\bar{y})}{\sum_{i=1}^n(x_{i}-\bar{x})^2}\]
\(\epsilon\)代表所有影響Y,但是觀察不到的變數。如果有一個未觀察的變數,同時影響Y以及X,就違反這個假設。
例如薪資=\(\beta_{0}+\beta_{1}\)教育程度+\(\epsilon\)
在這個模型中,「能力」(\(q_{i}\))並未被我們觀察,進入誤差項,\(\epsilon_{i}=\gamma q_{i}+v_{i}\)。如果\(Cov(q_{i},X_{i})\neq 0\), 那麼\(Cov(u_{i},X_{i})\neq 0\)。
如果\(\gamma>0\),\(Cov(q_{i},X_{i})>0\),迴歸係數估計會變大,也就是高估自變數與依變數的關係。
set.seed(02138)
X=rbinom(100, 20, prob=0.45)
Y=X+runif(100, 0, 0.5)
e=ifelse(X>=9, 1, 0)
OLS1=lm(Y ~ X)
#summary(OLS1)
OLS2=lm(Y+e ~ X)
#summary(OLS2)
stargazer(OLS1,OLS2,align = T,
title='不同依變數迴歸模型比較',
type=ifelse(knitr::is_latex_output(),"latex","html"),
label=knitr::opts_current$get("label"))
Dependent variable: | ||
Y | Y + e | |
(1) | (2) | |
X | 1.004*** | 1.177*** |
(0.006) | (0.014) | |
Constant | 0.213*** | -0.736*** |
(0.057) | (0.127) | |
Observations | 100 | 100 |
R2 | 0.996 | 0.987 |
Adjusted R2 | 0.996 | 0.986 |
Residual Std. Error (df = 98) | 0.143 | 0.317 |
F Statistic (df = 1; 98) | 25,878.000*** | 7,206.000*** |
Note: | p<0.1; p<0.05; p<0.01 |
R
的plot(OLS1,1, yaxt='n')
axis(2, c(-0.6, -0.2, 0, 0.2, 0.6))
abline(h = 0, lty=2)
Figure 2.3: 條件平均值為0檢驗圖1
plot(OLS2,1, yaxt='n')
axis(2, c(-0.6, -0.2, 0, 0.2, 0.6))
abline(h = 0, lty=2)
Figure 2.4: 條件平均值為0檢驗圖2
X<-c(19,17,21,18,15,12,14,20)
Y<-c(1,3,1,1,2,3,2,1)
M1<- lm(Y ~ X)
plot(M1$fitted.values, M1$residuals)
abline(h = 0, lty = 2, lwd = 1.4)
Figure 2.5: 變異數相等假設檢驗圖1
X<-c(19,17,21,18,15,12,14,20)
Y<-c(1,3,1,1,2,3,2,1)
M1<- lm(Y ~ X)
car::spreadLevelPlot(M1)
Figure 2.6: 變異數相等假設檢驗圖2
##
## Suggested power transformation: 0.3847
car::spreadLevelPlot(lm(eruptions~waiting, faithful))
Figure 2.7: 變異數相等假設檢驗圖3
##
## Suggested power transformation: 0.8039
如果違反這個假設,標準誤有可能膨脹,也就是並不有效。
迴歸係數的條件變異數表示如下(後續說明如何得出):
\[\boxed{\rm{Var}[\hat{\beta}_{1}|X]={\frac{\sigma_{u}^{2}}{\sum_{i=1}^{n}(x-\bar{x})^2}}=\frac{\sigma_{u}^{2}}{SST_{x}}}\]
\[\boxed{ \rm{Var}[\hat{\beta}_{0}|X] =\sigma_{u}^{2}(\frac{1}{n}+\frac{\bar{x}^2}{\sum_{i=1}^{n}(x-\bar{x})^2}) =\frac{\sigma^2\sum_{i=1}^{n}x^2}{\sum_{i=1}^{n}(x-\bar{x})^2}}\]
\[\rm{where}\hspace{.4em}\rm{Var}[u|X]=\sigma_{u}^{2}\]
其中\(u\)代表殘差。
當\(\sigma_{u}^{2}\)小的時候,\(Var[\hat{\beta}_{1}|X]\)也會變小,也就是未知的變數對於係數的影響越小,離真實的迴歸線可能越近。
因為\(\sum_{i=1}^{n}(x-\bar{x})^2=(n-1)S_{x}^{2}\),所以 n越大,\(Var[\hat{\beta}_{1}|X]\)越小,可能越近真實的迴歸線。
\(S_{x}^{2}\)越大,\(Var[\hat{\beta}_{1}|X]\)越小,離真實的迴歸線可能越近。
\(Var[\hat{\beta}_{1}|X]\)是\(Var[\beta_{1}|X]\)的估計之一,有可能因為n很大或者是\(Var[X]\)很大,使得\(Var[\hat{\beta}_{1}|X]\)變小
圖2.8似乎顯示殘差值在不同Y固定X的情況下,並不是隨機分佈。用另外一個資料來觀察殘差與預測值的關係:
#relationship between age of units and distances to employment centers.
B1 <- lm(age ~ dis, data=MASS::Boston)
plot(fitted(B1), resid(B1))
abline(0,0, col='#FF11CC')
Figure 2.8: 殘差值與預測值散佈圖1
library(olsrr)
ols_plot_resid_fit(B1)
Figure 2.9: 用olsrr畫殘差值與預測值散佈圖1
set.seed(02138)
X<-rnorm(100, 0, 1); Y<-X+rnorm(100, 0, 1.1)
M1 <- lm(Y~X)
plot(fitted(M1), resid(M1), col='#2200EE')
abline(0,0, col='#AA1100', lty = 2, lwd = 1.5)
Figure 2.10: 殘差值與預測值散佈圖2
set.seed(02138)
X<-rnorm(100, 0, 1); Y<-X^2+runif(100, 0, 2)
M1 <- lm(Y~X)
stargazer::stargazer(M1, type = 'html')
Dependent variable: | |
Y | |
X | -0.258* |
(0.150) | |
Constant | 1.866*** |
(0.149) | |
Observations | 100 |
R2 | 0.029 |
Adjusted R2 | 0.020 |
Residual Std. Error | 1.490 (df = 98) |
F Statistic | 2.971* (df = 1; 98) |
Note: | p<0.1; p<0.05; p<0.01 |
plot(fitted(M1), resid(M1), col='#AE13C0')
abline(0,0, col='#AA1100')
Figure 2.11: 殘差值與預測值散佈圖3
library(sandwich)
## Warning: package 'sandwich' was built under R version 4.3.3
library(lmtest)
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:data.table':
##
## yearmon, yearqtr
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
set.seed(02138)
X<-rnorm(100, 0, 1); Y<-X^2+runif(100, 0, 2)
M1 <- lm(Y~X)
# White/robust standard errors
coeftest(M1, vcov = vcovHC(M1, type = "HC0"))
t test of coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.866 0.146 12.82 <2e-16 ***
X -0.258 0.260 -0.99 0.32
—
Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1
\[Y|X \sim N(\beta_{0}+\beta_{1}X, \hspace{.4em}\sigma_{u}^{2})\]
Figure 2.12: 誤差項的變異數圖1
Figure 2.13: 誤差項的變異數圖2
\[\hat{\beta}_{1}-\beta_{1}/\sqrt{\rm{Var[\hat{\beta}_{1}|X]}}=\frac{\hat{\beta}_{1}-\beta_{1}}{s.e._{\hat{\beta}}}\sim N(0,1)\]
\[\hat{\beta}\sim N(\beta_{1},\hspace{.3em}\rm{Var}[\hat{\beta_{1}}|X])\]
\[\rm{Var}[\hat{\beta_{1}}|X])=\sigma_{u}^2/\sum\limits_{i=1}^{n}(x_{i}-\bar{x})^2\]
\[\frac{\hat{\beta}_{1}-\beta_{1}}{s.e._{\hat{\beta}}}\sim N(0,1)\]
Figure 3.1: 常態分佈抽出的樣本平均值分佈
par(mfrow=c(3,2), mar = c(2, 2, 2, 2))
set.seed(02138)
res <-c()
s <- c()
simulations <- c(10, 100, 500)
samplesize <- c(30, 100)
#loop
for (i in 1:length(simulations)){
for (j in 1:length(samplesize)){
f=function(n = samplesize[j], mu=1, sigma = 1){
s = rnorm(samplesize[j], 1, 1)
(mean(s)-mu)/sqrt((sum(s-mean(s))^2)/sqrt(n))
}
#xvals = seq(-1, 1, 0.1)
hist(UsingR::simple.sim(simulations[i], f, samplesize[j], 1), probability=TRUE, breaks = 40, col='#FF9933', main = paste("N =", samplesize[j], ", Num. of Samples=", simulations[i]),
xlab=expression(mu))
# xvals = seq(-1, 1, .01)
#points(xvals, dnorm(xvals, 0, 1),type="l", lwd=2, col='#FF9933')
}
}
Figure 3.2: 不同樣本規模的樣本平均值分佈
# Set the parameters
lambda <- 0.5 # Rate parameter of the exponential distribution
sample_size <- 100 # Sample size
num_simulations <- 200 # Number of simulations
# Function to estimate lambda from a sample
estimate_lambda <- function(data) {
mean(data) # Estimator of lambda for exponential #distribution is the sample mean (1/lambda)
}
# Simulate data and estimate parameters
lambda_estimates <- replicate(num_simulations, {
# Generate a sample from exponential distribution
sample_data <- rexp(sample_size, rate = lambda)
# Estimate lambda from the sample
estimate_lambda(sample_data)
})
# Plot the distribution of estimated lambdas
hist(lambda_estimates, breaks = 30, main = "Distribution of 200 Estimated Lambdas",
xlab = "Estimated Lambda", col = "#00EEAA", border = "white")
Figure 3.3: 指數分佈抽出的200個樣本平均值分佈
# Set the parameters
lambda <- 0.5 # Rate parameter of the exponential distribution
sample_size <- 100 # Sample size
num_simulations <- 1000 # Number of simulations
# Function to estimate lambda from a sample
estimate_lambda <- function(data) {
mean(data) # Estimator of lambda for exponential #distribution is the sample mean (1/lambda)
}
# Simulate data and estimate parameters
lambda_estimates <- replicate(num_simulations, {
# Generate a sample from exponential distribution
sample_data <- rexp(sample_size, rate = lambda)
# Estimate lambda from the sample
estimate_lambda(sample_data)
})
# Plot the distribution of estimated lambdas
hist(lambda_estimates, breaks = 30, main = "Distribution of 1000 Estimated Lambdas",
xlab = "Estimated Lambda", col = "#112ee1", border = "white")
Figure 3.4: 指數分佈抽出的1000個樣本平均值分佈
\(V[Y]=V[\sum X_{i}]=\sum V[X_{i}]=n\cdot p(1-p)\)
\(V[\frac{Y}{n}]=\frac{1}{n^2}V[Y]=\frac{1}{n^2}np(1-p)=\frac{p(1-p)}{n}\)
set.seed(2019)
results =numeric(0)
k=100; p=0.25; mu=k*p # k is number of trials
for (i in 1:1000) {
S = rbinom(1, k, p)
results[i]=(S-mu)/sqrt(k*p*(1-p)) #
}
hist(results, probability = T, col='#3399FF', breaks=30, main='')
xvals = seq(-3,3,.01)
points(xvals,dnorm(xvals,0,1),type="l", lwd=2, col='#FF9933')
Figure 3.5: 二元分佈抽出的樣本平均值分佈
set.seed(02138)
# 載入套件
library(ggplot2)
set.seed(2025)
# 參數設定
p <- 0.25
n <- 100
n_sim <- 1000
# 模擬抽樣比例
phat <- rbinom(n_sim, n, p) / n
# 標準化
se <- sqrt(p * (1 - p) / n)
z_scores <- (phat - p) / se
# 整理資料框
df <- data.frame(z = z_scores)
# 畫圖
ggplot(df, aes(x = z)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "#66CCFF", color = "white") +
stat_function(fun = dnorm, args = list(mean = 0, sd = 1),
color = "red", size = 1.2) +
labs(
title = "中央極限定理示意:樣本比例的標準化分布",
x = "標準化後的樣本比例 (Z 值)",
y = "密度"
) +
theme_minimal(base_size = 14)
Figure 3.6: 二元分佈樣本統計分佈
假設 \(X_1, X_2, \dots, X_n\) 為獨立而且一致抽樣 (independent and identically distributed, i.i.d.) 的隨機變數,平均值 \(\mu\) 且標準差 \(\sigma\)。
隨機樣本的和表示為:
\[ S_n = \sum_{i=1}^{n} X_i \]
\[\text{SD}\left(\sum_{i=1}^{n} X_{i}\right) = \sigma\sqrt{n}\]
\[ \frac{S_n - n\mu}{\sigma \sqrt{n}} \xrightarrow{d} \mathcal{N}(0, 1) \quad \text{as } n \to \infty \]
set.seed(02138)
# Number of simulations
n_sim <- 10000
# Function to simulate sum of n exponentials
simulate_sum <- function(n, rate = 1) {
replicate(n_sim, sum(rexp(n, rate = rate)))
}
# Simulate 2, 5, and 30 exponential sums
x1 <- simulate_sum(2)
x5 <- simulate_sum(5)
x30 <- simulate_sum(30)
# Plotting
par(mfrow = c(1, 3)) # 3 plots side by side
hist(x1, breaks = 50, col = "skyblue", main = "Sum of 2 Exp(1)", xlab = "Value", probability = TRUE)
curve(dnorm(x, mean = mean(x1), sd = sd(x1)), col = "red", add = TRUE, lwd = 2)
hist(x5, breaks = 50, col = "lightgreen", main = "Sum of 5 Exp(1)", xlab = "Value", probability = TRUE)
curve(dnorm(x, mean = mean(x5), sd = sd(x5)), col = "red", add = TRUE, lwd = 2)
hist(x30, breaks = 50, col = "lightcoral", main = "Sum of 30 Exp(1)", xlab = "Value", probability = TRUE)
curve(dnorm(x, mean = mean(x30), sd = sd(x30)), col = "red", add = TRUE, lwd = 2)
Figure 3.7: 變數和的中央極限定理
\[Y=\hat{\beta}_{0} - \hat{\beta}_{1}X_{1}+u\]
\[\begin{align} Y=5 - 1x+u \tag{3.1} \end{align}\]
\[u\sim N(0,4)\]
set.seed(02138)
#Simulations of Y, X, residuals
y<-rep(NA,40000)
x<-matrix(NA,4000,c(200,200))
for(j in 1:200){
x[,j]<-runif(200,1,10) }
ru<-matrix(NA,4000,c(200,200))
for(j in 1:200){
ru[,j] <- rnorm(200,0,4)}
#Model
y<-5-x+ru
#Simulations of Beta
lm.beta0 <- rep(NA,200); lm.beta1 <- rep(NA,200)
for(j in 1:200){ lm.beta0[j]<-lm(y[,j]~x[,j])$coefficients[1]
lm.beta1[j]<-lm(y[,j]~x[,j])$coefficients[2] }
dt <- data.frame(b1 = lm.beta1, b0 = lm.beta0)
b1<-ggplot2::ggplot(data = dt,aes(x=b1)) +
geom_histogram(bins = 40, fill='#FF6666') +
labs(x = expression(paste(beta,'1'))) +
geom_density(alpha=.9)
b1
Figure 3.8: 迴歸係數長條圖1
b0<-ggplot2::ggplot(data = dt,aes(x=b0)) +
geom_histogram(bins=40, fill='#FF1111') +
labs(x = expression(paste(beta,'0'))) +
geom_density(alpha =.9)
b0
Figure 3.9: 迴歸係數長條圖2
\[ \hat{u}_{i}=y_{i}-\hat{y}_{i}=y_{i}-\hat{\beta}_{0}-\hat{\beta}_{1}x_{i} \]
\[ MSD(\hat{u})\equiv\frac{1}{n}\sum\limits_{i=1}^{n}(\hat{u}_{i}-\bar{\hat{u}})^2=\frac{1}{n}\sum\limits_{i=1}^{n}\hat{u}_{i}^2 \]
\[ \begin{eqnarray} E[MSD(\hat{u})] & = & \frac{n-2}{n}\sigma^{2}_{u}\\ \sigma^{2}_{u} & = &\frac{n}{n-2}MSD(\hat{u}) \\ & =& \frac{n}{n-2}\cdot\frac{1}{n}\sum\limits_{i=1}^{n}\hat{u}_{i}^2 \\ & =& \frac{1}{n-2}\sum\limits_{i=1}^{n}\hat{u}_{i}^2 \end{eqnarray} \]
\[\begin{align*} \sigma^{2}_{u}=\frac{1}{n-2}\sum\limits_{i=1}^{n}\hat{u}_{i}^2 \tag{5.1} \end{align*}\]
\[\begin{align*} \sum(x_{i}-\bar{x})\bar{y}&= (\sum x_{i}-n\bar{x})\bar{y}\\ &=\sum x_{i}\bar{y}-n\frac{\sum x_{i}}{n}\bar{y}\\ &=\sum x_{i}\bar{y}-\sum x_{i}\bar{y}\\ & = 0 \tag{5.2} \end{align*}\]
\[\sum_{i=1}^{n} c=n\cdot c\]
\[\sum_{i=1}^{n} \bar{X}=n\cdot \bar{X}\]
\[\begin{align*} \hat{\beta}_{1}=\beta_{1}+\frac{\sum (x_{i}-\bar{x})u_{i}}{SST_{x}} \tag{5.3} \end{align*}\]
\[\begin{align*} E[\hat{\beta}_{1}|X]&=E[\beta_{1}|X]+E[\frac{\sum (x_{i}-\bar{x})u_{i}}{SST_{x}}|X]\\ &=\beta_{1}+\sum_{i=1}^{n}\frac{\sum (x_{i}-\bar{x})}{SST_{x}}E[u_{i}|X] \\ &=\beta_{1} \end{align*}\]
\[E[\hat{\beta}_{1}]=E[E[\hat{\beta}_{1}|X]]=\beta_{1}\]
\[ \hat{\beta_{0}} = \bar{y} - \hat{\beta_{1}}\bar{x} \]
\[ \hat{y} = \hat{\beta_{0}} + \hat{\beta_{1}}x_{i} \]
\[\hat{\beta}_{1}=\beta_{1}+\frac{\sum (x_{i}-\bar{x})u_{i}}{SST_{x}}\]
\[\begin{align*} V[\hat{\beta}_{1}|X]&=V[\beta_{1}|X]+V[\frac{\sum (x_{i}-\bar{x})u_{i}}{SST_{x}}|X]\\ &=0 + \sum \{\frac{(x_{i}-\bar{x})u_{i}}{SST_{x}}\}^2\hspace{.1em}\rm{Var}[u_{i}|X]\\ &=\frac{\sum (x-\bar{x})^2}{SST_{x}^2}\sigma_{u}^2\\ &=\frac{SST_{x}}{SST_{x}^2}\sigma_{u}^2=\frac{\sigma_{u}^2}{SST_{x}} \tag{5.4} \end{align*}\]
在上面的式子(5.4)的簡化過程中,使用到變異數的特性。因為\(\beta_{1}\)是常數,沒有變異數可言,所以Var[c+x] = Var[c]+Var[x]=0+Var[x]。而且Var\([c\times x]=c^2\)Var[x]。
根據式(5.4),迴歸係數的條件變異數寫成:
\[\begin{align*} \hat{V}[\hat{\beta}_{1}|X] & = \mathcal{\frac{\sigma_{u}^{2}}{\sum_{i=1}^{n}(x-\bar{x})^2}} \\ & = \frac{\sigma_{u}^{2}}{SST_{x}} \tag{5.5} \end{align*}\]
\[\begin{align*} \hat{V}[\hat{\beta}_{0}|X] & = \sigma_{u}^{2}\mathcal{\{\frac{1}{n}+\frac{\bar{x}^2}{\sum_{i=1}^{n}(x-\bar{x})^2}}\}\\ & = \mathcal{\frac{\sigma_{u}^2\sum_{i=1}^{n}x^2}{n\sum_{i=1}^{n}(x-\bar{x})^2}} \tag{5.6} \end{align*}\]
\(\blacksquare\) 上面的方程式(5.6)、(5.5)分別取開根號得到標準誤:\(\sqrt{\hat{V}[\hat{\beta}_{0}|X]}\)以及\(\sqrt{\hat{V}[\hat{\beta}_{1}|X]}\)
\[ u\sim N(0,\sigma_{u}^{2}) \]
\[ \mathrm{Y}|\mathrm{X} \sim N(\beta_{0}+\beta_{1}X, \sigma_{u}^{2}) \]
\[ \sigma^2=\frac{\Sigma(y-\hat{y})^2}{n-2} \] \[ \mathrm{SE}(\hat{\beta_{1}})=\sqrt{\frac{\sigma^2}{\sum(x_{i}-\bar{x})^2}} \]
\[\hat{\beta_{1}}\sim N\Big(\beta_{1},\frac{\sigma^2}{\sum (x_{i}-\bar{x})^2}\Big)\]
\[\hat{\beta_{0}}\sim N\Big(\beta_{0},\frac{\sum x_{i}^2\sigma^2}{n\sum (x_{i}-\bar{x})^2}\Big)\]
\(\it{H}_{0}\): X與Y沒有關係,也就是\(\it{H}_{0}\): \(\beta_{1}=0\)
\(\it{H}_{a}\): X與Y有某種關係,也就是\(\it{H}_{a}\): \(\beta_{1} \neq 0\)
\[ t=\frac{\hat{\beta}_{1}-0}{\mathrm{SE}(\hat{\beta}_{1})} \]
par(bty='n', yaxt='n')
cord.x<-c(1.962, seq(1.962,3,0.01), 3)
cord.x2<-c(-3, seq(-3, -1.962,0.01), -1.962)
cord.y<-c(0, dt(seq(1.962, 3, 0.01), 1000), 0)
cord.z<- c(0, dt(seq(-3, -1.962, 0.01), 1000), 0)
curve(dt(x, 1000),xlim=c(-3,3),
main=expression(paste("Normal Density with"," ", t[alpha/2]=="0.025")) , ylab='', xlab='t value')
polygon(cord.x, cord.y, col="red3")
polygon(cord.x2, cord.z, col="red3" , add=T)
Figure 5.1: t分佈的雙尾拒斥域
par(bty='n', yaxt='n')
cord.x<-c(1.646, seq(1.646,3,0.01), 3)
cord.y<-c(0, dt(seq(1.646, 3, 0.01), 1000), 0)
curve(dt(x, 1000),xlim=c(-3,3),
main=expression(paste("Normal Density with"," ", t[alpha/2]=="0.05")) , ylab='', xlab='t value')
polygon(cord.x, cord.y, col="red3")
Figure 5.2: t分佈的右尾拒斥域
plot(function(x) dt(x, df = 300), -3, 3, ylim = c(0, .5),
main = "t - Density", yaxs = "i", col="white",ylab="Density",
xlab=expression(paste(X %~% tau[n])))
curve(dt(x, df = 3), -3,3, bty="l", col="blue", add=T, lwd=2)
curve(dt(x, df = 1), -3,3, bty="l", col="black", add=T, lwd=2)
curve(dt(x, df = 1000), -3,3, bty="l", col="red", add=T, lwd=2)
curve(dt(x, df = 15), -3,3, bty="l", col="green", add=T, lwd=2)
legend("topright", lty=c(1,1,1,1), lwd=c(2,2,2,2),
legend=c(expression(nu==1, nu==3, nu==15,nu==1000)),
col=c("black", "blue", "green","red"))
Figure 5.3: 不同自由度的t分佈
R
計算\(t\)分布的百分位的機率,也可以計算機率對應的百分位。首先計算特定機率的百分位如表5.1:
qtd <-data.frame(P=c('95%','97.5%','99%'),
t=c(qt(0.950, 1000),qt(0.975, 1000),qt(0.990, 1000)))
newqtd <- qtd %>% knitr::kable("html", caption='累積機率對應的t值') %>%
kableExtra::kable_styling(bootstrap_options = 'striped')
newqtd
P | t |
---|---|
95% | 1.646 |
97.5% | 1.962 |
99% | 2.330 |
tt <- data.frame(p_168=pt(1.68, 1000), p_196=pt(1.96, 1000), p_300=pt(3.00, 1000))
colnames(tt)<-c("t=1.68","t=1.96","t=3.00")
newtt <- tt %>% kable("html", caption='t值對應的百分位') %>%
kable_styling(bootstrap_options = 'striped')
newtt
t=1.68 | t=1.96 | t=3.00 |
---|---|---|
0.9534 | 0.9749 | 0.9986 |
TAB <- data.frame(price=c(300,250,400,550,317,389,425,289,389,559),
bedroom=c(3,3,4,5,4,3,6,3,4,5))
knitr::kable(TAB, format='simple', caption='房價與房間數資料')
price | bedroom |
---|---|
300 | 3 |
250 | 3 |
400 | 4 |
550 | 5 |
317 | 4 |
389 | 3 |
425 | 6 |
289 | 3 |
389 | 4 |
559 | 5 |
pricemodel<-lm(price ~ bedroom, data=TAB)
stargazer::stargazer(pricemodel, title='房價與房間數模型',
type=ifelse(knitr::is_latex_output(),"latex","html"),
label=knitr::opts_current$get("label"))
Dependent variable: | |
price | |
bedroom | 73.100** |
(23.760) | |
Constant | 94.400 |
(97.980) | |
Observations | 10 |
R2 | 0.542 |
Adjusted R2 | 0.485 |
Residual Std. Error | 75.150 (df = 8) |
F Statistic | 9.462** (df = 1; 8) |
Note: | p<0.1; p<0.05; p<0.01 |
T=(100-73.1)/23.76
cat("Test statistic:", T, "\n")
Test statistic: 1.132
pvalue=pt(T, df = 8, lower.tail = T)
cat("p-value:", pvalue)
p-value: 0.8548
結果顯示\(p>0.05\),所以無法拒斥「每增加一間房間售價就可以增加100,000元」的虛無假設。
如果用\(73.1-100=-26.9\)構成一個\(t\)分佈,會發現\(95\%\)的信賴區間包含0,同樣可以得出$=$100的結論。
TAB <- data.frame(elevation=c(600,1000,1250,1600,1800,2100,2500,2900),
temp=c(56,54,56,50,47,49,47,45))
knitr::kable(TAB, format='simple', caption='氣溫與高度資料')
elevation | temp |
---|---|
600 | 56 |
1000 | 54 |
1250 | 56 |
1600 | 50 |
1800 | 47 |
2100 | 49 |
2500 | 47 |
2900 | 45 |
m1<-lm(temp ~ elevation, data=TAB)
stargazer::stargazer(m1, title='氣溫直減率檢定',
type=ifelse(knitr::is_latex_output(),"latex","html"),
label=knitr::opts_current$get("label"))
Dependent variable: | |
temp | |
elevation | -0.005*** |
(0.001) | |
Constant | 59.290*** |
(1.717) | |
Observations | 8 |
R2 | 0.837 |
Adjusted R2 | 0.810 |
Residual Std. Error | 1.879 (df = 6) |
F Statistic | 30.810*** (df = 1; 6) |
Note: | p<0.1; p<0.05; p<0.01 |
T=(-0.005115-(-0.00534))/0.000921
T
[1] 0.2443
pvalue=pt(T, df =6, lower.tail = FALSE)
pvalue
[1] 0.4076
pvalue*2
[1] 0.8151
dt <- data.frame(quantity=c(2537,2,26761,2500,111,1900,439,4,4485,
22778,1354,69,1492,4,2),
value=c(15332,33,122517,12306,569,12350,2525,56,24713,
117794,8578,451,11932,59,23))
knitr::kable(dt, format='simple', caption='吳郭魚的產值與產量資料')
quantity | value |
---|---|
2537 | 15332 |
2 | 33 |
26761 | 122517 |
2500 | 12306 |
111 | 569 |
1900 | 12350 |
439 | 2525 |
4 | 56 |
4485 | 24713 |
22778 | 117794 |
1354 | 8578 |
69 | 451 |
1492 | 11932 |
4 | 59 |
2 | 23 |
m1<-lm(value ~ quantity, data = dt)
stargazer::stargazer(m1, title='吳郭魚的產值與產量模型',
type=ifelse(knitr::is_latex_output(),"latex","html"),
label=knitr::opts_current$get("label"))
Dependent variable: | |
value | |
quantity | 4.788*** |
(0.103) | |
Constant | 1,383.000 |
(950.800) | |
Observations | 15 |
R2 | 0.994 |
Adjusted R2 | 0.994 |
Residual Std. Error | 3,259.000 (df = 13) |
F Statistic | 2,156.000*** (df = 1; 13) |
Note: | p<0.1; p<0.05; p<0.01 |
T=(4.78 - 5.1)/0.1031
T
[1] -3.104
n=nrow(dt)
pvalue=pt(T, df = n-2, lower.tail = TRUE)
pvalue
[1] 0.004193
car
裡面的Salaries
資料分析:library(carData)
fit1 <- with(Salaries, lm(salary ~ sex))
stargazer::stargazer(fit1, title='大學教師的薪水迴歸模型',
type=ifelse(knitr::is_latex_output(),"latex","html"),
label=knitr::opts_current$get("label"))
Dependent variable: | |
salary | |
sexMale | 14,088.000*** |
(5,065.000) | |
Constant | 101,002.000*** |
(4,809.000) | |
Observations | 397 |
R2 | 0.019 |
Adjusted R2 | 0.017 |
Residual Std. Error | 30,035.000 (df = 395) |
F Statistic | 7.738*** (df = 1; 395) |
Note: | p<0.1; p<0.05; p<0.01 |
h0=1-pt(1.96, 395)
h1=1-pt(2.78, 395)
h0; h1
## [1] 0.02535
## [1] 0.002848
ggplot2::ggplot(data=Salaries,
ggplot2::aes(x=sex, y=salary))+
geom_point(col='#bb3311') +
stat_summary(geom = "line", fun = mean, group = 1) +
theme_bw()
Figure 6.1: 大學教師薪水與性別分佈圖
library(readxl)
file <- here::here('Data', 'pressure.xlsx')
pressure <- read_excel(file) #Upload the data
lmTemp = lm(Pressure~Temperature, data = pressure) #Create the linear regression
plot(pressure, pch = 16, col = "blue") #Plot the results
abline(lmTemp) #Add a regression line
Figure 6.2: 氣溫與壓力
plot(lmTemp$residuals, pch = 16, col = "#3333EE")
lmTemp2 = lm(Pressure ~ Temperature + I(Temperature^2), data = pressure) #Create a linear regression with a quadratic coefficient
stargazer(lmTemp2, title = '氣溫與壓力迴歸模型', type=ifelse(knitr::is_latex_output(),"latex","html"),
label=knitr::opts_current$get("label"))
Dependent variable: | |
Pressure | |
Temperature | -1.732*** |
(0.151) | |
I(Temperature2) | 0.052*** |
(0.001) | |
Constant | 33.750*** |
(3.616) | |
Observations | 10 |
R2 | 1.000 |
Adjusted R2 | 0.999 |
Residual Std. Error | 3.074 (df = 7) |
F Statistic | 7,859.000*** (df = 2; 7) |
Note: | p<0.1; p<0.05; p<0.01 |
library(olsrr)
ols_plot_resid_fit(lmTemp2)
set.seed(02138)
X<-rnorm(100, 0, 1); Y<-X+rnorm(100, 0, 1.1)
M1 <- lm(Y~X)
plot(fitted(M1), resid(M1), col='#EE1100')
abline(0,0, col='#0033CC')
Figure 7.1: 殘差值與預測值散佈圖2
已知\(\hat{\beta}_{1}=\frac{\sum (x_{i}-\bar{x})(y_{i}-\bar{y})}{\sum (x_{i}-\bar{x})^2}=\frac{\sum (x_{i}-\bar{x})y_{i}}{SST_{x}}\)。
根據(5.3),迴歸係數\(\hat{\beta}_{1}\)可從上面的程式改寫為:
\[\hat{\beta}_{1}=\beta_{1}+\frac{\sum (x_{i}-\bar{x})u_{i}}{\sum (x_{i}-\bar{x})^2}\]
假設: \[w_{i}\equiv \frac{\sum (x_{i}-\bar{x})}{\sum (x_{i}-\bar{x})^2}\]
根據式(5.4), \[\begin{align*} V[\hat{\beta}_{1}|X]&=V[\beta_{1}|X]+V[\frac{\sum (x_{i}-\bar{x})u_{i}}{SST_{x}}|X]\\ &=0 + \sum \{\frac{(x_{i}-\bar{x})u_{i}}{SST_{x}}\}^2\hspace{.1em}\rm{Var}[u_{i}|X]\\ &=\frac{\sum (x-\bar{x})^2}{SST_{x}^2}\sigma_{u}^2\\ &=\frac{SST_{x}}{SST_{x}^2}\sigma_{u}^2=\frac{\sigma_{u}^2}{SST_{x}} \tag{5.4} \end{align*}\]
在推演過程中,我們假設:
\[\frac{\sum (x-\bar{x})^2}{SST_{x}^2}\sigma_{u}^2\]
\[\frac{\sum (x-\bar{x})^2}{SST_{x}^2}\sigma_{u(i)}^2\]
\[ \text{Var}(\hat{\beta_{1}})=\frac{\sigma_{u(i)}^2}{SST_{x}} \]
\[u_{i}^2=\alpha_{0}+\alpha_{1}z_{i,1}+\ldots +\alpha_{k}z_{i,k}+\nu_{i}\]
\[\textit{H}_{0}:\alpha_{2}=\alpha_{3}=\ldots = \alpha_{k}=0\]
library(lmtest)
# Example data
set.seed(02139)
x1 <- rnorm(200)
x2 <- rnorm(200)
y <- 2*x1 + 3*x2 + rnorm(200)
# Fit linear regression model
model <- lm(y ~ x1 + x2)
# Perform Breusch-Pagan test
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 8.3, df = 2, p-value = 0.02
如果p<0.05,代表違反變異數同質性的假設。
也可用圖7.2檢測。
showtext::showtext_auto()
set.seed(02139)
x1 <- rnorm(200)
x2 <- rnorm(200)
y <- 2*x1 + 3*x2 + rnorm(200)
# Fit linear regression model
model <- lm(y ~ x1 + x2)
# visualization
plot(fitted(model), resid(model), col='#3322cc',
main = '檢測殘差變異數同質性假設散佈圖')
abline(0,0, col='#bb1100')
Figure 7.2: 殘差值與預測值散佈圖3
\[u_{i}^2=\alpha_{0}+\alpha_{1}z_{i,1}+\alpha_{2}z_{i,1}^2+\ldots +\alpha_{k}z_{i,k}+\alpha_{k+1}z_{i,k+1}^2+\nu_{i}\]
\[\textit{H}_{0}:\alpha_{2}=\alpha_{3}=\ldots = \alpha_{k}=0\]
data <- data.frame(y=y, x1=x1, x2=x2)
model2 <- lm(y ~ x1 + x2, data)
u2 <- model2$residuals^2
Ru2<- summary(lm(u2 ~ x1 + x2 + x1^2 + x2^2))$r.squared
LM <- nrow(data)*Ru2
p.value <- 1-pchisq(LM, 4)
cat("White Test's p value:", p.value, '\n')
## White Test's p value: 0.08185
因為p>0.05,所以沒有違反殘差變異數同質性假設。
圖2.8似乎顯示殘差在不同Y固定X的情況下,並不是隨機分佈。我們用Breusch-Pagen test以及White test檢測:
mydata <- pscl::bioChemists
m1 <- lm(ment ~ phd + kid5, data = mydata)
u2 <- m1$residuals^2
Ru2<- summary(lm(u2 ~ mydata$phd + mydata$phd^2 +
mydata$kid5 + mydata$kid5^2))$r.squared
LM <- nrow(mydata)*Ru2
p.value <- 1-pchisq(LM, 4)
cat("White Test's p value:", p.value, '\n')
## White Test's p value: 0.03154
#alternative way
fit_Y <- m1$fitted.values
r2.u <- summary(lm(u2 ~ fit_Y + fit_Y^2))$r.squared
LM <- nrow(mydata)*r2.u
p.value <- 1-pchisq(LM, 2)
cat("White Test's p value:", p.value, '\n')
## White Test's p value: 0.03086
# skedastic package
library(skedastic)
white(m1)
## # A tibble: 1 × 5
## statistic p.value parameter method alternative
## <dbl> <dbl> <dbl> <chr> <chr>
## 1 21.3 0.000278 4 White's Test greater
#Breusch-Pagen test
bptest(m1)
##
## studentized Breusch-Pagan test
##
## data: m1
## BP = 11, df = 2, p-value = 0.005
結果發現p<0.05,所以違反殘差變異數同質性假設。
還有一個Goldfeld-Quandt test,適用在資料可以分成兩組,檢驗兩組的變異數是否相等:
\[F=\frac{\hat{\sigma_{A}}^2/\sigma_{B}^{2}}{\hat{\sigma_{B}}^{2}/\sigma_{B}^{2}}\sim F_{N_{A}-k,N_{B}-k}\]
\[\text{Var}[\hat{\beta_{1}}]=\frac{n}{n-k}(\mathbb{X^{\prime}X})^{-1}\hat{\Sigma}(\mathbb{X^{\prime}X})^{-1}\]
\[\hat{\Sigma}=\sum_{i=1}^{n}\mathbb{x_{i}}{x_{i}^{\prime}}u_{i}^{1}\]
library(haven)
dt <- read_dta("https://grodri.github.io/datasets/effort.dta")
dt$effortg = cut(dt$effort, breaks=c(min(dt$effort),5,15, max(dt$effort)), right=FALSE, include.lowest=TRUE, labels=c("Weak", "Moderate", "Strong"))
#OLS model
OLS1 <- lm(change ~ setting + effortg, data = dt)
library(lmtest)
bptest(OLS1)
##
## studentized Breusch-Pagan test
##
## data: OLS1
## BP = 3.6, df = 3, p-value = 0.3
library(sandwich)
#sandwich
robustmodel <- coeftest(OLS1, vcov = vcovHC(OLS1, type="HC1"))
stargazer::stargazer(OLS1, robustmodel,
title = '出生率的OLS模型與穩健標準誤模型',
type=ifelse(knitr::is_latex_output(),"latex","html"),
label=knitr::opts_current$get("label"))
Dependent variable: | ||
change | ||
OLS | coefficient | |
test | ||
(1) | (2) | |
setting | 0.169 | 0.169*** |
(0.106) | (0.045) | |
effortgModerate | 4.144 | 4.144 |
(3.191) | (3.191) | |
effortgStrong | 19.450*** | 19.450*** |
(3.729) | (2.567) | |
Constant | -5.954 | -5.954** |
(7.166) | (2.708) | |
Observations | 20 | |
R2 | 0.802 | |
Adjusted R2 | 0.764 | |
Residual Std. Error | 5.732 (df = 16) | |
F Statistic | 21.550*** (df = 3; 16) | |
Note: | p<0.1; p<0.05; p<0.01 |
showtext::showtext_auto()
wages <- haven::read_dta("https://grodri.github.io/datasets/wages.dta")
#OLS
m1 <- lm(wages ~ education + workexp + unionmember + south + occupation + female, data = wages)
# visualization
plot(fitted(m1), resid(m1), col='#3322cc',
main = '檢測殘差變異數同質性假設散佈圖')
abline(0,0, col='#bb1100')
Figure 7.3: 檢測殘差變異數同質性假設散佈圖
#OLS model
lm1 <- lm(wages ~ education + workexp + unionmember + south + occupation + female, data = wages)
library(lmtest)
bptest(lm1)
##
## studentized Breusch-Pagan test
##
## data: lm1
## BP = 12, df = 6, p-value = 0.07
library(sandwich)
robustmodel <- coeftest(lm1, vcov = vcovHC(lm1, type="HC1"))
stargazer::stargazer(lm1, robustmodel, type=ifelse(knitr::is_latex_output(),"latex","html"),
label=knitr::opts_current$get("label"), title='薪資的OLS模型與穩健標準誤模型')
Dependent variable: | ||
wages | ||
OLS | coefficient | |
test | ||
(1) | (2) | |
education | 0.904*** | 0.904*** |
(0.081) | (0.094) | |
workexp | 0.104*** | 0.104*** |
(0.017) | (0.019) | |
unionmember | 1.444*** | 1.444*** |
(0.523) | (0.516) | |
south | -0.787* | -0.787* |
(0.427) | (0.414) | |
occupation | -0.062 | -0.062 |
(0.125) | (0.151) | |
female | -2.207*** | -2.207*** |
(0.398) | (0.400) | |
Constant | -3.362** | -3.362* |
(1.466) | (1.750) | |
Observations | 534 | |
R2 | 0.270 | |
Adjusted R2 | 0.261 | |
Residual Std. Error | 4.416 (df = 527) | |
F Statistic | 32.450*** (df = 6; 527) | |
Note: | p<0.1; p<0.05; p<0.01 |
\(\blacksquare\)有關Type I跟Type II error的說明可見Jim Frost (https://statisticsbyjim.com/hypothesis-testing/types-errors-hypothesis-testing/)。
library(kableExtra)
DT<-data.frame(X=c(4, 3, 5, 2, 4, 2, 2, 3, 2, 2, 2, 3, 5, 1, 1),
Y=c(5, 5, 5, 3, 4, 3, 3, 4, 4, 5, 4, 5, 3, 2, 1))
DT %>%
kable(format='simple', caption='習題一資料') %>%
kable_styling(bootstrap_options = 'striped')
## Warning in kable_styling(., bootstrap_options = "striped"): Please specify
## format in kable. kableExtra can customize either HTML or LaTeX outputs. See
## https://haozhu233.github.io/kableExtra/ for details.
X | Y |
---|---|
4 | 5 |
3 | 5 |
5 | 5 |
2 | 3 |
4 | 4 |
2 | 3 |
2 | 3 |
3 | 4 |
2 | 4 |
2 | 5 |
2 | 4 |
3 | 5 |
5 | 3 |
1 | 2 |
1 | 1 |
DT2<-data.frame(X=c(2, 1, 3, 4, 5),
Y=c(3, 4, 5, 4, 4))
DT2 %>%
kable("html") %>%
kable_styling()
X | Y |
---|---|
2 | 3 |
1 | 4 |
3 | 5 |
4 | 4 |
5 | 4 |
\[\rm{y2000} = \beta_{0}+\beta_{1}\rm{y1970}\]
carData::Migration
這筆資料,檢驗migrants
跟distance
之間的迴歸模型,有沒有符合殘差值與預測值無關的假設?
car::spreadLevelPlot
函數確認預測值沒有特殊的分佈。
carData
裡面的Salaries
資料分析,分析大學教師的薪水與服務年資有關嗎?跟性別比起來,哪一個變數與薪水相關比較大?
## 最後更新日期 05/15/2025