knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
library(randtests)
library(outliers)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(olsrr)
##
## 载入程辑包:'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library(InformationValue)
Duration <- c(2.88,2.51,2.22,2.16,2.05,1.93,1.67,1.49,1.25,1.21,1.15,1.12,0.73)
Pressure <- c(84,86,96,93,94,101,103,102,100,108,104,102,111)
Check independence of the dependent variable (Y)
runs.test(Pressure)
##
## Runs Test
##
## data: Pressure
## statistic = -1.8166, runs = 4, n1 = 6, n2 = 6, n = 12, p-value =
## 0.06928
## alternative hypothesis: nonrandomness
Scatter plot
plot(Duration,Pressure)
We can see that the linear trend is not so obviously.
Check potential outliers:
boxplot(Pressure)
dixon.test(Pressure)
##
## Dixon test for outliers
##
## data: Pressure
## Q = 0.375, p-value = 0.4957
## alternative hypothesis: lowest value 84 is an outlier
So there are no outliers.
Data <- data.frame(Duration,Pressure)
# Build the model
reg <- lm(Pressure~Duration,data = Data)
# Make predictions for individual responses
Pred_band <- predict(reg,interval = "prediction", level = 0.95)
## Warning in predict.lm(reg, interval = "prediction", level = 0.95): predictions on current data refer to _future_ responses
summary(reg)
##
## Call:
## lm(formula = Pressure ~ Duration, data = Data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2511 -1.4156 -0.6546 3.0441 4.6672
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 118.807 2.591 45.847 6.50e-14 ***
## Duration -11.645 1.421 -8.197 5.18e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.102 on 11 degrees of freedom
## Multiple R-squared: 0.8593, Adjusted R-squared: 0.8465
## F-statistic: 67.2 on 1 and 11 DF, p-value: 5.177e-06
# Update data frame
Data2 <- cbind(Data,Pred_band)
# Plot
ggplot(Data2,aes(Duration, Pressure))+
geom_point() +
geom_line(aes(y=lwr),color = "red",linetype = "dashed") +
geom_line(aes(y=upr),color = "red",linetype = "dashed") +
geom_smooth(method = lm, se=TRUE)
## `geom_smooth()` using formula 'y ~ x'
par(mfrow = c(2,2))
plot(reg)
For the residual plots, we can see the red line in “Residuals vs Fitted”
are approximate horizontal, which indicate a linear relationship.
Scale-location plot shows data are homoscedasticity and NormaL Q-Q plot
shows the residuals are normally distributed.
1.1 There is an distinct linear relationship between sitting duration
and blood pressure. In my linear regression model, the slope is -11.645,
the intercept is 118.807 with R-squared 0.8593.
1.2
q <- data.frame(Duration = 1.80)
predict(reg,q,interval = "prediction", level = 0.95)
## fit lwr upr
## 1 97.84662 90.75771 104.9355
When duration = 1.8, professor’s blood will be 97.85 mm Hg with 95% confidence interval [90.758,104.936].
# load data
AQI_data <- read.csv(file = "AQI.CSV", header = T)
# Check data
str(AQI_data)
## 'data.frame': 15 obs. of 6 variables:
## $ 锘緿ay : int 1 2 3 4 5 6 7 8 9 10 ...
## $ AQI : int 367 307 280 388 268 347 308 247 289 200 ...
## $ Pressure : num 1000 999 999 1000 998 ...
## $ Temperature: num 20.2 21.1 19.3 17.5 16.9 22.5 13.6 21.2 23.5 18.6 ...
## $ Humidity : int 44 56 20 31 60 48 76 62 60 74 ...
## $ Traffic : int 357 256 270 541 254 337 411 280 281 88 ...
# Plot scatter plots
ggpairs(AQI_data,columns=2:6)
Check independence of the dependent variable (Y)
runs.test(AQI_data$AQI)
##
## Runs Test
##
## data: AQI_data$AQI
## statistic = -0.55635, runs = 7, n1 = 7, n2 = 7, n = 14, p-value = 0.578
## alternative hypothesis: nonrandomness
Check potential outliers:
boxplot(AQI_data$AQI)
dixon.test(AQI_data$AQI)
##
## Dixon test for outliers
##
## data: AQI_data$AQI
## Q = 0.36757, p-value = 0.5359
## alternative hypothesis: lowest value 200 is an outlier
There is a linear relationship between AQI and other observed factors, H0 is then: β1=β2=…=βk=0
full_model <- lm(AQI~Pressure+Temperature+Humidity+Traffic, data = AQI_data)
output <- ols_step_all_possible(full_model)
# Print results from all possible subsets
output
# Plot results from all possible subsets
plot(output)
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
# All best subsets approach
ols_step_best_subset(full_model)
Auto selection
# Forward selection
ols_step_forward_aic(full_model, details=F)
##
## Selection Summary
## ------------------------------------------------------------------------
## Variable AIC Sum Sq RSS R-Sq Adj. R-Sq
## ------------------------------------------------------------------------
## Traffic 151.385 28988.211 14221.523 0.67087 0.64555
## Temperature 143.242 35977.293 7232.440 0.83262 0.80472
## ------------------------------------------------------------------------
# Stepwise regression
ols_step_backward_aic(full_model, details=F)
##
##
## Backward Elimination Summary
## ----------------------------------------------------------------------
## Variable AIC RSS Sum Sq R-Sq Adj. R-Sq
## ----------------------------------------------------------------------
## Full Model 147.151 7188.419 36021.314 0.83364 0.76709
## Humidity 145.170 7197.595 36012.138 0.83343 0.78800
## Pressure 143.242 7232.440 35977.293 0.83262 0.80472
## ----------------------------------------------------------------------
# Stepwise regression
ols_step_both_aic(full_model, details=F)
##
##
## Stepwise Summary
## ------------------------------------------------------------------------------------
## Variable Method AIC RSS Sum Sq R-Sq Adj. R-Sq
## ------------------------------------------------------------------------------------
## Traffic addition 151.385 14221.523 28988.211 0.67087 0.64555
## Temperature addition 143.242 7232.440 35977.293 0.83262 0.80472
## ------------------------------------------------------------------------------------
We can see that temperature and traffic are really important factors, We will adopt these two variables as the dependent variables in my model.
Build a best model between AQI and traffic and temperature.
# Make predictions with model
reg <- lm(AQI~Traffic + Temperature, data = AQI_data)
summary(reg)
##
## Call:
## lm(formula = AQI ~ Traffic + Temperature, data = AQI_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.362 -9.479 4.746 12.441 31.618
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30.03569 50.33257 0.597 0.56177
## Traffic 0.49788 0.06445 7.725 5.37e-06 ***
## Temperature 6.31691 1.85501 3.405 0.00522 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.55 on 12 degrees of freedom
## Multiple R-squared: 0.8326, Adjusted R-squared: 0.8047
## F-statistic: 29.85 on 2 and 12 DF, p-value: 2.199e-05
# Make confidence band
predict(reg, interval="confidence", level=0.95)
## fit lwr upr
## 1 335.3821 320.8030 349.9612
## 2 290.7810 274.4043 307.1577
## 3 286.3809 269.5850 303.1768
## 4 409.9373 379.1472 440.7273
## 5 263.2542 239.4515 287.0569
## 6 339.9533 322.5669 357.3398
## 7 320.5763 293.1139 348.0387
## 8 303.3619 288.2942 318.4296
## 9 318.3887 299.7698 337.0076
## 10 191.3441 152.5178 230.1703
## 11 324.8767 302.1715 347.5820
## 12 357.3575 335.6986 379.0164
## 13 326.6194 308.6254 344.6133
## 14 335.2887 305.5389 365.0386
## 15 379.4978 348.2738 410.7219
# Make confidence band
Pred_band <- predict(reg, interval="prediction", level=0.95)
## Warning in predict.lm(reg, interval = "prediction", level = 0.95): predictions on current data refer to _future_ responses
par(mfrow = c(2,2))
plot(reg)
Make prediction at the specific conditions
Condition1
condition1 <- data.frame(Pressure = 1003.0, Temperature = 25.0, Humidity = 70, Traffic = 450)
predict(reg,condition1,interval = "prediction",level = 0.95)
## fit lwr upr
## 1 412.0066 348.3722 475.641
Condition2
condition2 <- data.frame(Pressure = 1001.0, Temperature = 15.0, Humidity = 50, Traffic = 300)
predict(reg,condition2,interval = "prediction",level = 0.95)
## fit lwr upr
## 1 274.1548 214.5787 333.7308
# load data
data(mtcars)
mtcars <- force(mtcars)
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
# Fit the regression model
logistic <- glm( am~hp+wt+cyl, data = mtcars, family = binomial)
summary(logistic)
##
## Call:
## glm(formula = am ~ hp + wt + cyl, family = binomial, data = mtcars)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.17272 -0.14907 -0.01464 0.14116 1.27641
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 19.70288 8.11637 2.428 0.0152 *
## hp 0.03259 0.01886 1.728 0.0840 .
## wt -9.14947 4.15332 -2.203 0.0276 *
## cyl 0.48760 1.07162 0.455 0.6491
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 43.2297 on 31 degrees of freedom
## Residual deviance: 9.8415 on 28 degrees of freedom
## AIC: 17.841
##
## Number of Fisher Scoring iterations: 8
Update model The variable wt has a significant slope, which
have p value 0.0152.
So the updation os logistic regression model is
logistic <- glm( am~wt, data = mtcars, family = binomial)
summary(logistic)
##
## Call:
## glm(formula = am ~ wt, family = binomial, data = mtcars)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.11400 -0.53738 -0.08811 0.26055 2.19931
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 12.040 4.510 2.670 0.00759 **
## wt -4.024 1.436 -2.801 0.00509 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 43.230 on 31 degrees of freedom
## Residual deviance: 19.176 on 30 degrees of freedom
## AIC: 23.176
##
## Number of Fisher Scoring iterations: 6
McFadden’s R2
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
pR2(logistic)["McFadden"]
## fitting null model for pseudo-r2
## McFadden
## 0.5564145
Setting probability cutoff
# Find optimal cutoff probability to use to maximize accuracy
predicted_value <- predict(logistic, mtcars, type="response")
optimalCutoff(mtcars$am, predicted_value)
## [1] 0.3174064
Make predictions
car1 <- data.frame(hp = 140,wt = 3000)
predict(logistic,car1,type = "response")
## 1
## 2.220446e-16
car2 <- data.frame(hp = 90,wt = 2000)
predict(logistic,car2,type = "response")
## 1
## 2.220446e-16
s = c(rep(0,100), 10*exp(-(1:100)/20)*cos(2*pi*1:100/4))
x1 = s + rnorm(200)
plot.ts(x1)
s = c(rep(0,100), 10*exp(-(1:100)/200)*cos(2*pi*1:100/4))
x2 = s + rnorm(200)
plot.ts(x2)
The rough amplitude ratios of the first phase P to the second phase S for earthquakes are smaller than for explosions. So model 1 tend to be the earthquake series and model 2 tend to be the explosion series.
# plot and compare the signal modulators
m1 <- c(exp(-(1:100)/20))
m2 <- c(exp(-(1:100)/200))
par(mfrow = c(2,1))
plot(m1)
plot(m2)
The signal modulator 1 has a tail, whose slope is gradually decreased.
And the slope of signal modulator 2 does not show obvious change.
The series xt is not stationary because it is not independent of
time.
### 5.2
\[
\begin{aligned}
x_t &= \beta_1 + \beta_2t + w_t \\
x_{t-1} &= \beta_1 + \beta_2(t-1) +w_{t-1}\\
y_t &= x_t - x_{t-1}
= -\beta + w_t - w_{t-1}
\end{aligned}
\] So \(y_t\) is independent of
time, which means \(y_t\) is
stationary.
### 5.3
The mean of the moving average: \[
\begin{aligned}
v_t &= \frac{1}{2q+1} \sum_{j=-q}^q x_{t-j}\\
&= \frac{1}{2q+1}(\beta_1(2q+1) + \beta_2(\sum_{j=-q}^q (t+j)) \\
&= \frac{1}{2q+1} (\beta_1(2q+1)+\beta_2*t(2q+1))\\
&= \beta_1 +\beta_2 t
\end{aligned}
\] The autocovariance funciton:
\[
\begin{aligned}
acf = \frac{1}{N} \sum_{t=k+1}^N(x_t-\mu)(x_{t-k}-\mu)
\end{aligned}
\]