knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
library(randtests)
library(outliers)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(olsrr)
## 
## 载入程辑包:'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
library(InformationValue)

Sitting duration and blood pressure

Duration <- c(2.88,2.51,2.22,2.16,2.05,1.93,1.67,1.49,1.25,1.21,1.15,1.12,0.73)
Pressure <- c(84,86,96,93,94,101,103,102,100,108,104,102,111)

1.1 Check data

Check independence of the dependent variable (Y)

runs.test(Pressure)
## 
##  Runs Test
## 
## data:  Pressure
## statistic = -1.8166, runs = 4, n1 = 6, n2 = 6, n = 12, p-value =
## 0.06928
## alternative hypothesis: nonrandomness

Scatter plot

plot(Duration,Pressure)

We can see that the linear trend is not so obviously.

Check potential outliers:

boxplot(Pressure)

dixon.test(Pressure)
## 
##  Dixon test for outliers
## 
## data:  Pressure
## Q = 0.375, p-value = 0.4957
## alternative hypothesis: lowest value 84 is an outlier

So there are no outliers.

1.2 Write down H0 and H1

  • H0: there is no linear relationship between sitting duration and blood pressure (β1=0)
  • H1;there is a linear relationship between sitting duration and blood pressure (β1≠0)

1.3 Test the slope

Data <- data.frame(Duration,Pressure)
# Build the model
reg <- lm(Pressure~Duration,data = Data)
# Make predictions for individual responses
Pred_band <- predict(reg,interval = "prediction", level = 0.95)
## Warning in predict.lm(reg, interval = "prediction", level = 0.95): predictions on current data refer to _future_ responses
summary(reg)
## 
## Call:
## lm(formula = Pressure ~ Duration, data = Data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.2511 -1.4156 -0.6546  3.0441  4.6672 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  118.807      2.591  45.847 6.50e-14 ***
## Duration     -11.645      1.421  -8.197 5.18e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.102 on 11 degrees of freedom
## Multiple R-squared:  0.8593, Adjusted R-squared:  0.8465 
## F-statistic:  67.2 on 1 and 11 DF,  p-value: 5.177e-06
# Update data frame
Data2 <- cbind(Data,Pred_band)
# Plot
ggplot(Data2,aes(Duration, Pressure))+
  geom_point() +
  geom_line(aes(y=lwr),color = "red",linetype = "dashed") +
  geom_line(aes(y=upr),color = "red",linetype = "dashed") +
  geom_smooth(method = lm, se=TRUE)
## `geom_smooth()` using formula 'y ~ x'

1.4 Check residuals

par(mfrow = c(2,2))
plot(reg)

For the residual plots, we can see the red line in “Residuals vs Fitted” are approximate horizontal, which indicate a linear relationship. Scale-location plot shows data are homoscedasticity and NormaL Q-Q plot shows the residuals are normally distributed.

1.5 Report the results

1.1 There is an distinct linear relationship between sitting duration and blood pressure. In my linear regression model, the slope is -11.645, the intercept is 118.807 with R-squared 0.8593.
1.2

  q <- data.frame(Duration = 1.80)
  predict(reg,q,interval = "prediction", level = 0.95)
##        fit      lwr      upr
## 1 97.84662 90.75771 104.9355

When duration = 1.8, professor’s blood will be 97.85 mm Hg with 95% confidence interval [90.758,104.936].


AQI near a highway

# load data
AQI_data <- read.csv(file = "AQI.CSV", header = T)
# Check data
str(AQI_data)
## 'data.frame':    15 obs. of  6 variables:
##  $ 锘緿ay     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ AQI        : int  367 307 280 388 268 347 308 247 289 200 ...
##  $ Pressure   : num  1000 999 999 1000 998 ...
##  $ Temperature: num  20.2 21.1 19.3 17.5 16.9 22.5 13.6 21.2 23.5 18.6 ...
##  $ Humidity   : int  44 56 20 31 60 48 76 62 60 74 ...
##  $ Traffic    : int  357 256 270 541 254 337 411 280 281 88 ...

2.1 Check data

# Plot scatter plots
ggpairs(AQI_data,columns=2:6)

Check independence of the dependent variable (Y)

runs.test(AQI_data$AQI)
## 
##  Runs Test
## 
## data:  AQI_data$AQI
## statistic = -0.55635, runs = 7, n1 = 7, n2 = 7, n = 14, p-value = 0.578
## alternative hypothesis: nonrandomness

Check potential outliers:

boxplot(AQI_data$AQI)

dixon.test(AQI_data$AQI)
## 
##  Dixon test for outliers
## 
## data:  AQI_data$AQI
## Q = 0.36757, p-value = 0.5359
## alternative hypothesis: lowest value 200 is an outlier

2.2 Write down H0 and H1

There is a linear relationship between AQI and other observed factors, H0 is then: β1=β2=…=βk=0

2.3 Find the best model

full_model <- lm(AQI~Pressure+Temperature+Humidity+Traffic, data = AQI_data)
output <- ols_step_all_possible(full_model)
# Print results from all possible subsets
output
# Plot results from all possible subsets
plot(output)
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.
## It is deprecated to specify `guide = FALSE` to remove a guide. Please use `guide = "none"` instead.

# All best subsets approach
ols_step_best_subset(full_model)

Auto selection

# Forward selection
ols_step_forward_aic(full_model, details=F)
## 
##                            Selection Summary                             
## ------------------------------------------------------------------------
## Variable         AIC       Sum Sq         RSS        R-Sq      Adj. R-Sq 
## ------------------------------------------------------------------------
## Traffic        151.385    28988.211    14221.523    0.67087      0.64555 
## Temperature    143.242    35977.293     7232.440    0.83262      0.80472 
## ------------------------------------------------------------------------
# Stepwise regression
ols_step_backward_aic(full_model, details=F)
## 
## 
##                       Backward Elimination Summary                      
## ----------------------------------------------------------------------
## Variable        AIC        RSS        Sum Sq       R-Sq      Adj. R-Sq 
## ----------------------------------------------------------------------
## Full Model    147.151    7188.419    36021.314    0.83364      0.76709 
## Humidity      145.170    7197.595    36012.138    0.83343      0.78800 
## Pressure      143.242    7232.440    35977.293    0.83262      0.80472 
## ----------------------------------------------------------------------
# Stepwise regression
ols_step_both_aic(full_model, details=F)
## 
## 
##                                    Stepwise Summary                                   
## ------------------------------------------------------------------------------------
## Variable        Method       AIC         RSS        Sum Sq       R-Sq      Adj. R-Sq 
## ------------------------------------------------------------------------------------
## Traffic        addition    151.385    14221.523    28988.211    0.67087      0.64555 
## Temperature    addition    143.242     7232.440    35977.293    0.83262      0.80472 
## ------------------------------------------------------------------------------------

We can see that temperature and traffic are really important factors, We will adopt these two variables as the dependent variables in my model.

Build a best model between AQI and traffic and temperature.

# Make predictions with model
reg <- lm(AQI~Traffic + Temperature, data = AQI_data)
summary(reg)
## 
## Call:
## lm(formula = AQI ~ Traffic + Temperature, data = AQI_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -56.362  -9.479   4.746  12.441  31.618 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 30.03569   50.33257   0.597  0.56177    
## Traffic      0.49788    0.06445   7.725 5.37e-06 ***
## Temperature  6.31691    1.85501   3.405  0.00522 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24.55 on 12 degrees of freedom
## Multiple R-squared:  0.8326, Adjusted R-squared:  0.8047 
## F-statistic: 29.85 on 2 and 12 DF,  p-value: 2.199e-05
# Make confidence band
predict(reg, interval="confidence", level=0.95)
##         fit      lwr      upr
## 1  335.3821 320.8030 349.9612
## 2  290.7810 274.4043 307.1577
## 3  286.3809 269.5850 303.1768
## 4  409.9373 379.1472 440.7273
## 5  263.2542 239.4515 287.0569
## 6  339.9533 322.5669 357.3398
## 7  320.5763 293.1139 348.0387
## 8  303.3619 288.2942 318.4296
## 9  318.3887 299.7698 337.0076
## 10 191.3441 152.5178 230.1703
## 11 324.8767 302.1715 347.5820
## 12 357.3575 335.6986 379.0164
## 13 326.6194 308.6254 344.6133
## 14 335.2887 305.5389 365.0386
## 15 379.4978 348.2738 410.7219
# Make confidence band
Pred_band <- predict(reg, interval="prediction", level=0.95)
## Warning in predict.lm(reg, interval = "prediction", level = 0.95): predictions on current data refer to _future_ responses

4. Check residual

par(mfrow = c(2,2))
plot(reg)

Make prediction at the specific conditions
Condition1

condition1 <- data.frame(Pressure = 1003.0, Temperature = 25.0, Humidity = 70, Traffic = 450)
predict(reg,condition1,interval = "prediction",level = 0.95)
##        fit      lwr     upr
## 1 412.0066 348.3722 475.641

Condition2

condition2 <- data.frame(Pressure = 1001.0, Temperature = 15.0, Humidity = 50, Traffic = 300)
predict(reg,condition2,interval = "prediction",level = 0.95)
##        fit      lwr      upr
## 1 274.1548 214.5787 333.7308

3.Car and its engine

3.1

# load data
data(mtcars)
mtcars <- force(mtcars)
str(mtcars)
## 'data.frame':    32 obs. of  11 variables:
##  $ mpg : num  21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
##  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
##  $ disp: num  160 160 108 258 360 ...
##  $ hp  : num  110 110 93 110 175 105 245 62 95 123 ...
##  $ drat: num  3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
##  $ wt  : num  2.62 2.88 2.32 3.21 3.44 ...
##  $ qsec: num  16.5 17 18.6 19.4 17 ...
##  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
##  $ am  : num  1 1 1 0 0 0 0 0 0 0 ...
##  $ gear: num  4 4 4 3 3 3 3 4 4 4 ...
##  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
# Fit the regression model
logistic <- glm( am~hp+wt+cyl, data = mtcars, family = binomial)
summary(logistic)
## 
## Call:
## glm(formula = am ~ hp + wt + cyl, family = binomial, data = mtcars)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.17272  -0.14907  -0.01464   0.14116   1.27641  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept) 19.70288    8.11637   2.428   0.0152 *
## hp           0.03259    0.01886   1.728   0.0840 .
## wt          -9.14947    4.15332  -2.203   0.0276 *
## cyl          0.48760    1.07162   0.455   0.6491  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 43.2297  on 31  degrees of freedom
## Residual deviance:  9.8415  on 28  degrees of freedom
## AIC: 17.841
## 
## Number of Fisher Scoring iterations: 8

3.2

Update model The variable wt has a significant slope, which have p value 0.0152.
So the updation os logistic regression model is

logistic <- glm( am~wt, data = mtcars, family = binomial)
summary(logistic)
## 
## Call:
## glm(formula = am ~ wt, family = binomial, data = mtcars)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.11400  -0.53738  -0.08811   0.26055   2.19931  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)   12.040      4.510   2.670  0.00759 **
## wt            -4.024      1.436  -2.801  0.00509 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 43.230  on 31  degrees of freedom
## Residual deviance: 19.176  on 30  degrees of freedom
## AIC: 23.176
## 
## Number of Fisher Scoring iterations: 6

McFadden’s R2

library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
pR2(logistic)["McFadden"]
## fitting null model for pseudo-r2
##  McFadden 
## 0.5564145

Setting probability cutoff

# Find optimal cutoff probability to use to maximize accuracy
predicted_value <- predict(logistic, mtcars, type="response")
optimalCutoff(mtcars$am, predicted_value)
## [1] 0.3174064

3.3

Make predictions

car1 <- data.frame(hp = 140,wt = 3000)
predict(logistic,car1,type = "response")
##            1 
## 2.220446e-16
car2 <- data.frame(hp = 90,wt = 2000)
predict(logistic,car2,type = "response")
##            1 
## 2.220446e-16

Signal-plus-noise model

4.1

s = c(rep(0,100), 10*exp(-(1:100)/20)*cos(2*pi*1:100/4))
x1 = s + rnorm(200)
plot.ts(x1)

4.2

s = c(rep(0,100), 10*exp(-(1:100)/200)*cos(2*pi*1:100/4))
x2 = s + rnorm(200)
plot.ts(x2)

4.3

The rough amplitude ratios of the first phase P to the second phase S for earthquakes are smaller than for explosions. So model 1 tend to be the earthquake series and model 2 tend to be the explosion series.

# plot and compare the signal modulators
m1 <- c(exp(-(1:100)/20))
m2 <- c(exp(-(1:100)/200))
par(mfrow = c(2,1))
plot(m1)
plot(m2)

The signal modulator 1 has a tail, whose slope is gradually decreased. And the slope of signal modulator 2 does not show obvious change.


Stationary Time Series

5.1

The series xt is not stationary because it is not independent of time.
### 5.2
\[ \begin{aligned} x_t &= \beta_1 + \beta_2t + w_t \\ x_{t-1} &= \beta_1 + \beta_2(t-1) +w_{t-1}\\ y_t &= x_t - x_{t-1} = -\beta + w_t - w_{t-1} \end{aligned} \] So \(y_t\) is independent of time, which means \(y_t\) is stationary.
### 5.3
The mean of the moving average: \[ \begin{aligned} v_t &= \frac{1}{2q+1} \sum_{j=-q}^q x_{t-j}\\ &= \frac{1}{2q+1}(\beta_1(2q+1) + \beta_2(\sum_{j=-q}^q (t+j)) \\ &= \frac{1}{2q+1} (\beta_1(2q+1)+\beta_2*t(2q+1))\\ &= \beta_1 +\beta_2 t \end{aligned} \] The autocovariance funciton:
\[ \begin{aligned} acf = \frac{1}{N} \sum_{t=k+1}^N(x_t-\mu)(x_{t-k}-\mu) \end{aligned} \]