Below is all the packages that we use to analyze
library(readxl)
library(zoo)
library(lmtest)
library(broom)
library(car)
library(carData)
library(whitestrap)
library(tseries)
library(corrplot)
library(ggcorrplot)
library(e1071)
library(tseries)
Read data and assign variables. Converting unit of variables.
data <- read_xlsx("C:/Users/pptha/Downloads/thuktl.xlsx")
REV <- data[[2]] /1000000000000
LBC <- data[[3]] /1000000000000
SC <- data[[4]] /1000000000000
AGE <- data[[5]]
INC <- data[[6]] /100000
COVID <- data[[7]]
df <- data.frame(REV,LBC,SC,AGE,INC,COVID)
Setting up functions to calculate RMSE, MAE, MAPE:
mape <- function(actual, predicted) {
mean(abs((actual - predicted) / actual)) * 100
}
rmse <- function(actual, predicted) {
sqrt(mean((actual - predicted)^2))
}
mae <- function(actual, predicted) {
if(length(actual) != length(predicted)) {
stop("Vectors 'actual' and 'predicted' must have the same length.")
}
mean(abs(actual - predicted))
}
desstat <- function(x){
summaryvec<- c(
mean(x), median(x),max(x),min(x),sd(x),skewness(x),kurtosis(x),jarque.bera.test(REV)$statistic,jarque.bera.test(REV)$p.value
)
return(summaryvec)
}
for(i in 1:6){
print(colnames(df)[i])
print(desstat(df[[i]]))
}
cor_matrix <- cor(df)
p <- ggcorrplot(cor_matrix,
lab = TRUE,
hc.order = TRUE,
type = "full",
colors = c("red", "white", "blue"),
outline.color = "gray")
p
#a: alpha
#b: minimum for R_squared
crawlmodel <- function(a,b,vars,dep){
#There are 5 independent variables and 1 dependent variable => 2^5 - 1 possible models
n <- length(vars)
for (i in 1:(2^n - 1)) {
bits <- as.logical(rev(intToBits(i)[1:n]))
# Convert to binary TRUE/FALSE selector
selected_vars <- vars[bits]
#Select independent variables based on TRUE/FALSE vector
formula_str <- paste(paste(dep),"~", paste(selected_vars, collapse = " + "))
# Create formula: a ~ b + c + d (depending on selected_vars)
model <- lm(as.formula(formula_str), data = df)
# summary the model according to formula
f_stat <- summary(model)$fstatistic
f_p_value <- pf(f_stat["value"], f_stat["numdf"], f_stat["dendf"], lower.tail = FALSE)
#get fstat pvalue
if((resettest(model)$p.value >= a)& (f_p_value <= a) & (summary(model)$r.squared >= b)){
#set conditions for choosing variables
print(formula_str)
print(summary(model))
print(reset(model))
#print the result
}
}
}
varstring <- c( "LBC" ,"SC","AGE", "INC","COVID" )
depvar <- "REV"
crawlmodel(0.05,0.5,varstring,depvar)
[1] "REV ~ COVID"
Call:
lm(formula = as.formula(formula_str), data = df)
Residuals:
Min 1Q Median 3Q Max
-10.903 -4.741 0.320 3.937 12.565
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 14.409 1.331 10.826 9.94e-14 ***
COVID 16.676 1.974 8.448 1.34e-10 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6.52 on 42 degrees of freedom
Multiple R-squared: 0.6295, Adjusted R-squared: 0.6207
F-statistic: 71.37 on 1 and 42 DF, p-value: 1.338e-10
RESET test
data: model
RESET = 0, df1 = 2, df2 = 40, p-value = 1
When choosing p-value for Ramsay test >= 0.05, only 1 model satisfies. Alternate way is to use backward procedures:
model <- lm( REV ~ LBC + SC + AGE + INC + COVID)
step(model, direction = "backward")
Start: AIC=94.95
REV ~ LBC + SC + AGE + INC + COVID
Df Sum of Sq RSS AIC
- INC 1 0.415 290.28 93.013
- SC 1 9.650 299.52 94.391
<none> 289.87 94.950
- AGE 1 17.094 306.96 95.471
- COVID 1 34.779 324.65 97.936
- LBC 1 107.563 397.43 106.836
Step: AIC=93.01
REV ~ LBC + SC + AGE + COVID
Df Sum of Sq RSS AIC
- SC 1 9.263 299.54 92.395
<none> 290.28 93.013
- COVID 1 35.636 325.92 96.108
- LBC 1 115.653 405.93 105.768
- AGE 1 118.578 408.86 106.084
Step: AIC=92.4
REV ~ LBC + AGE + COVID
Df Sum of Sq RSS AIC
<none> 299.54 92.395
- COVID 1 28.18 327.73 94.351
- LBC 1 233.94 533.48 115.790
- AGE 1 941.67 1241.21 152.945
Call:
lm(formula = REV ~ LBC + AGE + COVID)
Coefficients:
(Intercept) LBC AGE COVID
-27.6397 3.3180 0.7419 -3.2279
=> Result of backward procedure: REV ~ LBC + AGE + COVID
linearmodel <- lm(REV ~ LBC + AGE + COVID)
summary(linearmodel)
Call:
lm(formula = REV ~ LBC + AGE + COVID)
Residuals:
Min 1Q Median 3Q Max
-7.8455 -1.1692 -0.1513 1.6782 6.3617
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -27.63973 3.42949 -8.059 6.57e-10 ***
LBC 3.31798 0.59364 5.589 1.78e-06 ***
AGE 0.74191 0.06616 11.214 6.42e-14 ***
COVID -3.22792 1.66396 -1.940 0.0595 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.737 on 40 degrees of freedom
Multiple R-squared: 0.9378, Adjusted R-squared: 0.9332
F-statistic: 201.2 on 3 and 40 DF, p-value: < 2.2e-16
depvar <- "log(REV)"
varstring <- c("LBC","SC","AGE","INC","COVID")
crawlmodel(0.05,0.5,varstring,depvar)
crawlmodel(0.05,0,varstring,depvar)
[1] "log(REV) ~ COVID"
Call:
lm(formula = as.formula(formula_str), data = df)
Residuals:
Min 1Q Median 3Q Max
-1.21109 -0.16666 0.02516 0.30368 0.82952
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.4654 0.1069 23.064 < 2e-16 ***
COVID 0.9648 0.1585 6.085 2.99e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.5237 on 42 degrees of freedom
Multiple R-squared: 0.4686, Adjusted R-squared: 0.4559
F-statistic: 37.03 on 1 and 42 DF, p-value: 2.992e-07
RESET test
data: model
RESET = 0, df1 = 2, df2 = 40, p-value = 1
=> Only 1 model: log(REV) ~ COVID satisfies the conditions, though, R_squared = 0.4686 is not efficient.
#those are varibles that can be log: LBC, SC, INC:
varstring1 <- c( "log(LBC)" ,"log(SC)","AGE", "log(INC)","COVID" )
varstring2 <- c( "LBC" ,"log(SC)","AGE", "log(INC)","COVID" )
varstring3 <- c( "log(LBC)" ,"SC","AGE", "log(INC)","COVID" )
varstring4 <- c( "log(LBC)" ,"log(SC)","AGE", "INC","COVID" )
varstring5 <- c( "LBC" ,"SC","AGE", "log(INC)","COVID" )
varstring6 <- c( "LBC" ,"log(SC)","AGE", "INC","COVID" )
varstring7 <- c( "log(LBC)" ,"SC","AGE", "INC","COVID" )
Each independent variable log options:
depvar <- "REV"
crawlmodel(0.05,0.5,varstring1,depvar)
[1] "REV ~ COVID"
Call:
lm(formula = as.formula(formula_str), data = df)
Residuals:
Min 1Q Median 3Q Max
-10.903 -4.741 0.320 3.937 12.565
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 14.409 1.331 10.826 9.94e-14 ***
COVID 16.676 1.974 8.448 1.34e-10 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6.52 on 42 degrees of freedom
Multiple R-squared: 0.6295, Adjusted R-squared: 0.6207
F-statistic: 71.37 on 1 and 42 DF, p-value: 1.338e-10
RESET test
data: model
RESET = 0, df1 = 2, df2 = 40, p-value = 1
[1] "REV ~ log(LBC)"
Call:
lm(formula = as.formula(formula_str), data = df)
Residuals:
Min 1Q Median 3Q Max
-7.236 -4.528 -2.376 4.146 14.646
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 21.6685 0.9233 23.468 < 2e-16 ***
log(LBC) 9.3428 1.0037 9.309 9.11e-12 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6.12 on 42 degrees of freedom
Multiple R-squared: 0.6735, Adjusted R-squared: 0.6658
F-statistic: 86.65 on 1 and 42 DF, p-value: 9.109e-12
RESET test
data: model
RESET = 1.1008, df1 = 2, df2 = 40, p-value = 0.3425
[1] "REV ~ log(LBC) + log(SC) + log(INC) + COVID"
Call:
lm(formula = as.formula(formula_str), data = df)
Residuals:
Min 1Q Median 3Q Max
-5.1087 -1.3251 -0.1116 1.3688 6.7143
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -69.3147 25.8577 -2.681 0.01071 *
log(LBC) 1.3545 0.8284 1.635 0.11007
log(SC) 5.4368 1.6110 3.375 0.00168 **
log(INC) 16.4622 5.1727 3.182 0.00286 **
COVID 1.0178 1.2655 0.804 0.42612
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.318 on 39 degrees of freedom
Multiple R-squared: 0.9565, Adjusted R-squared: 0.9521
F-statistic: 214.5 on 4 and 39 DF, p-value: < 2.2e-16
RESET test
data: model
RESET = 3.2247, df1 = 2, df2 = 37, p-value = 0.05117
[1] "REV ~ log(LBC) + log(SC) + AGE + log(INC) + COVID"
Call:
lm(formula = as.formula(formula_str), data = df)
Residuals:
Min 1Q Median 3Q Max
-5.1321 -1.3081 -0.0483 1.1535 7.3288
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -120.6483 53.8931 -2.239 0.03111 *
log(LBC) 0.9628 0.9019 1.067 0.29250
log(SC) 6.3865 1.8303 3.489 0.00124 **
AGE -0.3420 0.3152 -1.085 0.28480
log(INC) 30.0636 13.5578 2.217 0.03265 *
COVID 2.1681 1.6488 1.315 0.19640
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.313 on 38 degrees of freedom
Multiple R-squared: 0.9578, Adjusted R-squared: 0.9523
F-statistic: 172.6 on 5 and 38 DF, p-value: < 2.2e-16
RESET test
data: model
RESET = 3.1037, df1 = 2, df2 = 36, p-value = 0.05708
=> Model can be chosen are:
[1] “REV ~ log(LBC) + log(SC) + log(INC) + COVID”
[2] “REV ~ log(LBC) + log(SC) + AGE + log(INC) + COVID”
For varstring2:
depvar <- "REV"
crawlmodel(0.05,0.5,varstring2,depvar)
[1] "REV ~ COVID"
Call:
lm(formula = as.formula(formula_str), data = df)
Residuals:
Min 1Q Median 3Q Max
-10.903 -4.741 0.320 3.937 12.565
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 14.409 1.331 10.826 9.94e-14 ***
COVID 16.676 1.974 8.448 1.34e-10 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6.52 on 42 degrees of freedom
Multiple R-squared: 0.6295, Adjusted R-squared: 0.6207
F-statistic: 71.37 on 1 and 42 DF, p-value: 1.338e-10
RESET test
data: model
RESET = 0, df1 = 2, df2 = 40, p-value = 1
[1] "REV ~ LBC + log(SC) + log(INC)"
Call:
lm(formula = as.formula(formula_str), data = df)
Residuals:
Min 1Q Median 3Q Max
-5.1546 -1.2258 -0.4381 1.4164 6.9168
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -81.2107 22.1796 -3.662 0.000726 ***
LBC 1.2520 0.6699 1.869 0.068958 .
log(SC) 5.5849 1.4049 3.975 0.000287 ***
log(INC) 18.4620 4.3354 4.258 0.000121 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.292 on 40 degrees of freedom
Multiple R-squared: 0.9564, Adjusted R-squared: 0.9531
F-statistic: 292.5 on 3 and 40 DF, p-value: < 2.2e-16
RESET test
data: model
RESET = 3.2297, df1 = 2, df2 = 38, p-value = 0.05065
[1] "REV ~ LBC + log(SC) + log(INC) + COVID"
Call:
lm(formula = as.formula(formula_str), data = df)
Residuals:
Min 1Q Median 3Q Max
-5.1389 -1.2532 -0.3672 1.4695 6.7797
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -76.4451 28.0150 -2.729 0.009487 **
LBC 1.1799 0.7235 1.631 0.110981
log(SC) 5.7110 1.4890 3.835 0.000446 ***
log(INC) 17.5219 5.4940 3.189 0.002812 **
COVID 0.3831 1.3484 0.284 0.777819
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.319 on 39 degrees of freedom
Multiple R-squared: 0.9565, Adjusted R-squared: 0.952
F-statistic: 214.4 on 4 and 39 DF, p-value: < 2.2e-16
RESET test
data: model
RESET = 3.088, df1 = 2, df2 = 37, p-value = 0.05751
[1] "REV ~ LBC + log(SC) + AGE + log(INC) + COVID"
Call:
lm(formula = as.formula(formula_str), data = df)
Residuals:
Min 1Q Median 3Q Max
-5.1447 -1.3675 -0.0311 1.2240 7.3622
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -124.4926 53.4899 -2.327 0.025370 *
LBC 0.8220 0.7984 1.030 0.309721
log(SC) 6.6004 1.7097 3.860 0.000426 ***
AGE -0.3367 0.3195 -1.054 0.298594
log(INC) 30.5236 13.5018 2.261 0.029588 *
COVID 1.7109 1.8440 0.928 0.359368
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.315 on 38 degrees of freedom
Multiple R-squared: 0.9577, Adjusted R-squared: 0.9522
F-statistic: 172.2 on 5 and 38 DF, p-value: < 2.2e-16
RESET test
data: model
RESET = 3.1136, df1 = 2, df2 = 36, p-value = 0.05659
=> Model can be chosen are:
[3]“REV ~ LBC + log(SC) + log(INC) + COVID”
[4]“REV ~ LBC + log(SC) + AGE + log(INC) + COVID”
For varstring3:
depvar <- "REV"
crawlmodel(0.05,0.5,varstring3,depvar)
[1] "REV ~ COVID"
Call:
lm(formula = as.formula(formula_str), data = df)
Residuals:
Min 1Q Median 3Q Max
-10.903 -4.741 0.320 3.937 12.565
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 14.409 1.331 10.826 9.94e-14 ***
COVID 16.676 1.974 8.448 1.34e-10 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6.52 on 42 degrees of freedom
Multiple R-squared: 0.6295, Adjusted R-squared: 0.6207
F-statistic: 71.37 on 1 and 42 DF, p-value: 1.338e-10
RESET test
data: model
RESET = 0, df1 = 2, df2 = 40, p-value = 1
[1] "REV ~ log(LBC)"
Call:
lm(formula = as.formula(formula_str), data = df)
Residuals:
Min 1Q Median 3Q Max
-7.236 -4.528 -2.376 4.146 14.646
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 21.6685 0.9233 23.468 < 2e-16 ***
log(LBC) 9.3428 1.0037 9.309 9.11e-12 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6.12 on 42 degrees of freedom
Multiple R-squared: 0.6735, Adjusted R-squared: 0.6658
F-statistic: 86.65 on 1 and 42 DF, p-value: 9.109e-12
RESET test
data: model
RESET = 1.1008, df1 = 2, df2 = 40, p-value = 0.3425
=> No model has more than 1 independent variable that satisfies the conditions
For varstring4:
depvar <- "REV"
crawlmodel(0.05,0.5,varstring4,depvar)
[1] "REV ~ COVID"
Call:
lm(formula = as.formula(formula_str), data = df)
Residuals:
Min 1Q Median 3Q Max
-10.903 -4.741 0.320 3.937 12.565
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 14.409 1.331 10.826 9.94e-14 ***
COVID 16.676 1.974 8.448 1.34e-10 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6.52 on 42 degrees of freedom
Multiple R-squared: 0.6295, Adjusted R-squared: 0.6207
F-statistic: 71.37 on 1 and 42 DF, p-value: 1.338e-10
RESET test
data: model
RESET = 0, df1 = 2, df2 = 40, p-value = 1
[1] "REV ~ log(LBC)"
Call:
lm(formula = as.formula(formula_str), data = df)
Residuals:
Min 1Q Median 3Q Max
-7.236 -4.528 -2.376 4.146 14.646
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 21.6685 0.9233 23.468 < 2e-16 ***
log(LBC) 9.3428 1.0037 9.309 9.11e-12 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6.12 on 42 degrees of freedom
Multiple R-squared: 0.6735, Adjusted R-squared: 0.6658
F-statistic: 86.65 on 1 and 42 DF, p-value: 9.109e-12
RESET test
data: model
RESET = 1.1008, df1 = 2, df2 = 40, p-value = 0.3425
=> No model has more than 1 independent variable that satisfies the conditions
For varstring5:
depvar <- "REV"
crawlmodel(0.05,0.5,varstring5,depvar)
[1] "REV ~ COVID"
Call:
lm(formula = as.formula(formula_str), data = df)
Residuals:
Min 1Q Median 3Q Max
-10.903 -4.741 0.320 3.937 12.565
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 14.409 1.331 10.826 9.94e-14 ***
COVID 16.676 1.974 8.448 1.34e-10 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6.52 on 42 degrees of freedom
Multiple R-squared: 0.6295, Adjusted R-squared: 0.6207
F-statistic: 71.37 on 1 and 42 DF, p-value: 1.338e-10
RESET test
data: model
RESET = 0, df1 = 2, df2 = 40, p-value = 1
=> No model has more than 1 independent variable that satisfies the conditions
For varstring6:
depvar <- "REV"
crawlmodel(0.05,0.5,varstring6,depvar)
[1] "REV ~ COVID"
Call:
lm(formula = as.formula(formula_str), data = df)
Residuals:
Min 1Q Median 3Q Max
-10.903 -4.741 0.320 3.937 12.565
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 14.409 1.331 10.826 9.94e-14 ***
COVID 16.676 1.974 8.448 1.34e-10 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6.52 on 42 degrees of freedom
Multiple R-squared: 0.6295, Adjusted R-squared: 0.6207
F-statistic: 71.37 on 1 and 42 DF, p-value: 1.338e-10
RESET test
data: model
RESET = 0, df1 = 2, df2 = 40, p-value = 1
=> No model has more than 1 independent variable that satisfies the conditions
For varstring7:
depvar <- "REV"
crawlmodel(0.05,0.5,varstring7,depvar)
[1] "REV ~ COVID"
Call:
lm(formula = as.formula(formula_str), data = df)
Residuals:
Min 1Q Median 3Q Max
-10.903 -4.741 0.320 3.937 12.565
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 14.409 1.331 10.826 9.94e-14 ***
COVID 16.676 1.974 8.448 1.34e-10 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6.52 on 42 degrees of freedom
Multiple R-squared: 0.6295, Adjusted R-squared: 0.6207
F-statistic: 71.37 on 1 and 42 DF, p-value: 1.338e-10
RESET test
data: model
RESET = 0, df1 = 2, df2 = 40, p-value = 1
[1] "REV ~ log(LBC)"
Call:
lm(formula = as.formula(formula_str), data = df)
Residuals:
Min 1Q Median 3Q Max
-7.236 -4.528 -2.376 4.146 14.646
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 21.6685 0.9233 23.468 < 2e-16 ***
log(LBC) 9.3428 1.0037 9.309 9.11e-12 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6.12 on 42 degrees of freedom
Multiple R-squared: 0.6735, Adjusted R-squared: 0.6658
F-statistic: 86.65 on 1 and 42 DF, p-value: 9.109e-12
RESET test
data: model
RESET = 1.1008, df1 = 2, df2 = 40, p-value = 0.3425
=> No model has more than 1 independent variable that satisfies the conditions
There are 4 semi-log models that satisfy the conditions:
Overall significant: p-val of F-test <= 0.05
Bypass Ramsay test: p-val >= 0.05
R_squared >= 0.5
(Check out those in crawlmodel function)
These functions are:
[1] “REV ~ log(LBC) + log(SC) + log(INC) + COVID”
[2] “REV ~ log(LBC) + log(SC) + AGE + log(INC) + COVID”
[3]“REV ~ LBC + log(SC) + log(INC) + COVID”
[4]“REV ~ LBC + log(SC) + AGE + log(INC) + COVID”
lm1 <- lm(REV ~ log(LBC) + log(SC) + log(INC) + COVID)
summary(lm1)
Call:
lm(formula = REV ~ log(LBC) + log(SC) + log(INC) + COVID)
Residuals:
Min 1Q Median 3Q Max
-5.1087 -1.3251 -0.1116 1.3688 6.7143
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -69.3147 25.8577 -2.681 0.01071 *
log(LBC) 1.3545 0.8284 1.635 0.11007
log(SC) 5.4368 1.6110 3.375 0.00168 **
log(INC) 16.4622 5.1727 3.182 0.00286 **
COVID 1.0178 1.2655 0.804 0.42612
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.318 on 39 degrees of freedom
Multiple R-squared: 0.9565, Adjusted R-squared: 0.9521
F-statistic: 214.5 on 4 and 39 DF, p-value: < 2.2e-16
lm2 <- lm(REV ~ log(LBC) + log(SC) + AGE + log(INC) + COVID)
summary(lm2)
Call:
lm(formula = REV ~ log(LBC) + log(SC) + AGE + log(INC) + COVID)
Residuals:
Min 1Q Median 3Q Max
-5.1321 -1.3081 -0.0483 1.1535 7.3288
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -120.6483 53.8931 -2.239 0.03111 *
log(LBC) 0.9628 0.9019 1.067 0.29250
log(SC) 6.3865 1.8303 3.489 0.00124 **
AGE -0.3420 0.3152 -1.085 0.28480
log(INC) 30.0636 13.5578 2.217 0.03265 *
COVID 2.1681 1.6488 1.315 0.19640
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.313 on 38 degrees of freedom
Multiple R-squared: 0.9578, Adjusted R-squared: 0.9523
F-statistic: 172.6 on 5 and 38 DF, p-value: < 2.2e-16
lm3 <- lm(REV ~ LBC + log(SC) + log(INC) + COVID)
summary(lm3)
Call:
lm(formula = REV ~ LBC + log(SC) + log(INC) + COVID)
Residuals:
Min 1Q Median 3Q Max
-5.1389 -1.2532 -0.3672 1.4695 6.7797
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -76.4451 28.0150 -2.729 0.009487 **
LBC 1.1799 0.7235 1.631 0.110981
log(SC) 5.7110 1.4890 3.835 0.000446 ***
log(INC) 17.5219 5.4940 3.189 0.002812 **
COVID 0.3831 1.3484 0.284 0.777819
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.319 on 39 degrees of freedom
Multiple R-squared: 0.9565, Adjusted R-squared: 0.952
F-statistic: 214.4 on 4 and 39 DF, p-value: < 2.2e-16
lm4 <- lm(REV ~ LBC + log(SC) + AGE + log(INC) + COVID)
summary(lm4)
Call:
lm(formula = REV ~ LBC + log(SC) + AGE + log(INC) + COVID)
Residuals:
Min 1Q Median 3Q Max
-5.1447 -1.3675 -0.0311 1.2240 7.3622
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -124.4926 53.4899 -2.327 0.025370 *
LBC 0.8220 0.7984 1.030 0.309721
log(SC) 6.6004 1.7097 3.860 0.000426 ***
AGE -0.3367 0.3195 -1.054 0.298594
log(INC) 30.5236 13.5018 2.261 0.029588 *
COVID 1.7109 1.8440 0.928 0.359368
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.315 on 38 degrees of freedom
Multiple R-squared: 0.9577, Adjusted R-squared: 0.9522
F-statistic: 172.2 on 5 and 38 DF, p-value: < 2.2e-16
=> Choose model 1 to analyze as semi-log model because:
Has 2/4 significant variables
Has higher adjusted R-squared than model 3
semilogmodel <- lm(REV ~ log(LBC) + log(SC) + log(INC) + COVID)
summary(semilogmodel)
Call:
lm(formula = REV ~ log(LBC) + log(SC) + log(INC) + COVID)
Residuals:
Min 1Q Median 3Q Max
-5.1087 -1.3251 -0.1116 1.3688 6.7143
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -69.3147 25.8577 -2.681 0.01071 *
log(LBC) 1.3545 0.8284 1.635 0.11007
log(SC) 5.4368 1.6110 3.375 0.00168 **
log(INC) 16.4622 5.1727 3.182 0.00286 **
COVID 1.0178 1.2655 0.804 0.42612
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.318 on 39 degrees of freedom
Multiple R-squared: 0.9565, Adjusted R-squared: 0.9521
F-statistic: 214.5 on 4 and 39 DF, p-value: < 2.2e-16
depvar <- "log(REV)"
varstring1 <- c( "log(LBC)" ,"log(SC)","AGE", "log(INC)","COVID" )
varstring2 <- c( "LBC" ,"log(SC)","AGE", "log(INC)","COVID" )
varstring3 <- c( "log(LBC)" ,"SC","AGE", "log(INC)","COVID" )
varstring4 <- c( "log(LBC)" ,"log(SC)","AGE", "INC","COVID" )
varstring5 <- c( "LBC" ,"SC","AGE", "log(INC)","COVID" )
varstring6 <- c( "LBC" ,"log(SC)","AGE", "INC","COVID" )
varstring7 <- c( "log(LBC)" ,"SC","AGE", "INC","COVID" )
crawlmodel(0.05,0.5,varstring1,depvar)
crawlmodel(0.05,0.5,varstring2,depvar)
crawlmodel(0.05,0.5,varstring3,depvar)
crawlmodel(0.05,0.5,varstring4,depvar)
crawlmodel(0.05,0.5,varstring5,depvar)
crawlmodel(0.05,0.5,varstring6,depvar)
crawlmodel(0.05,0.5,varstring7,depvar)
=> No model with log(REV) satisfies the conditions.
Use backward methods for varstring1:
model <- lm( log(REV) ~ log(LBC) + log(SC)+ AGE+ log(INC) + COVID)
step(model, direction = "backward")
Start: AIC=-209.8
log(REV) ~ log(LBC) + log(SC) + AGE + log(INC) + COVID
Df Sum of Sq RSS AIC
- COVID 1 0.00873 0.29333 -210.47
<none> 0.28460 -209.80
- log(LBC) 1 0.01747 0.30207 -209.18
- log(INC) 1 0.06537 0.34997 -202.70
- AGE 1 0.08430 0.36890 -200.38
- log(SC) 1 1.50187 1.78647 -130.97
Step: AIC=-210.47
log(REV) ~ log(LBC) + log(SC) + AGE + log(INC)
Df Sum of Sq RSS AIC
<none> 0.29333 -210.47
- log(LBC) 1 0.02838 0.32172 -208.40
- log(INC) 1 0.11102 0.40435 -198.34
- AGE 1 0.20941 0.50274 -188.76
- log(SC) 1 1.84466 2.13800 -125.07
Call:
lm(formula = log(REV) ~ log(LBC) + log(SC) + AGE + log(INC))
Coefficients:
(Intercept) log(LBC) log(SC) AGE log(INC)
-4.06729 -0.06260 0.99736 -0.04776 1.74483
Result model:
[1] log(REV) ~ log(LBC) + log(SC) + AGE + log(INC)
Use backward methods for varstring2:
model <- lm( log(REV) ~ LBC+ log(SC)+ AGE + log(INC) + COVID)
step(model, direction = "backward")
Start: AIC=-210.62
log(REV) ~ LBC + log(SC) + AGE + log(INC) + COVID
Df Sum of Sq RSS AIC
- COVID 1 0.00167 0.28103 -212.35
<none> 0.27936 -210.62
- LBC 1 0.02271 0.30207 -209.18
- log(INC) 1 0.06616 0.34552 -203.26
- AGE 1 0.08916 0.36852 -200.43
- log(SC) 1 1.73079 2.01015 -125.78
Step: AIC=-212.35
log(REV) ~ LBC + log(SC) + AGE + log(INC)
Df Sum of Sq RSS AIC
<none> 0.28103 -212.35
- LBC 1 0.04069 0.32172 -208.40
- log(INC) 1 0.09464 0.37567 -201.58
- AGE 1 0.19993 0.48095 -190.71
- log(SC) 1 2.47354 2.75457 -113.92
Call:
lm(formula = log(REV) ~ LBC + log(SC) + AGE + log(INC))
Coefficients:
(Intercept) LBC log(SC) AGE log(INC)
-3.41721 -0.05932 0.98741 -0.04506 1.60610
Result model:
model <- lm( log(REV) ~ LBC+ log(SC)+ AGE + log(INC) + COVID)
step(model, direction = "backward")
Start: AIC=-210.62
log(REV) ~ LBC + log(SC) + AGE + log(INC) + COVID
Df Sum of Sq RSS AIC
- COVID 1 0.00167 0.28103 -212.35
<none> 0.27936 -210.62
- LBC 1 0.02271 0.30207 -209.18
- log(INC) 1 0.06616 0.34552 -203.26
- AGE 1 0.08916 0.36852 -200.43
- log(SC) 1 1.73079 2.01015 -125.78
Step: AIC=-212.35
log(REV) ~ LBC + log(SC) + AGE + log(INC)
Df Sum of Sq RSS AIC
<none> 0.28103 -212.35
- LBC 1 0.04069 0.32172 -208.40
- log(INC) 1 0.09464 0.37567 -201.58
- AGE 1 0.19993 0.48095 -190.71
- log(SC) 1 2.47354 2.75457 -113.92
Call:
lm(formula = log(REV) ~ LBC + log(SC) + AGE + log(INC))
Coefficients:
(Intercept) LBC log(SC) AGE log(INC)
-3.41721 -0.05932 0.98741 -0.04506 1.60610
[2] log(REV) ~ LBC + log(SC) + AGE + log(INC)
Use backward methods for varstring3:
model <- lm( log(REV) ~ log(LBC) + SC + AGE + log(INC) + COVID)
step(model, direction = "backward")
Start: AIC=-129.01
log(REV) ~ log(LBC) + SC + AGE + log(INC) + COVID
Df Sum of Sq RSS AIC
- SC 1 0.00131 1.7865 -130.97
- log(INC) 1 0.00158 1.7867 -130.97
<none> 1.7852 -129.01
- AGE 1 0.09517 1.8803 -128.72
- COVID 1 0.30312 2.0883 -124.11
- log(LBC) 1 1.46901 3.2542 -104.59
Step: AIC=-130.97
log(REV) ~ log(LBC) + AGE + log(INC) + COVID
Df Sum of Sq RSS AIC
- log(INC) 1 0.00239 1.7889 -132.915
<none> 1.7865 -130.974
- AGE 1 0.11348 1.8999 -130.264
- COVID 1 0.35153 2.1380 -125.070
- log(LBC) 1 2.08731 3.8738 -98.918
Step: AIC=-132.91
log(REV) ~ log(LBC) + AGE + COVID
Df Sum of Sq RSS AIC
<none> 1.7889 -132.915
- COVID 1 0.45813 2.2470 -124.882
- log(LBC) 1 2.14712 3.9360 -100.217
- AGE 1 2.97297 4.7618 -91.837
Call:
lm(formula = log(REV) ~ log(LBC) + AGE + COVID)
Coefficients:
(Intercept) log(LBC) AGE COVID
0.20080 0.33513 0.04602 -0.40586
Result model:
[3] log(REV) ~ log(LBC) + AGE + COVID
Use backward methods for varstring4:
model <- lm( log(REV) ~ log(LBC) + log(SC) + AGE + INC + COVID)
step(model, direction = "backward")
Start: AIC=-209.18
log(REV) ~ log(LBC) + log(SC) + AGE + INC + COVID
Df Sum of Sq RSS AIC
- COVID 1 0.00681 0.29546 -210.15
- log(LBC) 1 0.01025 0.29891 -209.64
<none> 0.28866 -209.18
- INC 1 0.06131 0.34997 -202.70
- AGE 1 0.07862 0.36728 -200.58
- log(SC) 1 1.30296 1.59161 -136.06
Step: AIC=-210.15
log(REV) ~ log(LBC) + log(SC) + AGE + INC
Df Sum of Sq RSS AIC
<none> 0.29546 -210.15
- log(LBC) 1 0.01586 0.31133 -209.85
- INC 1 0.10889 0.40435 -198.34
- AGE 1 0.19491 0.49038 -189.86
- log(SC) 1 1.89975 2.19521 -123.91
Call:
lm(formula = log(REV) ~ log(LBC) + log(SC) + AGE + INC)
Coefficients:
(Intercept) log(LBC) log(SC) AGE INC
3.704860 -0.046619 1.099855 -0.052812 0.008271
Result model:
[4] log(REV) ~ log(LBC) + log(SC) + AGE + INC
Use backward methods for varstring5:
model <- lm( log(REV) ~ LBC+ SC+AGE + log(INC) + COVID)
step(model, direction = "backward")
Start: AIC=-123.81
log(REV) ~ LBC + SC + AGE + log(INC) + COVID
Df Sum of Sq RSS AIC
- SC 1 0.00138 2.0102 -125.78
- log(INC) 1 0.00353 2.0123 -125.74
<none> 2.0088 -123.81
- AGE 1 0.12894 2.1377 -123.08
- COVID 1 0.70933 2.7181 -112.51
- LBC 1 1.24539 3.2542 -104.59
Step: AIC=-125.78
log(REV) ~ LBC + AGE + log(INC) + COVID
Df Sum of Sq RSS AIC
- log(INC) 1 0.00485 2.0150 -127.677
<none> 2.0102 -125.783
- AGE 1 0.16271 2.1729 -124.358
- COVID 1 0.74442 2.7546 -113.921
- LBC 1 1.86363 3.8738 -98.918
Step: AIC=-127.68
log(REV) ~ LBC + AGE + COVID
Df Sum of Sq RSS AIC
<none> 2.0150 -127.677
- COVID 1 0.9899 3.0049 -112.093
- LBC 1 1.9210 3.9360 -100.217
- AGE 1 5.4807 7.4957 -71.874
Call:
lm(formula = log(REV) ~ LBC + AGE + COVID)
Coefficients:
(Intercept) LBC AGE COVID
-0.7869 0.3007 0.0566 -0.6050
=> The result is not a log log model
Use backward methods for varstring6:
model <- lm( log(REV) ~ LBC + log(SC) + AGE + INC + COVID )
step(model, direction = "backward")
Start: AIC=-209.56
log(REV) ~ LBC + log(SC) + AGE + INC + COVID
Df Sum of Sq RSS AIC
- COVID 1 0.00216 0.28827 -211.23
- LBC 1 0.01280 0.29891 -209.64
<none> 0.28611 -209.56
- INC 1 0.05940 0.34552 -203.26
- AGE 1 0.08032 0.36644 -200.68
- log(SC) 1 1.48655 1.77267 -131.31
Step: AIC=-211.23
log(REV) ~ LBC + log(SC) + AGE + INC
Df Sum of Sq RSS AIC
<none> 0.28827 -211.23
- LBC 1 0.02305 0.31133 -209.85
- INC 1 0.08739 0.37567 -201.58
- AGE 1 0.17423 0.46250 -192.43
- log(SC) 1 2.61001 2.89828 -111.68
Call:
lm(formula = log(REV) ~ LBC + log(SC) + AGE + INC)
Coefficients:
(Intercept) LBC log(SC) AGE INC
3.715776 -0.045591 1.084177 -0.049574 0.007571
Result model:
[6] log(REV) ~ LBC + log(SC) + AGE + INC
Use backward methods for varstring7:
model <- lm( log(REV) ~ log(LBC) + SC + AGE + INC + COVID )
step(model, direction = "backward")
Start: AIC=-134.91
log(REV) ~ log(LBC) + SC + AGE + INC + COVID
Df Sum of Sq RSS AIC
- SC 1 0.03064 1.5916 -136.06
<none> 1.5610 -134.91
- INC 1 0.22576 1.7867 -130.97
- COVID 1 0.44208 2.0031 -125.94
- AGE 1 0.65774 2.2187 -121.44
- log(LBC) 1 1.03711 2.5981 -114.49
Step: AIC=-136.06
log(REV) ~ log(LBC) + AGE + INC + COVID
Df Sum of Sq RSS AIC
<none> 1.5916 -136.06
- INC 1 0.19724 1.7889 -132.91
- COVID 1 0.60360 2.1952 -123.91
- AGE 1 0.75620 2.3478 -120.95
- log(LBC) 1 1.06801 2.6596 -115.46
Call:
lm(formula = log(REV) ~ log(LBC) + AGE + INC + COVID)
Coefficients:
(Intercept) log(LBC) AGE INC COVID
-0.46016 0.27485 0.09091 -0.01071 -0.48700
Result model:
[7] log(REV) ~ log(LBC) + AGE + INC + COVID
There are 6 result of log-log models, using backward procedure:
[1] log(REV) ~ log(LBC) + log(SC) + AGE + log(INC)
[2] log(REV) ~ LBC + log(SC) + AGE + log(INC)
[3] log(REV) ~ log(LBC) + AGE + COVID
[4] log(REV) ~ log(LBC) + log(SC) + AGE + INC
[5] log(REV) ~ LBC + log(SC) + AGE + INC
[6] log(REV) ~ log(LBC) + AGE + INC + COVID
formulas <- c("log(REV) ~ log(LBC) + log(SC) + AGE + log(INC)",
"log(REV) ~ LBC + log(SC) + AGE + log(INC)",
"log(REV) ~ log(LBC) + AGE + COVID",
"log(REV) ~ log(LBC) + log(SC) + AGE + INC",
"log(REV) ~ LBC + log(SC) + AGE + INC",
"log(REV) ~ log(LBC) + AGE + INC + COVID")
for(i in 1:6){
model <- lm(formula = formulas[i])
print(formulas[i])
print(summary(model))
}
[1] "log(REV) ~ log(LBC) + log(SC) + AGE + log(INC)"
Call:
lm(formula = formulas[i])
Residuals:
Min 1Q Median 3Q Max
-0.263418 -0.043335 0.003576 0.044674 0.213069
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.067292 1.867164 -2.178 0.035491 *
log(LBC) -0.062599 0.032225 -1.943 0.059312 .
log(SC) 0.997361 0.063686 15.661 < 2e-16 ***
AGE -0.047760 0.009051 -5.277 5.22e-06 ***
log(INC) 1.744827 0.454160 3.842 0.000438 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.08673 on 39 degrees of freedom
Multiple R-squared: 0.9865, Adjusted R-squared: 0.9851
F-statistic: 710.6 on 4 and 39 DF, p-value: < 2.2e-16
[1] "log(REV) ~ LBC + log(SC) + AGE + log(INC)"
Call:
lm(formula = formulas[i])
Residuals:
Min 1Q Median 3Q Max
-0.256953 -0.045770 -0.001234 0.041823 0.212744
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.417209 1.834437 -1.863 0.070034 .
LBC -0.059322 0.024964 -2.376 0.022494 *
log(SC) 0.987414 0.053294 18.528 < 2e-16 ***
AGE -0.045058 0.008554 -5.267 5.37e-06 ***
log(INC) 1.606099 0.443176 3.624 0.000827 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.08489 on 39 degrees of freedom
Multiple R-squared: 0.987, Adjusted R-squared: 0.9857
F-statistic: 742.2 on 4 and 39 DF, p-value: < 2.2e-16
[1] "log(REV) ~ log(LBC) + AGE + COVID"
Call:
lm(formula = formulas[i])
Residuals:
Min 1Q Median 3Q Max
-0.40191 -0.13725 0.01743 0.08875 0.42544
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.200801 0.310325 0.647 0.52128
log(LBC) 0.335126 0.048366 6.929 2.36e-08 ***
AGE 0.046018 0.005644 8.153 4.90e-10 ***
COVID -0.405861 0.126806 -3.201 0.00269 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2115 on 40 degrees of freedom
Multiple R-squared: 0.9175, Adjusted R-squared: 0.9113
F-statistic: 148.2 on 3 and 40 DF, p-value: < 2.2e-16
[1] "log(REV) ~ log(LBC) + log(SC) + AGE + INC"
Call:
lm(formula = formulas[i])
Residuals:
Min 1Q Median 3Q Max
-0.279817 -0.053597 -0.001524 0.046229 0.222259
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.704860 0.234488 15.800 < 2e-16 ***
log(LBC) -0.046619 0.032218 -1.447 0.155895
log(SC) 1.099855 0.069456 15.835 < 2e-16 ***
AGE -0.052812 0.010412 -5.072 9.97e-06 ***
INC 0.008271 0.002182 3.791 0.000509 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.08704 on 39 degrees of freedom
Multiple R-squared: 0.9864, Adjusted R-squared: 0.985
F-statistic: 705.4 on 4 and 39 DF, p-value: < 2.2e-16
[1] "log(REV) ~ LBC + log(SC) + AGE + INC"
Call:
lm(formula = formulas[i])
Residuals:
Min 1Q Median 3Q Max
-0.274722 -0.043456 -0.003529 0.044853 0.218597
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.715776 0.230300 16.134 < 2e-16 ***
LBC -0.045591 0.025817 -1.766 0.08523 .
log(SC) 1.084177 0.057697 18.791 < 2e-16 ***
AGE -0.049574 0.010211 -4.855 1.98e-05 ***
INC 0.007571 0.002202 3.438 0.00141 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.08597 on 39 degrees of freedom
Multiple R-squared: 0.9867, Adjusted R-squared: 0.9853
F-statistic: 723.2 on 4 and 39 DF, p-value: < 2.2e-16
[1] "log(REV) ~ log(LBC) + AGE + INC + COVID"
Call:
lm(formula = formulas[i])
Residuals:
Min 1Q Median 3Q Max
-0.40894 -0.15619 0.00504 0.12893 0.38995
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.460162 0.422223 -1.090 0.282465
log(LBC) 0.274845 0.053726 5.116 8.69e-06 ***
AGE 0.090907 0.021119 4.305 0.000109 ***
INC -0.010709 0.004871 -2.198 0.033918 *
COVID -0.487005 0.126633 -3.846 0.000433 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.202 on 39 degrees of freedom
Multiple R-squared: 0.9266, Adjusted R-squared: 0.919
F-statistic: 123 on 4 and 39 DF, p-value: < 2.2e-16
=> Model 1 is the most efficient, with exactly one variable log(LBC) has p-val of t-test = 0.059312 > 0.05. This is acceptable if takes alpha = 6%.
loglogmodel <- lm(log(REV) ~ log(LBC) + log(SC) + AGE + log(INC))
summary(loglogmodel)
Call:
lm(formula = log(REV) ~ log(LBC) + log(SC) + AGE + log(INC))
Residuals:
Min 1Q Median 3Q Max
-0.263418 -0.043335 0.003576 0.044674 0.213069
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.067292 1.867164 -2.178 0.035491 *
log(LBC) -0.062599 0.032225 -1.943 0.059312 .
log(SC) 0.997361 0.063686 15.661 < 2e-16 ***
AGE -0.047760 0.009051 -5.277 5.22e-06 ***
log(INC) 1.744827 0.454160 3.842 0.000438 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.08673 on 39 degrees of freedom
Multiple R-squared: 0.9865, Adjusted R-squared: 0.9851
F-statistic: 710.6 on 4 and 39 DF, p-value: < 2.2e-16
Through testing through formulas, we found the most efficient quadratic model:
quadraticmodel <- lm(REV ~ LBC + SC + AGE + I(INC^2) + I(AGE^2))
summary(quadraticmodel )
Call:
lm(formula = REV ~ LBC + SC + AGE + I(INC^2) + I(AGE^2))
Residuals:
Min 1Q Median 3Q Max
-5.143 -1.182 -0.321 1.298 7.221
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -7.859e+01 1.449e+01 -5.422 3.53e-06 ***
LBC 5.620e-01 8.492e-01 0.662 0.5121
SC 2.357e+00 1.101e+00 2.141 0.0388 *
AGE 2.717e+00 4.837e-01 5.616 1.91e-06 ***
I(INC^2) 3.511e-04 1.460e-04 2.405 0.0212 *
I(AGE^2) -2.249e-02 4.816e-03 -4.671 3.69e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.316 on 38 degrees of freedom
Multiple R-squared: 0.9577, Adjusted R-squared: 0.9521
F-statistic: 172.1 on 5 and 38 DF, p-value: < 2.2e-16
We use this code block to fill in the data of Table 04. Estimated Result of our report
modelana <- function(x){
m <- lm(formula = x)
print(x)
print(summary(m))
print(resettest(m,power = 2:3, type = "fitted"))
print(white_test(m))
print(jarque.bera.test(m$residuals))
print(bgtest(m))
rmse(REV, m$fitted.values)
mae(REV, m$fitted.values)
mape(REV,m$fitted.values)
}
formulas <- c("REV ~ LBC + AGE + COVID",
"REV ~ log(LBC) + log(SC) + log(INC) + COVID",
"log(REV) ~ log(LBC) + log(SC) + AGE + log(INC)",
"REV ~ LBC + SC + AGE + I(INC^2) + I(AGE^2)")
for(i in formulas){
modelana(i)
}
[1] "REV ~ LBC + AGE + COVID"
Call:
lm(formula = x)
Residuals:
Min 1Q Median 3Q Max
-7.8455 -1.1692 -0.1513 1.6782 6.3617
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -27.63973 3.42949 -8.059 6.57e-10 ***
LBC 3.31798 0.59364 5.589 1.78e-06 ***
AGE 0.74191 0.06616 11.214 6.42e-14 ***
COVID -3.22792 1.66396 -1.940 0.0595 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.737 on 40 degrees of freedom
Multiple R-squared: 0.9378, Adjusted R-squared: 0.9332
F-statistic: 201.2 on 3 and 40 DF, p-value: < 2.2e-16
RESET test
data: m
RESET = 8.3613, df1 = 2, df2 = 38, p-value = 0.0009788
White's test results
Null hypothesis: Homoskedasticity of the residuals
Alternative hypothesis: Heteroskedasticity of the residuals
Test Statistic: 12.39
P-value: 0.002038
Jarque Bera Test
data: m$residuals
X-squared = 4.9723, df = 2, p-value = 0.08323
Breusch-Godfrey test for serial correlation of order up to 1
data: m
LM test = 1.699, df = 1, p-value = 0.1924
[1] "REV ~ log(LBC) + log(SC) + log(INC) + COVID"
Call:
lm(formula = x)
Residuals:
Min 1Q Median 3Q Max
-5.1087 -1.3251 -0.1116 1.3688 6.7143
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -69.3147 25.8577 -2.681 0.01071 *
log(LBC) 1.3545 0.8284 1.635 0.11007
log(SC) 5.4368 1.6110 3.375 0.00168 **
log(INC) 16.4622 5.1727 3.182 0.00286 **
COVID 1.0178 1.2655 0.804 0.42612
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.318 on 39 degrees of freedom
Multiple R-squared: 0.9565, Adjusted R-squared: 0.9521
F-statistic: 214.5 on 4 and 39 DF, p-value: < 2.2e-16
RESET test
data: m
RESET = 3.2247, df1 = 2, df2 = 37, p-value = 0.05117
White's test results
Null hypothesis: Homoskedasticity of the residuals
Alternative hypothesis: Heteroskedasticity of the residuals
Test Statistic: 6.39
P-value: 0.040873
Jarque Bera Test
data: m$residuals
X-squared = 3.4658, df = 2, p-value = 0.1768
Breusch-Godfrey test for serial correlation of order up to 1
data: m
LM test = 4.1139, df = 1, p-value = 0.04253
[1] "log(REV) ~ log(LBC) + log(SC) + AGE + log(INC)"
Call:
lm(formula = x)
Residuals:
Min 1Q Median 3Q Max
-0.263418 -0.043335 0.003576 0.044674 0.213069
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.067292 1.867164 -2.178 0.035491 *
log(LBC) -0.062599 0.032225 -1.943 0.059312 .
log(SC) 0.997361 0.063686 15.661 < 2e-16 ***
AGE -0.047760 0.009051 -5.277 5.22e-06 ***
log(INC) 1.744827 0.454160 3.842 0.000438 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.08673 on 39 degrees of freedom
Multiple R-squared: 0.9865, Adjusted R-squared: 0.9851
F-statistic: 710.6 on 4 and 39 DF, p-value: < 2.2e-16
RESET test
data: m
RESET = 7.5615, df1 = 2, df2 = 37, p-value = 0.001765
White's test results
Null hypothesis: Homoskedasticity of the residuals
Alternative hypothesis: Heteroskedasticity of the residuals
Test Statistic: 6.91
P-value: 0.03156
Jarque Bera Test
data: m$residuals
X-squared = 4.724, df = 2, p-value = 0.09423
Breusch-Godfrey test for serial correlation of order up to 1
data: m
LM test = 6.1862, df = 1, p-value = 0.01288
[1] "REV ~ LBC + SC + AGE + I(INC^2) + I(AGE^2)"
Call:
lm(formula = x)
Residuals:
Min 1Q Median 3Q Max
-5.143 -1.182 -0.321 1.298 7.221
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -7.859e+01 1.449e+01 -5.422 3.53e-06 ***
LBC 5.620e-01 8.492e-01 0.662 0.5121
SC 2.357e+00 1.101e+00 2.141 0.0388 *
AGE 2.717e+00 4.837e-01 5.616 1.91e-06 ***
I(INC^2) 3.511e-04 1.460e-04 2.405 0.0212 *
I(AGE^2) -2.249e-02 4.816e-03 -4.671 3.69e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.316 on 38 degrees of freedom
Multiple R-squared: 0.9577, Adjusted R-squared: 0.9521
F-statistic: 172.1 on 5 and 38 DF, p-value: < 2.2e-16
RESET test
data: m
RESET = 2.9558, df1 = 2, df2 = 36, p-value = 0.06478
White's test results
Null hypothesis: Homoskedasticity of the residuals
Alternative hypothesis: Heteroskedasticity of the residuals
Test Statistic: 6.71
P-value: 0.034924
Jarque Bera Test
data: m$residuals
X-squared = 6.634, df = 2, p-value = 0.03626
Breusch-Godfrey test for serial correlation of order up to 1
data: m
LM test = 3.3998, df = 1, p-value = 0.0652
We choose model 4 - quadratic model to be the main model to analyze
summary(quadraticmodel)
Call:
lm(formula = REV ~ LBC + SC + AGE + I(INC^2) + I(AGE^2))
Residuals:
Min 1Q Median 3Q Max
-5.143 -1.182 -0.321 1.298 7.221
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -7.859e+01 1.449e+01 -5.422 3.53e-06 ***
LBC 5.620e-01 8.492e-01 0.662 0.5121
SC 2.357e+00 1.101e+00 2.141 0.0388 *
AGE 2.717e+00 4.837e-01 5.616 1.91e-06 ***
I(INC^2) 3.511e-04 1.460e-04 2.405 0.0212 *
I(AGE^2) -2.249e-02 4.816e-03 -4.671 3.69e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.316 on 38 degrees of freedom
Multiple R-squared: 0.9577, Adjusted R-squared: 0.9521
F-statistic: 172.1 on 5 and 38 DF, p-value: < 2.2e-16
Test for omitted data:
fullmodel <- lm(REV ~ LBC + SC + AGE + I(INC^2) + I(AGE^2) + COVID)
anova(fullmodel,quadraticmodel)
Analysis of Variance Table
Model 1: REV ~ LBC + SC + AGE + I(INC^2) + I(AGE^2) + COVID
Model 2: REV ~ LBC + SC + AGE + I(INC^2) + I(AGE^2)
Res.Df RSS Df Sum of Sq F Pr(>F)
1 37 203.77
2 38 203.79 -1 -0.02382 0.0043 0.9479
Anova of REV on COVID:
ano <- aov(REV~COVID, data = df)
summary(ano)
Df Sum Sq Mean Sq F value Pr(>F)
COVID 1 3034 3033.8 71.37 1.34e-10 ***
Residuals 42 1785 42.5
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
resettest(quadraticmodel,power = 2:3, type = "fitted")
RESET test
data: quadraticmodel
RESET = 2.9558, df1 = 2, df2 = 36, p-value = 0.06478
white_test(quadraticmodel)
White's test results
Null hypothesis: Homoskedasticity of the residuals
Alternative hypothesis: Heteroskedasticity of the residuals
Test Statistic: 6.71
P-value: 0.034924
resid <- quadraticmodel$residuals
jarque.bera.test(resid)
Jarque Bera Test
data: resid
X-squared = 6.634, df = 2, p-value = 0.03626
print(mean(resid))
[1] 2.775558e-17
print(median(resid))
[1] -0.320985
print(max(resid))
[1] 7.220575
print(min(resid))
[1] -5.142712
print(sd(resid))
[1] 2.17701
print(skewness(resid))
[1] 0.383959
print(kurtosis(resid))
[1] 1.515737
ggplot(df, aes(resid)) +
geom_histogram(fill = "navy", color = "white", bins = 10) +
theme_minimal() +
labs(title = "Histogram of Residuals", x = "", y = "") +
theme(plot.title = element_text(hjust = 0.5))
bgtest(quadraticmodel, order = 1)
Breusch-Godfrey test for serial correlation of order up to 1
data: quadraticmodel
LM test = 3.3998, df = 1, p-value = 0.0652
bgtest(quadraticmodel, order = 2)
Breusch-Godfrey test for serial correlation of order up to 2
data: quadraticmodel
LM test = 3.562, df = 2, p-value = 0.1685
vif(quadraticmodel)
LBC SC AGE I(INC^2) I(AGE^2)
4.74133 33.79651 309.51893 64.69675 483.30265
m1 <- lm(QUAREV ~ COVID + EXCHANGE + GDPPERCA + WAGE + INCOME)
summary(m1)
vif(m1)