I. Initialization

Below is all the packages that we use to analyze

library(readxl)
library(zoo)
library(lmtest)
library(broom)
library(car)
library(carData)
library(whitestrap)
library(tseries)
library(corrplot)
library(ggcorrplot)
library(e1071)
library(tseries)

Read data and assign variables. Converting unit of variables.

data <- read_xlsx("C:/Users/pptha/Downloads/thuktl.xlsx")
REV <- data[[2]] /1000000000000
LBC <- data[[3]] /1000000000000
SC <- data[[4]] /1000000000000
AGE <- data[[5]]
INC <- data[[6]] /100000
COVID <- data[[7]]
df <- data.frame(REV,LBC,SC,AGE,INC,COVID)

Setting up functions to calculate RMSE, MAE, MAPE:

mape <- function(actual, predicted) {
  mean(abs((actual - predicted) / actual)) * 100
}
rmse <- function(actual, predicted) {
  sqrt(mean((actual - predicted)^2))
}
mae <- function(actual, predicted) {
  if(length(actual) != length(predicted)) {
    stop("Vectors 'actual' and 'predicted' must have the same length.")
  }
   mean(abs(actual - predicted))
}

II. Descriptive statistics

desstat <- function(x){
  summaryvec<- c(
    mean(x), median(x),max(x),min(x),sd(x),skewness(x),kurtosis(x),jarque.bera.test(REV)$statistic,jarque.bera.test(REV)$p.value
  )
  return(summaryvec)
}


for(i in 1:6){
  print(colnames(df)[i])
  print(desstat(df[[i]]))
}

III. Correlation matrix

cor_matrix <- cor(df)
p <- ggcorrplot(cor_matrix, 
           lab = TRUE, 
           hc.order = TRUE, 
           type = "full",          
           colors = c("red", "white", "blue"),
           outline.color = "gray")
p

IV. Model choosing

1. Linear model:



#a: alpha
#b: minimum for R_squared 
crawlmodel <- function(a,b,vars,dep){
  #There are 5 independent variables and 1 dependent variable => 2^5 - 1 possible models
  n <- length(vars)
  for (i in 1:(2^n - 1)) {
    bits <- as.logical(rev(intToBits(i)[1:n]))
    # Convert to binary TRUE/FALSE selector
  
    selected_vars <- vars[bits]
    #Select independent variables based on TRUE/FALSE vector
  
    formula_str <- paste(paste(dep),"~", paste(selected_vars, collapse = " + "))  
    # Create formula: a ~ b + c + d (depending on selected_vars)
   
    model <- lm(as.formula(formula_str), data = df)
    # summary the model according to formula
  
    f_stat <- summary(model)$fstatistic
    f_p_value <- pf(f_stat["value"], f_stat["numdf"], f_stat["dendf"], lower.tail = FALSE)
    #get fstat pvalue 
  
    if((resettest(model)$p.value >= a)& (f_p_value <= a) & (summary(model)$r.squared >= b)){
    #set conditions for choosing variables
    
      print(formula_str)
      print(summary(model))
      print(reset(model))
      #print the result
    }
  }
}
varstring <- c( "LBC" ,"SC","AGE", "INC","COVID" )
depvar <- "REV"
crawlmodel(0.05,0.5,varstring,depvar)
[1] "REV ~ COVID"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-10.903  -4.741   0.320   3.937  12.565 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   14.409      1.331  10.826 9.94e-14 ***
COVID         16.676      1.974   8.448 1.34e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.52 on 42 degrees of freedom
Multiple R-squared:  0.6295,    Adjusted R-squared:  0.6207 
F-statistic: 71.37 on 1 and 42 DF,  p-value: 1.338e-10


    RESET test

data:  model
RESET = 0, df1 = 2, df2 = 40, p-value = 1

When choosing p-value for Ramsay test >= 0.05, only 1 model satisfies. Alternate way is to use backward procedures:

model <- lm( REV ~  LBC + SC + AGE + INC + COVID)
step(model, direction = "backward")
Start:  AIC=94.95
REV ~ LBC + SC + AGE + INC + COVID

        Df Sum of Sq    RSS     AIC
- INC    1     0.415 290.28  93.013
- SC     1     9.650 299.52  94.391
<none>               289.87  94.950
- AGE    1    17.094 306.96  95.471
- COVID  1    34.779 324.65  97.936
- LBC    1   107.563 397.43 106.836

Step:  AIC=93.01
REV ~ LBC + SC + AGE + COVID

        Df Sum of Sq    RSS     AIC
- SC     1     9.263 299.54  92.395
<none>               290.28  93.013
- COVID  1    35.636 325.92  96.108
- LBC    1   115.653 405.93 105.768
- AGE    1   118.578 408.86 106.084

Step:  AIC=92.4
REV ~ LBC + AGE + COVID

        Df Sum of Sq     RSS     AIC
<none>                299.54  92.395
- COVID  1     28.18  327.73  94.351
- LBC    1    233.94  533.48 115.790
- AGE    1    941.67 1241.21 152.945

Call:
lm(formula = REV ~ LBC + AGE + COVID)

Coefficients:
(Intercept)          LBC          AGE        COVID  
   -27.6397       3.3180       0.7419      -3.2279  

=> Result of backward procedure: REV ~ LBC + AGE + COVID

linearmodel <- lm(REV ~ LBC + AGE + COVID)
summary(linearmodel)

Call:
lm(formula = REV ~ LBC + AGE + COVID)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.8455 -1.1692 -0.1513  1.6782  6.3617 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -27.63973    3.42949  -8.059 6.57e-10 ***
LBC           3.31798    0.59364   5.589 1.78e-06 ***
AGE           0.74191    0.06616  11.214 6.42e-14 ***
COVID        -3.22792    1.66396  -1.940   0.0595 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.737 on 40 degrees of freedom
Multiple R-squared:  0.9378,    Adjusted R-squared:  0.9332 
F-statistic: 201.2 on 3 and 40 DF,  p-value: < 2.2e-16

2. Semi-log model

a. Log-Lin:

depvar <- "log(REV)"
varstring <- c("LBC","SC","AGE","INC","COVID")
crawlmodel(0.05,0.5,varstring,depvar)
crawlmodel(0.05,0,varstring,depvar)
[1] "log(REV) ~ COVID"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.21109 -0.16666  0.02516  0.30368  0.82952 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   2.4654     0.1069  23.064  < 2e-16 ***
COVID         0.9648     0.1585   6.085 2.99e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.5237 on 42 degrees of freedom
Multiple R-squared:  0.4686,    Adjusted R-squared:  0.4559 
F-statistic: 37.03 on 1 and 42 DF,  p-value: 2.992e-07


    RESET test

data:  model
RESET = 0, df1 = 2, df2 = 40, p-value = 1

=> Only 1 model: log(REV) ~ COVID satisfies the conditions, though, R_squared = 0.4686 is not efficient.

b. Lin - Log:


#those are varibles that can be log: LBC, SC, INC:
varstring1 <- c( "log(LBC)" ,"log(SC)","AGE", "log(INC)","COVID" )
varstring2 <- c( "LBC" ,"log(SC)","AGE", "log(INC)","COVID" )
varstring3 <- c( "log(LBC)" ,"SC","AGE", "log(INC)","COVID" )
varstring4 <- c( "log(LBC)" ,"log(SC)","AGE", "INC","COVID" )
varstring5 <- c( "LBC" ,"SC","AGE", "log(INC)","COVID" )
varstring6 <- c( "LBC" ,"log(SC)","AGE", "INC","COVID" )
varstring7 <- c( "log(LBC)" ,"SC","AGE", "INC","COVID" )

Each independent variable log options:

depvar <- "REV"
crawlmodel(0.05,0.5,varstring1,depvar)
[1] "REV ~ COVID"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-10.903  -4.741   0.320   3.937  12.565 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   14.409      1.331  10.826 9.94e-14 ***
COVID         16.676      1.974   8.448 1.34e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.52 on 42 degrees of freedom
Multiple R-squared:  0.6295,    Adjusted R-squared:  0.6207 
F-statistic: 71.37 on 1 and 42 DF,  p-value: 1.338e-10


    RESET test

data:  model
RESET = 0, df1 = 2, df2 = 40, p-value = 1

[1] "REV ~ log(LBC)"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-7.236 -4.528 -2.376  4.146 14.646 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  21.6685     0.9233  23.468  < 2e-16 ***
log(LBC)      9.3428     1.0037   9.309 9.11e-12 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.12 on 42 degrees of freedom
Multiple R-squared:  0.6735,    Adjusted R-squared:  0.6658 
F-statistic: 86.65 on 1 and 42 DF,  p-value: 9.109e-12


    RESET test

data:  model
RESET = 1.1008, df1 = 2, df2 = 40, p-value = 0.3425

[1] "REV ~ log(LBC) + log(SC) + log(INC) + COVID"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.1087 -1.3251 -0.1116  1.3688  6.7143 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept) -69.3147    25.8577  -2.681  0.01071 * 
log(LBC)      1.3545     0.8284   1.635  0.11007   
log(SC)       5.4368     1.6110   3.375  0.00168 **
log(INC)     16.4622     5.1727   3.182  0.00286 **
COVID         1.0178     1.2655   0.804  0.42612   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.318 on 39 degrees of freedom
Multiple R-squared:  0.9565,    Adjusted R-squared:  0.9521 
F-statistic: 214.5 on 4 and 39 DF,  p-value: < 2.2e-16


    RESET test

data:  model
RESET = 3.2247, df1 = 2, df2 = 37, p-value = 0.05117

[1] "REV ~ log(LBC) + log(SC) + AGE + log(INC) + COVID"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.1321 -1.3081 -0.0483  1.1535  7.3288 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)   
(Intercept) -120.6483    53.8931  -2.239  0.03111 * 
log(LBC)       0.9628     0.9019   1.067  0.29250   
log(SC)        6.3865     1.8303   3.489  0.00124 **
AGE           -0.3420     0.3152  -1.085  0.28480   
log(INC)      30.0636    13.5578   2.217  0.03265 * 
COVID          2.1681     1.6488   1.315  0.19640   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.313 on 38 degrees of freedom
Multiple R-squared:  0.9578,    Adjusted R-squared:  0.9523 
F-statistic: 172.6 on 5 and 38 DF,  p-value: < 2.2e-16


    RESET test

data:  model
RESET = 3.1037, df1 = 2, df2 = 36, p-value = 0.05708

=> Model can be chosen are:

[1] “REV ~ log(LBC) + log(SC) + log(INC) + COVID”

[2] “REV ~ log(LBC) + log(SC) + AGE + log(INC) + COVID”

For varstring2:

 depvar <- "REV"
 crawlmodel(0.05,0.5,varstring2,depvar)
[1] "REV ~ COVID"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-10.903  -4.741   0.320   3.937  12.565 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   14.409      1.331  10.826 9.94e-14 ***
COVID         16.676      1.974   8.448 1.34e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.52 on 42 degrees of freedom
Multiple R-squared:  0.6295,    Adjusted R-squared:  0.6207 
F-statistic: 71.37 on 1 and 42 DF,  p-value: 1.338e-10


    RESET test

data:  model
RESET = 0, df1 = 2, df2 = 40, p-value = 1

[1] "REV ~ LBC + log(SC) + log(INC)"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.1546 -1.2258 -0.4381  1.4164  6.9168 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -81.2107    22.1796  -3.662 0.000726 ***
LBC           1.2520     0.6699   1.869 0.068958 .  
log(SC)       5.5849     1.4049   3.975 0.000287 ***
log(INC)     18.4620     4.3354   4.258 0.000121 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.292 on 40 degrees of freedom
Multiple R-squared:  0.9564,    Adjusted R-squared:  0.9531 
F-statistic: 292.5 on 3 and 40 DF,  p-value: < 2.2e-16


    RESET test

data:  model
RESET = 3.2297, df1 = 2, df2 = 38, p-value = 0.05065

[1] "REV ~ LBC + log(SC) + log(INC) + COVID"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.1389 -1.2532 -0.3672  1.4695  6.7797 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -76.4451    28.0150  -2.729 0.009487 ** 
LBC           1.1799     0.7235   1.631 0.110981    
log(SC)       5.7110     1.4890   3.835 0.000446 ***
log(INC)     17.5219     5.4940   3.189 0.002812 ** 
COVID         0.3831     1.3484   0.284 0.777819    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.319 on 39 degrees of freedom
Multiple R-squared:  0.9565,    Adjusted R-squared:  0.952 
F-statistic: 214.4 on 4 and 39 DF,  p-value: < 2.2e-16


    RESET test

data:  model
RESET = 3.088, df1 = 2, df2 = 37, p-value = 0.05751

[1] "REV ~ LBC + log(SC) + AGE + log(INC) + COVID"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.1447 -1.3675 -0.0311  1.2240  7.3622 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -124.4926    53.4899  -2.327 0.025370 *  
LBC            0.8220     0.7984   1.030 0.309721    
log(SC)        6.6004     1.7097   3.860 0.000426 ***
AGE           -0.3367     0.3195  -1.054 0.298594    
log(INC)      30.5236    13.5018   2.261 0.029588 *  
COVID          1.7109     1.8440   0.928 0.359368    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.315 on 38 degrees of freedom
Multiple R-squared:  0.9577,    Adjusted R-squared:  0.9522 
F-statistic: 172.2 on 5 and 38 DF,  p-value: < 2.2e-16


    RESET test

data:  model
RESET = 3.1136, df1 = 2, df2 = 36, p-value = 0.05659

=> Model can be chosen are:

[3]“REV ~ LBC + log(SC) + log(INC) + COVID”

[4]“REV ~ LBC + log(SC) + AGE + log(INC) + COVID”

For varstring3:

 depvar <- "REV"
 crawlmodel(0.05,0.5,varstring3,depvar)
[1] "REV ~ COVID"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-10.903  -4.741   0.320   3.937  12.565 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   14.409      1.331  10.826 9.94e-14 ***
COVID         16.676      1.974   8.448 1.34e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.52 on 42 degrees of freedom
Multiple R-squared:  0.6295,    Adjusted R-squared:  0.6207 
F-statistic: 71.37 on 1 and 42 DF,  p-value: 1.338e-10


    RESET test

data:  model
RESET = 0, df1 = 2, df2 = 40, p-value = 1

[1] "REV ~ log(LBC)"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-7.236 -4.528 -2.376  4.146 14.646 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  21.6685     0.9233  23.468  < 2e-16 ***
log(LBC)      9.3428     1.0037   9.309 9.11e-12 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.12 on 42 degrees of freedom
Multiple R-squared:  0.6735,    Adjusted R-squared:  0.6658 
F-statistic: 86.65 on 1 and 42 DF,  p-value: 9.109e-12


    RESET test

data:  model
RESET = 1.1008, df1 = 2, df2 = 40, p-value = 0.3425

=> No model has more than 1 independent variable that satisfies the conditions

For varstring4:

 depvar <- "REV"
 crawlmodel(0.05,0.5,varstring4,depvar)
[1] "REV ~ COVID"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-10.903  -4.741   0.320   3.937  12.565 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   14.409      1.331  10.826 9.94e-14 ***
COVID         16.676      1.974   8.448 1.34e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.52 on 42 degrees of freedom
Multiple R-squared:  0.6295,    Adjusted R-squared:  0.6207 
F-statistic: 71.37 on 1 and 42 DF,  p-value: 1.338e-10


    RESET test

data:  model
RESET = 0, df1 = 2, df2 = 40, p-value = 1

[1] "REV ~ log(LBC)"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-7.236 -4.528 -2.376  4.146 14.646 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  21.6685     0.9233  23.468  < 2e-16 ***
log(LBC)      9.3428     1.0037   9.309 9.11e-12 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.12 on 42 degrees of freedom
Multiple R-squared:  0.6735,    Adjusted R-squared:  0.6658 
F-statistic: 86.65 on 1 and 42 DF,  p-value: 9.109e-12


    RESET test

data:  model
RESET = 1.1008, df1 = 2, df2 = 40, p-value = 0.3425

=> No model has more than 1 independent variable that satisfies the conditions

For varstring5:

 depvar <- "REV"
 crawlmodel(0.05,0.5,varstring5,depvar)
[1] "REV ~ COVID"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-10.903  -4.741   0.320   3.937  12.565 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   14.409      1.331  10.826 9.94e-14 ***
COVID         16.676      1.974   8.448 1.34e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.52 on 42 degrees of freedom
Multiple R-squared:  0.6295,    Adjusted R-squared:  0.6207 
F-statistic: 71.37 on 1 and 42 DF,  p-value: 1.338e-10


    RESET test

data:  model
RESET = 0, df1 = 2, df2 = 40, p-value = 1

=> No model has more than 1 independent variable that satisfies the conditions

For varstring6:

 depvar <- "REV"
 crawlmodel(0.05,0.5,varstring6,depvar)
[1] "REV ~ COVID"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-10.903  -4.741   0.320   3.937  12.565 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   14.409      1.331  10.826 9.94e-14 ***
COVID         16.676      1.974   8.448 1.34e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.52 on 42 degrees of freedom
Multiple R-squared:  0.6295,    Adjusted R-squared:  0.6207 
F-statistic: 71.37 on 1 and 42 DF,  p-value: 1.338e-10


    RESET test

data:  model
RESET = 0, df1 = 2, df2 = 40, p-value = 1

=> No model has more than 1 independent variable that satisfies the conditions

For varstring7:

 depvar <- "REV"
 crawlmodel(0.05,0.5,varstring7,depvar)
[1] "REV ~ COVID"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-10.903  -4.741   0.320   3.937  12.565 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   14.409      1.331  10.826 9.94e-14 ***
COVID         16.676      1.974   8.448 1.34e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.52 on 42 degrees of freedom
Multiple R-squared:  0.6295,    Adjusted R-squared:  0.6207 
F-statistic: 71.37 on 1 and 42 DF,  p-value: 1.338e-10


    RESET test

data:  model
RESET = 0, df1 = 2, df2 = 40, p-value = 1

[1] "REV ~ log(LBC)"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-7.236 -4.528 -2.376  4.146 14.646 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  21.6685     0.9233  23.468  < 2e-16 ***
log(LBC)      9.3428     1.0037   9.309 9.11e-12 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.12 on 42 degrees of freedom
Multiple R-squared:  0.6735,    Adjusted R-squared:  0.6658 
F-statistic: 86.65 on 1 and 42 DF,  p-value: 9.109e-12


    RESET test

data:  model
RESET = 1.1008, df1 = 2, df2 = 40, p-value = 0.3425

=> No model has more than 1 independent variable that satisfies the conditions

Conclusion:

There are 4 semi-log models that satisfy the conditions:

  • Overall significant: p-val of F-test <= 0.05

  • Bypass Ramsay test: p-val >= 0.05

  • R_squared >= 0.5

(Check out those in crawlmodel function)

These functions are:

[1] “REV ~ log(LBC) + log(SC) + log(INC) + COVID”

[2] “REV ~ log(LBC) + log(SC) + AGE + log(INC) + COVID”

[3]“REV ~ LBC + log(SC) + log(INC) + COVID”

[4]“REV ~ LBC + log(SC) + AGE + log(INC) + COVID”

lm1 <- lm(REV ~ log(LBC) + log(SC) + log(INC) + COVID)
summary(lm1)

Call:
lm(formula = REV ~ log(LBC) + log(SC) + log(INC) + COVID)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.1087 -1.3251 -0.1116  1.3688  6.7143 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept) -69.3147    25.8577  -2.681  0.01071 * 
log(LBC)      1.3545     0.8284   1.635  0.11007   
log(SC)       5.4368     1.6110   3.375  0.00168 **
log(INC)     16.4622     5.1727   3.182  0.00286 **
COVID         1.0178     1.2655   0.804  0.42612   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.318 on 39 degrees of freedom
Multiple R-squared:  0.9565,    Adjusted R-squared:  0.9521 
F-statistic: 214.5 on 4 and 39 DF,  p-value: < 2.2e-16
lm2 <- lm(REV ~ log(LBC) + log(SC) + AGE + log(INC) + COVID)
summary(lm2)

Call:
lm(formula = REV ~ log(LBC) + log(SC) + AGE + log(INC) + COVID)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.1321 -1.3081 -0.0483  1.1535  7.3288 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)   
(Intercept) -120.6483    53.8931  -2.239  0.03111 * 
log(LBC)       0.9628     0.9019   1.067  0.29250   
log(SC)        6.3865     1.8303   3.489  0.00124 **
AGE           -0.3420     0.3152  -1.085  0.28480   
log(INC)      30.0636    13.5578   2.217  0.03265 * 
COVID          2.1681     1.6488   1.315  0.19640   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.313 on 38 degrees of freedom
Multiple R-squared:  0.9578,    Adjusted R-squared:  0.9523 
F-statistic: 172.6 on 5 and 38 DF,  p-value: < 2.2e-16
lm3 <- lm(REV ~ LBC + log(SC) + log(INC) + COVID)
summary(lm3)

Call:
lm(formula = REV ~ LBC + log(SC) + log(INC) + COVID)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.1389 -1.2532 -0.3672  1.4695  6.7797 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -76.4451    28.0150  -2.729 0.009487 ** 
LBC           1.1799     0.7235   1.631 0.110981    
log(SC)       5.7110     1.4890   3.835 0.000446 ***
log(INC)     17.5219     5.4940   3.189 0.002812 ** 
COVID         0.3831     1.3484   0.284 0.777819    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.319 on 39 degrees of freedom
Multiple R-squared:  0.9565,    Adjusted R-squared:  0.952 
F-statistic: 214.4 on 4 and 39 DF,  p-value: < 2.2e-16
lm4 <- lm(REV ~ LBC + log(SC) + AGE + log(INC) + COVID)
summary(lm4)

Call:
lm(formula = REV ~ LBC + log(SC) + AGE + log(INC) + COVID)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.1447 -1.3675 -0.0311  1.2240  7.3622 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -124.4926    53.4899  -2.327 0.025370 *  
LBC            0.8220     0.7984   1.030 0.309721    
log(SC)        6.6004     1.7097   3.860 0.000426 ***
AGE           -0.3367     0.3195  -1.054 0.298594    
log(INC)      30.5236    13.5018   2.261 0.029588 *  
COVID          1.7109     1.8440   0.928 0.359368    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.315 on 38 degrees of freedom
Multiple R-squared:  0.9577,    Adjusted R-squared:  0.9522 
F-statistic: 172.2 on 5 and 38 DF,  p-value: < 2.2e-16

=> Choose model 1 to analyze as semi-log model because:

  • Has 2/4 significant variables

  • Has higher adjusted R-squared than model 3

semilogmodel <- lm(REV ~ log(LBC) + log(SC) + log(INC) + COVID)
summary(semilogmodel)

Call:
lm(formula = REV ~ log(LBC) + log(SC) + log(INC) + COVID)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.1087 -1.3251 -0.1116  1.3688  6.7143 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept) -69.3147    25.8577  -2.681  0.01071 * 
log(LBC)      1.3545     0.8284   1.635  0.11007   
log(SC)       5.4368     1.6110   3.375  0.00168 **
log(INC)     16.4622     5.1727   3.182  0.00286 **
COVID         1.0178     1.2655   0.804  0.42612   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.318 on 39 degrees of freedom
Multiple R-squared:  0.9565,    Adjusted R-squared:  0.9521 
F-statistic: 214.5 on 4 and 39 DF,  p-value: < 2.2e-16

3. Log - log:

depvar <- "log(REV)"
varstring1 <- c( "log(LBC)" ,"log(SC)","AGE", "log(INC)","COVID" )
varstring2 <- c( "LBC" ,"log(SC)","AGE", "log(INC)","COVID" )
varstring3 <- c( "log(LBC)" ,"SC","AGE", "log(INC)","COVID" )
varstring4 <- c( "log(LBC)" ,"log(SC)","AGE", "INC","COVID" )
varstring5 <- c( "LBC" ,"SC","AGE", "log(INC)","COVID" )
varstring6 <- c( "LBC" ,"log(SC)","AGE", "INC","COVID" )
varstring7 <- c( "log(LBC)" ,"SC","AGE", "INC","COVID" )
crawlmodel(0.05,0.5,varstring1,depvar)
crawlmodel(0.05,0.5,varstring2,depvar)
crawlmodel(0.05,0.5,varstring3,depvar)
crawlmodel(0.05,0.5,varstring4,depvar)
crawlmodel(0.05,0.5,varstring5,depvar)
crawlmodel(0.05,0.5,varstring6,depvar)
crawlmodel(0.05,0.5,varstring7,depvar)

=> No model with log(REV) satisfies the conditions.

Use backward methods for varstring1:

model <- lm( log(REV) ~  log(LBC) + log(SC)+ AGE+ log(INC) + COVID)
step(model, direction = "backward")
Start:  AIC=-209.8
log(REV) ~ log(LBC) + log(SC) + AGE + log(INC) + COVID

           Df Sum of Sq     RSS     AIC
- COVID     1   0.00873 0.29333 -210.47
<none>                  0.28460 -209.80
- log(LBC)  1   0.01747 0.30207 -209.18
- log(INC)  1   0.06537 0.34997 -202.70
- AGE       1   0.08430 0.36890 -200.38
- log(SC)   1   1.50187 1.78647 -130.97

Step:  AIC=-210.47
log(REV) ~ log(LBC) + log(SC) + AGE + log(INC)

           Df Sum of Sq     RSS     AIC
<none>                  0.29333 -210.47
- log(LBC)  1   0.02838 0.32172 -208.40
- log(INC)  1   0.11102 0.40435 -198.34
- AGE       1   0.20941 0.50274 -188.76
- log(SC)   1   1.84466 2.13800 -125.07

Call:
lm(formula = log(REV) ~ log(LBC) + log(SC) + AGE + log(INC))

Coefficients:
(Intercept)     log(LBC)      log(SC)          AGE     log(INC)  
   -4.06729     -0.06260      0.99736     -0.04776      1.74483  

Result model:

[1] log(REV) ~ log(LBC) + log(SC) + AGE + log(INC)

Use backward methods for varstring2:

model <- lm( log(REV) ~  LBC+ log(SC)+ AGE + log(INC) + COVID)
step(model, direction = "backward")
Start:  AIC=-210.62
log(REV) ~ LBC + log(SC) + AGE + log(INC) + COVID

           Df Sum of Sq     RSS     AIC
- COVID     1   0.00167 0.28103 -212.35
<none>                  0.27936 -210.62
- LBC       1   0.02271 0.30207 -209.18
- log(INC)  1   0.06616 0.34552 -203.26
- AGE       1   0.08916 0.36852 -200.43
- log(SC)   1   1.73079 2.01015 -125.78

Step:  AIC=-212.35
log(REV) ~ LBC + log(SC) + AGE + log(INC)

           Df Sum of Sq     RSS     AIC
<none>                  0.28103 -212.35
- LBC       1   0.04069 0.32172 -208.40
- log(INC)  1   0.09464 0.37567 -201.58
- AGE       1   0.19993 0.48095 -190.71
- log(SC)   1   2.47354 2.75457 -113.92

Call:
lm(formula = log(REV) ~ LBC + log(SC) + AGE + log(INC))

Coefficients:
(Intercept)          LBC      log(SC)          AGE     log(INC)  
   -3.41721     -0.05932      0.98741     -0.04506      1.60610  

Result model:

model <- lm( log(REV) ~  LBC+ log(SC)+ AGE + log(INC) + COVID)
step(model, direction = "backward")
Start:  AIC=-210.62
log(REV) ~ LBC + log(SC) + AGE + log(INC) + COVID

           Df Sum of Sq     RSS     AIC
- COVID     1   0.00167 0.28103 -212.35
<none>                  0.27936 -210.62
- LBC       1   0.02271 0.30207 -209.18
- log(INC)  1   0.06616 0.34552 -203.26
- AGE       1   0.08916 0.36852 -200.43
- log(SC)   1   1.73079 2.01015 -125.78

Step:  AIC=-212.35
log(REV) ~ LBC + log(SC) + AGE + log(INC)

           Df Sum of Sq     RSS     AIC
<none>                  0.28103 -212.35
- LBC       1   0.04069 0.32172 -208.40
- log(INC)  1   0.09464 0.37567 -201.58
- AGE       1   0.19993 0.48095 -190.71
- log(SC)   1   2.47354 2.75457 -113.92

Call:
lm(formula = log(REV) ~ LBC + log(SC) + AGE + log(INC))

Coefficients:
(Intercept)          LBC      log(SC)          AGE     log(INC)  
   -3.41721     -0.05932      0.98741     -0.04506      1.60610  

[2] log(REV) ~ LBC + log(SC) + AGE + log(INC)

Use backward methods for varstring3:

model <- lm( log(REV) ~   log(LBC) + SC + AGE + log(INC) + COVID)
step(model, direction = "backward")
Start:  AIC=-129.01
log(REV) ~ log(LBC) + SC + AGE + log(INC) + COVID

           Df Sum of Sq    RSS     AIC
- SC        1   0.00131 1.7865 -130.97
- log(INC)  1   0.00158 1.7867 -130.97
<none>                  1.7852 -129.01
- AGE       1   0.09517 1.8803 -128.72
- COVID     1   0.30312 2.0883 -124.11
- log(LBC)  1   1.46901 3.2542 -104.59

Step:  AIC=-130.97
log(REV) ~ log(LBC) + AGE + log(INC) + COVID

           Df Sum of Sq    RSS      AIC
- log(INC)  1   0.00239 1.7889 -132.915
<none>                  1.7865 -130.974
- AGE       1   0.11348 1.8999 -130.264
- COVID     1   0.35153 2.1380 -125.070
- log(LBC)  1   2.08731 3.8738  -98.918

Step:  AIC=-132.91
log(REV) ~ log(LBC) + AGE + COVID

           Df Sum of Sq    RSS      AIC
<none>                  1.7889 -132.915
- COVID     1   0.45813 2.2470 -124.882
- log(LBC)  1   2.14712 3.9360 -100.217
- AGE       1   2.97297 4.7618  -91.837

Call:
lm(formula = log(REV) ~ log(LBC) + AGE + COVID)

Coefficients:
(Intercept)     log(LBC)          AGE        COVID  
    0.20080      0.33513      0.04602     -0.40586  

Result model:

[3] log(REV) ~ log(LBC) + AGE + COVID

Use backward methods for varstring4:

model <- lm( log(REV) ~  log(LBC) + log(SC) + AGE + INC + COVID)
step(model, direction = "backward")
Start:  AIC=-209.18
log(REV) ~ log(LBC) + log(SC) + AGE + INC + COVID

           Df Sum of Sq     RSS     AIC
- COVID     1   0.00681 0.29546 -210.15
- log(LBC)  1   0.01025 0.29891 -209.64
<none>                  0.28866 -209.18
- INC       1   0.06131 0.34997 -202.70
- AGE       1   0.07862 0.36728 -200.58
- log(SC)   1   1.30296 1.59161 -136.06

Step:  AIC=-210.15
log(REV) ~ log(LBC) + log(SC) + AGE + INC

           Df Sum of Sq     RSS     AIC
<none>                  0.29546 -210.15
- log(LBC)  1   0.01586 0.31133 -209.85
- INC       1   0.10889 0.40435 -198.34
- AGE       1   0.19491 0.49038 -189.86
- log(SC)   1   1.89975 2.19521 -123.91

Call:
lm(formula = log(REV) ~ log(LBC) + log(SC) + AGE + INC)

Coefficients:
(Intercept)     log(LBC)      log(SC)          AGE          INC  
   3.704860    -0.046619     1.099855    -0.052812     0.008271  

Result model:

[4] log(REV) ~ log(LBC) + log(SC) + AGE + INC

Use backward methods for varstring5:

model <- lm( log(REV) ~  LBC+ SC+AGE + log(INC) + COVID)
step(model, direction = "backward")
Start:  AIC=-123.81
log(REV) ~ LBC + SC + AGE + log(INC) + COVID

           Df Sum of Sq    RSS     AIC
- SC        1   0.00138 2.0102 -125.78
- log(INC)  1   0.00353 2.0123 -125.74
<none>                  2.0088 -123.81
- AGE       1   0.12894 2.1377 -123.08
- COVID     1   0.70933 2.7181 -112.51
- LBC       1   1.24539 3.2542 -104.59

Step:  AIC=-125.78
log(REV) ~ LBC + AGE + log(INC) + COVID

           Df Sum of Sq    RSS      AIC
- log(INC)  1   0.00485 2.0150 -127.677
<none>                  2.0102 -125.783
- AGE       1   0.16271 2.1729 -124.358
- COVID     1   0.74442 2.7546 -113.921
- LBC       1   1.86363 3.8738  -98.918

Step:  AIC=-127.68
log(REV) ~ LBC + AGE + COVID

        Df Sum of Sq    RSS      AIC
<none>               2.0150 -127.677
- COVID  1    0.9899 3.0049 -112.093
- LBC    1    1.9210 3.9360 -100.217
- AGE    1    5.4807 7.4957  -71.874

Call:
lm(formula = log(REV) ~ LBC + AGE + COVID)

Coefficients:
(Intercept)          LBC          AGE        COVID  
    -0.7869       0.3007       0.0566      -0.6050  

=> The result is not a log log model

Use backward methods for varstring6:

model <- lm( log(REV) ~  LBC + log(SC) + AGE + INC + COVID )
step(model, direction = "backward")
Start:  AIC=-209.56
log(REV) ~ LBC + log(SC) + AGE + INC + COVID

          Df Sum of Sq     RSS     AIC
- COVID    1   0.00216 0.28827 -211.23
- LBC      1   0.01280 0.29891 -209.64
<none>                 0.28611 -209.56
- INC      1   0.05940 0.34552 -203.26
- AGE      1   0.08032 0.36644 -200.68
- log(SC)  1   1.48655 1.77267 -131.31

Step:  AIC=-211.23
log(REV) ~ LBC + log(SC) + AGE + INC

          Df Sum of Sq     RSS     AIC
<none>                 0.28827 -211.23
- LBC      1   0.02305 0.31133 -209.85
- INC      1   0.08739 0.37567 -201.58
- AGE      1   0.17423 0.46250 -192.43
- log(SC)  1   2.61001 2.89828 -111.68

Call:
lm(formula = log(REV) ~ LBC + log(SC) + AGE + INC)

Coefficients:
(Intercept)          LBC      log(SC)          AGE          INC  
   3.715776    -0.045591     1.084177    -0.049574     0.007571  

Result model:

[6] log(REV) ~ LBC + log(SC) + AGE + INC

Use backward methods for varstring7:

model <- lm( log(REV) ~  log(LBC) + SC + AGE + INC + COVID )
step(model, direction = "backward")
Start:  AIC=-134.91
log(REV) ~ log(LBC) + SC + AGE + INC + COVID

           Df Sum of Sq    RSS     AIC
- SC        1   0.03064 1.5916 -136.06
<none>                  1.5610 -134.91
- INC       1   0.22576 1.7867 -130.97
- COVID     1   0.44208 2.0031 -125.94
- AGE       1   0.65774 2.2187 -121.44
- log(LBC)  1   1.03711 2.5981 -114.49

Step:  AIC=-136.06
log(REV) ~ log(LBC) + AGE + INC + COVID

           Df Sum of Sq    RSS     AIC
<none>                  1.5916 -136.06
- INC       1   0.19724 1.7889 -132.91
- COVID     1   0.60360 2.1952 -123.91
- AGE       1   0.75620 2.3478 -120.95
- log(LBC)  1   1.06801 2.6596 -115.46

Call:
lm(formula = log(REV) ~ log(LBC) + AGE + INC + COVID)

Coefficients:
(Intercept)     log(LBC)          AGE          INC        COVID  
   -0.46016      0.27485      0.09091     -0.01071     -0.48700  

Result model:

[7] log(REV) ~ log(LBC) + AGE + INC + COVID

Conclusion:

There are 6 result of log-log models, using backward procedure:

[1] log(REV) ~ log(LBC) + log(SC) + AGE + log(INC)

[2] log(REV) ~ LBC + log(SC) + AGE + log(INC)

[3] log(REV) ~ log(LBC) + AGE + COVID

[4] log(REV) ~ log(LBC) + log(SC) + AGE + INC

[5] log(REV) ~ LBC + log(SC) + AGE + INC

[6] log(REV) ~ log(LBC) + AGE + INC + COVID

formulas <- c("log(REV) ~ log(LBC) + log(SC) + AGE + log(INC)",
              "log(REV) ~ LBC + log(SC) + AGE + log(INC)",
              "log(REV) ~ log(LBC) + AGE + COVID",
              "log(REV) ~ log(LBC) + log(SC) + AGE + INC",
              "log(REV) ~ LBC + log(SC) + AGE + INC",
              "log(REV) ~ log(LBC) + AGE + INC + COVID")
for(i in 1:6){
  model <- lm(formula = formulas[i])
  print(formulas[i])
  print(summary(model))
}
[1] "log(REV) ~ log(LBC) + log(SC) + AGE + log(INC)"

Call:
lm(formula = formulas[i])

Residuals:
      Min        1Q    Median        3Q       Max 
-0.263418 -0.043335  0.003576  0.044674  0.213069 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -4.067292   1.867164  -2.178 0.035491 *  
log(LBC)    -0.062599   0.032225  -1.943 0.059312 .  
log(SC)      0.997361   0.063686  15.661  < 2e-16 ***
AGE         -0.047760   0.009051  -5.277 5.22e-06 ***
log(INC)     1.744827   0.454160   3.842 0.000438 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.08673 on 39 degrees of freedom
Multiple R-squared:  0.9865,    Adjusted R-squared:  0.9851 
F-statistic: 710.6 on 4 and 39 DF,  p-value: < 2.2e-16

[1] "log(REV) ~ LBC + log(SC) + AGE + log(INC)"

Call:
lm(formula = formulas[i])

Residuals:
      Min        1Q    Median        3Q       Max 
-0.256953 -0.045770 -0.001234  0.041823  0.212744 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -3.417209   1.834437  -1.863 0.070034 .  
LBC         -0.059322   0.024964  -2.376 0.022494 *  
log(SC)      0.987414   0.053294  18.528  < 2e-16 ***
AGE         -0.045058   0.008554  -5.267 5.37e-06 ***
log(INC)     1.606099   0.443176   3.624 0.000827 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.08489 on 39 degrees of freedom
Multiple R-squared:  0.987, Adjusted R-squared:  0.9857 
F-statistic: 742.2 on 4 and 39 DF,  p-value: < 2.2e-16

[1] "log(REV) ~ log(LBC) + AGE + COVID"

Call:
lm(formula = formulas[i])

Residuals:
     Min       1Q   Median       3Q      Max 
-0.40191 -0.13725  0.01743  0.08875  0.42544 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.200801   0.310325   0.647  0.52128    
log(LBC)     0.335126   0.048366   6.929 2.36e-08 ***
AGE          0.046018   0.005644   8.153 4.90e-10 ***
COVID       -0.405861   0.126806  -3.201  0.00269 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.2115 on 40 degrees of freedom
Multiple R-squared:  0.9175,    Adjusted R-squared:  0.9113 
F-statistic: 148.2 on 3 and 40 DF,  p-value: < 2.2e-16

[1] "log(REV) ~ log(LBC) + log(SC) + AGE + INC"

Call:
lm(formula = formulas[i])

Residuals:
      Min        1Q    Median        3Q       Max 
-0.279817 -0.053597 -0.001524  0.046229  0.222259 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  3.704860   0.234488  15.800  < 2e-16 ***
log(LBC)    -0.046619   0.032218  -1.447 0.155895    
log(SC)      1.099855   0.069456  15.835  < 2e-16 ***
AGE         -0.052812   0.010412  -5.072 9.97e-06 ***
INC          0.008271   0.002182   3.791 0.000509 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.08704 on 39 degrees of freedom
Multiple R-squared:  0.9864,    Adjusted R-squared:  0.985 
F-statistic: 705.4 on 4 and 39 DF,  p-value: < 2.2e-16

[1] "log(REV) ~ LBC + log(SC) + AGE + INC"

Call:
lm(formula = formulas[i])

Residuals:
      Min        1Q    Median        3Q       Max 
-0.274722 -0.043456 -0.003529  0.044853  0.218597 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  3.715776   0.230300  16.134  < 2e-16 ***
LBC         -0.045591   0.025817  -1.766  0.08523 .  
log(SC)      1.084177   0.057697  18.791  < 2e-16 ***
AGE         -0.049574   0.010211  -4.855 1.98e-05 ***
INC          0.007571   0.002202   3.438  0.00141 ** 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.08597 on 39 degrees of freedom
Multiple R-squared:  0.9867,    Adjusted R-squared:  0.9853 
F-statistic: 723.2 on 4 and 39 DF,  p-value: < 2.2e-16

[1] "log(REV) ~ log(LBC) + AGE + INC + COVID"

Call:
lm(formula = formulas[i])

Residuals:
     Min       1Q   Median       3Q      Max 
-0.40894 -0.15619  0.00504  0.12893  0.38995 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -0.460162   0.422223  -1.090 0.282465    
log(LBC)     0.274845   0.053726   5.116 8.69e-06 ***
AGE          0.090907   0.021119   4.305 0.000109 ***
INC         -0.010709   0.004871  -2.198 0.033918 *  
COVID       -0.487005   0.126633  -3.846 0.000433 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.202 on 39 degrees of freedom
Multiple R-squared:  0.9266,    Adjusted R-squared:  0.919 
F-statistic:   123 on 4 and 39 DF,  p-value: < 2.2e-16

=> Model 1 is the most efficient, with exactly one variable log(LBC) has p-val of t-test = 0.059312 > 0.05. This is acceptable if takes alpha = 6%.

loglogmodel <- lm(log(REV) ~ log(LBC) + log(SC) + AGE + log(INC))
summary(loglogmodel)

Call:
lm(formula = log(REV) ~ log(LBC) + log(SC) + AGE + log(INC))

Residuals:
      Min        1Q    Median        3Q       Max 
-0.263418 -0.043335  0.003576  0.044674  0.213069 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -4.067292   1.867164  -2.178 0.035491 *  
log(LBC)    -0.062599   0.032225  -1.943 0.059312 .  
log(SC)      0.997361   0.063686  15.661  < 2e-16 ***
AGE         -0.047760   0.009051  -5.277 5.22e-06 ***
log(INC)     1.744827   0.454160   3.842 0.000438 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.08673 on 39 degrees of freedom
Multiple R-squared:  0.9865,    Adjusted R-squared:  0.9851 
F-statistic: 710.6 on 4 and 39 DF,  p-value: < 2.2e-16

4. Quadratic functions:

Through testing through formulas, we found the most efficient quadratic model:

quadraticmodel <- lm(REV ~ LBC + SC + AGE + I(INC^2) + I(AGE^2))
summary(quadraticmodel )

Call:
lm(formula = REV ~ LBC + SC + AGE + I(INC^2) + I(AGE^2))

Residuals:
   Min     1Q Median     3Q    Max 
-5.143 -1.182 -0.321  1.298  7.221 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -7.859e+01  1.449e+01  -5.422 3.53e-06 ***
LBC          5.620e-01  8.492e-01   0.662   0.5121    
SC           2.357e+00  1.101e+00   2.141   0.0388 *  
AGE          2.717e+00  4.837e-01   5.616 1.91e-06 ***
I(INC^2)     3.511e-04  1.460e-04   2.405   0.0212 *  
I(AGE^2)    -2.249e-02  4.816e-03  -4.671 3.69e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.316 on 38 degrees of freedom
Multiple R-squared:  0.9577,    Adjusted R-squared:  0.9521 
F-statistic: 172.1 on 5 and 38 DF,  p-value: < 2.2e-16

MOST EFFICIENT MODEL:

We use this code block to fill in the data of Table 04. Estimated Result of our report

modelana <- function(x){
  m <- lm(formula = x)
  print(x)
  print(summary(m))
  print(resettest(m,power = 2:3, type = "fitted"))
  print(white_test(m))
  print(jarque.bera.test(m$residuals))
  print(bgtest(m))
  rmse(REV, m$fitted.values)
  mae(REV, m$fitted.values)
  mape(REV,m$fitted.values)
}
formulas <- c("REV ~ LBC + AGE + COVID",
              "REV ~ log(LBC) + log(SC) + log(INC) + COVID",
              "log(REV) ~ log(LBC) + log(SC) + AGE + log(INC)",
              "REV ~ LBC + SC + AGE + I(INC^2) + I(AGE^2)")
for(i in formulas){
  modelana(i)
}
[1] "REV ~ LBC + AGE + COVID"

Call:
lm(formula = x)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.8455 -1.1692 -0.1513  1.6782  6.3617 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -27.63973    3.42949  -8.059 6.57e-10 ***
LBC           3.31798    0.59364   5.589 1.78e-06 ***
AGE           0.74191    0.06616  11.214 6.42e-14 ***
COVID        -3.22792    1.66396  -1.940   0.0595 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.737 on 40 degrees of freedom
Multiple R-squared:  0.9378,    Adjusted R-squared:  0.9332 
F-statistic: 201.2 on 3 and 40 DF,  p-value: < 2.2e-16


    RESET test

data:  m
RESET = 8.3613, df1 = 2, df2 = 38, p-value = 0.0009788

White's test results

Null hypothesis: Homoskedasticity of the residuals
Alternative hypothesis: Heteroskedasticity of the residuals
Test Statistic: 12.39
P-value: 0.002038

    Jarque Bera Test

data:  m$residuals
X-squared = 4.9723, df = 2, p-value = 0.08323


    Breusch-Godfrey test for serial correlation of order up to 1

data:  m
LM test = 1.699, df = 1, p-value = 0.1924

[1] "REV ~ log(LBC) + log(SC) + log(INC) + COVID"

Call:
lm(formula = x)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.1087 -1.3251 -0.1116  1.3688  6.7143 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept) -69.3147    25.8577  -2.681  0.01071 * 
log(LBC)      1.3545     0.8284   1.635  0.11007   
log(SC)       5.4368     1.6110   3.375  0.00168 **
log(INC)     16.4622     5.1727   3.182  0.00286 **
COVID         1.0178     1.2655   0.804  0.42612   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.318 on 39 degrees of freedom
Multiple R-squared:  0.9565,    Adjusted R-squared:  0.9521 
F-statistic: 214.5 on 4 and 39 DF,  p-value: < 2.2e-16


    RESET test

data:  m
RESET = 3.2247, df1 = 2, df2 = 37, p-value = 0.05117

White's test results

Null hypothesis: Homoskedasticity of the residuals
Alternative hypothesis: Heteroskedasticity of the residuals
Test Statistic: 6.39
P-value: 0.040873

    Jarque Bera Test

data:  m$residuals
X-squared = 3.4658, df = 2, p-value = 0.1768


    Breusch-Godfrey test for serial correlation of order up to 1

data:  m
LM test = 4.1139, df = 1, p-value = 0.04253

[1] "log(REV) ~ log(LBC) + log(SC) + AGE + log(INC)"

Call:
lm(formula = x)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.263418 -0.043335  0.003576  0.044674  0.213069 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -4.067292   1.867164  -2.178 0.035491 *  
log(LBC)    -0.062599   0.032225  -1.943 0.059312 .  
log(SC)      0.997361   0.063686  15.661  < 2e-16 ***
AGE         -0.047760   0.009051  -5.277 5.22e-06 ***
log(INC)     1.744827   0.454160   3.842 0.000438 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.08673 on 39 degrees of freedom
Multiple R-squared:  0.9865,    Adjusted R-squared:  0.9851 
F-statistic: 710.6 on 4 and 39 DF,  p-value: < 2.2e-16


    RESET test

data:  m
RESET = 7.5615, df1 = 2, df2 = 37, p-value = 0.001765

White's test results

Null hypothesis: Homoskedasticity of the residuals
Alternative hypothesis: Heteroskedasticity of the residuals
Test Statistic: 6.91
P-value: 0.03156

    Jarque Bera Test

data:  m$residuals
X-squared = 4.724, df = 2, p-value = 0.09423


    Breusch-Godfrey test for serial correlation of order up to 1

data:  m
LM test = 6.1862, df = 1, p-value = 0.01288

[1] "REV ~ LBC + SC + AGE + I(INC^2) + I(AGE^2)"

Call:
lm(formula = x)

Residuals:
   Min     1Q Median     3Q    Max 
-5.143 -1.182 -0.321  1.298  7.221 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -7.859e+01  1.449e+01  -5.422 3.53e-06 ***
LBC          5.620e-01  8.492e-01   0.662   0.5121    
SC           2.357e+00  1.101e+00   2.141   0.0388 *  
AGE          2.717e+00  4.837e-01   5.616 1.91e-06 ***
I(INC^2)     3.511e-04  1.460e-04   2.405   0.0212 *  
I(AGE^2)    -2.249e-02  4.816e-03  -4.671 3.69e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.316 on 38 degrees of freedom
Multiple R-squared:  0.9577,    Adjusted R-squared:  0.9521 
F-statistic: 172.1 on 5 and 38 DF,  p-value: < 2.2e-16


    RESET test

data:  m
RESET = 2.9558, df1 = 2, df2 = 36, p-value = 0.06478

White's test results

Null hypothesis: Homoskedasticity of the residuals
Alternative hypothesis: Heteroskedasticity of the residuals
Test Statistic: 6.71
P-value: 0.034924

    Jarque Bera Test

data:  m$residuals
X-squared = 6.634, df = 2, p-value = 0.03626


    Breusch-Godfrey test for serial correlation of order up to 1

data:  m
LM test = 3.3998, df = 1, p-value = 0.0652

We choose model 4 - quadratic model to be the main model to analyze

V. Model analyze

summary(quadraticmodel)

Call:
lm(formula = REV ~ LBC + SC + AGE + I(INC^2) + I(AGE^2))

Residuals:
   Min     1Q Median     3Q    Max 
-5.143 -1.182 -0.321  1.298  7.221 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -7.859e+01  1.449e+01  -5.422 3.53e-06 ***
LBC          5.620e-01  8.492e-01   0.662   0.5121    
SC           2.357e+00  1.101e+00   2.141   0.0388 *  
AGE          2.717e+00  4.837e-01   5.616 1.91e-06 ***
I(INC^2)     3.511e-04  1.460e-04   2.405   0.0212 *  
I(AGE^2)    -2.249e-02  4.816e-03  -4.671 3.69e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.316 on 38 degrees of freedom
Multiple R-squared:  0.9577,    Adjusted R-squared:  0.9521 
F-statistic: 172.1 on 5 and 38 DF,  p-value: < 2.2e-16

Is COVID affecting REV?

Test for omitted data:

fullmodel <- lm(REV ~ LBC + SC + AGE + I(INC^2) + I(AGE^2) + COVID)
anova(fullmodel,quadraticmodel)
Analysis of Variance Table

Model 1: REV ~ LBC + SC + AGE + I(INC^2) + I(AGE^2) + COVID
Model 2: REV ~ LBC + SC + AGE + I(INC^2) + I(AGE^2)
  Res.Df    RSS Df Sum of Sq      F Pr(>F)
1     37 203.77                           
2     38 203.79 -1  -0.02382 0.0043 0.9479

Anova of REV on COVID:

ano <- aov(REV~COVID, data = df)
summary(ano)
            Df Sum Sq Mean Sq F value   Pr(>F)    
COVID        1   3034  3033.8   71.37 1.34e-10 ***
Residuals   42   1785    42.5                     
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

RAMSEY TEST:

resettest(quadraticmodel,power = 2:3, type = "fitted")

    RESET test

data:  quadraticmodel
RESET = 2.9558, df1 = 2, df2 = 36, p-value = 0.06478

WHITE TEST:

white_test(quadraticmodel)
White's test results

Null hypothesis: Homoskedasticity of the residuals
Alternative hypothesis: Heteroskedasticity of the residuals
Test Statistic: 6.71
P-value: 0.034924

NORMAL JB TEST:

resid <- quadraticmodel$residuals
jarque.bera.test(resid)

    Jarque Bera Test

data:  resid
X-squared = 6.634, df = 2, p-value = 0.03626
print(mean(resid))
[1] 2.775558e-17
print(median(resid))
[1] -0.320985
print(max(resid))
[1] 7.220575
print(min(resid))
[1] -5.142712
print(sd(resid))
[1] 2.17701
print(skewness(resid))
[1] 0.383959
print(kurtosis(resid))
[1] 1.515737
ggplot(df, aes(resid)) +
  geom_histogram(fill = "navy", color = "white", bins = 10) +
  theme_minimal() +
  labs(title = "Histogram of Residuals", x = "", y = "") +
  theme(plot.title = element_text(hjust = 0.5))

BG TEST:

bgtest(quadraticmodel, order = 1)

    Breusch-Godfrey test for serial correlation of order up to 1

data:  quadraticmodel
LM test = 3.3998, df = 1, p-value = 0.0652
bgtest(quadraticmodel, order = 2)

    Breusch-Godfrey test for serial correlation of order up to 2

data:  quadraticmodel
LM test = 3.562, df = 2, p-value = 0.1685

VIF:

vif(quadraticmodel)
      LBC        SC       AGE  I(INC^2)  I(AGE^2) 
  4.74133  33.79651 309.51893  64.69675 483.30265 
m1 <- lm(QUAREV ~ COVID + EXCHANGE +  GDPPERCA + WAGE + INCOME)
summary(m1)
vif(m1)
---
title: "Research on factors affecting The gioi di dong revenue"
output: html_notebook
---

# I. Initialization

Below is all the packages that we use to analyze

```{r}
library(readxl)
library(zoo)
library(lmtest)
library(broom)
library(car)
library(carData)
library(whitestrap)
library(tseries)
library(corrplot)
library(ggcorrplot)
library(e1071)
library(tseries)
```

Read data and assign variables. Converting unit of variables.

```{r}
data <- read_xlsx("C:/Users/ppthao/Downloads/thuktl.xlsx")
REV <- data[[2]] /1000000000000
LBC <- data[[3]] /1000000000000
SC <- data[[4]] /1000000000000
AGE <- data[[5]]
INC <- data[[6]] /100000
COVID <- data[[7]]
df <- data.frame(REV,LBC,SC,AGE,INC,COVID)

```

Setting up functions to calculate RMSE, MAE, MAPE:

```{r}
mape <- function(actual, predicted) {
  mean(abs((actual - predicted) / actual)) * 100
}
rmse <- function(actual, predicted) {
  sqrt(mean((actual - predicted)^2))
}
mae <- function(actual, predicted) {
  if(length(actual) != length(predicted)) {
    stop("Vectors 'actual' and 'predicted' must have the same length.")
  }
   mean(abs(actual - predicted))
}
```

# II. Descriptive statistics

```{r}
desstat <- function(x){
  summaryvec<- c(
    mean(x), median(x),max(x),min(x),sd(x),skewness(x),kurtosis(x),jarque.bera.test(REV)$statistic,jarque.bera.test(REV)$p.value
  )
  return(summaryvec)
}


for(i in 1:6){
  print(colnames(df)[i])
  print(desstat(df[[i]]))
}
```

# III. Correlation matrix

```{r}
cor_matrix <- cor(df)
p <- ggcorrplot(cor_matrix, 
           lab = TRUE, 
           hc.order = TRUE, 
           type = "full",          
           colors = c("red", "white", "blue"),
           outline.color = "gray")
p

```

# IV. Model choosing

## 1. Linear model:

```{r}


#a: alpha
#b: minimum for R_squared 
crawlmodel <- function(a,b,vars,dep){
  #There are 5 independent variables and 1 dependent variable => 2^5 - 1 possible models
  n <- length(vars)
  for (i in 1:(2^n - 1)) {
    bits <- as.logical(rev(intToBits(i)[1:n]))
    # Convert to binary TRUE/FALSE selector
  
    selected_vars <- vars[bits]
    #Select independent variables based on TRUE/FALSE vector
  
    formula_str <- paste(paste(dep),"~", paste(selected_vars, collapse = " + "))  
    # Create formula: a ~ b + c + d (depending on selected_vars)
   
    model <- lm(as.formula(formula_str), data = df)
    # summary the model according to formula
  
    f_stat <- summary(model)$fstatistic
    f_p_value <- pf(f_stat["value"], f_stat["numdf"], f_stat["dendf"], lower.tail = FALSE)
    #get fstat pvalue 
  
    if((resettest(model)$p.value >= a)& (f_p_value <= a) & (summary(model)$r.squared >= b)){
    #set conditions for choosing variables
    
      print(formula_str)
      print(summary(model))
      print(reset(model))
      #print the result
    }
  }
}
varstring <- c( "LBC" ,"SC","AGE", "INC","COVID" )
depvar <- "REV"
crawlmodel(0.05,0.5,varstring,depvar)

```

When choosing p-value for Ramsay test \>= 0.05, only 1 model satisfies. Alternate way is to use backward procedures:

```{r}
model <- lm( REV ~  LBC + SC + AGE + INC + COVID)
step(model, direction = "backward")
```

=\> Result of backward procedure: REV \~ LBC + AGE + COVID

```{r}
linearmodel <- lm(REV ~ LBC + AGE + COVID)
summary(linearmodel)
```

## 2. Semi-log model

#### a. Log-Lin:

```{r}
depvar <- "log(REV)"
varstring <- c("LBC","SC","AGE","INC","COVID")
crawlmodel(0.05,0.5,varstring,depvar)
crawlmodel(0.05,0,varstring,depvar)
```

=\> Only 1 model: log(REV) \~ COVID satisfies the conditions, though, R_squared = 0.4686 is not efficient.

#### b. Lin - Log:

```{r}

#those are varibles that can be log: LBC, SC, INC:
varstring1 <- c( "log(LBC)" ,"log(SC)","AGE", "log(INC)","COVID" )
varstring2 <- c( "LBC" ,"log(SC)","AGE", "log(INC)","COVID" )
varstring3 <- c( "log(LBC)" ,"SC","AGE", "log(INC)","COVID" )
varstring4 <- c( "log(LBC)" ,"log(SC)","AGE", "INC","COVID" )
varstring5 <- c( "LBC" ,"SC","AGE", "log(INC)","COVID" )
varstring6 <- c( "LBC" ,"log(SC)","AGE", "INC","COVID" )
varstring7 <- c( "log(LBC)" ,"SC","AGE", "INC","COVID" )
```

Each independent variable log options:

```{r}
depvar <- "REV"
crawlmodel(0.05,0.5,varstring1,depvar)
```

=\> Model can be chosen are:

[1] "REV \~ log(LBC) + log(SC) + log(INC) + COVID"

[2] "REV \~ log(LBC) + log(SC) + AGE + log(INC) + COVID"

For varstring2:

```{r}
 depvar <- "REV"
 crawlmodel(0.05,0.5,varstring2,depvar)
```

=\> Model can be chosen are:

[3]"REV \~ LBC + log(SC) + log(INC) + COVID"

[4]"REV \~ LBC + log(SC) + AGE + log(INC) + COVID"

For varstring3:

```{r}
 depvar <- "REV"
 crawlmodel(0.05,0.5,varstring3,depvar)
```

=\> No model has more than 1 independent variable that satisfies the conditions

For varstring4:

```{r}
 depvar <- "REV"
 crawlmodel(0.05,0.5,varstring4,depvar)
```

=\> No model has more than 1 independent variable that satisfies the conditions

For varstring5:

```{r}
 depvar <- "REV"
 crawlmodel(0.05,0.5,varstring5,depvar)
```

=\> No model has more than 1 independent variable that satisfies the conditions

For varstring6:

```{r}
 depvar <- "REV"
 crawlmodel(0.05,0.5,varstring6,depvar)
```

=\> No model has more than 1 independent variable that satisfies the conditions

For varstring7:

```{r}
 depvar <- "REV"
 crawlmodel(0.05,0.5,varstring7,depvar)
```

=\> No model has more than 1 independent variable that satisfies the conditions

#### **Conclusion:**

There are 4 semi-log models that satisfy the conditions:

-   Overall significant: p-val of F-test \<= 0.05

-   Bypass Ramsay test: p-val \>= 0.05

-   R_squared \>= 0.5

(Check out those in `crawlmodel` function)

These functions are:

[1] "REV \~ log(LBC) + log(SC) + log(INC) + COVID"

[2] "REV \~ log(LBC) + log(SC) + AGE + log(INC) + COVID"

[3]"REV \~ LBC + log(SC) + log(INC) + COVID"

[4]"REV \~ LBC + log(SC) + AGE + log(INC) + COVID"

```{r}
lm1 <- lm(REV ~ log(LBC) + log(SC) + log(INC) + COVID)
summary(lm1)
lm2 <- lm(REV ~ log(LBC) + log(SC) + AGE + log(INC) + COVID)
summary(lm2)
lm3 <- lm(REV ~ LBC + log(SC) + log(INC) + COVID)
summary(lm3)
lm4 <- lm(REV ~ LBC + log(SC) + AGE + log(INC) + COVID)
summary(lm4)
```

=\> Choose model 1 to analyze as semi-log model because:

-   Has 2/4 significant variables

-   Has higher adjusted R-squared than model 3

```{r}
semilogmodel <- lm(REV ~ log(LBC) + log(SC) + log(INC) + COVID)
summary(semilogmodel)
```

## 3. Log - log:

```{r}
depvar <- "log(REV)"
varstring1 <- c( "log(LBC)" ,"log(SC)","AGE", "log(INC)","COVID" )
varstring2 <- c( "LBC" ,"log(SC)","AGE", "log(INC)","COVID" )
varstring3 <- c( "log(LBC)" ,"SC","AGE", "log(INC)","COVID" )
varstring4 <- c( "log(LBC)" ,"log(SC)","AGE", "INC","COVID" )
varstring5 <- c( "LBC" ,"SC","AGE", "log(INC)","COVID" )
varstring6 <- c( "LBC" ,"log(SC)","AGE", "INC","COVID" )
varstring7 <- c( "log(LBC)" ,"SC","AGE", "INC","COVID" )
crawlmodel(0.05,0.5,varstring1,depvar)
crawlmodel(0.05,0.5,varstring2,depvar)
crawlmodel(0.05,0.5,varstring3,depvar)
crawlmodel(0.05,0.5,varstring4,depvar)
crawlmodel(0.05,0.5,varstring5,depvar)
crawlmodel(0.05,0.5,varstring6,depvar)
crawlmodel(0.05,0.5,varstring7,depvar)
```

=\> No model with log(REV) satisfies the conditions.

Use backward methods for varstring1:

```{r}
model <- lm( log(REV) ~  log(LBC) + log(SC)+ AGE+ log(INC) + COVID)
step(model, direction = "backward")
```

Result model:

[1] log(REV) \~ log(LBC) + log(SC) + AGE + log(INC)

Use backward methods for varstring2:

```{r}
model <- lm( log(REV) ~  LBC+ log(SC)+ AGE + log(INC) + COVID)
step(model, direction = "backward")
```

Result model:

```{r}
model <- lm( log(REV) ~  LBC+ log(SC)+ AGE + log(INC) + COVID)
step(model, direction = "backward")
```

[2] log(REV) \~ LBC + log(SC) + AGE + log(INC)

Use backward methods for varstring3:

```{r}
model <- lm( log(REV) ~   log(LBC) + SC + AGE + log(INC) + COVID)
step(model, direction = "backward")
```

Result model:

[3] log(REV) \~ log(LBC) + AGE + COVID

Use backward methods for varstring4:

```{r}
model <- lm( log(REV) ~  log(LBC) + log(SC) + AGE + INC + COVID)
step(model, direction = "backward")
```

Result model:

[4] log(REV) \~ log(LBC) + log(SC) + AGE + INC

Use backward methods for varstring5:

```{r}
model <- lm( log(REV) ~  LBC+ SC+AGE + log(INC) + COVID)
step(model, direction = "backward")
```

=\> The result is not a log log model

Use backward methods for varstring6:

```{r}
model <- lm( log(REV) ~  LBC + log(SC) + AGE + INC + COVID )
step(model, direction = "backward")
```

Result model:

[6] log(REV) \~ LBC + log(SC) + AGE + INC

Use backward methods for varstring7:

```{r}
model <- lm( log(REV) ~  log(LBC) + SC + AGE + INC + COVID )
step(model, direction = "backward")

```

Result model:

[7] log(REV) \~ log(LBC) + AGE + INC + COVID

#### Conclusion:

There are 6 result of log-log models, using backward procedure:

[1] log(REV) \~ log(LBC) + log(SC) + AGE + log(INC)

[2] log(REV) \~ LBC + log(SC) + AGE + log(INC)

[3] log(REV) \~ log(LBC) + AGE + COVID

[4] log(REV) \~ log(LBC) + log(SC) + AGE + INC

[5] log(REV) \~ LBC + log(SC) + AGE + INC

[6] log(REV) \~ log(LBC) + AGE + INC + COVID

```{r}
formulas <- c("log(REV) ~ log(LBC) + log(SC) + AGE + log(INC)",
              "log(REV) ~ LBC + log(SC) + AGE + log(INC)",
              "log(REV) ~ log(LBC) + AGE + COVID",
              "log(REV) ~ log(LBC) + log(SC) + AGE + INC",
              "log(REV) ~ LBC + log(SC) + AGE + INC",
              "log(REV) ~ log(LBC) + AGE + INC + COVID")
for(i in 1:6){
  model <- lm(formula = formulas[i])
  print(formulas[i])
  print(summary(model))
}


```

=\> Model 1 is the most efficient, with exactly one variable log(LBC) has p-val of t-test = 0.059312 \> 0.05. This is acceptable if takes alpha = 6%.

```{r}
loglogmodel <- lm(log(REV) ~ log(LBC) + log(SC) + AGE + log(INC))
summary(loglogmodel)
```

## 4. Quadratic functions:

Through testing through formulas, we found the most efficient quadratic model:

```{r}
quadraticmodel <- lm(REV ~ LBC + SC + AGE + I(INC^2) + I(AGE^2))
summary(quadraticmodel )
```

## MOST EFFICIENT MODEL:

We use this code block to fill in the data of Table 04. Estimated Result of our report

```{r}
modelana <- function(x){
  m <- lm(formula = x)
  print(x)
  print(summary(m))
  print(resettest(m,power = 2:3, type = "fitted"))
  print(white_test(m))
  print(jarque.bera.test(m$residuals))
  print(bgtest(m))
  rmse(REV, m$fitted.values)
  mae(REV, m$fitted.values)
  mape(REV,m$fitted.values)
}
formulas <- c("REV ~ LBC + AGE + COVID",
              "REV ~ log(LBC) + log(SC) + log(INC) + COVID",
              "log(REV) ~ log(LBC) + log(SC) + AGE + log(INC)",
              "REV ~ LBC + SC + AGE + I(INC^2) + I(AGE^2)")
for(i in formulas){
  modelana(i)
}

```

We choose model 4 - quadratic model to be the main model to analyze

# V. Model analyze

```{r}
summary(quadraticmodel)

```

#### Is COVID affecting REV?

Test for omitted data:

```{r}
fullmodel <- lm(REV ~ LBC + SC + AGE + I(INC^2) + I(AGE^2) + COVID)
anova(fullmodel,quadraticmodel)
```

Anova of REV on COVID:

```{r}
ano <- aov(REV~COVID, data = df)
summary(ano)
```

#### RAMSEY TEST:

```{r}
resettest(quadraticmodel,power = 2:3, type = "fitted")
```

#### WHITE TEST:

```{r}
white_test(quadraticmodel)
```

#### NORMAL JB TEST:

```{r}
resid <- quadraticmodel$residuals
jarque.bera.test(resid)
print(mean(resid))
print(median(resid))
print(max(resid))
print(min(resid))
print(sd(resid))
print(skewness(resid))
print(kurtosis(resid))

ggplot(df, aes(resid)) +
  geom_histogram(fill = "navy", color = "white", bins = 10) +
  theme_minimal() +
  labs(title = "Histogram of Residuals", x = "", y = "") +
  theme(plot.title = element_text(hjust = 0.5))
```

#### BG TEST:

```{r}
bgtest(quadraticmodel, order = 1)
bgtest(quadraticmodel, order = 2)
```

#### VIF:

```{r}
vif(quadraticmodel)
```

```{r}
model <- lm( QUAREV ~ EXCHANGE + GDPPERCA + INCOME + I(EXCHANGE^2) + I(INCOME^2) + I(QUARTIL^2 ))
summary(model)
reset(model)
vif(model)
mape(QUAREV,model$fitted.values)
white_test(model)
jarque.bera.test(model$residuals)

a <- df1
ans <- 0
for(i in 1:6){
  print(mean(a[[i]]))
  print(median(a[[i]]))
  print(max(a[[i]]))
  print(min(a[[i]]))
  print(sd(a[[i]]))
  print(skewness(a[[i]]))
  print(kurtosis(a[[i]]))
  print(jarque.bera.test(a[[i]])

}

bgtest(model, order = 1)
bgtest(fmodel, order = 2)


ggplot(df, aes(x = fmodel$residuals)) +
  geom_histogram(fill = "navy", color = "white", bins = 10) +
  theme_minimal() +
  labs(title = "Histogram of Residuals", x = "", y = "") +
  theme(plot.title = element_text(hjust = 0.5))
```

```{r}
m1 <- lm(QUAREV ~ COVID + EXCHANGE +  GDPPERCA + WAGE + INCOME)
summary(m1)
vif(m1)
```
