I. Initialization

Below is all the packages that we use to analyze

library(readxl)
library(zoo)
library(lmtest)
library(broom)
library(car)
library(carData)
library(whitestrap)
library(tseries)
library(corrplot)
library(ggcorrplot)
library(e1071)
library(tseries)

Read data and assign variables. Converting unit of variables.

data <- read_xlsx("C:/Users/pptha/Downloads/thuktl.xlsx")
REV <- data[[2]] /1000000000000
LBC <- data[[3]] /1000000000000
SC <- data[[4]] /1000000000000
AGE <- data[[5]]
INC <- data[[6]] /100000
COVID <- data[[7]]
df <- data.frame(REV,LBC,SC,AGE,INC,COVID)

Setting up functions to calculate RMSE, MAE, MAPE:

mape <- function(actual, predicted) {
  mean(abs((actual - predicted) / actual)) * 100
}
rmse <- function(actual, predicted) {
  sqrt(mean((actual - predicted)^2))
}
mae <- function(actual, predicted) {
  if(length(actual) != length(predicted)) {
    stop("Vectors 'actual' and 'predicted' must have the same length.")
  }
   mean(abs(actual - predicted))
}

II. Descriptive statistics

desstat <- function(x){
  summaryvec<- c(
    mean(x), median(x),max(x),min(x),sd(x),skewness(x),kurtosis(x),jarque.bera.test(REV)$statistic,jarque.bera.test(REV)$p.value
  )
  return(summaryvec)
}


for(i in 1:6){
  print(colnames(df)[i])
  print(desstat(df[[i]]))
}

III. Correlation matrix

cor_matrix <- cor(df)
p <- ggcorrplot(cor_matrix, 
           lab = TRUE, 
           hc.order = TRUE, 
           type = "full",          
           colors = c("red", "white", "blue"),
           outline.color = "gray")
p

IV. Model choosing

1. Linear model:



#a: alpha
#b: minimum for R_squared 
crawlmodel <- function(a,b,vars,dep){
  #There are 5 independent variables and 1 dependent variable => 2^5 - 1 possible models
  n <- length(vars)
  for (i in 1:(2^n - 1)) {
    bits <- as.logical(rev(intToBits(i)[1:n]))
    # Convert to binary TRUE/FALSE selector
  
    selected_vars <- vars[bits]
    #Select independent variables based on TRUE/FALSE vector
  
    formula_str <- paste(paste(dep),"~", paste(selected_vars, collapse = " + "))  
    # Create formula: a ~ b + c + d (depending on selected_vars)
   
    model <- lm(as.formula(formula_str), data = df)
    # summary the model according to formula
  
    f_stat <- summary(model)$fstatistic
    f_p_value <- pf(f_stat["value"], f_stat["numdf"], f_stat["dendf"], lower.tail = FALSE)
    #get fstat pvalue 
  
    if((resettest(model)$p.value >= a)& (f_p_value <= a) & (summary(model)$r.squared >= b)){
    #set conditions for choosing variables
    
      print(formula_str)
      print(summary(model))
      print(reset(model))
      #print the result
    }
  }
}
varstring <- c( "LBC" ,"SC","AGE", "INC","COVID" )
depvar <- "REV"
crawlmodel(0.05,0.5,varstring,depvar)

[1] "REV ~ COVID"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-10.903  -4.741   0.320   3.937  12.565 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   14.409      1.331  10.826 9.94e-14 ***
COVID         16.676      1.974   8.448 1.34e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.52 on 42 degrees of freedom
Multiple R-squared:  0.6295,    Adjusted R-squared:  0.6207 
F-statistic: 71.37 on 1 and 42 DF,  p-value: 1.338e-10


    RESET test

data:  model
RESET = 0, df1 = 2, df2 = 40, p-value = 1

When choosing p-value for Ramsay test >= 0.05, only 1 model satisfies. Alternate way is to use backward procedures:

model <- lm( REV ~  LBC + SC + AGE + INC + COVID)
step(model, direction = "backward")

Start:  AIC=94.95
REV ~ LBC + SC + AGE + INC + COVID

        Df Sum of Sq    RSS     AIC
- INC    1     0.415 290.28  93.013
- SC     1     9.650 299.52  94.391
<none>               289.87  94.950
- AGE    1    17.094 306.96  95.471
- COVID  1    34.779 324.65  97.936
- LBC    1   107.563 397.43 106.836

Step:  AIC=93.01
REV ~ LBC + SC + AGE + COVID

        Df Sum of Sq    RSS     AIC
- SC     1     9.263 299.54  92.395
<none>               290.28  93.013
- COVID  1    35.636 325.92  96.108
- LBC    1   115.653 405.93 105.768
- AGE    1   118.578 408.86 106.084

Step:  AIC=92.4
REV ~ LBC + AGE + COVID

        Df Sum of Sq     RSS     AIC
<none>                299.54  92.395
- COVID  1     28.18  327.73  94.351
- LBC    1    233.94  533.48 115.790
- AGE    1    941.67 1241.21 152.945

Call:
lm(formula = REV ~ LBC + AGE + COVID)

Coefficients:
(Intercept)          LBC          AGE        COVID  
   -27.6397       3.3180       0.7419      -3.2279

=> Result of backward procedure: REV ~ LBC + AGE + COVID

linearmodel <- lm(REV ~ LBC + AGE + COVID)
summary(linearmodel)


Call:
lm(formula = REV ~ LBC + AGE + COVID)

Residuals:
    Min      1Q  Median      3Q     Max 
-7.8455 -1.1692 -0.1513  1.6782  6.3617 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -27.63973    3.42949  -8.059 6.57e-10 ***
LBC           3.31798    0.59364   5.589 1.78e-06 ***
AGE           0.74191    0.06616  11.214 6.42e-14 ***
COVID        -3.22792    1.66396  -1.940   0.0595 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.737 on 40 degrees of freedom
Multiple R-squared:  0.9378,    Adjusted R-squared:  0.9332 
F-statistic: 201.2 on 3 and 40 DF,  p-value: < 2.2e-16

2. Semi-log model

a. Log-Lin:

depvar <- "log(REV)"
varstring <- c("LBC","SC","AGE","INC","COVID")
crawlmodel(0.05,0.5,varstring,depvar)
crawlmodel(0.05,0,varstring,depvar)

[1] "log(REV) ~ COVID"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.21109 -0.16666  0.02516  0.30368  0.82952 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   2.4654     0.1069  23.064  < 2e-16 ***
COVID         0.9648     0.1585   6.085 2.99e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.5237 on 42 degrees of freedom
Multiple R-squared:  0.4686,    Adjusted R-squared:  0.4559 
F-statistic: 37.03 on 1 and 42 DF,  p-value: 2.992e-07


    RESET test

data:  model
RESET = 0, df1 = 2, df2 = 40, p-value = 1

=> Only 1 model: log(REV) ~ COVID satisfies the conditions, though, R_squared = 0.4686 is not efficient.

b. Lin - Log:


#those are varibles that can be log: LBC, SC, INC:
varstring1 <- c( "log(LBC)" ,"log(SC)","AGE", "log(INC)","COVID" )
varstring2 <- c( "LBC" ,"log(SC)","AGE", "log(INC)","COVID" )
varstring3 <- c( "log(LBC)" ,"SC","AGE", "log(INC)","COVID" )
varstring4 <- c( "log(LBC)" ,"log(SC)","AGE", "INC","COVID" )
varstring5 <- c( "LBC" ,"SC","AGE", "log(INC)","COVID" )
varstring6 <- c( "LBC" ,"log(SC)","AGE", "INC","COVID" )
varstring7 <- c( "log(LBC)" ,"SC","AGE", "INC","COVID" )

Each independent variable log options:

depvar <- "REV"
crawlmodel(0.05,0.5,varstring1,depvar)

[1] "REV ~ COVID"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-10.903  -4.741   0.320   3.937  12.565 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   14.409      1.331  10.826 9.94e-14 ***
COVID         16.676      1.974   8.448 1.34e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.52 on 42 degrees of freedom
Multiple R-squared:  0.6295,    Adjusted R-squared:  0.6207 
F-statistic: 71.37 on 1 and 42 DF,  p-value: 1.338e-10


    RESET test

data:  model
RESET = 0, df1 = 2, df2 = 40, p-value = 1

[1] "REV ~ log(LBC)"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-7.236 -4.528 -2.376  4.146 14.646 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  21.6685     0.9233  23.468  < 2e-16 ***
log(LBC)      9.3428     1.0037   9.309 9.11e-12 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.12 on 42 degrees of freedom
Multiple R-squared:  0.6735,    Adjusted R-squared:  0.6658 
F-statistic: 86.65 on 1 and 42 DF,  p-value: 9.109e-12


    RESET test

data:  model
RESET = 1.1008, df1 = 2, df2 = 40, p-value = 0.3425

[1] "REV ~ log(LBC) + log(SC) + log(INC) + COVID"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.1087 -1.3251 -0.1116  1.3688  6.7143 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept) -69.3147    25.8577  -2.681  0.01071 * 
log(LBC)      1.3545     0.8284   1.635  0.11007   
log(SC)       5.4368     1.6110   3.375  0.00168 **
log(INC)     16.4622     5.1727   3.182  0.00286 **
COVID         1.0178     1.2655   0.804  0.42612   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.318 on 39 degrees of freedom
Multiple R-squared:  0.9565,    Adjusted R-squared:  0.9521 
F-statistic: 214.5 on 4 and 39 DF,  p-value: < 2.2e-16


    RESET test

data:  model
RESET = 3.2247, df1 = 2, df2 = 37, p-value = 0.05117

[1] "REV ~ log(LBC) + log(SC) + AGE + log(INC) + COVID"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.1321 -1.3081 -0.0483  1.1535  7.3288 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)   
(Intercept) -120.6483    53.8931  -2.239  0.03111 * 
log(LBC)       0.9628     0.9019   1.067  0.29250   
log(SC)        6.3865     1.8303   3.489  0.00124 **
AGE           -0.3420     0.3152  -1.085  0.28480   
log(INC)      30.0636    13.5578   2.217  0.03265 * 
COVID          2.1681     1.6488   1.315  0.19640   
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.313 on 38 degrees of freedom
Multiple R-squared:  0.9578,    Adjusted R-squared:  0.9523 
F-statistic: 172.6 on 5 and 38 DF,  p-value: < 2.2e-16


    RESET test

data:  model
RESET = 3.1037, df1 = 2, df2 = 36, p-value = 0.05708

=> Model can be chosen are:

[1] “REV ~ log(LBC) + log(SC) + log(INC) + COVID”

[2] “REV ~ log(LBC) + log(SC) + AGE + log(INC) + COVID”

For varstring2:

 depvar <- "REV"
 crawlmodel(0.05,0.5,varstring2,depvar)

[1] "REV ~ COVID"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-10.903  -4.741   0.320   3.937  12.565 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   14.409      1.331  10.826 9.94e-14 ***
COVID         16.676      1.974   8.448 1.34e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.52 on 42 degrees of freedom
Multiple R-squared:  0.6295,    Adjusted R-squared:  0.6207 
F-statistic: 71.37 on 1 and 42 DF,  p-value: 1.338e-10


    RESET test

data:  model
RESET = 0, df1 = 2, df2 = 40, p-value = 1

[1] "REV ~ LBC + log(SC) + log(INC)"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.1546 -1.2258 -0.4381  1.4164  6.9168 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -81.2107    22.1796  -3.662 0.000726 ***
LBC           1.2520     0.6699   1.869 0.068958 .  
log(SC)       5.5849     1.4049   3.975 0.000287 ***
log(INC)     18.4620     4.3354   4.258 0.000121 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.292 on 40 degrees of freedom
Multiple R-squared:  0.9564,    Adjusted R-squared:  0.9531 
F-statistic: 292.5 on 3 and 40 DF,  p-value: < 2.2e-16


    RESET test

data:  model
RESET = 3.2297, df1 = 2, df2 = 38, p-value = 0.05065

[1] "REV ~ LBC + log(SC) + log(INC) + COVID"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.1389 -1.2532 -0.3672  1.4695  6.7797 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -76.4451    28.0150  -2.729 0.009487 ** 
LBC           1.1799     0.7235   1.631 0.110981    
log(SC)       5.7110     1.4890   3.835 0.000446 ***
log(INC)     17.5219     5.4940   3.189 0.002812 ** 
COVID         0.3831     1.3484   0.284 0.777819    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.319 on 39 degrees of freedom
Multiple R-squared:  0.9565,    Adjusted R-squared:  0.952 
F-statistic: 214.4 on 4 and 39 DF,  p-value: < 2.2e-16


    RESET test

data:  model
RESET = 3.088, df1 = 2, df2 = 37, p-value = 0.05751

[1] "REV ~ LBC + log(SC) + AGE + log(INC) + COVID"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-5.1447 -1.3675 -0.0311  1.2240  7.3622 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -124.4926    53.4899  -2.327 0.025370 *  
LBC            0.8220     0.7984   1.030 0.309721    
log(SC)        6.6004     1.7097   3.860 0.000426 ***
AGE           -0.3367     0.3195  -1.054 0.298594    
log(INC)      30.5236    13.5018   2.261 0.029588 *  
COVID          1.7109     1.8440   0.928 0.359368    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 2.315 on 38 degrees of freedom
Multiple R-squared:  0.9577,    Adjusted R-squared:  0.9522 
F-statistic: 172.2 on 5 and 38 DF,  p-value: < 2.2e-16


    RESET test

data:  model
RESET = 3.1136, df1 = 2, df2 = 36, p-value = 0.05659

=> Model can be chosen are:

[3]“REV ~ LBC + log(SC) + log(INC) + COVID”

[4]“REV ~ LBC + log(SC) + AGE + log(INC) + COVID”

For varstring3:

 depvar <- "REV"
 crawlmodel(0.05,0.5,varstring3,depvar)

[1] "REV ~ COVID"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-10.903  -4.741   0.320   3.937  12.565 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   14.409      1.331  10.826 9.94e-14 ***
COVID         16.676      1.974   8.448 1.34e-10 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.52 on 42 degrees of freedom
Multiple R-squared:  0.6295,    Adjusted R-squared:  0.6207 
F-statistic: 71.37 on 1 and 42 DF,  p-value: 1.338e-10


    RESET test

data:  model
RESET = 0, df1 = 2, df2 = 40, p-value = 1

[1] "REV ~ log(LBC)"

Call:
lm(formula = as.formula(formula_str), data = df)

Residuals:
   Min     1Q Median     3Q    Max 
-7.236 -4.528 -2.376  4.146 14.646 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  21.6685     0.9233  23.468  < 2e-16 ***
log(LBC)      9.3428     1.0037   9.309 9.11e-12 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6.12 on 42 degrees of freedom
Multiple R-squared:  0.6735,    Adjusted R-squared:  0.6658 
F-statistic: 86.65 on 1 and 42 DF,  p-value: 9.109e-12


    RESET test

data:  model
RESET = 1.1008, df1 = 2, df2 = 40, p-value = 0.3425