Using Simple Linear Regression

Thess assumptions will apply to exponential and logarithmic models.

Exponential model

yt=aexp(btime)

Applying logarithm:

ln(Yt)=ln(a)+b*time. This can be estimated using OLS.

sales<-c(1248.2, 1643.2, 2445,3376.3, 4666.9, 6400.9, 8451.5, 11909.9, 15969.3, 20649, 32602)
tsales<-1:length(sales)

dfsales<-data.frame(sales=sales,tsales=tsales)
plot(dfsales)

Clearly, the graph shows a curve pattern. Let’s transform and apply regression:

lnsales<-log(dfsales$sales)
dfsales$lnsales<-lnsales #adding the lnsales to our dataframe
head(dfsales)
##    sales tsales  lnsales
## 1 1248.2      1 7.129458
## 2 1643.2      2 7.404401
## 3 2445.0      3 7.801800
## 4 3376.3      4 8.124536
## 5 4666.9      5 8.448250
## 6 6400.9      6 8.764194
plot(dfsales$tsales, dfsales$lnsales)

results2<-lm(lnsales~tsales, data=dfsales) #applying regression
summary(results2)
## 
## Call:
## lm(formula = lnsales ~ tsales, data = dfsales)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.080830 -0.016312  0.008552  0.027310  0.056037 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.81786    0.02747  248.22  < 2e-16 ***
## tsales       0.31984    0.00405   78.98 4.24e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04248 on 9 degrees of freedom
## Multiple R-squared:  0.9986, Adjusted R-squared:  0.9984 
## F-statistic:  6237 on 1 and 9 DF,  p-value: 4.235e-14

Now let’s graphically check the assumptions of regression if holds:

plot(results2) #check the fitted vs residual for constant error variance assumtion, Normal PP to check normality

plot(dfsales$tsales, results2$residuals, type="l") #to check independence of error terms

acf(results2$residuals) # we can check acf as well

Now we have the estimate of the “transformed” data. Let’s tranform the estimated coefficient back to normal by exponentiating (since we applied logarithm)

a<-exp(results2$coefficients[1])
b<-results2$coefficients[2]
print(a)
## (Intercept) 
##    914.0248
print(b)
##    tsales 
## 0.3198394

Now let us visualize the actual data, superimpose the fitted values.

fit<-a*exp(b*(1:11)) #compute the fitted values or instead of using this formula from the scratch we can just used the computed fitted from the linear form model then exponentiate
fit1<-exp(results2$fitted.values) #the fitted values in lm() is in thhe "fitted.values"

dfsales$fitted<-fit  #add the fitted values to our dataframe
dfsales$fitted_1<-fit1 #this is just equal to fit :)

plot(dfsales$tsales, dfsales$sales)
lines(dfsales$tsales, dfsales$fitted, col="red")

Our model seems pretty accurate. let’s now compute the in-sample MSE, MAPE and MAE to quantify the performance of our model.

res<-dfsales$sales-dfsales$fitted
dfsales$res<-res
head(dfsales)
##    sales tsales  lnsales   fitted fitted_1       res
## 1 1248.2      1 7.129458 1258.527 1258.527 -10.32689
## 2 1643.2      2 7.404401 1732.874 1732.874 -89.67406
## 3 2445.0      3 7.801800 2386.006 2386.006  58.99416
## 4 3376.3      4 8.124536 3285.307 3285.307  90.99266
## 5 4666.9      5 8.448250 4523.562 4523.562 143.33843
## 6 6400.9      6 8.764194 6228.522 6228.522 172.37792
MSE<-sum((res)^2)/11
MAPE<-sum(abs(res/dfsales$sales))/11
MAE<-sum(abs(res))/11

accuracy<-data.frame(MSE=MSE, MAPE=MAPE, MAE=MAE)
print(accuracy)
##        MSE       MAPE      MAE
## 1 578048.7 0.03196925 417.8768

We check if it’s a constant variance of the error terms and other plots of the fitted model (not the fitted linear form which we already got when we applied plot(results) function).

#constant variance
plot(dfsales$fitted, dfsales$res) 

#Normal QQ PLOT
qqnorm(dfsales$res)
qqline(dfsales$res)

plot(dfsales$sales, dfsales$res, type="l")

based on the first graph, seems that the variance is not constant, and there are some outlier in the normal QQ Plot.

Note: Guassian white noise have uncorrelated error terms and don’t have to be independence

Logistic Model

Our time series trend have S shape. This is example 5.3 in our discussion.

yt=L/(1+exp(a+b*time+et))

loans<-c(2611,2951.6,3459.5, 3459.5,4085.1, 9053.5, 13800, 15125, 18673, 21579,25787,25690)
t<-1:length(loans) #i used length() because sometimes I am careless, miss to encode some data points :D

dfloans<-data.frame(loans=loans,t=t)
plot(dfloans)
lines(dfloans$t, dfloans$loans, col="red", type="l")

Now let us transform. Use L=30000

tLoans<-log(30000/dfloans$loans-1)
dfloans$tLoans<-tLoans
head(dfloans)
##    loans t    tLoans
## 1 2611.0 1 2.3504082
## 2 2951.6 2 2.2152805
## 3 3459.5 3 2.0375478
## 4 3459.5 4 2.0375478
## 5 4085.1 5 1.8474719
## 6 9053.5 6 0.8388201
results3<-lm(tLoans~t, data=dfloans)
summary(results3)
## 
## Call:
## lm(formula = tLoans ~ t, data = dfloans)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52302 -0.18074  0.01508  0.08609  0.67393 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.29840    0.21388   15.42 2.68e-08 ***
## t           -0.42497    0.02906  -14.62 4.46e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3475 on 10 degrees of freedom
## Multiple R-squared:  0.9553, Adjusted R-squared:  0.9509 
## F-statistic: 213.8 on 1 and 10 DF,  p-value: 4.462e-08

Now let us compute the fit.

a<-results3$coefficients[1]
b<-results3$coefficients[2]
fit<-30000/(1+exp(a+b*(1:length(loans))))
dfloans$fit<-fit
plot(dfloans$t, dfloans$loans, type="l")
lines(dfloans$t, dfloans$fit, type="l", col="red")