Computational Statistics

Ronald Wesonga (Ph.D)

21 Sep 2022

Exploratory Statistics

str(trees)
## 'data.frame':    31 obs. of  3 variables:
##  $ Girth : num  8.3 8.6 8.8 10.5 10.7 10.8 11 11 11.1 11.2 ...
##  $ Height: num  70 65 63 72 81 83 66 75 80 75 ...
##  $ Volume: num  10.3 10.3 10.2 16.4 18.8 19.7 15.6 18.2 22.6 19.9 ...
head(trees)
##   Girth Height Volume
## 1   8.3     70   10.3
## 2   8.6     65   10.3
## 3   8.8     63   10.2
## 4  10.5     72   16.4
## 5  10.7     81   18.8
## 6  10.8     83   19.7
kable(colMeans(trees[,1:3])) # sapply(trees, mean)
x
Girth 13.24839
Height 76.00000
Volume 30.17097
kable(cov(trees[,1:3]))      # sapply(trees, sd)
Girth Height Volume
Girth 9.847914 10.38333 49.88812
Height 10.383333 40.60000 62.66000
Volume 49.888118 62.66000 270.20280
kable(cor(trees[,1:3]))
Girth Height Volume
Girth 1.0000000 0.5192801 0.9671194
Height 0.5192801 1.0000000 0.5982497
Volume 0.9671194 0.5982497 1.0000000
plot(trees, cex = 1.5, pch = 16, col = "red") 

pairs(trees)

scatterplot(trees$Girth ~ trees$Volume, main="Scatterplot")

scatterplot3d(trees[,1:3], main="3D Scatterplot")

# Chernoff Faces
faces(trees[,1:3])

## effect of variables:
##  modified item       Var     
##  "height of face   " "Girth" 
##  "width of face    " "Height"
##  "structure of face" "Volume"
##  "height of mouth  " "Girth" 
##  "width of mouth   " "Height"
##  "smiling          " "Volume"
##  "height of eyes   " "Girth" 
##  "width of eyes    " "Height"
##  "height of hair   " "Volume"
##  "width of hair   "  "Girth" 
##  "style of hair   "  "Height"
##  "height of nose  "  "Volume"
##  "width of nose   "  "Girth" 
##  "width of ear    "  "Height"
##  "height of ear   "  "Volume"
bagplot.pairs(trees, gap = 0, col.baghull = "green")

##    Girth Height Volume
## 1    8.3     70   10.3
## 2    8.6     65   10.3
## 3    8.8     63   10.2
## 4   10.5     72   16.4
## 5   10.7     81   18.8
## 6   10.8     83   19.7
## 7   11.0     66   15.6
## 8   11.0     75   18.2
## 9   11.1     80   22.6
## 10  11.2     75   19.9
## 11  11.3     79   24.2
## 12  11.4     76   21.0
## 13  11.4     76   21.4
## 14  11.7     69   21.3
## 15  12.0     75   19.1
## 16  12.9     74   22.2
## 17  12.9     85   33.8
## 18  13.3     86   27.4
## 19  13.7     71   25.7
## 20  13.8     64   24.9
## 21  14.0     78   34.5
## 22  14.2     80   31.7
## 23  14.5     74   36.3
## 24  16.0     72   38.3
## 25  16.3     77   42.6
## 26  17.3     81   55.4
## 27  17.5     82   55.7
## 28  17.9     80   58.3
## 29  18.0     80   51.5
## 30  18.0     80   51.0
## 31  20.6     87   77.0

Multiple Regression Modeling

m1 = lm(Volume~Girth,data=trees)
summary(m1)
## 
## Call:
## lm(formula = Volume ~ Girth, data = trees)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.065 -3.107  0.152  3.495  9.587 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -36.9435     3.3651  -10.98 7.62e-12 ***
## Girth         5.0659     0.2474   20.48  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.252 on 29 degrees of freedom
## Multiple R-squared:  0.9353, Adjusted R-squared:  0.9331 
## F-statistic: 419.4 on 1 and 29 DF,  p-value: < 2.2e-16
m2 = lm(Volume~Girth+Height,data=trees)
summary(m2)
## 
## Call:
## lm(formula = Volume ~ Girth + Height, data = trees)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4065 -2.6493 -0.2876  2.2003  8.4847 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -57.9877     8.6382  -6.713 2.75e-07 ***
## Girth         4.7082     0.2643  17.816  < 2e-16 ***
## Height        0.3393     0.1302   2.607   0.0145 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.882 on 28 degrees of freedom
## Multiple R-squared:  0.948,  Adjusted R-squared:  0.9442 
## F-statistic:   255 on 2 and 28 DF,  p-value: < 2.2e-16
m3 = lm(Volume~Girth*Height,data=trees)
summary(m3)
## 
## Call:
## lm(formula = Volume ~ Girth * Height, data = trees)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.5821 -1.0673  0.3026  1.5641  4.6649 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  69.39632   23.83575   2.911  0.00713 ** 
## Girth        -5.85585    1.92134  -3.048  0.00511 ** 
## Height       -1.29708    0.30984  -4.186  0.00027 ***
## Girth:Height  0.13465    0.02438   5.524 7.48e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.709 on 27 degrees of freedom
## Multiple R-squared:  0.9756, Adjusted R-squared:  0.9728 
## F-statistic: 359.3 on 3 and 27 DF,  p-value: < 2.2e-16
m4 = lm(log(Volume)~log(Girth)+log(Height),data=trees)
summary(m4)
## 
## Call:
## lm(formula = log(Volume) ~ log(Girth) + log(Height), data = trees)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.168561 -0.048488  0.002431  0.063637  0.129223 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.63162    0.79979  -8.292 5.06e-09 ***
## log(Girth)   1.98265    0.07501  26.432  < 2e-16 ***
## log(Height)  1.11712    0.20444   5.464 7.81e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08139 on 28 degrees of freedom
## Multiple R-squared:  0.9777, Adjusted R-squared:  0.9761 
## F-statistic: 613.2 on 2 and 28 DF,  p-value: < 2.2e-16
confint(m4)
##                 2.5 %    97.5 %
## (Intercept) -8.269912 -4.993322
## log(Girth)   1.828998  2.136302
## log(Height)  0.698353  1.535894

Now add the interaction term.

m5 = lm(log(Volume)~log(Girth)*log(Height),data=trees)
summary(m5)
## 
## Call:
## lm(formula = log(Volume) ~ log(Girth) * log(Height), data = trees)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.165941 -0.048613  0.006384  0.062204  0.132295 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)
## (Intercept)             -3.6869     7.6996  -0.479    0.636
## log(Girth)               0.7942     3.0910   0.257    0.799
## log(Height)              0.4377     1.7788   0.246    0.808
## log(Girth):log(Height)   0.2740     0.7124   0.385    0.704
## 
## Residual standard error: 0.08265 on 27 degrees of freedom
## Multiple R-squared:  0.9778, Adjusted R-squared:  0.9753 
## F-statistic: 396.4 on 3 and 27 DF,  p-value: < 2.2e-16
m6 = lm(log(Volume)-log((Girth^2)*Height)~1,data=trees)
summary(m6)
## 
## Call:
## lm(formula = log(Volume) - log((Girth^2) * Height) ~ 1, data = trees)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.168446 -0.047355 -0.003518  0.066308  0.136467 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.16917    0.01421  -434.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0791 on 30 degrees of freedom
m7 = lm(Volume~0+I(Girth^2):Height,data=trees)
summary(m7)
## 
## Call:
## lm(formula = Volume ~ 0 + I(Girth^2):Height, data = trees)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6696 -1.0832 -0.3341  1.6045  4.2944 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## I(Girth^2):Height 2.108e-03  2.722e-05   77.44   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.455 on 30 degrees of freedom
## Multiple R-squared:  0.995,  Adjusted R-squared:  0.9949 
## F-statistic:  5996 on 1 and 30 DF,  p-value: < 2.2e-16

Model Selection

plot(m6)

## hat values (leverages) are all = 0.03225806
##  and there are no factor predictors; no plot no. 5

plot(m7)

shapiro.test(residuals(m6))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(m6)
## W = 0.97013, p-value = 0.5225
shapiro.test(residuals(m7))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(m7)
## W = 0.95846, p-value = 0.2655
summary(m1)
## 
## Call:
## lm(formula = Volume ~ Girth, data = trees)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.065 -3.107  0.152  3.495  9.587 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -36.9435     3.3651  -10.98 7.62e-12 ***
## Girth         5.0659     0.2474   20.48  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.252 on 29 degrees of freedom
## Multiple R-squared:  0.9353, Adjusted R-squared:  0.9331 
## F-statistic: 419.4 on 1 and 29 DF,  p-value: < 2.2e-16
AIC(m1)
## [1] 181.6447
summary(m2)
## 
## Call:
## lm(formula = Volume ~ Girth + Height, data = trees)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4065 -2.6493 -0.2876  2.2003  8.4847 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -57.9877     8.6382  -6.713 2.75e-07 ***
## Girth         4.7082     0.2643  17.816  < 2e-16 ***
## Height        0.3393     0.1302   2.607   0.0145 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.882 on 28 degrees of freedom
## Multiple R-squared:  0.948,  Adjusted R-squared:  0.9442 
## F-statistic:   255 on 2 and 28 DF,  p-value: < 2.2e-16
AIC(m2)
## [1] 176.91
summary(m3)
## 
## Call:
## lm(formula = Volume ~ Girth * Height, data = trees)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.5821 -1.0673  0.3026  1.5641  4.6649 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  69.39632   23.83575   2.911  0.00713 ** 
## Girth        -5.85585    1.92134  -3.048  0.00511 ** 
## Height       -1.29708    0.30984  -4.186  0.00027 ***
## Girth:Height  0.13465    0.02438   5.524 7.48e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.709 on 27 degrees of freedom
## Multiple R-squared:  0.9756, Adjusted R-squared:  0.9728 
## F-statistic: 359.3 on 3 and 27 DF,  p-value: < 2.2e-16
AIC(m3)
## [1] 155.4692
summary(m4)
## 
## Call:
## lm(formula = log(Volume) ~ log(Girth) + log(Height), data = trees)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.168561 -0.048488  0.002431  0.063637  0.129223 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.63162    0.79979  -8.292 5.06e-09 ***
## log(Girth)   1.98265    0.07501  26.432  < 2e-16 ***
## log(Height)  1.11712    0.20444   5.464 7.81e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08139 on 28 degrees of freedom
## Multiple R-squared:  0.9777, Adjusted R-squared:  0.9761 
## F-statistic: 613.2 on 2 and 28 DF,  p-value: < 2.2e-16
AIC(m4)
## [1] -62.71125
summary(m5)
## 
## Call:
## lm(formula = log(Volume) ~ log(Girth) * log(Height), data = trees)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.165941 -0.048613  0.006384  0.062204  0.132295 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)
## (Intercept)             -3.6869     7.6996  -0.479    0.636
## log(Girth)               0.7942     3.0910   0.257    0.799
## log(Height)              0.4377     1.7788   0.246    0.808
## log(Girth):log(Height)   0.2740     0.7124   0.385    0.704
## 
## Residual standard error: 0.08265 on 27 degrees of freedom
## Multiple R-squared:  0.9778, Adjusted R-squared:  0.9753 
## F-statistic: 396.4 on 3 and 27 DF,  p-value: < 2.2e-16
AIC(m5)
## [1] -60.88061
summary(m6)
## 
## Call:
## lm(formula = log(Volume) - log((Girth^2) * Height) ~ 1, data = trees)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.168446 -0.047355 -0.003518  0.066308  0.136467 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6.16917    0.01421  -434.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0791 on 30 degrees of freedom
AIC(m6)
## [1] -66.34198
summary(m7)
## 
## Call:
## lm(formula = Volume ~ 0 + I(Girth^2):Height, data = trees)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6696 -1.0832 -0.3341  1.6045  4.2944 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## I(Girth^2):Height 2.108e-03  2.722e-05   77.44   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.455 on 30 degrees of freedom
## Multiple R-squared:  0.995,  Adjusted R-squared:  0.9949 
## F-statistic:  5996 on 1 and 30 DF,  p-value: < 2.2e-16
AIC(m7)
## [1] 146.6447

Stepwise Regression

m8 = lm(Fertility~.,data=swiss)
summary(m8)
## 
## Call:
## lm(formula = Fertility ~ ., data = swiss)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.2743  -5.2617   0.5032   4.1198  15.3213 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      66.91518   10.70604   6.250 1.91e-07 ***
## Agriculture      -0.17211    0.07030  -2.448  0.01873 *  
## Examination      -0.25801    0.25388  -1.016  0.31546    
## Education        -0.87094    0.18303  -4.758 2.43e-05 ***
## Catholic          0.10412    0.03526   2.953  0.00519 ** 
## Infant.Mortality  1.07705    0.38172   2.822  0.00734 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.165 on 41 degrees of freedom
## Multiple R-squared:  0.7067, Adjusted R-squared:  0.671 
## F-statistic: 19.76 on 5 and 41 DF,  p-value: 5.594e-10
library(MASS)
summary(stepAIC(m8))
## Start:  AIC=190.69
## Fertility ~ Agriculture + Examination + Education + Catholic + 
##     Infant.Mortality
## 
##                    Df Sum of Sq    RSS    AIC
## - Examination       1     53.03 2158.1 189.86
## <none>                          2105.0 190.69
## - Agriculture       1    307.72 2412.8 195.10
## - Infant.Mortality  1    408.75 2513.8 197.03
## - Catholic          1    447.71 2552.8 197.75
## - Education         1   1162.56 3267.6 209.36
## 
## Step:  AIC=189.86
## Fertility ~ Agriculture + Education + Catholic + Infant.Mortality
## 
##                    Df Sum of Sq    RSS    AIC
## <none>                          2158.1 189.86
## - Agriculture       1    264.18 2422.2 193.29
## - Infant.Mortality  1    409.81 2567.9 196.03
## - Catholic          1    956.57 3114.6 205.10
## - Education         1   2249.97 4408.0 221.43
## 
## Call:
## lm(formula = Fertility ~ Agriculture + Education + Catholic + 
##     Infant.Mortality, data = swiss)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.6765  -6.0522   0.7514   3.1664  16.1422 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      62.10131    9.60489   6.466 8.49e-08 ***
## Agriculture      -0.15462    0.06819  -2.267  0.02857 *  
## Education        -0.98026    0.14814  -6.617 5.14e-08 ***
## Catholic          0.12467    0.02889   4.315 9.50e-05 ***
## Infant.Mortality  1.07844    0.38187   2.824  0.00722 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.168 on 42 degrees of freedom
## Multiple R-squared:  0.6993, Adjusted R-squared:  0.6707 
## F-statistic: 24.42 on 4 and 42 DF,  p-value: 1.717e-10

Non-Linear Multiple Regression Models

\(y = \beta_0x_1^{\beta_1}x_2^{\beta_2}+\varepsilon\)

Parameters for non-linear models may be estimated using the nls package in R.

volume = trees$Volume
height = trees$Height
girth = trees$Girth
m9 = nls(volume~beta0*girth^beta1*height^beta2,start=list(beta0=1,beta1=2,beta2=1))
summary(m9)
## 
## Formula: volume ~ beta0 * girth^beta1 * height^beta2
## 
## Parameters:
##       Estimate Std. Error t value Pr(>|t|)    
## beta0 0.001449   0.001367   1.060 0.298264    
## beta1 1.996921   0.082077  24.330  < 2e-16 ***
## beta2 1.087647   0.242159   4.491 0.000111 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.533 on 28 degrees of freedom
## 
## Number of iterations to convergence: 5 
## Achieved convergence tolerance: 8.255e-07

Practical Exercise

Real-life Data Problem

Use your own local data to develop a regression predictive model. Follow the ideas as demonstrated in the lecture to import the data in RStudio, develop your own regression analysis plan and objectives. Analyse the data and write a statistical report in rmarkdown. Submit your well interpreted report by 27 September 2022.