Question 10.1a

Using the same crime data set uscrime.txt as in Questions 8.2 and 9.1, find the best model you can using

  1. a regression tree model

require(ggthemes)
library(tidyverse)
library(magrittr)
library(TTR)
library(tidyr)
library(dplyr)
library(lubridate)
library(ggplot2)
library(plotly)
library(fpp2)   
library(forecast)
library(caTools)
library(reshape2)
library(psych) 
require(graphics)
require(Matrix)
library(corrplot)
library(mltools)
library(fBasics)
library(kableExtra)
library(DMwR)
library(caret)
library(gridExtra)
library(DAAG)
library(rpart)
library(randomForest)
library(tree)
library(ISLR)
library(rpart.plot)
library(rattle)
library(data.table)
library(InformationValue)
library("mlr3")
library("mlr3learners")
df <- read.csv(file="uscrime.csv",stringsAsFactors = F, header=T) 
head(df,2)
##      M So   Ed  Po1 Po2    LF   M.F Pop   NW    U1  U2 Wealth Ineq     Prob
## 1 15.1  1  9.1  5.8 5.6 0.510  95.0  33 30.1 0.108 4.1   3940 26.1 0.084602
## 2 14.3  0 11.3 10.3 9.5 0.583 101.2  13 10.2 0.096 3.6   5570 19.4 0.029599
##      Time Crime
## 1 26.2011   791
## 2 25.2999  1635
#Statistical Summary of original data
df_unistats<- basicStats(df)[c("Minimum", "Maximum", "1. Quartile", "3. Quartile", "Mean", "Median",
                               "Variance", "Stdev", "Skewness", "Kurtosis"), ] %>% t() %>% as.data.frame()

kable(df_unistats)
Minimum Maximum
  1. Quartile
  1. Quartile
Mean Median Variance Stdev Skewness Kurtosis
M 11.9000 17.700000 13.000000 14.60000 13.857447 13.6000 1.579454e+00 1.256763 0.821917 0.377753
So 0.0000 1.000000 0.000000 1.00000 0.340426 0.0000 2.294170e-01 0.478975 0.652139 -1.607569
Ed 8.7000 12.200000 9.750000 11.45000 10.563830 10.8000 1.251489e+00 1.118700 -0.318987 -1.149253
Po1 4.5000 16.600000 6.250000 10.45000 8.500000 7.8000 8.832174e+00 2.971897 0.890124 0.162309
Po2 4.1000 15.700000 5.850000 9.70000 8.023404 7.3000 7.818353e+00 2.796132 0.844367 0.008590
LF 0.4800 0.641000 0.530500 0.59300 0.561191 0.5600 1.633000e-03 0.040412 0.270675 -0.888732
M.F 93.4000 107.100000 96.450000 99.20000 98.302128 97.7000 8.683256e+00 2.946737 0.993223 0.652010
Pop 3.0000 168.000000 10.000000 41.50000 36.617021 25.0000 1.449415e+03 38.071188 1.854230 3.078936
NW 0.2000 42.300000 2.400000 13.25000 10.112766 7.6000 1.057377e+02 10.282882 1.379966 1.077648
U1 0.0700 0.142000 0.080500 0.10400 0.095468 0.0920 3.250000e-04 0.018029 0.774876 -0.131208
U2 2.0000 5.800000 2.750000 3.85000 3.397872 3.4000 7.132560e-01 0.844545 0.542264 0.173008
Wealth 2880.0000 6890.000000 4595.000000 5915.00000 5253.829787 5370.0000 9.310502e+05 964.909442 -0.381952 -0.613169
Ineq 12.6000 27.600000 16.550000 22.75000 19.400000 17.6000 1.591696e+01 3.989606 0.367063 -1.138824
Prob 0.0069 0.119804 0.032701 0.05445 0.047091 0.0421 5.170000e-04 0.022737 0.883336 0.749579
Time 12.1996 44.000400 21.600350 30.45075 26.597921 25.8006 5.022408e+01 7.086895 0.371275 -0.413474
Crime 342.0000 1993.000000 658.500000 1057.50000 905.085106 831.0000 1.495854e+05 386.762697 1.053927 0.777628
# Visualizations via Box plots 
meltData <- melt(df)

p <- ggplot(meltData, aes(factor(variable), value)) 
p + geom_boxplot() + facet_wrap(~variable, scale="free")+theme_economist()+scale_colour_economist()

#density plots of original data
density <- df %>%
  gather() %>%                         
  ggplot(aes(value)) +                 
    facet_wrap(~ key, scales = "free") +
    geom_density()+theme_economist()+scale_colour_economist()
density

str(df) #structure of data
## 'data.frame':    47 obs. of  16 variables:
##  $ M     : num  15.1 14.3 14.2 13.6 14.1 12.1 12.7 13.1 15.7 14 ...
##  $ So    : int  1 0 1 0 0 0 1 1 1 0 ...
##  $ Ed    : num  9.1 11.3 8.9 12.1 12.1 11 11.1 10.9 9 11.8 ...
##  $ Po1   : num  5.8 10.3 4.5 14.9 10.9 11.8 8.2 11.5 6.5 7.1 ...
##  $ Po2   : num  5.6 9.5 4.4 14.1 10.1 11.5 7.9 10.9 6.2 6.8 ...
##  $ LF    : num  0.51 0.583 0.533 0.577 0.591 0.547 0.519 0.542 0.553 0.632 ...
##  $ M.F   : num  95 101.2 96.9 99.4 98.5 ...
##  $ Pop   : int  33 13 18 157 18 25 4 50 39 7 ...
##  $ NW    : num  30.1 10.2 21.9 8 3 4.4 13.9 17.9 28.6 1.5 ...
##  $ U1    : num  0.108 0.096 0.094 0.102 0.091 0.084 0.097 0.079 0.081 0.1 ...
##  $ U2    : num  4.1 3.6 3.3 3.9 2 2.9 3.8 3.5 2.8 2.4 ...
##  $ Wealth: int  3940 5570 3180 6730 5780 6890 6200 4720 4210 5260 ...
##  $ Ineq  : num  26.1 19.4 25 16.7 17.4 12.6 16.8 20.6 23.9 17.4 ...
##  $ Prob  : num  0.0846 0.0296 0.0834 0.0158 0.0414 ...
##  $ Time  : num  26.2 25.3 24.3 29.9 21.3 ...
##  $ Crime : int  791 1635 578 1969 1234 682 963 1555 856 705 ...

Regression Trees

set.seed(18)
#fitting the model
m1<- rpart(Crime~ ., data = df,method="anova" )
#summarize model result
summary(m1)
## Call:
## rpart(formula = Crime ~ ., data = df, method = "anova")
##   n= 47 
## 
##           CP nsplit rel error    xerror      xstd
## 1 0.36296293      0 1.0000000 1.0590182 0.2597043
## 2 0.14814320      1 0.6370371 0.8708915 0.1768329
## 3 0.05173165      2 0.4888939 0.9529299 0.2182971
## 4 0.01000000      3 0.4371622 0.9276588 0.2202180
## 
## Variable importance
##    Po1    Po2 Wealth   Ineq   Prob      M     NW    Pop   Time     Ed     LF 
##     17     17     11     11     10     10      9      5      4      4      1 
##     So 
##      1 
## 
## Node number 1: 47 observations,    complexity param=0.3629629
##   mean=905.0851, MSE=146402.7 
##   left son=2 (23 obs) right son=3 (24 obs)
##   Primary splits:
##       Po1    < 7.65      to the left,  improve=0.3629629, (0 missing)
##       Po2    < 7.2       to the left,  improve=0.3629629, (0 missing)
##       Prob   < 0.0418485 to the right, improve=0.3217700, (0 missing)
##       NW     < 7.65      to the left,  improve=0.2356621, (0 missing)
##       Wealth < 6240      to the left,  improve=0.2002403, (0 missing)
##   Surrogate splits:
##       Po2    < 7.2       to the left,  agree=1.000, adj=1.000, (0 split)
##       Wealth < 5330      to the left,  agree=0.830, adj=0.652, (0 split)
##       Prob   < 0.043598  to the right, agree=0.809, adj=0.609, (0 split)
##       M      < 13.25     to the right, agree=0.745, adj=0.478, (0 split)
##       Ineq   < 17.15     to the right, agree=0.745, adj=0.478, (0 split)
## 
## Node number 2: 23 observations,    complexity param=0.05173165
##   mean=669.6087, MSE=33880.15 
##   left son=4 (12 obs) right son=5 (11 obs)
##   Primary splits:
##       Pop < 22.5      to the left,  improve=0.4568043, (0 missing)
##       M   < 14.5      to the left,  improve=0.3931567, (0 missing)
##       NW  < 5.4       to the left,  improve=0.3184074, (0 missing)
##       Po1 < 5.75      to the left,  improve=0.2310098, (0 missing)
##       U1  < 0.093     to the right, improve=0.2119062, (0 missing)
##   Surrogate splits:
##       NW   < 5.4       to the left,  agree=0.826, adj=0.636, (0 split)
##       M    < 14.5      to the left,  agree=0.783, adj=0.545, (0 split)
##       Time < 22.30055  to the left,  agree=0.783, adj=0.545, (0 split)
##       So   < 0.5       to the left,  agree=0.739, adj=0.455, (0 split)
##       Ed   < 10.85     to the right, agree=0.739, adj=0.455, (0 split)
## 
## Node number 3: 24 observations,    complexity param=0.1481432
##   mean=1130.75, MSE=150173.4 
##   left son=6 (10 obs) right son=7 (14 obs)
##   Primary splits:
##       NW   < 7.65      to the left,  improve=0.2828293, (0 missing)
##       M    < 13.05     to the left,  improve=0.2714159, (0 missing)
##       Time < 21.9001   to the left,  improve=0.2060170, (0 missing)
##       M.F  < 99.2      to the left,  improve=0.1703438, (0 missing)
##       Po1  < 10.75     to the left,  improve=0.1659433, (0 missing)
##   Surrogate splits:
##       Ed   < 11.45     to the right, agree=0.750, adj=0.4, (0 split)
##       Ineq < 16.25     to the left,  agree=0.750, adj=0.4, (0 split)
##       Time < 21.9001   to the left,  agree=0.750, adj=0.4, (0 split)
##       Pop  < 30        to the left,  agree=0.708, adj=0.3, (0 split)
##       LF   < 0.5885    to the right, agree=0.667, adj=0.2, (0 split)
## 
## Node number 4: 12 observations
##   mean=550.5, MSE=20317.58 
## 
## Node number 5: 11 observations
##   mean=799.5455, MSE=16315.52 
## 
## Node number 6: 10 observations
##   mean=886.9, MSE=55757.49 
## 
## Node number 7: 14 observations
##   mean=1304.929, MSE=144801.8

The Regression Tree

par(bg = 'lightblue')
fancyRpartPlot(m1, main="Fig1: Decision Tree Graph", palettes=c("Oranges"), type=1)


Observation1:


m1$variable.importance
##       Po1       Po2    Wealth      Ineq      Prob         M        NW       Pop 
## 2497521.7 2497521.7 1628818.5 1602212.0 1520230.6 1388627.8 1245883.8  661770.6 
##      Time        Ed        LF        So 
##  601906.0  569545.9  203872.5  161800.8
  1. When rpart grows a tree it performs 10-fold (default = 10) cross validation on the data

  2. In our case, the lowest Cross-Validation error (xerror) occurred 0.975 with 1 split

(m1$cptable) # display the CV results
##           CP nsplit rel error    xerror      xstd
## 1 0.36296293      0 1.0000000 1.0590182 0.2597043
## 2 0.14814320      1 0.6370371 0.8708915 0.1768329
## 3 0.05173165      2 0.4888939 0.9529299 0.2182971
## 4 0.01000000      3 0.4371622 0.9276588 0.2202180
plotcp(m1) # visualize cross-validation results

  1. This plot is confirms what we already know from CV plot; that is 1 split give best result w/o over-fitting
# create additional plots
par(mfrow=c(1,2)) # two plots on one page
rsq.rpart(m1) # visualize cross-validation results  
## 
## Regression tree:
## rpart(formula = Crime ~ ., data = df, method = "anova")
## 
## Variables actually used in tree construction:
## [1] NW  Po1 Pop
## 
## Root node error: 6880928/47 = 146403
## 
## n= 47 
## 
##         CP nsplit rel error  xerror    xstd
## 1 0.362963      0   1.00000 1.05902 0.25970
## 2 0.148143      1   0.63704 0.87089 0.17683
## 3 0.051732      2   0.48889 0.95293 0.21830
## 4 0.010000      3   0.43716 0.92766 0.22022

# find best value of cp
min_cp <- m1$cptable[which.min(m1$cptable[,"xerror"]),"CP"]
#min_cp =>0.148, this is where lowest xerror occurred for the complexity parameter (cp)
# pruned tree using best cp
m1_prune <- prune(m1, cp = min_cp)
# plot the pruned tree
par(bg = 'lightblue')
fancyRpartPlot(m1_prune, main="Fig2: Pruned Tree", palettes=c("Oranges"), type=1)


Observation2:


  1. P01 < 7.7

  2. P01 > 7.7

df.new<-dplyr::filter((df),Po1<7.7)
df.new.greater.7.7<-dplyr::filter((df),Po1>7.7)

prune_lm = lm(Crime ~., data = df.new)# linear regression on P01<7.7 based on Pruned data above
prune_lm.greater.7.7 = lm(Crime ~., data = df.new.greater.7.7)# linear regression on P01>7.7 based on Pruned data above

ggplotRegression <- function (fit) {

require(ggplot2)

ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) + 
  geom_point() +
  stat_smooth(method = "lm", col = "red") +
  labs(title = paste("Adj R2 = ",signif(summary(fit)$adj.r.squared, 5),
                     "Intercept =",signif(fit$coef[[1]],5 ),
                     " Slope =",signif(fit$coef[[2]], 5),
                     " P =",signif(summary(fit)$coef[2,4], 5)))
}
#plot Po1 < 7.7
ggplotRegression(prune_lm )+theme_economist()+theme(
  plot.title = element_text(color = "red", size = 12, face = "bold"))+ labs(x = "All Data pts with P01 < 7.7")

#plot Po1 <>7.7
ggplotRegression(prune_lm.greater.7.7)+theme_economist()+theme(
  plot.title = element_text(color = "red", size = 12, face = "bold"))+ labs(x = "All Data pts with P01 > 7.7")

  1. Based on Pruned Tree - only showing residual analysis for Po1<7.7
# Compute the analysis of variance
res.aov.train <- aov(prune_lm , data = df.new)
# Summary of the analysis
summary(res.aov.train)
##             Df Sum Sq Mean Sq F value  Pr(>F)   
## M            1 125028  125028   9.312 0.01854 * 
## So           1  12026   12026   0.896 0.37545   
## Ed           1 190519  190519  14.190 0.00701 **
## Po1          1 124811  124811   9.296 0.01861 * 
## Po2          1   1030    1030   0.077 0.78980   
## LF           1  40827   40827   3.041 0.12472   
## M.F          1   3404    3404   0.254 0.63006   
## Pop          1  17414   17414   1.297 0.29224   
## NW           1  29277   29277   2.181 0.18328   
## U1           1  26226   26226   1.953 0.20493   
## U2           1   1240    1240   0.092 0.77004   
## Wealth       1   6900    6900   0.514 0.49665   
## Ineq         1   1611    1611   0.120 0.73923   
## Prob         1  14494   14494   1.080 0.33335   
## Time         1  90452   90452   6.737 0.03566 * 
## Residuals    7  93984   13426                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 1. Homogeneity of variances check
plot(res.aov.train, 1)

# 2. Normality of residual check
plot(res.aov.train, 2)


Observation3:

Note: Po1 < 7.7 were shown but since P01 > 7.7 were similar, it was not plotted


From the Pruned Tree

m1_prune$where #this tells us where the leafs are
##  [1] 2 3 2 3 3 3 3 3 2 2 3 2 2 2 2 3 2 3 3 3 2 2 3 3 2 3 2 3 3 2 2 3 2 3 3 3 2 2
## [39] 2 3 2 2 2 3 2 3 3
m1_prune$frame #shows are the rows of each node
##      var  n wt       dev      yval complexity ncompete nsurrogate
## 1    Po1 47 47 6880927.7  905.0851 0.36296293        4          5
## 2 <leaf> 23 23  779243.5  669.6087 0.05173165        0          0
## 3 <leaf> 24 24 3604162.5 1130.7500 0.14814320        0          0
leaf.2 <-df[which(m1_prune$where==2),]
leaf.3<-df[which(m1_prune$where==3),]

2 leafs Linear Regression analysis

m.leaf2<-lm(Crime~., data=leaf.2)
summary(m.leaf2)
## 
## Call:
## lm(formula = Crime ~ ., data = leaf.2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -109.147  -52.803   -6.495   53.784  127.196 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   -48.5477  2044.9766  -0.024   0.9817  
## M              45.8622    58.6256   0.782   0.4597  
## So            380.4815   223.1072   1.705   0.1319  
## Ed            187.9074    89.5799   2.098   0.0741 .
## Po1            -3.5138   157.7513  -0.022   0.9829  
## Po2            44.6382   148.5528   0.300   0.7725  
## LF           1059.3652  1187.9722   0.892   0.4021  
## M.F           -22.5521    21.4677  -1.051   0.3284  
## Pop            10.6413     5.0929   2.089   0.0750 .
## NW              0.1010     7.9019   0.013   0.9902  
## U1           4878.2802  4874.8165   1.001   0.3503  
## U2             -5.5126   133.5094  -0.041   0.9682  
## Wealth         -0.1022     0.1752  -0.583   0.5779  
## Ineq            4.7779    35.5290   0.134   0.8968  
## Prob        -7317.4407  3280.7511  -2.230   0.0609 .
## Time          -20.0603     7.7287  -2.596   0.0357 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 115.9 on 7 degrees of freedom
## Multiple R-squared:  0.8794, Adjusted R-squared:  0.6209 
## F-statistic: 3.403 on 15 and 7 DF,  p-value: 0.0541

Observation4:

m2.leaf2 <- lm(Crime~Ed+Pop+Prob+Time,data=leaf.2)
summary(m2.leaf2 )
## 
## Call:
## lm(formula = Crime ~ Ed + Pop + Prob + Time, data = leaf.2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -206.35  -90.22   -7.59   59.64  357.11 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   819.960    515.112   1.592   0.1288   
## Ed              9.499     34.869   0.272   0.7884   
## Pop            11.395      3.229   3.529   0.0024 **
## Prob        -3164.075   2095.755  -1.510   0.1485   
## Time          -12.130      6.830  -1.776   0.0927 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 154.5 on 18 degrees of freedom
## Multiple R-squared:  0.4485, Adjusted R-squared:  0.3259 
## F-statistic: 3.659 on 4 and 18 DF,  p-value: 0.02379

Observation4:

m3.leaf2  <- lm(Crime~Pop,data=leaf.2)
summary(m3.leaf2)
## 
## Call:
## lm(formula = Crime ~ Pop, data = leaf.2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -224.71 -114.53    0.29   60.43  400.75 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  497.836     66.949   7.436  2.6e-07 ***
## Pop            7.540      2.539   2.970  0.00731 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 161.7 on 21 degrees of freedom
## Multiple R-squared:  0.2957, Adjusted R-squared:  0.2622 
## F-statistic: 8.818 on 1 and 21 DF,  p-value: 0.007313
ggplotRegression(m3.leaf2 )+theme_economist()+theme(
  plot.title = element_text(color = "red", size = 12, face = "bold"))

res1.aov.train <- aov(m3.leaf2  , data = leaf.2)
# 1. Homogeneity of variances check
plot(res1.aov.train, 1)

# 2. Normality of residual check
plot(res1.aov.train, 2)


Observation5:


m.leaf3 <- lm(Crime~.,data=leaf.3)
summary(m.leaf3 )
## 
## Call:
## lm(formula = Crime ~ ., data = leaf.3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -206.805 -120.407   -9.489  103.985  248.226 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -8634.1701  2366.4043  -3.649  0.00651 **
## M               5.6032    96.1623   0.058  0.95496   
## So            179.6267   409.5210   0.439  0.67254   
## Ed            263.0845   146.4229   1.797  0.11010   
## Po1           235.2349   166.1289   1.416  0.19452   
## Po2          -140.7023   193.8759  -0.726  0.48869   
## LF           1442.4214  4832.4463   0.298  0.77294   
## M.F            -1.2379    54.8160  -0.023  0.98254   
## Pop            -3.7686     2.8833  -1.307  0.22751   
## NW             -0.5396    24.5039  -0.022  0.98297   
## U1          -3779.9843 10923.3434  -0.346  0.73823   
## U2            163.7106   150.5361   1.088  0.30848   
## Wealth          0.3017     0.2051   1.471  0.17946   
## Ineq          155.3754    65.5077   2.372  0.04511 * 
## Prob        -3624.0711  4381.4724  -0.827  0.43214   
## Time           21.9335    14.6754   1.495  0.17338   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 229.9 on 8 degrees of freedom
## Multiple R-squared:  0.8827, Adjusted R-squared:  0.6626 
## F-statistic: 4.012 on 15 and 8 DF,  p-value: 0.02669

Observation6:

m2.leaf3<- lm(Crime~Ineq,data=leaf.3)
summary(m2.leaf3)
## 
## Call:
## lm(formula = Crime ~ Ineq, data = leaf.3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -612.67 -254.25  -88.83  136.25  918.77 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   702.70     495.52   1.418    0.170
## Ineq           24.44      27.91   0.876    0.391
## 
## Residual standard error: 397.9 on 22 degrees of freedom
## Multiple R-squared:  0.03368,    Adjusted R-squared:  -0.01024 
## F-statistic: 0.7668 on 1 and 22 DF,  p-value: 0.3907

Observation7:


Question 10.1b

  1. a random forest model

Random Forest Model

set.seed(71)
random.forest.model <- randomForest(Crime ~. , data=df,keep.forest=T, importance=TRUE,class=)
print(random.forest.model ) # view results
## 
## Call:
##  randomForest(formula = Crime ~ ., data = df, keep.forest = T,      importance = TRUE, class = ) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##           Mean of squared residuals: 87620.99
##                     % Var explained: 40.15
randomForest::importance(random.forest.model ) # importance of each predictor
##           %IncMSE IncNodePurity
## M       2.1422379      188810.1
## So      1.3842953       22913.7
## Ed      3.3048929      201890.5
## Po1    11.8658584     1141790.9
## Po2    10.1332975     1098083.6
## LF      1.7264668      272905.0
## M.F     1.6662156      312965.8
## Pop     0.8410977      363070.0
## NW      7.1869300      518088.2
## U1      0.3949679      139563.4
## U2      0.9451352      146112.3
## Wealth  4.5875617      642320.7
## Ineq    0.6261387      275573.5
## Prob    9.6015928      872924.5
## Time    3.7374840      217304.9
par(bg = 'lightblue2')
varImpPlot(random.forest.model ) #plot of importance predictors

Prediction and Performance Metrics

ypred.RF <- predict(random.forest.model )
RSS <- sum((ypred.RF-df$Crime)^2) # the residual sum of squares
TSS <- sum((mean(df$Crime)-df$Crime)^2)#the total sum of squares
print(sprintf("R-Squared of Random Forest Model (All factors & Data Pts) = %0.3f", R.squared.RF<-1-(RSS/TSS) ))
## [1] "R-Squared of Random Forest Model (All factors & Data Pts) = 0.402"
#residual analysis
par(bg = 'lightblue')
plot(df$Crime, scale(ypred.RF-df$Crime), pch = 19, col = "maroon1",main="Random Forest: Residual vs. Fitted")
abline(0,0)


Observation8:


Analysis on the Pruned Tree again using Random Forest

  1. All data points for P01<7.7
# Define training control
set.seed(71)
fit.control <- caret::trainControl(method = "repeatedcv", number = 10, repeats = 3)
rf.fit.leaf2 <- caret::train(Crime~.,
                       data = df.new,
                       method = "rf",
                       trControl = fit.control)
# Summarize the results
print(rf.fit.leaf2)
## Random Forest 
## 
## 23 samples
## 15 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 21, 21, 21, 21, 20, 20, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##    2    148.6439  0.7932877  132.1649
##    8    146.8366  0.8270646  127.4077
##   15    148.8677  0.8379726  127.7360
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 8.
  1. All data points for P01>7.7
set.seed(71)
fit.control <- caret::trainControl(method = "repeatedcv", number = 10, repeats = 3)
rf.fit.leaf3 <- caret::train(Crime~.,
                       data = df.new.greater.7.7,
                       method = "rf",
                       trControl = fit.control)
# Summarize the results
print(rf.fit.leaf3)
## Random Forest 
## 
## 24 samples
## 15 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 22, 22, 21, 22, 21, 21, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##    2    352.3433  0.8550967  307.8084
##    8    368.5349  0.8468138  321.6500
##   15    390.8036  0.8471077  339.9812
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.

Observation9:

Note: R-squared based on lowest RMSE


Summary

- Both Regression Tree & Random Forest pointed to P01 as the defining factor to branched on (Importance predictor plots)

Advantage of Regression tree:

- Regression Tree as a root-causing tool

(a) While Linear regression on P01 > 7.7 produced negative R-squared!  This indicated to us that this particular leaf needed attention and should be re-investigated for further actions such as either a different model or factors to regress upon

- Very clear which model one is using and clearly, we do know which factor/s were significant

Disadvantage of Regression tree:

- Could produced overfitting issues when you have small sample size with too many predictors

Advantage of Random Forest:

- Random Forest produced better overall results with less overfitting and its cross validated R squares looked even more impressive (suspcisously).  As touted, Random Forest can reduce overfitting issues due to the fact that you have inherent randomeness in the model itself 

- Built-in function tells us which variable/s are of importance; P01 which coincided with regression tree

Disadvantage of Random Forest:

- No line of sight to which model/s were utilized.  

Question 10.2

Describe a situation or problem from your job, everyday life, current events, etc., for which a logistic regression model would be appropriate. List some (up to 5) predictors that you might use.

The Credit Card Fraud Detection problem is of significant importance to the banking industry because banks each year spend hundreds of millions of dollars due to fraud. When a credit card transaction happens, the bank makes a note of several factors. For instance, the date of the transaction, amount, place, type of purchase, etc. Based on these factors, they develop a Logistic Regression model of whether or not the transaction is a fraud.


Question 10.3

Logistic Regression

The German Credit Problem:

Note: The original data set had a number of categorical variables, which must be transform to binary variables[1] so that they can be appropriately handled by logistic regression. The data is organized in the spreadsheet German Credit.txt[2]


# Loading data
df.log<-read.table("germancredit.txt",sep = " ")
head(df.log,2)
##    V1 V2  V3  V4   V5  V6  V7 V8  V9  V10 V11  V12 V13  V14  V15 V16  V17 V18
## 1 A11  6 A34 A43 1169 A65 A75  4 A93 A101   4 A121  67 A143 A152   2 A173   1
## 2 A12 48 A32 A43 5951 A61 A73  2 A92 A101   2 A121  22 A143 A152   1 A173   1
##    V19  V20 V21
## 1 A192 A201   1
## 2 A191 A201   2
set.seed(713)
newdata <- one_hot(as.data.table(df.log))#one hot encoding the categorical variables
newdata$V21[newdata$V21==1]<-0
newdata$V21[newdata$V21==2]<-1

#Generate a random sample of 70% of the rows
random_row<- sample(1:nrow(newdata ),as.integer(0.7*nrow(newdata ),replace=F))
traindata = newdata [random_row,]
#Assign the test data set to the remaining 30% of the original set
testdata = newdata [-random_row,]

-Imbalance Check

table(newdata$V21)
## 
##   0   1 
## 700 300

Observation10:


set.seed(713)
log.reg1 <- glm(V21 ~.,family=binomial(link = "logit"),data=traindata)
summary(log.reg1 )
## 
## Call:
## glm(formula = V21 ~ ., family = binomial(link = "logit"), data = traindata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5305  -0.7382  -0.3661   0.7299   2.7427  
## 
## Coefficients: (13 not defined because of singularities)
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.135e+00  1.628e+00  -4.996 5.86e-07 ***
## V1_A11       1.756e+00  2.850e-01   6.162 7.16e-10 ***
## V1_A12       1.317e+00  2.740e-01   4.806 1.54e-06 ***
## V1_A13       4.612e-01  4.771e-01   0.967 0.333694    
## V1_A14              NA         NA      NA       NA    
## V2           2.040e-02  1.114e-02   1.831 0.067053 .  
## V3_A30       1.718e+00  5.026e-01   3.418 0.000630 ***
## V3_A31       2.108e+00  5.403e-01   3.901 9.57e-05 ***
## V3_A32       1.115e+00  3.197e-01   3.487 0.000489 ***
## V3_A33       9.977e-01  3.905e-01   2.555 0.010612 *  
## V3_A34              NA         NA      NA       NA    
## V4_A40       5.729e-01  3.913e-01   1.464 0.143099    
## V4_A41      -1.090e+00  5.161e-01  -2.112 0.034660 *  
## V4_A410     -5.138e-01  8.926e-01  -0.576 0.564887    
## V4_A42      -4.636e-01  4.175e-01  -1.110 0.266804    
## V4_A43      -3.511e-01  3.812e-01  -0.921 0.357030    
## V4_A44      -3.046e-01  9.119e-01  -0.334 0.738322    
## V4_A45       6.158e-01  6.760e-01   0.911 0.362366    
## V4_A46       1.331e-01  5.636e-01   0.236 0.813335    
## V4_A48      -1.520e+01  4.497e+02  -0.034 0.973031    
## V4_A49              NA         NA      NA       NA    
## V5           1.544e-04  5.186e-05   2.977 0.002912 ** 
## V6_A61       8.776e-01  3.027e-01   2.899 0.003747 ** 
## V6_A62       3.352e-01  4.175e-01   0.803 0.422059    
## V6_A63       7.555e-01  5.171e-01   1.461 0.143977    
## V6_A64      -3.631e-01  6.639e-01  -0.547 0.584416    
## V6_A65              NA         NA      NA       NA    
## V7_A71       2.559e-01  4.722e-01   0.542 0.587897    
## V7_A72       2.634e-01  3.582e-01   0.735 0.462159    
## V7_A73       4.323e-03  2.945e-01   0.015 0.988287    
## V7_A74      -7.052e-01  3.598e-01  -1.960 0.049988 *  
## V7_A75              NA         NA      NA       NA    
## V8           3.428e-01  1.054e-01   3.253 0.001141 ** 
## V9_A91      -3.057e-03  5.238e-01  -0.006 0.995344    
## V9_A92      -5.761e-02  3.811e-01  -0.151 0.879834    
## V9_A93      -5.984e-01  3.770e-01  -1.587 0.112468    
## V9_A94              NA         NA      NA       NA    
## V10_A101     8.571e-01  5.134e-01   1.670 0.095008 .  
## V10_A102     1.289e+00  6.752e-01   1.909 0.056296 .  
## V10_A103            NA         NA      NA       NA    
## V11         -3.271e-02  1.060e-01  -0.309 0.757674    
## V12_A121    -5.186e-01  5.127e-01  -1.011 0.311834    
## V12_A122    -2.735e-01  4.961e-01  -0.551 0.581474    
## V12_A123    -3.283e-01  4.825e-01  -0.680 0.496222    
## V12_A124            NA         NA      NA       NA    
## V13         -6.069e-03  1.122e-02  -0.541 0.588678    
## V14_A141     4.070e-01  2.867e-01   1.420 0.155658    
## V14_A142     9.262e-01  4.765e-01   1.944 0.051917 .  
## V14_A143            NA         NA      NA       NA    
## V15_A151     3.657e-01  5.688e-01   0.643 0.520241    
## V15_A152     2.017e-01  5.402e-01   0.373 0.708812    
## V15_A153            NA         NA      NA       NA    
## V16          1.592e-01  2.322e-01   0.686 0.493004    
## V17_A171    -9.006e-01  7.506e-01  -1.200 0.230205    
## V17_A172    -2.770e-04  4.069e-01  -0.001 0.999457    
## V17_A173     2.300e-01  3.318e-01   0.693 0.488241    
## V17_A174            NA         NA      NA       NA    
## V18          2.527e-01  2.973e-01   0.850 0.395243    
## V19_A191     1.845e-01  2.460e-01   0.750 0.453258    
## V19_A192            NA         NA      NA       NA    
## V20_A201     2.060e+00  8.978e-01   2.295 0.021751 *  
## V20_A202            NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 856.90  on 699  degrees of freedom
## Residual deviance: 630.76  on 651  degrees of freedom
## AIC: 728.76
## 
## Number of Fisher Scoring iterations: 14
  1. From above: V1_A11+V1_A12+V3_A30+V3_A31+V3_A32+V3_A33+V4_A41+V5+V6_A61+V7_A74+V8+V20_A201
set.seed(713)
log.reg2 <- glm(V21~ V1_A11+V1_A12+V3_A30+V3_A31+V3_A32+V3_A33+V4_A41+V5+V6_A61+V7_A74+V8+V20_A201 ,family=binomial(link = "logit"),data=traindata)
summary(log.reg2 )
## 
## Call:
## glm(formula = V21 ~ V1_A11 + V1_A12 + V3_A30 + V3_A31 + V3_A32 + 
##     V3_A33 + V4_A41 + V5 + V6_A61 + V7_A74 + V8 + V20_A201, family = binomial(link = "logit"), 
##     data = traindata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2163  -0.7574  -0.4430   0.8829   2.6810  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.307e+00  9.892e-01  -6.376 1.81e-10 ***
## V1_A11       1.629e+00  2.404e-01   6.775 1.24e-11 ***
## V1_A12       1.243e+00  2.352e-01   5.286 1.25e-07 ***
## V3_A30       1.792e+00  4.552e-01   3.936 8.29e-05 ***
## V3_A31       2.085e+00  4.465e-01   4.670 3.01e-06 ***
## V3_A32       1.027e+00  2.447e-01   4.195 2.73e-05 ***
## V3_A33       1.154e+00  3.599e-01   3.207  0.00134 ** 
## V4_A41      -1.109e+00  3.878e-01  -2.860  0.00423 ** 
## V5           1.746e-04  3.608e-05   4.839 1.31e-06 ***
## V6_A61       6.363e-01  2.091e-01   3.043  0.00234 ** 
## V7_A74      -7.138e-01  2.751e-01  -2.595  0.00947 ** 
## V8           2.814e-01  9.051e-02   3.109  0.00188 ** 
## V20_A201     2.095e+00  8.725e-01   2.401  0.01634 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 856.90  on 699  degrees of freedom
## Residual deviance: 680.19  on 687  degrees of freedom
## AIC: 706.19
## 
## Number of Fisher Scoring iterations: 5

Like in case of linear regression, checking for multicollinearity

#calculate the VIF for each predictor variable in the model
 VIF <- function(linear.model, no.intercept=FALSE, all.diagnostics=FALSE, plot=FALSE) {
    require(mctest)
    if(no.intercept==FALSE) design.matrix <- model.matrix(linear.model)[,-1]
    if(no.intercept==TRUE) design.matrix <- model.matrix(linear.model)
    if(plot==TRUE) mc.plot(design.matrix,linear.model$model[1])
    if(all.diagnostics==FALSE) output <- imcdiag(design.matrix,linear.model$model[1], method='VIF')$idiags[,1]
    if(all.diagnostics==TRUE) output <- imcdiag(design.matrix,linear.model$model[1])
    output
 }

#create vector of VIF values
vif_values <- VIF(log.reg2 )
#create horizontal bar chart to display each VIF value
par(bg = 'lightblue')
barplot(vif_values, main = "Fig3: VIF Values on Final Logistic Regression Model", horiz = TRUE, col = "gold",las=2,cex.names=.53,xlim=c(0,5))
#add vertical line at 5
abline(v = 5, lwd = 3, lty = 2)


Observation11:


  1. This 3rd and final iteration was selected
set.seed(713)
predicted <- predict(log.reg2  , testdata, type="response") #prediction based on test data
#optimal prediction probability cutoff for the model
optCutOff <- optimalCutoff(testdata$V21, predicted)[1] 
predicted_roundup <- as.integer(predicted > optCutOff ) #rounding up to 1's and 0's
  1. Confusion Matrix

  2. Misclassification Error Rate: The lower the better

  3. Specificity and Sensitivity

  4. ROC Curve

#Confustion Matrix
confusionMatrix(predicted_roundup, testdata$V21, threshold = optCutOff)
##     0  1
## 0 188 23
## 1  48 41
#Misclassification error
misClassError(testdata$V21, predicted_roundup, threshold = optCutOff)
## [1] 0.2367
#sensitivity & specificity
sensitivity(testdata$V21, predicted_roundup, threshold = optCutOff)
## [1] 0.4606742
specificity(testdata$V21, predicted_roundup, threshold = optCutOff)
## [1] 0.8909953

Observation12:


plotROC(testdata$V21, predicted_roundup) 

Cost-Sensitive Classification

  1. Costs of a false positive (incorrectly saying an applicant is a good credit risk) outweigh the cost of a false negative (incorrectly saying an applicant is a bad credit risk) by a factor of five
costs = matrix(c(0, 5, 1, 0), nrow = 2)
dimnames(costs) = list(Actual = c("good", "bad"), Predicted= c("good", "bad"))
print(costs) #Cost of misclassifications 5x for FP vs. FN
##       Predicted
## Actual good bad
##   good    0   1
##   bad     5   0

Thresholding

#initialize list 
cost <- vector(mode = "list")
for (i in 1:100){
predicted_roundup <- as.integer(predicted > i/100 )
cm_matrix <- as.matrix(table(testdata$V21 ,predicted_roundup))
#Ensuring NO out of bounds issues while looping
if(nrow(cm_matrix)==2) {fp<-cm_matrix[2,1]} else {fp=0} 
if(ncol(cm_matrix)==2){fn<-cm_matrix[1,2]} else {fn=0}

cost<-c(cost, fn*1+fp*5)
}
#Plots ov Total cost vs % thresholds
par(bg = 'lightblue')
plot(x=seq(0.01,1,by=0.01),y=cost,xlab = "Threshold, %",ylab = "Total Cost of Misclassifications",main = "Fig4: Cost vs Threshold",col='red',pch=16)
grid (10,10, lty = 6, col = "white")

numerator<-which.min(cost)
min.threshold <-numerator/100
print(sprintf("Minimum Threshold due to Uneven Error Cost= %0.3f", min.threshold))
## [1] "Minimum Threshold due to Uneven Error Cost= 0.130"

Summary

- German Credit Data set had binary, numerical and categorical data.  The data cleaning was to one-hot encoding all the data including the response to binary data points for ease of analysis downstream

- Ran total of 2 iterations to get to the most significant predictors; tossing out insignifcant ones along the way, we arrived at:

Logistic Model

              Estimate    
(Intercept) -6.307e+00  
V1_A11       1.629e+00  
V1_A12       1.243e+00  
V3_A30       1.792e+00  
V3_A31       2.085e+00  
V3_A32       1.027e+00  
V3_A33       1.154e+00   
V4_A41      -1.109e+00  
V5           1.746e-04  
V6_A61       6.363e-01  
V7_A74      -7.138e-01  
V8           2.814e-01   
V20_A201     2.095e+00   

- Because of FP cost 5x more than FN, we had to fine tune the threshold and the optimal threshold for classification was at 13%

- Lastly, the AUC at 65% (although not the best accuracy measure per Sokol) tells us that a random person from the yes group would probably get a higher estimate relative to the person from the no group