Question 10.1a

Using the same crime data set uscrime.txt as in Questions 8.2 and 9.1, find the best model you can using

a regression tree model

require(ggthemes)
library(tidyverse)
library(magrittr)
library(TTR)
library(tidyr)
library(dplyr)
library(lubridate)
library(ggplot2)
library(plotly)
library(fpp2)   
library(forecast)
library(caTools)
library(reshape2)
library(psych) 
require(graphics)
require(Matrix)
library(corrplot)
library(mltools)
library(fBasics)
library(kableExtra)
library(DMwR)
library(caret)
library(gridExtra)
library(DAAG)
library(rpart)
library(randomForest)
library(tree)
library(ISLR)
library(rpart.plot)
library(rattle)
library(data.table)
library(InformationValue)
library("mlr3")
library("mlr3learners")

Load Data

df <- read.csv(file="uscrime.csv",stringsAsFactors = F, header=T) 
head(df,2)

##      M So   Ed  Po1 Po2    LF   M.F Pop   NW    U1  U2 Wealth Ineq     Prob
## 1 15.1  1  9.1  5.8 5.6 0.510  95.0  33 30.1 0.108 4.1   3940 26.1 0.084602
## 2 14.3  0 11.3 10.3 9.5 0.583 101.2  13 10.2 0.096 3.6   5570 19.4 0.029599
##      Time Crime
## 1 26.2011   791
## 2 25.2999  1635

Data exploration & Simple Visualizations

#Statistical Summary of original data
df_unistats<- basicStats(df)[c("Minimum", "Maximum", "1. Quartile", "3. Quartile", "Mean", "Median",
                               "Variance", "Stdev", "Skewness", "Kurtosis"), ] %>% t() %>% as.data.frame()

kable(df_unistats)

	Minimum	Maximum	Quartile	Quartile	Mean	Median	Variance	Stdev	Skewness	Kurtosis
M	11.9000	17.700000	13.000000	14.60000	13.857447	13.6000	1.579454e+00	1.256763	0.821917	0.377753
So	0.0000	1.000000	0.000000	1.00000	0.340426	0.0000	2.294170e-01	0.478975	0.652139	-1.607569
Ed	8.7000	12.200000	9.750000	11.45000	10.563830	10.8000	1.251489e+00	1.118700	-0.318987	-1.149253
Po1	4.5000	16.600000	6.250000	10.45000	8.500000	7.8000	8.832174e+00	2.971897	0.890124	0.162309
Po2	4.1000	15.700000	5.850000	9.70000	8.023404	7.3000	7.818353e+00	2.796132	0.844367	0.008590
LF	0.4800	0.641000	0.530500	0.59300	0.561191	0.5600	1.633000e-03	0.040412	0.270675	-0.888732
M.F	93.4000	107.100000	96.450000	99.20000	98.302128	97.7000	8.683256e+00	2.946737	0.993223	0.652010
Pop	3.0000	168.000000	10.000000	41.50000	36.617021	25.0000	1.449415e+03	38.071188	1.854230	3.078936
NW	0.2000	42.300000	2.400000	13.25000	10.112766	7.6000	1.057377e+02	10.282882	1.379966	1.077648
U1	0.0700	0.142000	0.080500	0.10400	0.095468	0.0920	3.250000e-04	0.018029	0.774876	-0.131208
U2	2.0000	5.800000	2.750000	3.85000	3.397872	3.4000	7.132560e-01	0.844545	0.542264	0.173008
Wealth	2880.0000	6890.000000	4595.000000	5915.00000	5253.829787	5370.0000	9.310502e+05	964.909442	-0.381952	-0.613169
Ineq	12.6000	27.600000	16.550000	22.75000	19.400000	17.6000	1.591696e+01	3.989606	0.367063	-1.138824
Prob	0.0069	0.119804	0.032701	0.05445	0.047091	0.0421	5.170000e-04	0.022737	0.883336	0.749579
Time	12.1996	44.000400	21.600350	30.45075	26.597921	25.8006	5.022408e+01	7.086895	0.371275	-0.413474
Crime	342.0000	1993.000000	658.500000	1057.50000	905.085106	831.0000	1.495854e+05	386.762697	1.053927	0.777628

# Visualizations via Box plots 
meltData <- melt(df)

p <- ggplot(meltData, aes(factor(variable), value)) 
p + geom_boxplot() + facet_wrap(~variable, scale="free")+theme_economist()+scale_colour_economist()

#density plots of original data
density <- df %>%
  gather() %>%                         
  ggplot(aes(value)) +                 
    facet_wrap(~ key, scales = "free") +
    geom_density()+theme_economist()+scale_colour_economist()
density

Datastructure

str(df) #structure of data

## 'data.frame':    47 obs. of  16 variables:
##  $ M     : num  15.1 14.3 14.2 13.6 14.1 12.1 12.7 13.1 15.7 14 ...
##  $ So    : int  1 0 1 0 0 0 1 1 1 0 ...
##  $ Ed    : num  9.1 11.3 8.9 12.1 12.1 11 11.1 10.9 9 11.8 ...
##  $ Po1   : num  5.8 10.3 4.5 14.9 10.9 11.8 8.2 11.5 6.5 7.1 ...
##  $ Po2   : num  5.6 9.5 4.4 14.1 10.1 11.5 7.9 10.9 6.2 6.8 ...
##  $ LF    : num  0.51 0.583 0.533 0.577 0.591 0.547 0.519 0.542 0.553 0.632 ...
##  $ M.F   : num  95 101.2 96.9 99.4 98.5 ...
##  $ Pop   : int  33 13 18 157 18 25 4 50 39 7 ...
##  $ NW    : num  30.1 10.2 21.9 8 3 4.4 13.9 17.9 28.6 1.5 ...
##  $ U1    : num  0.108 0.096 0.094 0.102 0.091 0.084 0.097 0.079 0.081 0.1 ...
##  $ U2    : num  4.1 3.6 3.3 3.9 2 2.9 3.8 3.5 2.8 2.4 ...
##  $ Wealth: int  3940 5570 3180 6730 5780 6890 6200 4720 4210 5260 ...
##  $ Ineq  : num  26.1 19.4 25 16.7 17.4 12.6 16.8 20.6 23.9 17.4 ...
##  $ Prob  : num  0.0846 0.0296 0.0834 0.0158 0.0414 ...
##  $ Time  : num  26.2 25.3 24.3 29.9 21.3 ...
##  $ Crime : int  791 1635 578 1969 1234 682 963 1555 856 705 ...

Regression Trees

set.seed(18)
#fitting the model
m1<- rpart(Crime~ ., data = df,method="anova" )
#summarize model result
summary(m1)

## Call:
## rpart(formula = Crime ~ ., data = df, method = "anova")
##   n= 47 
## 
##           CP nsplit rel error    xerror      xstd
## 1 0.36296293      0 1.0000000 1.0590182 0.2597043
## 2 0.14814320      1 0.6370371 0.8708915 0.1768329
## 3 0.05173165      2 0.4888939 0.9529299 0.2182971
## 4 0.01000000      3 0.4371622 0.9276588 0.2202180
## 
## Variable importance
##    Po1    Po2 Wealth   Ineq   Prob      M     NW    Pop   Time     Ed     LF 
##     17     17     11     11     10     10      9      5      4      4      1 
##     So 
##      1 
## 
## Node number 1: 47 observations,    complexity param=0.3629629
##   mean=905.0851, MSE=146402.7 
##   left son=2 (23 obs) right son=3 (24 obs)
##   Primary splits:
##       Po1    < 7.65      to the left,  improve=0.3629629, (0 missing)
##       Po2    < 7.2       to the left,  improve=0.3629629, (0 missing)
##       Prob   < 0.0418485 to the right, improve=0.3217700, (0 missing)
##       NW     < 7.65      to the left,  improve=0.2356621, (0 missing)
##       Wealth < 6240      to the left,  improve=0.2002403, (0 missing)
##   Surrogate splits:
##       Po2    < 7.2       to the left,  agree=1.000, adj=1.000, (0 split)
##       Wealth < 5330      to the left,  agree=0.830, adj=0.652, (0 split)
##       Prob   < 0.043598  to the right, agree=0.809, adj=0.609, (0 split)
##       M      < 13.25     to the right, agree=0.745, adj=0.478, (0 split)
##       Ineq   < 17.15     to the right, agree=0.745, adj=0.478, (0 split)
## 
## Node number 2: 23 observations,    complexity param=0.05173165
##   mean=669.6087, MSE=33880.15 
##   left son=4 (12 obs) right son=5 (11 obs)
##   Primary splits:
##       Pop < 22.5      to the left,  improve=0.4568043, (0 missing)
##       M   < 14.5      to the left,  improve=0.3931567, (0 missing)
##       NW  < 5.4       to the left,  improve=0.3184074, (0 missing)
##       Po1 < 5.75      to the left,  improve=0.2310098, (0 missing)
##       U1  < 0.093     to the right, improve=0.2119062, (0 missing)
##   Surrogate splits:
##       NW   < 5.4       to the left,  agree=0.826, adj=0.636, (0 split)
##       M    < 14.5      to the left,  agree=0.783, adj=0.545, (0 split)
##       Time < 22.30055  to the left,  agree=0.783, adj=0.545, (0 split)
##       So   < 0.5       to the left,  agree=0.739, adj=0.455, (0 split)
##       Ed   < 10.85     to the right, agree=0.739, adj=0.455, (0 split)
## 
## Node number 3: 24 observations,    complexity param=0.1481432
##   mean=1130.75, MSE=150173.4 
##   left son=6 (10 obs) right son=7 (14 obs)
##   Primary splits:
##       NW   < 7.65      to the left,  improve=0.2828293, (0 missing)
##       M    < 13.05     to the left,  improve=0.2714159, (0 missing)
##       Time < 21.9001   to the left,  improve=0.2060170, (0 missing)
##       M.F  < 99.2      to the left,  improve=0.1703438, (0 missing)
##       Po1  < 10.75     to the left,  improve=0.1659433, (0 missing)
##   Surrogate splits:
##       Ed   < 11.45     to the right, agree=0.750, adj=0.4, (0 split)
##       Ineq < 16.25     to the left,  agree=0.750, adj=0.4, (0 split)
##       Time < 21.9001   to the left,  agree=0.750, adj=0.4, (0 split)
##       Pop  < 30        to the left,  agree=0.708, adj=0.3, (0 split)
##       LF   < 0.5885    to the right, agree=0.667, adj=0.2, (0 split)
## 
## Node number 4: 12 observations
##   mean=550.5, MSE=20317.58 
## 
## Node number 5: 11 observations
##   mean=799.5455, MSE=16315.52 
## 
## Node number 6: 10 observations
##   mean=886.9, MSE=55757.49 
## 
## Node number 7: 14 observations
##   mean=1304.929, MSE=144801.8

The Regression Tree

par(bg = 'lightblue')
fancyRpartPlot(m1, main="Fig1: Decision Tree Graph", palettes=c("Oranges"), type=1)

Observation1:

Note only 3 predictors were used in the construction of this tree; P01, Pop and NW respectively
4 terminal nodes & 2 branching points

Variable of Importance

m1$variable.importance

##       Po1       Po2    Wealth      Ineq      Prob         M        NW       Pop 
## 2497521.7 2497521.7 1628818.5 1602212.0 1520230.6 1388627.8 1245883.8  661770.6 
##      Time        Ed        LF        So 
##  601906.0  569545.9  203872.5  161800.8

Cross Validation

When rpart grows a tree it performs 10-fold (default = 10) cross validation on the data
In our case, the lowest Cross-Validation error (xerror) occurred 0.975 with 1 split

(m1$cptable) # display the CV results

##           CP nsplit rel error    xerror      xstd
## 1 0.36296293      0 1.0000000 1.0590182 0.2597043
## 2 0.14814320      1 0.6370371 0.8708915 0.1768329
## 3 0.05173165      2 0.4888939 0.9529299 0.2182971
## 4 0.01000000      3 0.4371622 0.9276588 0.2202180

Plot of CV results

plotcp(m1) # visualize cross-validation results

Additional plots

This plot is confirms what we already know from CV plot; that is 1 split give best result w/o over-fitting

# create additional plots
par(mfrow=c(1,2)) # two plots on one page
rsq.rpart(m1) # visualize cross-validation results

## 
## Regression tree:
## rpart(formula = Crime ~ ., data = df, method = "anova")
## 
## Variables actually used in tree construction:
## [1] NW  Po1 Pop
## 
## Root node error: 6880928/47 = 146403
## 
## n= 47 
## 
##         CP nsplit rel error  xerror    xstd
## 1 0.362963      0   1.00000 1.05902 0.25970
## 2 0.148143      1   0.63704 0.87089 0.17683
## 3 0.051732      2   0.48889 0.95293 0.21830
## 4 0.010000      3   0.43716 0.92766 0.22022

Pruning tree

# find best value of cp
min_cp <- m1$cptable[which.min(m1$cptable[,"xerror"]),"CP"]
#min_cp =>0.148, this is where lowest xerror occurred for the complexity parameter (cp)
# pruned tree using best cp
m1_prune <- prune(m1, cp = min_cp)
# plot the pruned tree
par(bg = 'lightblue')
fancyRpartPlot(m1_prune, main="Fig2: Pruned Tree", palettes=c("Oranges"), type=1)

Observation2:

Note trimming the tree to only 1 split based on complexity parameter (see above)

Linear Regression on Pruned Tree based on all data points branching on factor:

P01 < 7.7
P01 > 7.7

df.new<-dplyr::filter((df),Po1<7.7)
df.new.greater.7.7<-dplyr::filter((df),Po1>7.7)

prune_lm = lm(Crime ~., data = df.new)# linear regression on P01<7.7 based on Pruned data above
prune_lm.greater.7.7 = lm(Crime ~., data = df.new.greater.7.7)# linear regression on P01>7.7 based on Pruned data above

ggplotRegression <- function (fit) {

require(ggplot2)

ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) + 
  geom_point() +
  stat_smooth(method = "lm", col = "red") +
  labs(title = paste("Adj R2 = ",signif(summary(fit)$adj.r.squared, 5),
                     "Intercept =",signif(fit$coef[[1]],5 ),
                     " Slope =",signif(fit$coef[[2]], 5),
                     " P =",signif(summary(fit)$coef[2,4], 5)))
}
#plot Po1 < 7.7
ggplotRegression(prune_lm )+theme_economist()+theme(
  plot.title = element_text(color = "red", size = 12, face = "bold"))+ labs(x = "All Data pts with P01 < 7.7")

#plot Po1 <>7.7
ggplotRegression(prune_lm.greater.7.7)+theme_economist()+theme(
  plot.title = element_text(color = "red", size = 12, face = "bold"))+ labs(x = "All Data pts with P01 > 7.7")

Statistical Diagnostic

Based on Pruned Tree - only showing residual analysis for Po1<7.7

# Compute the analysis of variance
res.aov.train <- aov(prune_lm , data = df.new)
# Summary of the analysis
summary(res.aov.train)

##             Df Sum Sq Mean Sq F value  Pr(>F)   
## M            1 125028  125028   9.312 0.01854 * 
## So           1  12026   12026   0.896 0.37545   
## Ed           1 190519  190519  14.190 0.00701 **
## Po1          1 124811  124811   9.296 0.01861 * 
## Po2          1   1030    1030   0.077 0.78980   
## LF           1  40827   40827   3.041 0.12472   
## M.F          1   3404    3404   0.254 0.63006   
## Pop          1  17414   17414   1.297 0.29224   
## NW           1  29277   29277   2.181 0.18328   
## U1           1  26226   26226   1.953 0.20493   
## U2           1   1240    1240   0.092 0.77004   
## Wealth       1   6900    6900   0.514 0.49665   
## Ineq         1   1611    1611   0.120 0.73923   
## Prob         1  14494   14494   1.080 0.33335   
## Time         1  90452   90452   6.737 0.03566 * 
## Residuals    7  93984   13426                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# 1. Homogeneity of variances check
plot(res.aov.train, 1)

# 2. Normality of residual check
plot(res.aov.train, 2)

Observation3:

Linear regressions based on single factor P01<7.7 and P01>7.7 with all the data
Overall R-Squared deceptively looked good but the overall P-value for both were bad mainly because we have lots of insignificant predictors with small sample size

Note: Po1 < 7.7 were shown but since P01 > 7.7 were similar, it was not plotted

Homogeneity of variances looked good but QQ plot of normality looks lousy
Let’s see if we can refined it by just focusing on lesser predictors?

From the Pruned Tree

Two-leaf tree analysis with one branch

m1_prune$where #this tells us where the leafs are

##  [1] 2 3 2 3 3 3 3 3 2 2 3 2 2 2 2 3 2 3 3 3 2 2 3 3 2 3 2 3 3 2 2 3 2 3 3 3 2 2
## [39] 2 3 2 2 2 3 2 3 3

m1_prune$frame #shows are the rows of each node

##      var  n wt       dev      yval complexity ncompete nsurrogate
## 1    Po1 47 47 6880927.7  905.0851 0.36296293        4          5
## 2 <leaf> 23 23  779243.5  669.6087 0.05173165        0          0
## 3 <leaf> 24 24 3604162.5 1130.7500 0.14814320        0          0

Separate the data into separate leafs

leaf.2 <-df[which(m1_prune$where==2),]
leaf.3<-df[which(m1_prune$where==3),]

2 leafs Linear Regression analysis

Leaf 2 first

m.leaf2<-lm(Crime~., data=leaf.2)
summary(m.leaf2)

## 
## Call:
## lm(formula = Crime ~ ., data = leaf.2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -109.147  -52.803   -6.495   53.784  127.196 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   -48.5477  2044.9766  -0.024   0.9817  
## M              45.8622    58.6256   0.782   0.4597  
## So            380.4815   223.1072   1.705   0.1319  
## Ed            187.9074    89.5799   2.098   0.0741 .
## Po1            -3.5138   157.7513  -0.022   0.9829  
## Po2            44.6382   148.5528   0.300   0.7725  
## LF           1059.3652  1187.9722   0.892   0.4021  
## M.F           -22.5521    21.4677  -1.051   0.3284  
## Pop            10.6413     5.0929   2.089   0.0750 .
## NW              0.1010     7.9019   0.013   0.9902  
## U1           4878.2802  4874.8165   1.001   0.3503  
## U2             -5.5126   133.5094  -0.041   0.9682  
## Wealth         -0.1022     0.1752  -0.583   0.5779  
## Ineq            4.7779    35.5290   0.134   0.8968  
## Prob        -7317.4407  3280.7511  -2.230   0.0609 .
## Time          -20.0603     7.7287  -2.596   0.0357 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 115.9 on 7 degrees of freedom
## Multiple R-squared:  0.8794, Adjusted R-squared:  0.6209 
## F-statistic: 3.403 on 15 and 7 DF,  p-value: 0.0541

Observation4:

Only 4 factors are significant at the 90% confidence level (alpha=0.1). Let’s use those

m2.leaf2 <- lm(Crime~Ed+Pop+Prob+Time,data=leaf.2)
summary(m2.leaf2 )

## 
## Call:
## lm(formula = Crime ~ Ed + Pop + Prob + Time, data = leaf.2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -206.35  -90.22   -7.59   59.64  357.11 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   819.960    515.112   1.592   0.1288   
## Ed              9.499     34.869   0.272   0.7884   
## Pop            11.395      3.229   3.529   0.0024 **
## Prob        -3164.075   2095.755  -1.510   0.1485   
## Time          -12.130      6.830  -1.776   0.0927 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 154.5 on 18 degrees of freedom
## Multiple R-squared:  0.4485, Adjusted R-squared:  0.3259 
## F-statistic: 3.659 on 4 and 18 DF,  p-value: 0.02379

Observation4:

Only 2 factors now, let’s cut to the chase and use Pop (99% confidence level) only

m3.leaf2  <- lm(Crime~Pop,data=leaf.2)
summary(m3.leaf2)

## 
## Call:
## lm(formula = Crime ~ Pop, data = leaf.2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -224.71 -114.53    0.29   60.43  400.75 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  497.836     66.949   7.436  2.6e-07 ***
## Pop            7.540      2.539   2.970  0.00731 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 161.7 on 21 degrees of freedom
## Multiple R-squared:  0.2957, Adjusted R-squared:  0.2622 
## F-statistic: 8.818 on 1 and 21 DF,  p-value: 0.007313

ggplotRegression(m3.leaf2 )+theme_economist()+theme(
  plot.title = element_text(color = "red", size = 12, face = "bold"))

Another statistical diagnostic check

res1.aov.train <- aov(m3.leaf2  , data = leaf.2)
# 1. Homogeneity of variances check
plot(res1.aov.train, 1)

# 2. Normality of residual check
plot(res1.aov.train, 2)

Observation5:

Adj R-Squared looks more realistic now given the small sample size, P-values looked great
Homogeneity of variances looked good and QQ plot of normality got better but still a bit of fraying on the outer edges
To make this more pristine, we would’ve performed log transform on the Pop predictor but we are going to stop here and say this is good enough

Leaf 3 now

m.leaf3 <- lm(Crime~.,data=leaf.3)
summary(m.leaf3 )

## 
## Call:
## lm(formula = Crime ~ ., data = leaf.3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -206.805 -120.407   -9.489  103.985  248.226 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -8634.1701  2366.4043  -3.649  0.00651 **
## M               5.6032    96.1623   0.058  0.95496   
## So            179.6267   409.5210   0.439  0.67254   
## Ed            263.0845   146.4229   1.797  0.11010   
## Po1           235.2349   166.1289   1.416  0.19452   
## Po2          -140.7023   193.8759  -0.726  0.48869   
## LF           1442.4214  4832.4463   0.298  0.77294   
## M.F            -1.2379    54.8160  -0.023  0.98254   
## Pop            -3.7686     2.8833  -1.307  0.22751   
## NW             -0.5396    24.5039  -0.022  0.98297   
## U1          -3779.9843 10923.3434  -0.346  0.73823   
## U2            163.7106   150.5361   1.088  0.30848   
## Wealth          0.3017     0.2051   1.471  0.17946   
## Ineq          155.3754    65.5077   2.372  0.04511 * 
## Prob        -3624.0711  4381.4724  -0.827  0.43214   
## Time           21.9335    14.6754   1.495  0.17338   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 229.9 on 8 degrees of freedom
## Multiple R-squared:  0.8827, Adjusted R-squared:  0.6626 
## F-statistic: 4.012 on 15 and 8 DF,  p-value: 0.02669

Observation6:

Here it’s even worse, only Ineq is significant at 95% confidence level (alpha=0.05). Let’s just use that then

m2.leaf3<- lm(Crime~Ineq,data=leaf.3)
summary(m2.leaf3)

## 
## Call:
## lm(formula = Crime ~ Ineq, data = leaf.3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -612.67 -254.25  -88.83  136.25  918.77 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   702.70     495.52   1.418    0.170
## Ineq           24.44      27.91   0.876    0.391
## 
## Residual standard error: 397.9 on 22 degrees of freedom
## Multiple R-squared:  0.03368,    Adjusted R-squared:  -0.01024 
## F-statistic: 0.7668 on 1 and 22 DF,  p-value: 0.3907

Observation7:

This leaf3 terminal is really bad! Unexpected. We may have to throw this model away and find better ones OR…..
Regression tree indicated Leaf 3 with sub-data of P01> 7.7 was a viable data set to regress upon while in fact it was not. This is meant that we could use tree branching as a root causing tool and discover other more viable factors

Question 10.1b

a random forest model

Random Forest Model

set.seed(71)
random.forest.model <- randomForest(Crime ~. , data=df,keep.forest=T, importance=TRUE,class=)
print(random.forest.model ) # view results

## 
## Call:
##  randomForest(formula = Crime ~ ., data = df, keep.forest = T,      importance = TRUE, class = ) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 5
## 
##           Mean of squared residuals: 87620.99
##                     % Var explained: 40.15

randomForest::importance(random.forest.model ) # importance of each predictor

##           %IncMSE IncNodePurity
## M       2.1422379      188810.1
## So      1.3842953       22913.7
## Ed      3.3048929      201890.5
## Po1    11.8658584     1141790.9
## Po2    10.1332975     1098083.6
## LF      1.7264668      272905.0
## M.F     1.6662156      312965.8
## Pop     0.8410977      363070.0
## NW      7.1869300      518088.2
## U1      0.3949679      139563.4
## U2      0.9451352      146112.3
## Wealth  4.5875617      642320.7
## Ineq    0.6261387      275573.5
## Prob    9.6015928      872924.5
## Time    3.7374840      217304.9

par(bg = 'lightblue2')
varImpPlot(random.forest.model ) #plot of importance predictors

Prediction and Performance Metrics

ypred.RF <- predict(random.forest.model )
RSS <- sum((ypred.RF-df$Crime)^2) # the residual sum of squares
TSS <- sum((mean(df$Crime)-df$Crime)^2)#the total sum of squares
print(sprintf("R-Squared of Random Forest Model (All factors & Data Pts) = %0.3f", R.squared.RF<-1-(RSS/TSS) ))

## [1] "R-Squared of Random Forest Model (All factors & Data Pts) = 0.402"

#residual analysis
par(bg = 'lightblue')
plot(df$Crime, scale(ypred.RF-df$Crime), pch = 19, col = "maroon1",main="Random Forest: Residual vs. Fitted")
abline(0,0)

Observation8:

Overall R-Squared = 0.402 which is very reasonable (all data points & all factors). The introduction of Randomness to the Random Forest model really helped in reducing over-fitting
Residuals looked suspect with a downward drift; heteroscedasticity maybe in play due to non-uniform variances

Analysis on the Pruned Tree again using Random Forest

Cross Validation on Random Forest Model
Repeat Two-leaf tree analysis with one branch

All data points for P01<7.7

# Define training control
set.seed(71)
fit.control <- caret::trainControl(method = "repeatedcv", number = 10, repeats = 3)
rf.fit.leaf2 <- caret::train(Crime~.,
                       data = df.new,
                       method = "rf",
                       trControl = fit.control)
# Summarize the results
print(rf.fit.leaf2)

## Random Forest 
## 
## 23 samples
## 15 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 21, 21, 21, 21, 20, 20, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##    2    148.6439  0.7932877  132.1649
##    8    146.8366  0.8270646  127.4077
##   15    148.8677  0.8379726  127.7360
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 8.

All data points for P01>7.7

set.seed(71)
fit.control <- caret::trainControl(method = "repeatedcv", number = 10, repeats = 3)
rf.fit.leaf3 <- caret::train(Crime~.,
                       data = df.new.greater.7.7,
                       method = "rf",
                       trControl = fit.control)
# Summarize the results
print(rf.fit.leaf3)

## Random Forest 
## 
## 24 samples
## 15 predictors
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 22, 22, 21, 22, 21, 21, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##    2    352.3433  0.8550967  307.8084
##    8    368.5349  0.8468138  321.6500
##   15    390.8036  0.8471077  339.9812
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.

Observation9:

Note: R-squared based on lowest RMSE

Both Cross Validation R squared looked much better now. Not sure if overfitting is causing it or not but Random Forest has above decent results for all data points with P01 > 7.7 and P01 < 7.7

Summary

- Both Regression Tree & Random Forest pointed to P01 as the defining factor to branched on (Importance predictor plots)

Advantage of Regression tree:

- Regression Tree as a root-causing tool

(a) While Linear regression on P01 > 7.7 produced negative R-squared!  This indicated to us that this particular leaf needed attention and should be re-investigated for further actions such as either a different model or factors to regress upon

- Very clear which model one is using and clearly, we do know which factor/s were significant

Disadvantage of Regression tree:

- Could produced overfitting issues when you have small sample size with too many predictors

Advantage of Random Forest:

- Random Forest produced better overall results with less overfitting and its cross validated R squares looked even more impressive (suspcisously).  As touted, Random Forest can reduce overfitting issues due to the fact that you have inherent randomeness in the model itself 

- Built-in function tells us which variable/s are of importance; P01 which coincided with regression tree

Disadvantage of Random Forest:

- No line of sight to which model/s were utilized.

Question 10.2

Describe a situation or problem from your job, everyday life, current events, etc., for which a logistic regression model would be appropriate. List some (up to 5) predictors that you might use.

The Credit Card Fraud Detection problem is of significant importance to the banking industry because banks each year spend hundreds of millions of dollars due to fraud. When a credit card transaction happens, the bank makes a note of several factors. For instance, the date of the transaction, amount, place, type of purchase, etc. Based on these factors, they develop a Logistic Regression model of whether or not the transaction is a fraud.

Question 10.3

Logistic Regression

The German Credit Problem:

Note: The original data set had a number of categorical variables, which must be transform to binary variables[1] so that they can be appropriately handled by logistic regression. The data is organized in the spreadsheet German Credit.txt[2]

# Loading data
df.log<-read.table("germancredit.txt",sep = " ")
head(df.log,2)

##    V1 V2  V3  V4   V5  V6  V7 V8  V9  V10 V11  V12 V13  V14  V15 V16  V17 V18
## 1 A11  6 A34 A43 1169 A65 A75  4 A93 A101   4 A121  67 A143 A152   2 A173   1
## 2 A12 48 A32 A43 5951 A61 A73  2 A92 A101   2 A121  22 A143 A152   1 A173   1
##    V19  V20 V21
## 1 A192 A201   1
## 2 A191 A201   2

One-hot encoding & Making response 1 or 0

set.seed(713)
newdata <- one_hot(as.data.table(df.log))#one hot encoding the categorical variables
newdata$V21[newdata$V21==1]<-0
newdata$V21[newdata$V21==2]<-1

#Generate a random sample of 70% of the rows
random_row<- sample(1:nrow(newdata ),as.integer(0.7*nrow(newdata ),replace=F))
traindata = newdata [random_row,]
#Assign the test data set to the remaining 30% of the original set
testdata = newdata [-random_row,]

-Imbalance Check

table(newdata$V21)

## 
##   0   1 
## 700 300

Observation10:

There are clearly more 0’s than 1’s: 70/30 split. We have to be careful, depending on the confusion matrix; we may have to sample the observations in approximately equal proportions to get better models. Let’s see….

All predictors

set.seed(713)
log.reg1 <- glm(V21 ~.,family=binomial(link = "logit"),data=traindata)
summary(log.reg1 )

## 
## Call:
## glm(formula = V21 ~ ., family = binomial(link = "logit"), data = traindata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5305  -0.7382  -0.3661   0.7299   2.7427  
## 
## Coefficients: (13 not defined because of singularities)
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.135e+00  1.628e+00  -4.996 5.86e-07 ***
## V1_A11       1.756e+00  2.850e-01   6.162 7.16e-10 ***
## V1_A12       1.317e+00  2.740e-01   4.806 1.54e-06 ***
## V1_A13       4.612e-01  4.771e-01   0.967 0.333694    
## V1_A14              NA         NA      NA       NA    
## V2           2.040e-02  1.114e-02   1.831 0.067053 .  
## V3_A30       1.718e+00  5.026e-01   3.418 0.000630 ***
## V3_A31       2.108e+00  5.403e-01   3.901 9.57e-05 ***
## V3_A32       1.115e+00  3.197e-01   3.487 0.000489 ***
## V3_A33       9.977e-01  3.905e-01   2.555 0.010612 *  
## V3_A34              NA         NA      NA       NA    
## V4_A40       5.729e-01  3.913e-01   1.464 0.143099    
## V4_A41      -1.090e+00  5.161e-01  -2.112 0.034660 *  
## V4_A410     -5.138e-01  8.926e-01  -0.576 0.564887    
## V4_A42      -4.636e-01  4.175e-01  -1.110 0.266804    
## V4_A43      -3.511e-01  3.812e-01  -0.921 0.357030    
## V4_A44      -3.046e-01  9.119e-01  -0.334 0.738322    
## V4_A45       6.158e-01  6.760e-01   0.911 0.362366    
## V4_A46       1.331e-01  5.636e-01   0.236 0.813335    
## V4_A48      -1.520e+01  4.497e+02  -0.034 0.973031    
## V4_A49              NA         NA      NA       NA    
## V5           1.544e-04  5.186e-05   2.977 0.002912 ** 
## V6_A61       8.776e-01  3.027e-01   2.899 0.003747 ** 
## V6_A62       3.352e-01  4.175e-01   0.803 0.422059    
## V6_A63       7.555e-01  5.171e-01   1.461 0.143977    
## V6_A64      -3.631e-01  6.639e-01  -0.547 0.584416    
## V6_A65              NA         NA      NA       NA    
## V7_A71       2.559e-01  4.722e-01   0.542 0.587897    
## V7_A72       2.634e-01  3.582e-01   0.735 0.462159    
## V7_A73       4.323e-03  2.945e-01   0.015 0.988287    
## V7_A74      -7.052e-01  3.598e-01  -1.960 0.049988 *  
## V7_A75              NA         NA      NA       NA    
## V8           3.428e-01  1.054e-01   3.253 0.001141 ** 
## V9_A91      -3.057e-03  5.238e-01  -0.006 0.995344    
## V9_A92      -5.761e-02  3.811e-01  -0.151 0.879834    
## V9_A93      -5.984e-01  3.770e-01  -1.587 0.112468    
## V9_A94              NA         NA      NA       NA    
## V10_A101     8.571e-01  5.134e-01   1.670 0.095008 .  
## V10_A102     1.289e+00  6.752e-01   1.909 0.056296 .  
## V10_A103            NA         NA      NA       NA    
## V11         -3.271e-02  1.060e-01  -0.309 0.757674    
## V12_A121    -5.186e-01  5.127e-01  -1.011 0.311834    
## V12_A122    -2.735e-01  4.961e-01  -0.551 0.581474    
## V12_A123    -3.283e-01  4.825e-01  -0.680 0.496222    
## V12_A124            NA         NA      NA       NA    
## V13         -6.069e-03  1.122e-02  -0.541 0.588678    
## V14_A141     4.070e-01  2.867e-01   1.420 0.155658    
## V14_A142     9.262e-01  4.765e-01   1.944 0.051917 .  
## V14_A143            NA         NA      NA       NA    
## V15_A151     3.657e-01  5.688e-01   0.643 0.520241    
## V15_A152     2.017e-01  5.402e-01   0.373 0.708812    
## V15_A153            NA         NA      NA       NA    
## V16          1.592e-01  2.322e-01   0.686 0.493004    
## V17_A171    -9.006e-01  7.506e-01  -1.200 0.230205    
## V17_A172    -2.770e-04  4.069e-01  -0.001 0.999457    
## V17_A173     2.300e-01  3.318e-01   0.693 0.488241    
## V17_A174            NA         NA      NA       NA    
## V18          2.527e-01  2.973e-01   0.850 0.395243    
## V19_A191     1.845e-01  2.460e-01   0.750 0.453258    
## V19_A192            NA         NA      NA       NA    
## V20_A201     2.060e+00  8.978e-01   2.295 0.021751 *  
## V20_A202            NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 856.90  on 699  degrees of freedom
## Residual deviance: 630.76  on 651  degrees of freedom
## AIC: 728.76
## 
## Number of Fisher Scoring iterations: 14

Similar to linear regression, tossing out insignificant predictors

From above: V1_A11+V1_A12+V3_A30+V3_A31+V3_A32+V3_A33+V4_A41+V5+V6_A61+V7_A74+V8+V20_A201

Rerun

set.seed(713)
log.reg2 <- glm(V21~ V1_A11+V1_A12+V3_A30+V3_A31+V3_A32+V3_A33+V4_A41+V5+V6_A61+V7_A74+V8+V20_A201 ,family=binomial(link = "logit"),data=traindata)
summary(log.reg2 )

## 
## Call:
## glm(formula = V21 ~ V1_A11 + V1_A12 + V3_A30 + V3_A31 + V3_A32 + 
##     V3_A33 + V4_A41 + V5 + V6_A61 + V7_A74 + V8 + V20_A201, family = binomial(link = "logit"), 
##     data = traindata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2163  -0.7574  -0.4430   0.8829   2.6810  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.307e+00  9.892e-01  -6.376 1.81e-10 ***
## V1_A11       1.629e+00  2.404e-01   6.775 1.24e-11 ***
## V1_A12       1.243e+00  2.352e-01   5.286 1.25e-07 ***
## V3_A30       1.792e+00  4.552e-01   3.936 8.29e-05 ***
## V3_A31       2.085e+00  4.465e-01   4.670 3.01e-06 ***
## V3_A32       1.027e+00  2.447e-01   4.195 2.73e-05 ***
## V3_A33       1.154e+00  3.599e-01   3.207  0.00134 ** 
## V4_A41      -1.109e+00  3.878e-01  -2.860  0.00423 ** 
## V5           1.746e-04  3.608e-05   4.839 1.31e-06 ***
## V6_A61       6.363e-01  2.091e-01   3.043  0.00234 ** 
## V7_A74      -7.138e-01  2.751e-01  -2.595  0.00947 ** 
## V8           2.814e-01  9.051e-02   3.109  0.00188 ** 
## V20_A201     2.095e+00  8.725e-01   2.401  0.01634 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 856.90  on 699  degrees of freedom
## Residual deviance: 680.19  on 687  degrees of freedom
## AIC: 706.19
## 
## Number of Fisher Scoring iterations: 5

Multicollinearity

Like in case of linear regression, checking for multicollinearity

#calculate the VIF for each predictor variable in the model
 VIF <- function(linear.model, no.intercept=FALSE, all.diagnostics=FALSE, plot=FALSE) {
    require(mctest)
    if(no.intercept==FALSE) design.matrix <- model.matrix(linear.model)[,-1]
    if(no.intercept==TRUE) design.matrix <- model.matrix(linear.model)
    if(plot==TRUE) mc.plot(design.matrix,linear.model$model[1])
    if(all.diagnostics==FALSE) output <- imcdiag(design.matrix,linear.model$model[1], method='VIF')$idiags[,1]
    if(all.diagnostics==TRUE) output <- imcdiag(design.matrix,linear.model$model[1])
    output
 }

#create vector of VIF values
vif_values <- VIF(log.reg2 )
#create horizontal bar chart to display each VIF value
par(bg = 'lightblue')
barplot(vif_values, main = "Fig3: VIF Values on Final Logistic Regression Model", horiz = TRUE, col = "gold",las=2,cex.names=.53,xlim=c(0,5))
#add vertical line at 5
abline(v = 5, lwd = 3, lty = 2)

Observation11:

All predictors were less than 5 which means no multicollinearity issues

Final Model & Validation

This 3rd and final iteration was selected

set.seed(713)
predicted <- predict(log.reg2  , testdata, type="response") #prediction based on test data
#optimal prediction probability cutoff for the model
optCutOff <- optimalCutoff(testdata$V21, predicted)[1] 
predicted_roundup <- as.integer(predicted > optCutOff ) #rounding up to 1's and 0's

Model Quality:

Confusion Matrix
Misclassification Error Rate: The lower the better
Specificity and Sensitivity
ROC Curve

#Confustion Matrix
confusionMatrix(predicted_roundup, testdata$V21, threshold = optCutOff)

##     0  1
## 0 188 23
## 1  48 41

#Misclassification error
misClassError(testdata$V21, predicted_roundup, threshold = optCutOff)

## [1] 0.2367

#sensitivity & specificity
sensitivity(testdata$V21, predicted_roundup, threshold = optCutOff)

## [1] 0.4606742

specificity(testdata$V21, predicted_roundup, threshold = optCutOff)

## [1] 0.8909953

Observation12:

Total Accuracy: 73% (TP+TN)/(TP+TN+FN+FP)
Sensitivity: 46% while Specificity: 89%
Misclassifcation error rate: 24%

ROC Plot

plotROC(testdata$V21, predicted_roundup)

Cost-Sensitive Classification

Uneven cost of FP vs. FN

Costs of a false positive (incorrectly saying an applicant is a good credit risk) outweigh the cost of a false negative (incorrectly saying an applicant is a bad credit risk) by a factor of five

costs = matrix(c(0, 5, 1, 0), nrow = 2)
dimnames(costs) = list(Actual = c("good", "bad"), Predicted= c("good", "bad"))
print(costs) #Cost of misclassifications 5x for FP vs. FN

##       Predicted
## Actual good bad
##   good    0   1
##   bad     5   0

Thresholding

Plot of Total Cost of misclassifications vs Thresholds

#initialize list 
cost <- vector(mode = "list")
for (i in 1:100){
predicted_roundup <- as.integer(predicted > i/100 )
cm_matrix <- as.matrix(table(testdata$V21 ,predicted_roundup))
#Ensuring NO out of bounds issues while looping
if(nrow(cm_matrix)==2) {fp<-cm_matrix[2,1]} else {fp=0} 
if(ncol(cm_matrix)==2){fn<-cm_matrix[1,2]} else {fn=0}

cost<-c(cost, fn*1+fp*5)
}
#Plots ov Total cost vs % thresholds
par(bg = 'lightblue')
plot(x=seq(0.01,1,by=0.01),y=cost,xlab = "Threshold, %",ylab = "Total Cost of Misclassifications",main = "Fig4: Cost vs Threshold",col='red',pch=16)
grid (10,10, lty = 6, col = "white")

Minimum Threshold

numerator<-which.min(cost)
min.threshold <-numerator/100
print(sprintf("Minimum Threshold due to Uneven Error Cost= %0.3f", min.threshold))

## [1] "Minimum Threshold due to Uneven Error Cost= 0.130"

Summary

- German Credit Data set had binary, numerical and categorical data.  The data cleaning was to one-hot encoding all the data including the response to binary data points for ease of analysis downstream

- Ran total of 2 iterations to get to the most significant predictors; tossing out insignifcant ones along the way, we arrived at:

Logistic Model

              Estimate    
(Intercept) -6.307e+00  
V1_A11       1.629e+00  
V1_A12       1.243e+00  
V3_A30       1.792e+00  
V3_A31       2.085e+00  
V3_A32       1.027e+00  
V3_A33       1.154e+00   
V4_A41      -1.109e+00  
V5           1.746e-04  
V6_A61       6.363e-01  
V7_A74      -7.138e-01  
V8           2.814e-01   
V20_A201     2.095e+00   

- Because of FP cost 5x more than FN, we had to fine tune the threshold and the optimal threshold for classification was at 13%

- Lastly, the AUC at 65% (although not the best accuracy measure per Sokol) tells us that a random person from the yes group would probably get a higher estimate relative to the person from the no group

HW7_ISYE_6501

10/1/2020

Question 10.1a

Regression Trees

The Regression Tree

From the Pruned Tree

2 leafs Linear Regression analysis

Question 10.1b

Random Forest Model

Prediction and Performance Metrics

Analysis on the Pruned Tree again using Random Forest

Summary

Question 10.2

Question 10.3

Logistic Regression

Cost-Sensitive Classification

Thresholding

Summary