Using the same crime data set uscrime.txt as in Questions 8.2 and 9.1, find the best model you can using
require(ggthemes)
library(tidyverse)
library(magrittr)
library(TTR)
library(tidyr)
library(dplyr)
library(lubridate)
library(ggplot2)
library(plotly)
library(fpp2)
library(forecast)
library(caTools)
library(reshape2)
library(psych)
require(graphics)
require(Matrix)
library(corrplot)
library(mltools)
library(fBasics)
library(kableExtra)
library(DMwR)
library(caret)
library(gridExtra)
library(DAAG)
library(rpart)
library(randomForest)
library(tree)
library(ISLR)
library(rpart.plot)
library(rattle)
library(data.table)
library(InformationValue)
library("mlr3")
library("mlr3learners")
df <- read.csv(file="uscrime.csv",stringsAsFactors = F, header=T)
head(df,2)
## M So Ed Po1 Po2 LF M.F Pop NW U1 U2 Wealth Ineq Prob
## 1 15.1 1 9.1 5.8 5.6 0.510 95.0 33 30.1 0.108 4.1 3940 26.1 0.084602
## 2 14.3 0 11.3 10.3 9.5 0.583 101.2 13 10.2 0.096 3.6 5570 19.4 0.029599
## Time Crime
## 1 26.2011 791
## 2 25.2999 1635
#Statistical Summary of original data
df_unistats<- basicStats(df)[c("Minimum", "Maximum", "1. Quartile", "3. Quartile", "Mean", "Median",
"Variance", "Stdev", "Skewness", "Kurtosis"), ] %>% t() %>% as.data.frame()
kable(df_unistats)
Minimum | Maximum |
|
|
Mean | Median | Variance | Stdev | Skewness | Kurtosis | |
---|---|---|---|---|---|---|---|---|---|---|
M | 11.9000 | 17.700000 | 13.000000 | 14.60000 | 13.857447 | 13.6000 | 1.579454e+00 | 1.256763 | 0.821917 | 0.377753 |
So | 0.0000 | 1.000000 | 0.000000 | 1.00000 | 0.340426 | 0.0000 | 2.294170e-01 | 0.478975 | 0.652139 | -1.607569 |
Ed | 8.7000 | 12.200000 | 9.750000 | 11.45000 | 10.563830 | 10.8000 | 1.251489e+00 | 1.118700 | -0.318987 | -1.149253 |
Po1 | 4.5000 | 16.600000 | 6.250000 | 10.45000 | 8.500000 | 7.8000 | 8.832174e+00 | 2.971897 | 0.890124 | 0.162309 |
Po2 | 4.1000 | 15.700000 | 5.850000 | 9.70000 | 8.023404 | 7.3000 | 7.818353e+00 | 2.796132 | 0.844367 | 0.008590 |
LF | 0.4800 | 0.641000 | 0.530500 | 0.59300 | 0.561191 | 0.5600 | 1.633000e-03 | 0.040412 | 0.270675 | -0.888732 |
M.F | 93.4000 | 107.100000 | 96.450000 | 99.20000 | 98.302128 | 97.7000 | 8.683256e+00 | 2.946737 | 0.993223 | 0.652010 |
Pop | 3.0000 | 168.000000 | 10.000000 | 41.50000 | 36.617021 | 25.0000 | 1.449415e+03 | 38.071188 | 1.854230 | 3.078936 |
NW | 0.2000 | 42.300000 | 2.400000 | 13.25000 | 10.112766 | 7.6000 | 1.057377e+02 | 10.282882 | 1.379966 | 1.077648 |
U1 | 0.0700 | 0.142000 | 0.080500 | 0.10400 | 0.095468 | 0.0920 | 3.250000e-04 | 0.018029 | 0.774876 | -0.131208 |
U2 | 2.0000 | 5.800000 | 2.750000 | 3.85000 | 3.397872 | 3.4000 | 7.132560e-01 | 0.844545 | 0.542264 | 0.173008 |
Wealth | 2880.0000 | 6890.000000 | 4595.000000 | 5915.00000 | 5253.829787 | 5370.0000 | 9.310502e+05 | 964.909442 | -0.381952 | -0.613169 |
Ineq | 12.6000 | 27.600000 | 16.550000 | 22.75000 | 19.400000 | 17.6000 | 1.591696e+01 | 3.989606 | 0.367063 | -1.138824 |
Prob | 0.0069 | 0.119804 | 0.032701 | 0.05445 | 0.047091 | 0.0421 | 5.170000e-04 | 0.022737 | 0.883336 | 0.749579 |
Time | 12.1996 | 44.000400 | 21.600350 | 30.45075 | 26.597921 | 25.8006 | 5.022408e+01 | 7.086895 | 0.371275 | -0.413474 |
Crime | 342.0000 | 1993.000000 | 658.500000 | 1057.50000 | 905.085106 | 831.0000 | 1.495854e+05 | 386.762697 | 1.053927 | 0.777628 |
# Visualizations via Box plots
meltData <- melt(df)
p <- ggplot(meltData, aes(factor(variable), value))
p + geom_boxplot() + facet_wrap(~variable, scale="free")+theme_economist()+scale_colour_economist()
#density plots of original data
density <- df %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_density()+theme_economist()+scale_colour_economist()
density
str(df) #structure of data
## 'data.frame': 47 obs. of 16 variables:
## $ M : num 15.1 14.3 14.2 13.6 14.1 12.1 12.7 13.1 15.7 14 ...
## $ So : int 1 0 1 0 0 0 1 1 1 0 ...
## $ Ed : num 9.1 11.3 8.9 12.1 12.1 11 11.1 10.9 9 11.8 ...
## $ Po1 : num 5.8 10.3 4.5 14.9 10.9 11.8 8.2 11.5 6.5 7.1 ...
## $ Po2 : num 5.6 9.5 4.4 14.1 10.1 11.5 7.9 10.9 6.2 6.8 ...
## $ LF : num 0.51 0.583 0.533 0.577 0.591 0.547 0.519 0.542 0.553 0.632 ...
## $ M.F : num 95 101.2 96.9 99.4 98.5 ...
## $ Pop : int 33 13 18 157 18 25 4 50 39 7 ...
## $ NW : num 30.1 10.2 21.9 8 3 4.4 13.9 17.9 28.6 1.5 ...
## $ U1 : num 0.108 0.096 0.094 0.102 0.091 0.084 0.097 0.079 0.081 0.1 ...
## $ U2 : num 4.1 3.6 3.3 3.9 2 2.9 3.8 3.5 2.8 2.4 ...
## $ Wealth: int 3940 5570 3180 6730 5780 6890 6200 4720 4210 5260 ...
## $ Ineq : num 26.1 19.4 25 16.7 17.4 12.6 16.8 20.6 23.9 17.4 ...
## $ Prob : num 0.0846 0.0296 0.0834 0.0158 0.0414 ...
## $ Time : num 26.2 25.3 24.3 29.9 21.3 ...
## $ Crime : int 791 1635 578 1969 1234 682 963 1555 856 705 ...
set.seed(18)
#fitting the model
m1<- rpart(Crime~ ., data = df,method="anova" )
#summarize model result
summary(m1)
## Call:
## rpart(formula = Crime ~ ., data = df, method = "anova")
## n= 47
##
## CP nsplit rel error xerror xstd
## 1 0.36296293 0 1.0000000 1.0590182 0.2597043
## 2 0.14814320 1 0.6370371 0.8708915 0.1768329
## 3 0.05173165 2 0.4888939 0.9529299 0.2182971
## 4 0.01000000 3 0.4371622 0.9276588 0.2202180
##
## Variable importance
## Po1 Po2 Wealth Ineq Prob M NW Pop Time Ed LF
## 17 17 11 11 10 10 9 5 4 4 1
## So
## 1
##
## Node number 1: 47 observations, complexity param=0.3629629
## mean=905.0851, MSE=146402.7
## left son=2 (23 obs) right son=3 (24 obs)
## Primary splits:
## Po1 < 7.65 to the left, improve=0.3629629, (0 missing)
## Po2 < 7.2 to the left, improve=0.3629629, (0 missing)
## Prob < 0.0418485 to the right, improve=0.3217700, (0 missing)
## NW < 7.65 to the left, improve=0.2356621, (0 missing)
## Wealth < 6240 to the left, improve=0.2002403, (0 missing)
## Surrogate splits:
## Po2 < 7.2 to the left, agree=1.000, adj=1.000, (0 split)
## Wealth < 5330 to the left, agree=0.830, adj=0.652, (0 split)
## Prob < 0.043598 to the right, agree=0.809, adj=0.609, (0 split)
## M < 13.25 to the right, agree=0.745, adj=0.478, (0 split)
## Ineq < 17.15 to the right, agree=0.745, adj=0.478, (0 split)
##
## Node number 2: 23 observations, complexity param=0.05173165
## mean=669.6087, MSE=33880.15
## left son=4 (12 obs) right son=5 (11 obs)
## Primary splits:
## Pop < 22.5 to the left, improve=0.4568043, (0 missing)
## M < 14.5 to the left, improve=0.3931567, (0 missing)
## NW < 5.4 to the left, improve=0.3184074, (0 missing)
## Po1 < 5.75 to the left, improve=0.2310098, (0 missing)
## U1 < 0.093 to the right, improve=0.2119062, (0 missing)
## Surrogate splits:
## NW < 5.4 to the left, agree=0.826, adj=0.636, (0 split)
## M < 14.5 to the left, agree=0.783, adj=0.545, (0 split)
## Time < 22.30055 to the left, agree=0.783, adj=0.545, (0 split)
## So < 0.5 to the left, agree=0.739, adj=0.455, (0 split)
## Ed < 10.85 to the right, agree=0.739, adj=0.455, (0 split)
##
## Node number 3: 24 observations, complexity param=0.1481432
## mean=1130.75, MSE=150173.4
## left son=6 (10 obs) right son=7 (14 obs)
## Primary splits:
## NW < 7.65 to the left, improve=0.2828293, (0 missing)
## M < 13.05 to the left, improve=0.2714159, (0 missing)
## Time < 21.9001 to the left, improve=0.2060170, (0 missing)
## M.F < 99.2 to the left, improve=0.1703438, (0 missing)
## Po1 < 10.75 to the left, improve=0.1659433, (0 missing)
## Surrogate splits:
## Ed < 11.45 to the right, agree=0.750, adj=0.4, (0 split)
## Ineq < 16.25 to the left, agree=0.750, adj=0.4, (0 split)
## Time < 21.9001 to the left, agree=0.750, adj=0.4, (0 split)
## Pop < 30 to the left, agree=0.708, adj=0.3, (0 split)
## LF < 0.5885 to the right, agree=0.667, adj=0.2, (0 split)
##
## Node number 4: 12 observations
## mean=550.5, MSE=20317.58
##
## Node number 5: 11 observations
## mean=799.5455, MSE=16315.52
##
## Node number 6: 10 observations
## mean=886.9, MSE=55757.49
##
## Node number 7: 14 observations
## mean=1304.929, MSE=144801.8
par(bg = 'lightblue')
fancyRpartPlot(m1, main="Fig1: Decision Tree Graph", palettes=c("Oranges"), type=1)
Observation1:
Note only 3 predictors were used in the construction of this tree; P01, Pop and NW respectively
4 terminal nodes & 2 branching points
m1$variable.importance
## Po1 Po2 Wealth Ineq Prob M NW Pop
## 2497521.7 2497521.7 1628818.5 1602212.0 1520230.6 1388627.8 1245883.8 661770.6
## Time Ed LF So
## 601906.0 569545.9 203872.5 161800.8
When rpart grows a tree it performs 10-fold (default = 10) cross validation on the data
In our case, the lowest Cross-Validation error (xerror) occurred 0.975 with 1 split
(m1$cptable) # display the CV results
## CP nsplit rel error xerror xstd
## 1 0.36296293 0 1.0000000 1.0590182 0.2597043
## 2 0.14814320 1 0.6370371 0.8708915 0.1768329
## 3 0.05173165 2 0.4888939 0.9529299 0.2182971
## 4 0.01000000 3 0.4371622 0.9276588 0.2202180
plotcp(m1) # visualize cross-validation results
# create additional plots
par(mfrow=c(1,2)) # two plots on one page
rsq.rpart(m1) # visualize cross-validation results
##
## Regression tree:
## rpart(formula = Crime ~ ., data = df, method = "anova")
##
## Variables actually used in tree construction:
## [1] NW Po1 Pop
##
## Root node error: 6880928/47 = 146403
##
## n= 47
##
## CP nsplit rel error xerror xstd
## 1 0.362963 0 1.00000 1.05902 0.25970
## 2 0.148143 1 0.63704 0.87089 0.17683
## 3 0.051732 2 0.48889 0.95293 0.21830
## 4 0.010000 3 0.43716 0.92766 0.22022
# find best value of cp
min_cp <- m1$cptable[which.min(m1$cptable[,"xerror"]),"CP"]
#min_cp =>0.148, this is where lowest xerror occurred for the complexity parameter (cp)
# pruned tree using best cp
m1_prune <- prune(m1, cp = min_cp)
# plot the pruned tree
par(bg = 'lightblue')
fancyRpartPlot(m1_prune, main="Fig2: Pruned Tree", palettes=c("Oranges"), type=1)
Observation2:
P01 < 7.7
P01 > 7.7
df.new<-dplyr::filter((df),Po1<7.7)
df.new.greater.7.7<-dplyr::filter((df),Po1>7.7)
prune_lm = lm(Crime ~., data = df.new)# linear regression on P01<7.7 based on Pruned data above
prune_lm.greater.7.7 = lm(Crime ~., data = df.new.greater.7.7)# linear regression on P01>7.7 based on Pruned data above
ggplotRegression <- function (fit) {
require(ggplot2)
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_point() +
stat_smooth(method = "lm", col = "red") +
labs(title = paste("Adj R2 = ",signif(summary(fit)$adj.r.squared, 5),
"Intercept =",signif(fit$coef[[1]],5 ),
" Slope =",signif(fit$coef[[2]], 5),
" P =",signif(summary(fit)$coef[2,4], 5)))
}
#plot Po1 < 7.7
ggplotRegression(prune_lm )+theme_economist()+theme(
plot.title = element_text(color = "red", size = 12, face = "bold"))+ labs(x = "All Data pts with P01 < 7.7")
#plot Po1 <>7.7
ggplotRegression(prune_lm.greater.7.7)+theme_economist()+theme(
plot.title = element_text(color = "red", size = 12, face = "bold"))+ labs(x = "All Data pts with P01 > 7.7")
# Compute the analysis of variance
res.aov.train <- aov(prune_lm , data = df.new)
# Summary of the analysis
summary(res.aov.train)
## Df Sum Sq Mean Sq F value Pr(>F)
## M 1 125028 125028 9.312 0.01854 *
## So 1 12026 12026 0.896 0.37545
## Ed 1 190519 190519 14.190 0.00701 **
## Po1 1 124811 124811 9.296 0.01861 *
## Po2 1 1030 1030 0.077 0.78980
## LF 1 40827 40827 3.041 0.12472
## M.F 1 3404 3404 0.254 0.63006
## Pop 1 17414 17414 1.297 0.29224
## NW 1 29277 29277 2.181 0.18328
## U1 1 26226 26226 1.953 0.20493
## U2 1 1240 1240 0.092 0.77004
## Wealth 1 6900 6900 0.514 0.49665
## Ineq 1 1611 1611 0.120 0.73923
## Prob 1 14494 14494 1.080 0.33335
## Time 1 90452 90452 6.737 0.03566 *
## Residuals 7 93984 13426
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 1. Homogeneity of variances check
plot(res.aov.train, 1)
# 2. Normality of residual check
plot(res.aov.train, 2)
Observation3:
Linear regressions based on single factor P01<7.7 and P01>7.7 with all the data
Overall R-Squared deceptively looked good but the overall P-value for both were bad mainly because we have lots of insignificant predictors with small sample size
Note: Po1 < 7.7 were shown but since P01 > 7.7 were similar, it was not plotted
Homogeneity of variances looked good but QQ plot of normality looks lousy
Let’s see if we can refined it by just focusing on lesser predictors?
m1_prune$where #this tells us where the leafs are
## [1] 2 3 2 3 3 3 3 3 2 2 3 2 2 2 2 3 2 3 3 3 2 2 3 3 2 3 2 3 3 2 2 3 2 3 3 3 2 2
## [39] 2 3 2 2 2 3 2 3 3
m1_prune$frame #shows are the rows of each node
## var n wt dev yval complexity ncompete nsurrogate
## 1 Po1 47 47 6880927.7 905.0851 0.36296293 4 5
## 2 <leaf> 23 23 779243.5 669.6087 0.05173165 0 0
## 3 <leaf> 24 24 3604162.5 1130.7500 0.14814320 0 0
leaf.2 <-df[which(m1_prune$where==2),]
leaf.3<-df[which(m1_prune$where==3),]
m.leaf2<-lm(Crime~., data=leaf.2)
summary(m.leaf2)
##
## Call:
## lm(formula = Crime ~ ., data = leaf.2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -109.147 -52.803 -6.495 53.784 127.196
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -48.5477 2044.9766 -0.024 0.9817
## M 45.8622 58.6256 0.782 0.4597
## So 380.4815 223.1072 1.705 0.1319
## Ed 187.9074 89.5799 2.098 0.0741 .
## Po1 -3.5138 157.7513 -0.022 0.9829
## Po2 44.6382 148.5528 0.300 0.7725
## LF 1059.3652 1187.9722 0.892 0.4021
## M.F -22.5521 21.4677 -1.051 0.3284
## Pop 10.6413 5.0929 2.089 0.0750 .
## NW 0.1010 7.9019 0.013 0.9902
## U1 4878.2802 4874.8165 1.001 0.3503
## U2 -5.5126 133.5094 -0.041 0.9682
## Wealth -0.1022 0.1752 -0.583 0.5779
## Ineq 4.7779 35.5290 0.134 0.8968
## Prob -7317.4407 3280.7511 -2.230 0.0609 .
## Time -20.0603 7.7287 -2.596 0.0357 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 115.9 on 7 degrees of freedom
## Multiple R-squared: 0.8794, Adjusted R-squared: 0.6209
## F-statistic: 3.403 on 15 and 7 DF, p-value: 0.0541
Observation4:
m2.leaf2 <- lm(Crime~Ed+Pop+Prob+Time,data=leaf.2)
summary(m2.leaf2 )
##
## Call:
## lm(formula = Crime ~ Ed + Pop + Prob + Time, data = leaf.2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -206.35 -90.22 -7.59 59.64 357.11
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 819.960 515.112 1.592 0.1288
## Ed 9.499 34.869 0.272 0.7884
## Pop 11.395 3.229 3.529 0.0024 **
## Prob -3164.075 2095.755 -1.510 0.1485
## Time -12.130 6.830 -1.776 0.0927 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 154.5 on 18 degrees of freedom
## Multiple R-squared: 0.4485, Adjusted R-squared: 0.3259
## F-statistic: 3.659 on 4 and 18 DF, p-value: 0.02379
Observation4:
m3.leaf2 <- lm(Crime~Pop,data=leaf.2)
summary(m3.leaf2)
##
## Call:
## lm(formula = Crime ~ Pop, data = leaf.2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -224.71 -114.53 0.29 60.43 400.75
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 497.836 66.949 7.436 2.6e-07 ***
## Pop 7.540 2.539 2.970 0.00731 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 161.7 on 21 degrees of freedom
## Multiple R-squared: 0.2957, Adjusted R-squared: 0.2622
## F-statistic: 8.818 on 1 and 21 DF, p-value: 0.007313
ggplotRegression(m3.leaf2 )+theme_economist()+theme(
plot.title = element_text(color = "red", size = 12, face = "bold"))
res1.aov.train <- aov(m3.leaf2 , data = leaf.2)
# 1. Homogeneity of variances check
plot(res1.aov.train, 1)
# 2. Normality of residual check
plot(res1.aov.train, 2)
Observation5:
Adj R-Squared looks more realistic now given the small sample size, P-values looked great
Homogeneity of variances looked good and QQ plot of normality got better but still a bit of fraying on the outer edges
To make this more pristine, we would’ve performed log transform on the Pop predictor but we are going to stop here and say this is good enough
m.leaf3 <- lm(Crime~.,data=leaf.3)
summary(m.leaf3 )
##
## Call:
## lm(formula = Crime ~ ., data = leaf.3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -206.805 -120.407 -9.489 103.985 248.226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8634.1701 2366.4043 -3.649 0.00651 **
## M 5.6032 96.1623 0.058 0.95496
## So 179.6267 409.5210 0.439 0.67254
## Ed 263.0845 146.4229 1.797 0.11010
## Po1 235.2349 166.1289 1.416 0.19452
## Po2 -140.7023 193.8759 -0.726 0.48869
## LF 1442.4214 4832.4463 0.298 0.77294
## M.F -1.2379 54.8160 -0.023 0.98254
## Pop -3.7686 2.8833 -1.307 0.22751
## NW -0.5396 24.5039 -0.022 0.98297
## U1 -3779.9843 10923.3434 -0.346 0.73823
## U2 163.7106 150.5361 1.088 0.30848
## Wealth 0.3017 0.2051 1.471 0.17946
## Ineq 155.3754 65.5077 2.372 0.04511 *
## Prob -3624.0711 4381.4724 -0.827 0.43214
## Time 21.9335 14.6754 1.495 0.17338
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 229.9 on 8 degrees of freedom
## Multiple R-squared: 0.8827, Adjusted R-squared: 0.6626
## F-statistic: 4.012 on 15 and 8 DF, p-value: 0.02669
Observation6:
m2.leaf3<- lm(Crime~Ineq,data=leaf.3)
summary(m2.leaf3)
##
## Call:
## lm(formula = Crime ~ Ineq, data = leaf.3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -612.67 -254.25 -88.83 136.25 918.77
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 702.70 495.52 1.418 0.170
## Ineq 24.44 27.91 0.876 0.391
##
## Residual standard error: 397.9 on 22 degrees of freedom
## Multiple R-squared: 0.03368, Adjusted R-squared: -0.01024
## F-statistic: 0.7668 on 1 and 22 DF, p-value: 0.3907
Observation7:
This leaf3 terminal is really bad! Unexpected. We may have to throw this model away and find better ones OR…..
Regression tree indicated Leaf 3 with sub-data of P01> 7.7 was a viable data set to regress upon while in fact it was not. This is meant that we could use tree branching as a root causing tool and discover other more viable factors
set.seed(71)
random.forest.model <- randomForest(Crime ~. , data=df,keep.forest=T, importance=TRUE,class=)
print(random.forest.model ) # view results
##
## Call:
## randomForest(formula = Crime ~ ., data = df, keep.forest = T, importance = TRUE, class = )
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 5
##
## Mean of squared residuals: 87620.99
## % Var explained: 40.15
randomForest::importance(random.forest.model ) # importance of each predictor
## %IncMSE IncNodePurity
## M 2.1422379 188810.1
## So 1.3842953 22913.7
## Ed 3.3048929 201890.5
## Po1 11.8658584 1141790.9
## Po2 10.1332975 1098083.6
## LF 1.7264668 272905.0
## M.F 1.6662156 312965.8
## Pop 0.8410977 363070.0
## NW 7.1869300 518088.2
## U1 0.3949679 139563.4
## U2 0.9451352 146112.3
## Wealth 4.5875617 642320.7
## Ineq 0.6261387 275573.5
## Prob 9.6015928 872924.5
## Time 3.7374840 217304.9
par(bg = 'lightblue2')
varImpPlot(random.forest.model ) #plot of importance predictors
ypred.RF <- predict(random.forest.model )
RSS <- sum((ypred.RF-df$Crime)^2) # the residual sum of squares
TSS <- sum((mean(df$Crime)-df$Crime)^2)#the total sum of squares
print(sprintf("R-Squared of Random Forest Model (All factors & Data Pts) = %0.3f", R.squared.RF<-1-(RSS/TSS) ))
## [1] "R-Squared of Random Forest Model (All factors & Data Pts) = 0.402"
#residual analysis
par(bg = 'lightblue')
plot(df$Crime, scale(ypred.RF-df$Crime), pch = 19, col = "maroon1",main="Random Forest: Residual vs. Fitted")
abline(0,0)
Observation8:
Overall R-Squared = 0.402 which is very reasonable (all data points & all factors). The introduction of Randomness to the Random Forest model really helped in reducing over-fitting
Residuals looked suspect with a downward drift; heteroscedasticity maybe in play due to non-uniform variances
Cross Validation on Random Forest Model
Repeat Two-leaf tree analysis with one branch
# Define training control
set.seed(71)
fit.control <- caret::trainControl(method = "repeatedcv", number = 10, repeats = 3)
rf.fit.leaf2 <- caret::train(Crime~.,
data = df.new,
method = "rf",
trControl = fit.control)
# Summarize the results
print(rf.fit.leaf2)
## Random Forest
##
## 23 samples
## 15 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 21, 21, 21, 21, 20, 20, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 148.6439 0.7932877 132.1649
## 8 146.8366 0.8270646 127.4077
## 15 148.8677 0.8379726 127.7360
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 8.
set.seed(71)
fit.control <- caret::trainControl(method = "repeatedcv", number = 10, repeats = 3)
rf.fit.leaf3 <- caret::train(Crime~.,
data = df.new.greater.7.7,
method = "rf",
trControl = fit.control)
# Summarize the results
print(rf.fit.leaf3)
## Random Forest
##
## 24 samples
## 15 predictors
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 22, 22, 21, 22, 21, 21, ...
## Resampling results across tuning parameters:
##
## mtry RMSE Rsquared MAE
## 2 352.3433 0.8550967 307.8084
## 8 368.5349 0.8468138 321.6500
## 15 390.8036 0.8471077 339.9812
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.
Observation9:
Note: R-squared based on lowest RMSE
- Both Regression Tree & Random Forest pointed to P01 as the defining factor to branched on (Importance predictor plots)
Advantage of Regression tree:
- Regression Tree as a root-causing tool
(a) While Linear regression on P01 > 7.7 produced negative R-squared! This indicated to us that this particular leaf needed attention and should be re-investigated for further actions such as either a different model or factors to regress upon
- Very clear which model one is using and clearly, we do know which factor/s were significant
Disadvantage of Regression tree:
- Could produced overfitting issues when you have small sample size with too many predictors
Advantage of Random Forest:
- Random Forest produced better overall results with less overfitting and its cross validated R squares looked even more impressive (suspcisously). As touted, Random Forest can reduce overfitting issues due to the fact that you have inherent randomeness in the model itself
- Built-in function tells us which variable/s are of importance; P01 which coincided with regression tree
Disadvantage of Random Forest:
- No line of sight to which model/s were utilized.
Describe a situation or problem from your job, everyday life, current events, etc., for which a logistic regression model would be appropriate. List some (up to 5) predictors that you might use.
The Credit Card Fraud Detection problem is of significant importance to the banking industry because banks each year spend hundreds of millions of dollars due to fraud. When a credit card transaction happens, the bank makes a note of several factors. For instance, the date of the transaction, amount, place, type of purchase, etc. Based on these factors, they develop a Logistic Regression model of whether or not the transaction is a fraud.
The German Credit Problem:
Note: The original data set had a number of categorical variables, which must be transform to binary variables[1] so that they can be appropriately handled by logistic regression. The data is organized in the spreadsheet German Credit.txt[2]
# Loading data
df.log<-read.table("germancredit.txt",sep = " ")
head(df.log,2)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18
## 1 A11 6 A34 A43 1169 A65 A75 4 A93 A101 4 A121 67 A143 A152 2 A173 1
## 2 A12 48 A32 A43 5951 A61 A73 2 A92 A101 2 A121 22 A143 A152 1 A173 1
## V19 V20 V21
## 1 A192 A201 1
## 2 A191 A201 2
set.seed(713)
newdata <- one_hot(as.data.table(df.log))#one hot encoding the categorical variables
newdata$V21[newdata$V21==1]<-0
newdata$V21[newdata$V21==2]<-1
#Generate a random sample of 70% of the rows
random_row<- sample(1:nrow(newdata ),as.integer(0.7*nrow(newdata ),replace=F))
traindata = newdata [random_row,]
#Assign the test data set to the remaining 30% of the original set
testdata = newdata [-random_row,]
-Imbalance Check
table(newdata$V21)
##
## 0 1
## 700 300
Observation10:
set.seed(713)
log.reg1 <- glm(V21 ~.,family=binomial(link = "logit"),data=traindata)
summary(log.reg1 )
##
## Call:
## glm(formula = V21 ~ ., family = binomial(link = "logit"), data = traindata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5305 -0.7382 -0.3661 0.7299 2.7427
##
## Coefficients: (13 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.135e+00 1.628e+00 -4.996 5.86e-07 ***
## V1_A11 1.756e+00 2.850e-01 6.162 7.16e-10 ***
## V1_A12 1.317e+00 2.740e-01 4.806 1.54e-06 ***
## V1_A13 4.612e-01 4.771e-01 0.967 0.333694
## V1_A14 NA NA NA NA
## V2 2.040e-02 1.114e-02 1.831 0.067053 .
## V3_A30 1.718e+00 5.026e-01 3.418 0.000630 ***
## V3_A31 2.108e+00 5.403e-01 3.901 9.57e-05 ***
## V3_A32 1.115e+00 3.197e-01 3.487 0.000489 ***
## V3_A33 9.977e-01 3.905e-01 2.555 0.010612 *
## V3_A34 NA NA NA NA
## V4_A40 5.729e-01 3.913e-01 1.464 0.143099
## V4_A41 -1.090e+00 5.161e-01 -2.112 0.034660 *
## V4_A410 -5.138e-01 8.926e-01 -0.576 0.564887
## V4_A42 -4.636e-01 4.175e-01 -1.110 0.266804
## V4_A43 -3.511e-01 3.812e-01 -0.921 0.357030
## V4_A44 -3.046e-01 9.119e-01 -0.334 0.738322
## V4_A45 6.158e-01 6.760e-01 0.911 0.362366
## V4_A46 1.331e-01 5.636e-01 0.236 0.813335
## V4_A48 -1.520e+01 4.497e+02 -0.034 0.973031
## V4_A49 NA NA NA NA
## V5 1.544e-04 5.186e-05 2.977 0.002912 **
## V6_A61 8.776e-01 3.027e-01 2.899 0.003747 **
## V6_A62 3.352e-01 4.175e-01 0.803 0.422059
## V6_A63 7.555e-01 5.171e-01 1.461 0.143977
## V6_A64 -3.631e-01 6.639e-01 -0.547 0.584416
## V6_A65 NA NA NA NA
## V7_A71 2.559e-01 4.722e-01 0.542 0.587897
## V7_A72 2.634e-01 3.582e-01 0.735 0.462159
## V7_A73 4.323e-03 2.945e-01 0.015 0.988287
## V7_A74 -7.052e-01 3.598e-01 -1.960 0.049988 *
## V7_A75 NA NA NA NA
## V8 3.428e-01 1.054e-01 3.253 0.001141 **
## V9_A91 -3.057e-03 5.238e-01 -0.006 0.995344
## V9_A92 -5.761e-02 3.811e-01 -0.151 0.879834
## V9_A93 -5.984e-01 3.770e-01 -1.587 0.112468
## V9_A94 NA NA NA NA
## V10_A101 8.571e-01 5.134e-01 1.670 0.095008 .
## V10_A102 1.289e+00 6.752e-01 1.909 0.056296 .
## V10_A103 NA NA NA NA
## V11 -3.271e-02 1.060e-01 -0.309 0.757674
## V12_A121 -5.186e-01 5.127e-01 -1.011 0.311834
## V12_A122 -2.735e-01 4.961e-01 -0.551 0.581474
## V12_A123 -3.283e-01 4.825e-01 -0.680 0.496222
## V12_A124 NA NA NA NA
## V13 -6.069e-03 1.122e-02 -0.541 0.588678
## V14_A141 4.070e-01 2.867e-01 1.420 0.155658
## V14_A142 9.262e-01 4.765e-01 1.944 0.051917 .
## V14_A143 NA NA NA NA
## V15_A151 3.657e-01 5.688e-01 0.643 0.520241
## V15_A152 2.017e-01 5.402e-01 0.373 0.708812
## V15_A153 NA NA NA NA
## V16 1.592e-01 2.322e-01 0.686 0.493004
## V17_A171 -9.006e-01 7.506e-01 -1.200 0.230205
## V17_A172 -2.770e-04 4.069e-01 -0.001 0.999457
## V17_A173 2.300e-01 3.318e-01 0.693 0.488241
## V17_A174 NA NA NA NA
## V18 2.527e-01 2.973e-01 0.850 0.395243
## V19_A191 1.845e-01 2.460e-01 0.750 0.453258
## V19_A192 NA NA NA NA
## V20_A201 2.060e+00 8.978e-01 2.295 0.021751 *
## V20_A202 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 856.90 on 699 degrees of freedom
## Residual deviance: 630.76 on 651 degrees of freedom
## AIC: 728.76
##
## Number of Fisher Scoring iterations: 14
set.seed(713)
log.reg2 <- glm(V21~ V1_A11+V1_A12+V3_A30+V3_A31+V3_A32+V3_A33+V4_A41+V5+V6_A61+V7_A74+V8+V20_A201 ,family=binomial(link = "logit"),data=traindata)
summary(log.reg2 )
##
## Call:
## glm(formula = V21 ~ V1_A11 + V1_A12 + V3_A30 + V3_A31 + V3_A32 +
## V3_A33 + V4_A41 + V5 + V6_A61 + V7_A74 + V8 + V20_A201, family = binomial(link = "logit"),
## data = traindata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2163 -0.7574 -0.4430 0.8829 2.6810
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.307e+00 9.892e-01 -6.376 1.81e-10 ***
## V1_A11 1.629e+00 2.404e-01 6.775 1.24e-11 ***
## V1_A12 1.243e+00 2.352e-01 5.286 1.25e-07 ***
## V3_A30 1.792e+00 4.552e-01 3.936 8.29e-05 ***
## V3_A31 2.085e+00 4.465e-01 4.670 3.01e-06 ***
## V3_A32 1.027e+00 2.447e-01 4.195 2.73e-05 ***
## V3_A33 1.154e+00 3.599e-01 3.207 0.00134 **
## V4_A41 -1.109e+00 3.878e-01 -2.860 0.00423 **
## V5 1.746e-04 3.608e-05 4.839 1.31e-06 ***
## V6_A61 6.363e-01 2.091e-01 3.043 0.00234 **
## V7_A74 -7.138e-01 2.751e-01 -2.595 0.00947 **
## V8 2.814e-01 9.051e-02 3.109 0.00188 **
## V20_A201 2.095e+00 8.725e-01 2.401 0.01634 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 856.90 on 699 degrees of freedom
## Residual deviance: 680.19 on 687 degrees of freedom
## AIC: 706.19
##
## Number of Fisher Scoring iterations: 5
Like in case of linear regression, checking for multicollinearity
#calculate the VIF for each predictor variable in the model
VIF <- function(linear.model, no.intercept=FALSE, all.diagnostics=FALSE, plot=FALSE) {
require(mctest)
if(no.intercept==FALSE) design.matrix <- model.matrix(linear.model)[,-1]
if(no.intercept==TRUE) design.matrix <- model.matrix(linear.model)
if(plot==TRUE) mc.plot(design.matrix,linear.model$model[1])
if(all.diagnostics==FALSE) output <- imcdiag(design.matrix,linear.model$model[1], method='VIF')$idiags[,1]
if(all.diagnostics==TRUE) output <- imcdiag(design.matrix,linear.model$model[1])
output
}
#create vector of VIF values
vif_values <- VIF(log.reg2 )
#create horizontal bar chart to display each VIF value
par(bg = 'lightblue')
barplot(vif_values, main = "Fig3: VIF Values on Final Logistic Regression Model", horiz = TRUE, col = "gold",las=2,cex.names=.53,xlim=c(0,5))
#add vertical line at 5
abline(v = 5, lwd = 3, lty = 2)
Observation11:
set.seed(713)
predicted <- predict(log.reg2 , testdata, type="response") #prediction based on test data
#optimal prediction probability cutoff for the model
optCutOff <- optimalCutoff(testdata$V21, predicted)[1]
predicted_roundup <- as.integer(predicted > optCutOff ) #rounding up to 1's and 0's
Confusion Matrix
Misclassification Error Rate: The lower the better
Specificity and Sensitivity
ROC Curve
#Confustion Matrix
confusionMatrix(predicted_roundup, testdata$V21, threshold = optCutOff)
## 0 1
## 0 188 23
## 1 48 41
#Misclassification error
misClassError(testdata$V21, predicted_roundup, threshold = optCutOff)
## [1] 0.2367
#sensitivity & specificity
sensitivity(testdata$V21, predicted_roundup, threshold = optCutOff)
## [1] 0.4606742
specificity(testdata$V21, predicted_roundup, threshold = optCutOff)
## [1] 0.8909953
Observation12:
Total Accuracy: 73% (TP+TN)/(TP+TN+FN+FP)
Sensitivity: 46% while Specificity: 89%
Misclassifcation error rate: 24%
plotROC(testdata$V21, predicted_roundup)
costs = matrix(c(0, 5, 1, 0), nrow = 2)
dimnames(costs) = list(Actual = c("good", "bad"), Predicted= c("good", "bad"))
print(costs) #Cost of misclassifications 5x for FP vs. FN
## Predicted
## Actual good bad
## good 0 1
## bad 5 0
#initialize list
cost <- vector(mode = "list")
for (i in 1:100){
predicted_roundup <- as.integer(predicted > i/100 )
cm_matrix <- as.matrix(table(testdata$V21 ,predicted_roundup))
#Ensuring NO out of bounds issues while looping
if(nrow(cm_matrix)==2) {fp<-cm_matrix[2,1]} else {fp=0}
if(ncol(cm_matrix)==2){fn<-cm_matrix[1,2]} else {fn=0}
cost<-c(cost, fn*1+fp*5)
}
#Plots ov Total cost vs % thresholds
par(bg = 'lightblue')
plot(x=seq(0.01,1,by=0.01),y=cost,xlab = "Threshold, %",ylab = "Total Cost of Misclassifications",main = "Fig4: Cost vs Threshold",col='red',pch=16)
grid (10,10, lty = 6, col = "white")
numerator<-which.min(cost)
min.threshold <-numerator/100
print(sprintf("Minimum Threshold due to Uneven Error Cost= %0.3f", min.threshold))
## [1] "Minimum Threshold due to Uneven Error Cost= 0.130"
- German Credit Data set had binary, numerical and categorical data. The data cleaning was to one-hot encoding all the data including the response to binary data points for ease of analysis downstream
- Ran total of 2 iterations to get to the most significant predictors; tossing out insignifcant ones along the way, we arrived at:
Logistic Model
Estimate
(Intercept) -6.307e+00
V1_A11 1.629e+00
V1_A12 1.243e+00
V3_A30 1.792e+00
V3_A31 2.085e+00
V3_A32 1.027e+00
V3_A33 1.154e+00
V4_A41 -1.109e+00
V5 1.746e-04
V6_A61 6.363e-01
V7_A74 -7.138e-01
V8 2.814e-01
V20_A201 2.095e+00
- Because of FP cost 5x more than FN, we had to fine tune the threshold and the optimal threshold for classification was at 13%
- Lastly, the AUC at 65% (although not the best accuracy measure per Sokol) tells us that a random person from the yes group would probably get a higher estimate relative to the person from the no group