#US ARRESTS DATA
library(datasets)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.4 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
USArrests<-tbl_df(USArrests)
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
USArrests<-as.data.frame(USArrests)
str(USArrests)
## 'data.frame': 50 obs. of 4 variables:
## $ Murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
## $ UrbanPop: int 58 48 80 50 91 78 77 72 80 60 ...
## $ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
?USArrests
## starting httpd help server ...
## done
summary(USArrests)
## Murder Assault UrbanPop Rape
## Min. : 0.800 Min. : 45.0 Min. :32.00 Min. : 7.30
## 1st Qu.: 4.075 1st Qu.:109.0 1st Qu.:54.50 1st Qu.:15.07
## Median : 7.250 Median :159.0 Median :66.00 Median :20.10
## Mean : 7.788 Mean :170.8 Mean :65.54 Mean :21.23
## 3rd Qu.:11.250 3rd Qu.:249.0 3rd Qu.:77.75 3rd Qu.:26.18
## Max. :17.400 Max. :337.0 Max. :91.00 Max. :46.00
#Description
#This data set contains statistics, in arrests per 100,000 residents for assault, murder, and rape in each of the 50 US states in 1973.
# Also given is the percent of the population living in urban areas.
#[1] Murder numeric Murder arrests (per 100,000)
#[2] Assault numeric Assault arrests (per 100,000)
#[3] UrbanPop numeric Percent urban population
#[4] Rape numeric Rape arrests (per 100,000)
#what drives Murder rates up?
#1 Murder as a function of Urban population
ggplot(USArrests,aes(x=UrbanPop,
y=Murder))+
geom_point(col="blue")+
geom_smooth(col="yellow")+
labs(x="Urban pop",
y="Murder per 100 000",
title="Murder per Urban Percent population")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# There seems to be a linear relationship between Murder and Urban Pop percentage
#the plot seems to indicate that as the population percentage increases so do the number of murder cases.
cor(USArrests$Murder,USArrests$UrbanPop)
## [1] 0.06957262
# the correlation between Murder and Pop is 0.06957262 which is positively correlated.
#the strength of the relationship is considerably weak but positive none the less
#2 Assault as a function of Urban population
ggplot(USArrests,aes(x=UrbanPop,
y=Assault))+
geom_point(col="red")+
geom_smooth(col="yellow")+
labs(x="Urban pop",
y="Assault per 100 000",
title="Assault per Urban Percent population")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Assault also displays similar characteristics as Murder in contrast to Population, as the population increaeses the number of Assault cases also increase
cor(USArrests$Assault,USArrests$UrbanPop)
## [1] 0.2588717
# the correlation between the two variables is 0.2588717, which is positive and slightly more correlated that Murder and Pop
# urban pop of about 33% has around 500 0000 cases of assault and an urban pop
# of about 80% has around 34Mil cases of assault.
#3 Rape as a function of Urban population
ggplot(USArrests,aes(x=UrbanPop,
y=Rape))+
geom_point(col="green")+
geom_smooth(col="yellow")+
labs(x="Urban pop",
y="Rape per 100 000",
title="Rape per Urban Percent population")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Rape displays a positive linear correlation with Urban Pop,
#it also seems to increase as population percentage increases with the lowest pop percentage being around 33%
# and the min rape cases at around 1.2Mil
# The highest rape cases are at around 4.6Mil with an urban population percentage of about 82
#clearly as the pop percentage rises so does the number of rape cases.
#4 Murder as a function of Rape per Urban population
ggplot(USArrests,aes(x=Rape,
y=Murder,
col=UrbanPop))+
geom_point()+
geom_smooth(col="yellow")+
labs(x="Rape per 100 000",
y="Murder per 100 000",
title="Murder as a Function of Rape per Urban Percent population")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Now when we compare Murder as a function of rape per Urban Population
# a trend emerges and it becomes clear that an individual is likely to be Murdered if they are raped in,
# an urban pop percentage that is large.
#for instance for every 100 000 Murder cases in an urban population with percentage of about 40% there are also 500 000 rape cases reported in that population percentage
cor(USArrests$Murder,USArrests$Rape)
## [1] 0.5635788
# and the correlation between them is 0.5635788, which is strongly positively correlated, meaning as one increases so does the other
#for example where we have Murder cases around the figures of 1.6M , we aslo have an increase in Rape cases 3.2M, and also the population
#percentage is expected to be higher , 90%.
#5 Murder as a function of Assault per Urban population
ggplot(USArrests,aes(x=Assault,
y=Murder,
col=UrbanPop))+
geom_point()+
geom_smooth(col="red")+
labs(x="Assault per 100 000",
y="Murder per 100 000",
title="Murder as a Function of Assault per Urban Percent population")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Now if we compare Murder Cases as a Function of Assault per Urban Population percentage
#We find that the plot at first sight seems to suggest that the correlation between this two variables is the strongest in our case study.
# with 100K murder cases associated with about 4.8M Assault cases at the lowest Urban pop percentage of 40%
#and steadily rises as one variable increases, for example where we have Murder cases around
# 600K we also have Assault cases around 8M. with the population percentage at about 50-60 percent
#and when the Murder cases are around 1.5M we have Assault cases at around 20M, with the urban population around the 80s.
cor(USArrests$Murder,USArrests$Assault)
## [1] 0.8018733
#as expected the correlation between Murder and Assault is 0.8018733.
#so far the most correlated pairs in our study
# this implies that one is more likely to be Murdered if they are Assaulted than when they are Raped and the population also increseases the odds
#6 Murder as a function of Rape per Assault case
ggplot(USArrests,aes(x=Rape,
y=Murder,
col=Assault))+
geom_point()+
geom_smooth(col="purple")+
labs(x="Rape per 100 000",
y="Murder per 100 000",
title="Murder as a Function of Rape per Assault case")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

cor(USArrests$Murder,USArrests$Rape)
## [1] 0.5635788
# the correlation between Murder and Rape is 0.5635788
#which is strongly positively correlated
#we can say that as the number of rape cases increase so do the number of Murders
#We can say that when Murder cases are at around 100K we expect 500k rape cases per 10M Assault cases
# as we increase one so does the other. Murder at 1.25M we have about 1.6M rape cases per 30M Assault cases
cor(USArrests$Murder,USArrests$Assault)
## [1] 0.8018733
# on the other hand the correlation between Murder and Assault is 0.8018733
# which is very positively correlated and in short we can say that you are likely to be Murdered if your Assaulted rather than Raped
summary(USArrests)
## Murder Assault UrbanPop Rape
## Min. : 0.800 Min. : 45.0 Min. :32.00 Min. : 7.30
## 1st Qu.: 4.075 1st Qu.:109.0 1st Qu.:54.50 1st Qu.:15.07
## Median : 7.250 Median :159.0 Median :66.00 Median :20.10
## Mean : 7.788 Mean :170.8 Mean :65.54 Mean :21.23
## 3rd Qu.:11.250 3rd Qu.:249.0 3rd Qu.:77.75 3rd Qu.:26.18
## Max. :17.400 Max. :337.0 Max. :91.00 Max. :46.00
#7 Modeling
# model 1 with all variables included
Md1 <- lm(Murder~Assault+Rape+UrbanPop, data = USArrests)
Md1
##
## Call:
## lm(formula = Murder ~ Assault + Rape + UrbanPop, data = USArrests)
##
## Coefficients:
## (Intercept) Assault Rape UrbanPop
## 3.27664 0.03978 0.06140 -0.05469
summary(Md1)
##
## Call:
## lm(formula = Murder ~ Assault + Rape + UrbanPop, data = USArrests)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.3990 -1.9127 -0.3444 1.2557 7.4279
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.276639 1.737997 1.885 0.0657 .
## Assault 0.039777 0.005912 6.729 2.33e-08 ***
## Rape 0.061399 0.055740 1.102 0.2764
## UrbanPop -0.054694 0.027880 -1.962 0.0559 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.574 on 46 degrees of freedom
## Multiple R-squared: 0.6721, Adjusted R-squared: 0.6507
## F-statistic: 31.42 on 3 and 46 DF, p-value: 3.322e-11
#Adjusted R-squared: 0.6507
#model 2
Md2 <-lm(Murder~Assault+Rape,data=USArrests)
Md2
##
## Call:
## lm(formula = Murder ~ Assault + Rape, data = USArrests)
##
## Coefficients:
## (Intercept) Assault Rape
## 0.41886 0.04003 0.02514
summary(Md2)
##
## Call:
## lm(formula = Murder ~ Assault + Rape, data = USArrests)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8667 -1.7653 -0.3746 1.3030 7.8864
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.418863 0.976184 0.429 0.670
## Assault 0.040029 0.006087 6.576 3.59e-08 ***
## Rape 0.025142 0.054157 0.464 0.645
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.651 on 47 degrees of freedom
## Multiple R-squared: 0.6446, Adjusted R-squared: 0.6295
## F-statistic: 42.63 on 2 and 47 DF, p-value: 2.76e-11
#Adjusted R-squared: 0.6295
#model 3
Md3 <-lm(Murder~Assault,data = USArrests)
summary(Md3)
##
## Call:
## lm(formula = Murder ~ Assault, data = USArrests)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8528 -1.7456 -0.3979 1.3044 7.9256
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.631683 0.854776 0.739 0.464
## Assault 0.041909 0.004507 9.298 2.6e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.629 on 48 degrees of freedom
## Multiple R-squared: 0.643, Adjusted R-squared: 0.6356
## F-statistic: 86.45 on 1 and 48 DF, p-value: 2.596e-12
#Adjusted R-squared: 0.6356
# model 4
Md4 <-lm(Murder~Rape,data = USArrests)
summary(Md4)
##
## Call:
## lm(formula = Murder ~ Rape, data = USArrests)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.0900 -2.4025 -0.5731 1.7095 9.3949
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.22367 1.28456 1.731 0.0899 .
## Rape 0.26207 0.05544 4.727 2.03e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.635 on 48 degrees of freedom
## Multiple R-squared: 0.3176, Adjusted R-squared: 0.3034
## F-statistic: 22.34 on 1 and 48 DF, p-value: 2.031e-05
#Adjusted R-squared: 0.3034
#model 5
Md5 <-lm(Murder~UrbanPop,data = USArrests)
summary(Md5)
##
## Call:
## lm(formula = Murder ~ UrbanPop, data = USArrests)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.537 -3.736 -0.779 3.332 9.728
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.41594 2.90669 2.207 0.0321 *
## UrbanPop 0.02093 0.04333 0.483 0.6312
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.39 on 48 degrees of freedom
## Multiple R-squared: 0.00484, Adjusted R-squared: -0.01589
## F-statistic: 0.2335 on 1 and 48 DF, p-value: 0.6312
#Adjusted R-squared: -0.01589
#so we are going to go with model 1 since its adj R^2 is around 0.65
Md1$fitted
## 1 2 3 4 5 6 7 8
## 10.793487 13.845014 12.499018 9.296908 11.770833 9.501235 4.122251 9.775774
## 9 10 11 12 13 14 15 16
## 14.185141 9.972108 1.807086 5.968315 10.115168 5.505761 3.080437 5.346423
## 17 18 19 20 21 22 23 24
## 5.769092 10.934441 4.267684 13.252220 5.555289 11.527607 3.445667 12.222335
## 25 26 27 28 29 30 31 32
## 8.259884 5.720538 4.955995 11.694674 3.064389 5.887785 12.755499 10.278912
## 33 34 35 36 37 38 39 40
## 15.208861 3.108308 5.261824 6.791813 7.735738 4.469929 5.949135 13.130661
## 41 42 43 44 45 46 47 48
## 5.022175 9.179467 8.462044 5.080455 4.123421 7.307146 6.660358 4.936553
## 49 50
## 2.438163 7.356976
ggplot(USArrests,aes(x=UrbanPop,
y=Md1$fitted,
col=Rape))+
geom_point()+
labs(x="Urban pop",
y="Murder Fitted values",
title="Murder as a function of Urban pop and rape")

# our fitted model seems to display similar traits as the original data,
#it suggest that murder and urban pop are positively correlated and that as we increase the Pop percentage so does the Murder and rape Cases
# original values not fitted ones
ggplot(USArrests,aes(x=UrbanPop,
y=Murder,
col=Rape))+
geom_point()+
labs(x="Urban pop",
y="Murder",
title="Murder as a function of Urban pop and rape")

## bar plot of Murder per Urban population with rape as a selector
ggplot(USArrests,aes(x=UrbanPop,
y=Murder,col=Rape,fill=Rape))+geom_col()

# the bar plot also support the scatter plot findings , in that the higher the urban pop percentage
# the higher the number of Murder cases and rape associated with those population.
#in general we can conclude that More Rapes and Murders occur in large populated cities
## bar plot of Murder per Urban population with Assault as a selector
ggplot(USArrests,aes(x=UrbanPop,
y=Murder,col=Assault,fill=Assault))+geom_col()

#Similarly we see that Murder and Assault also occur more frequently in highly populated places.
## bar plot of Murder per urban population with rape and assault as both selectors
ggplot(USArrests,aes(x=UrbanPop,
y=Murder,
col=Rape,
fill=Assault))+
geom_col()+
labs(x="Population percentage",
y="Murder per 100,000",
title= "Murder as a function of Urban Pop: With Rape & Assault")

# its clear that murder as a function of population percentage per assault and rape cases is linearly related to its population size.
#as we increase in population so does the number of Assault, Rape and Murder cases.
##NOW FOR THE FITTED MURDER VALUES##
ggplot(USArrests,aes(x=UrbanPop,
y=Md1$fitted,
col=Rape,
fill=Assault))+
geom_col()+
labs(x="Population percentage",
y="Murder per 100,000",
title= "Murder as a function of Urban Pop: With Rape & Assault")

# our fitted values also suggest the underlying fact that our original data revealed to us.
# and our model is Md1 <- lm(Murder~Assault+Rape+UrbanPop, data = Arrests).
library(rpart)
library(rattle)
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
library(rpart.plot)
library(RColorBrewer)
library(crossval)
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(vcd)
## Loading required package: grid
library(Metrics)
D1 <-USArrests
d1.ori<-D1
set.seed(99)
tr <- d1.ori[sample(row.names(d1.ori), size = round(nrow(d1.ori)*0.5)), ]
te <- d1.ori[!(row.names(d1.ori) %in% row.names(tr)), ]
#reset the original training and test data
tr1 <- tr
te1 <- te
te2 <-te
#zero r strategy no one will purchase
te2$Murder <- rep(1,nrow(te2))
#building the tree
tr1$Murder = as.factor(tr1$Murder)
fit1 <- rpart(formula=Murder ~ .,
data=tr1,
control=rpart.control(minsplit=20, minbucket=1, cp=0.08))
fit1
## n= 25
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 25 23 13.2 (0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.08 0.04 0.08 0.04)
## 2) UrbanPop>=62 16 14 15.4 (0 0 0.062 0.062 0.062 0.062 0 0.062 0.062 0.062 0.062 0.062 0 0.062 0.062 0.062 0.062 0.062 0 0 0 0.12 0) *
## 3) UrbanPop< 62 9 7 13.2 (0.11 0.11 0 0 0 0 0.11 0 0 0 0 0 0.11 0 0 0 0 0 0.11 0.22 0.11 0 0.11) *
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 2564181 137.0 4591772 245.3 4591772 245.3
## Vcells 4638565 35.4 10146329 77.5 6885330 52.6
fancyRpartPlot(fit1)

printcp(fit1)
##
## Classification tree:
## rpart(formula = Murder ~ ., data = tr1, control = rpart.control(minsplit = 20,
## minbucket = 1, cp = 0.08))
##
## Variables actually used in tree construction:
## [1] UrbanPop
##
## Root node error: 23/25 = 0.92
##
## n= 25
##
## CP nsplit rel error xerror xstd
## 1 0.086957 0 1.00000 1.087 0
## 2 0.080000 1 0.91304 1.087 0
print(fit1)
## n= 25
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 25 23 13.2 (0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.08 0.04 0.08 0.04)
## 2) UrbanPop>=62 16 14 15.4 (0 0 0.062 0.062 0.062 0.062 0 0.062 0.062 0.062 0.062 0.062 0 0.062 0.062 0.062 0.062 0.062 0 0 0 0.12 0) *
## 3) UrbanPop< 62 9 7 13.2 (0.11 0.11 0 0 0 0 0.11 0 0 0 0 0 0.11 0 0 0 0 0 0.11 0.22 0.11 0 0.11) *
plot(fit1)
text(fit1)

fit1$cptable[which.min(fit1$cptable[,"xerror"]),"CP"]
## [1] 0.08695652
#compare with base model
Prediction<-predict(fit1,te1,type="class")
#Update the prediction
te2$Purchase <- Prediction
Pred = factor(as.factor(te2$Murder))
Actual = factor(as.factor(te1$Murder))
table(te1$Murder)
##
## 2.1 2.2 2.6 2.7 3.3 3.4 3.8 4.3 4.4 5.3 5.9 6 6.8 7.9 8.1 8.5
## 1 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1
## 9 9.7 10 12.2 12.7 17.4
## 2 1 1 1 1 1
cm1 = confusionMatrix(Actual,Pred)
cm1
## FP TP TN FN
## 0 25 0 0
## attr(,"negative")
## [1] "control"
diagnosticErrors(cm1)
## acc sens spec ppv npv lor
## 1 1 NaN 1 NaN NaN
## attr(,"negative")
## [1] "control"