us-arrests.R

#US ARRESTS DATA


library(datasets)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.4     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(ggplot2)
USArrests<-tbl_df(USArrests)

## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

USArrests<-as.data.frame(USArrests)
str(USArrests)

## 'data.frame':    50 obs. of  4 variables:
##  $ Murder  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
##  $ Assault : int  236 263 294 190 276 204 110 238 335 211 ...
##  $ UrbanPop: int  58 48 80 50 91 78 77 72 80 60 ...
##  $ Rape    : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...

?USArrests

## starting httpd help server ...

##  done

summary(USArrests)

##      Murder          Assault         UrbanPop          Rape      
##  Min.   : 0.800   Min.   : 45.0   Min.   :32.00   Min.   : 7.30  
##  1st Qu.: 4.075   1st Qu.:109.0   1st Qu.:54.50   1st Qu.:15.07  
##  Median : 7.250   Median :159.0   Median :66.00   Median :20.10  
##  Mean   : 7.788   Mean   :170.8   Mean   :65.54   Mean   :21.23  
##  3rd Qu.:11.250   3rd Qu.:249.0   3rd Qu.:77.75   3rd Qu.:26.18  
##  Max.   :17.400   Max.   :337.0   Max.   :91.00   Max.   :46.00

#Description
#This data set contains statistics, in arrests per 100,000 residents for assault, murder, and rape in each of the 50 US states in 1973.
# Also given is the percent of the population living in urban areas.

#[1]    Murder  numeric Murder arrests (per 100,000)
#[2]    Assault numeric Assault arrests (per 100,000)
#[3]    UrbanPop    numeric Percent urban population
#[4]    Rape    numeric Rape arrests (per 100,000)

#what drives Murder rates up?

#1 Murder as a function of Urban population
ggplot(USArrests,aes(x=UrbanPop,
                     y=Murder))+
  geom_point(col="blue")+
  geom_smooth(col="yellow")+
  labs(x="Urban pop",
       y="Murder per 100 000",
       title="Murder per Urban Percent population")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# There seems to be a linear relationship between Murder and Urban Pop percentage
#the plot seems to indicate that as the population percentage increases so do the number of murder cases.
cor(USArrests$Murder,USArrests$UrbanPop)

## [1] 0.06957262

# the correlation between Murder and Pop is 0.06957262 which is positively correlated.
#the strength of the relationship is considerably weak but positive none the less


#2 Assault as a function of Urban population
ggplot(USArrests,aes(x=UrbanPop,
                     y=Assault))+
  geom_point(col="red")+
  geom_smooth(col="yellow")+
  labs(x="Urban pop",
       y="Assault per 100 000",
       title="Assault per Urban Percent population")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Assault also displays similar characteristics as Murder in contrast to Population, as the population increaeses the number of Assault cases also increase
cor(USArrests$Assault,USArrests$UrbanPop)

## [1] 0.2588717

# the correlation between the two variables is 0.2588717, which is positive and slightly more correlated that Murder and Pop
# urban pop of about 33% has around 500 0000 cases of assault and an urban pop 
# of about 80% has around 34Mil cases of assault.


#3 Rape as a function of Urban population
ggplot(USArrests,aes(x=UrbanPop,
                     y=Rape))+
  geom_point(col="green")+
  geom_smooth(col="yellow")+
  labs(x="Urban pop",
       y="Rape per 100 000",
       title="Rape per Urban Percent population")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Rape displays a positive linear correlation with Urban Pop,
#it also seems to increase as population percentage increases with the lowest pop percentage being around 33%
# and the min rape cases at around 1.2Mil 
# The highest rape cases are at around 4.6Mil with an urban population percentage of about 82
#clearly as the pop percentage rises so does the number of rape cases.


#4 Murder as a function of Rape per Urban population
ggplot(USArrests,aes(x=Rape,
                     y=Murder,
                     col=UrbanPop))+
  geom_point()+
  geom_smooth(col="yellow")+
  labs(x="Rape per 100 000",
       y="Murder per 100 000",
       title="Murder as a Function of Rape per Urban Percent population")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Now when we compare Murder as a function of rape per Urban Population
# a trend emerges and it becomes clear that an individual is likely to be Murdered if they are raped in,
# an urban pop percentage that is large. 
#for instance for every 100 000 Murder cases in an urban population with percentage of about 40% there are also 500 000 rape cases reported in that population percentage
cor(USArrests$Murder,USArrests$Rape)

## [1] 0.5635788

# and the correlation between them is 0.5635788, which is strongly positively correlated, meaning as one increases so does the other
#for example where we have Murder cases around the figures of 1.6M , we aslo have an increase in Rape cases 3.2M, and also the population
#percentage is expected to be higher , 90%.


#5 Murder as a function of Assault per Urban population
ggplot(USArrests,aes(x=Assault,
                     y=Murder,
                     col=UrbanPop))+
  geom_point()+
  geom_smooth(col="red")+
  labs(x="Assault per 100 000",
       y="Murder per 100 000",
       title="Murder as a Function of Assault per Urban Percent population")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Now if we compare Murder Cases as a Function of Assault per Urban Population percentage
#We find that the plot at first sight seems to suggest that the correlation between this two variables is the strongest in our case study.
# with 100K murder cases associated with about 4.8M Assault cases at the lowest Urban pop percentage of 40%
#and steadily rises as one variable increases, for example where we have Murder cases around
# 600K we also have Assault cases around 8M. with the population percentage at about 50-60 percent
#and when the Murder cases are around 1.5M we have Assault cases at around 20M, with the urban population around the 80s.
cor(USArrests$Murder,USArrests$Assault)

## [1] 0.8018733

#as expected the correlation between Murder and Assault is 0.8018733. 
#so far the most correlated pairs in our study 
# this implies that one is more likely to be Murdered if they are Assaulted than when they are Raped and the population also increseases the odds


#6 Murder as a function of Rape per Assault case
ggplot(USArrests,aes(x=Rape,
                     y=Murder,
                     col=Assault))+
  geom_point()+
  geom_smooth(col="purple")+
  labs(x="Rape per 100 000",
       y="Murder per 100 000",
       title="Murder as a Function of Rape per Assault case")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

cor(USArrests$Murder,USArrests$Rape)

## [1] 0.5635788

# the correlation between Murder and Rape is 0.5635788
#which is strongly positively correlated
#we can say that as the number of rape cases increase so do the number of Murders
#We can say that when Murder cases are at around 100K we expect 500k rape cases per 10M Assault cases
# as we increase one so does the other. Murder at 1.25M we have about 1.6M rape cases per 30M Assault cases
cor(USArrests$Murder,USArrests$Assault)

## [1] 0.8018733

# on the other hand the correlation between Murder and Assault is 0.8018733
# which is very positively correlated and in short we can say that you are likely to be Murdered if your Assaulted rather than Raped
summary(USArrests)

##      Murder          Assault         UrbanPop          Rape      
##  Min.   : 0.800   Min.   : 45.0   Min.   :32.00   Min.   : 7.30  
##  1st Qu.: 4.075   1st Qu.:109.0   1st Qu.:54.50   1st Qu.:15.07  
##  Median : 7.250   Median :159.0   Median :66.00   Median :20.10  
##  Mean   : 7.788   Mean   :170.8   Mean   :65.54   Mean   :21.23  
##  3rd Qu.:11.250   3rd Qu.:249.0   3rd Qu.:77.75   3rd Qu.:26.18  
##  Max.   :17.400   Max.   :337.0   Max.   :91.00   Max.   :46.00

#7 Modeling 
# model 1 with all variables included
Md1 <- lm(Murder~Assault+Rape+UrbanPop, data = USArrests)
Md1

## 
## Call:
## lm(formula = Murder ~ Assault + Rape + UrbanPop, data = USArrests)
## 
## Coefficients:
## (Intercept)      Assault         Rape     UrbanPop  
##     3.27664      0.03978      0.06140     -0.05469

summary(Md1)

## 
## Call:
## lm(formula = Murder ~ Assault + Rape + UrbanPop, data = USArrests)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.3990 -1.9127 -0.3444  1.2557  7.4279 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.276639   1.737997   1.885   0.0657 .  
## Assault      0.039777   0.005912   6.729 2.33e-08 ***
## Rape         0.061399   0.055740   1.102   0.2764    
## UrbanPop    -0.054694   0.027880  -1.962   0.0559 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.574 on 46 degrees of freedom
## Multiple R-squared:  0.6721, Adjusted R-squared:  0.6507 
## F-statistic: 31.42 on 3 and 46 DF,  p-value: 3.322e-11

#Adjusted R-squared:  0.6507

#model 2 
Md2 <-lm(Murder~Assault+Rape,data=USArrests)
Md2

## 
## Call:
## lm(formula = Murder ~ Assault + Rape, data = USArrests)
## 
## Coefficients:
## (Intercept)      Assault         Rape  
##     0.41886      0.04003      0.02514

summary(Md2)

## 
## Call:
## lm(formula = Murder ~ Assault + Rape, data = USArrests)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8667 -1.7653 -0.3746  1.3030  7.8864 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.418863   0.976184   0.429    0.670    
## Assault     0.040029   0.006087   6.576 3.59e-08 ***
## Rape        0.025142   0.054157   0.464    0.645    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.651 on 47 degrees of freedom
## Multiple R-squared:  0.6446, Adjusted R-squared:  0.6295 
## F-statistic: 42.63 on 2 and 47 DF,  p-value: 2.76e-11

#Adjusted R-squared:  0.6295

#model 3 
Md3 <-lm(Murder~Assault,data = USArrests)
summary(Md3)

## 
## Call:
## lm(formula = Murder ~ Assault, data = USArrests)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8528 -1.7456 -0.3979  1.3044  7.9256 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.631683   0.854776   0.739    0.464    
## Assault     0.041909   0.004507   9.298  2.6e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.629 on 48 degrees of freedom
## Multiple R-squared:  0.643,  Adjusted R-squared:  0.6356 
## F-statistic: 86.45 on 1 and 48 DF,  p-value: 2.596e-12

#Adjusted R-squared:  0.6356 

# model 4
Md4 <-lm(Murder~Rape,data = USArrests)
summary(Md4)

## 
## Call:
## lm(formula = Murder ~ Rape, data = USArrests)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0900 -2.4025 -0.5731  1.7095  9.3949 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.22367    1.28456   1.731   0.0899 .  
## Rape         0.26207    0.05544   4.727 2.03e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.635 on 48 degrees of freedom
## Multiple R-squared:  0.3176, Adjusted R-squared:  0.3034 
## F-statistic: 22.34 on 1 and 48 DF,  p-value: 2.031e-05

#Adjusted R-squared:  0.3034

#model 5 
Md5 <-lm(Murder~UrbanPop,data = USArrests)
summary(Md5)

## 
## Call:
## lm(formula = Murder ~ UrbanPop, data = USArrests)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.537 -3.736 -0.779  3.332  9.728 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  6.41594    2.90669   2.207   0.0321 *
## UrbanPop     0.02093    0.04333   0.483   0.6312  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.39 on 48 degrees of freedom
## Multiple R-squared:  0.00484,    Adjusted R-squared:  -0.01589 
## F-statistic: 0.2335 on 1 and 48 DF,  p-value: 0.6312

#Adjusted R-squared:  -0.01589 

#so we are going to go with model 1 since its adj R^2 is around 0.65

Md1$fitted

##         1         2         3         4         5         6         7         8 
## 10.793487 13.845014 12.499018  9.296908 11.770833  9.501235  4.122251  9.775774 
##         9        10        11        12        13        14        15        16 
## 14.185141  9.972108  1.807086  5.968315 10.115168  5.505761  3.080437  5.346423 
##        17        18        19        20        21        22        23        24 
##  5.769092 10.934441  4.267684 13.252220  5.555289 11.527607  3.445667 12.222335 
##        25        26        27        28        29        30        31        32 
##  8.259884  5.720538  4.955995 11.694674  3.064389  5.887785 12.755499 10.278912 
##        33        34        35        36        37        38        39        40 
## 15.208861  3.108308  5.261824  6.791813  7.735738  4.469929  5.949135 13.130661 
##        41        42        43        44        45        46        47        48 
##  5.022175  9.179467  8.462044  5.080455  4.123421  7.307146  6.660358  4.936553 
##        49        50 
##  2.438163  7.356976

ggplot(USArrests,aes(x=UrbanPop,
                     y=Md1$fitted,
                     col=Rape))+
  geom_point()+
  labs(x="Urban pop",
       y="Murder Fitted values",
       title="Murder as a function of Urban pop and rape")

# our fitted model seems to display similar traits as the original data,
#it suggest that murder and urban pop are positively correlated and that as we increase the Pop percentage so does the Murder and rape Cases

# original values not fitted ones
ggplot(USArrests,aes(x=UrbanPop,
                     y=Murder,
                     col=Rape))+
  geom_point()+
  labs(x="Urban pop",
       y="Murder",
       title="Murder as a function of Urban pop and rape")

## bar plot of Murder per Urban population with rape as a selector
ggplot(USArrests,aes(x=UrbanPop,
                     y=Murder,col=Rape,fill=Rape))+geom_col()

# the bar plot also support the scatter plot findings , in that the higher the urban pop percentage
# the higher the number of Murder cases and rape associated with those population.
#in general we can conclude that More Rapes and Murders occur in large populated cities 

## bar plot of Murder per Urban population with Assault as a selector
ggplot(USArrests,aes(x=UrbanPop,
                     y=Murder,col=Assault,fill=Assault))+geom_col()

#Similarly we see that Murder and Assault also occur more frequently in highly populated places.

## bar plot of Murder per urban population with rape and assault as both selectors
ggplot(USArrests,aes(x=UrbanPop,
                     y=Murder,
                     col=Rape,
                     fill=Assault))+
  geom_col()+
  labs(x="Population percentage",
       y="Murder per 100,000",
       title= "Murder as a function of Urban Pop: With Rape & Assault")

# its clear that murder as a function of population percentage per assault and rape cases is linearly related to its population size.
#as we increase in population so does the number of Assault, Rape and Murder cases.

##NOW FOR THE FITTED MURDER VALUES##
ggplot(USArrests,aes(x=UrbanPop,
                     y=Md1$fitted,
                     col=Rape,
                     fill=Assault))+
  geom_col()+
  labs(x="Population percentage",
       y="Murder per 100,000",
       title= "Murder as a function of Urban Pop: With Rape & Assault")

# our fitted values also suggest the underlying fact that our original data revealed to us. 
# and our model is Md1 <- lm(Murder~Assault+Rape+UrbanPop, data = Arrests).


library(rpart)
library(rattle)

## Loading required package: bitops

## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

library(rpart.plot)
library(RColorBrewer)
library(crossval)
library(gplots)

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library(vcd)

## Loading required package: grid

library(Metrics)

D1 <-USArrests
d1.ori<-D1
set.seed(99)

tr <- d1.ori[sample(row.names(d1.ori), size = round(nrow(d1.ori)*0.5)), ]
te <- d1.ori[!(row.names(d1.ori) %in% row.names(tr)), ]

#reset the original training and test data
tr1 <- tr
te1  <- te
te2 <-te

#zero r strategy no one will purchase
te2$Murder <- rep(1,nrow(te2))

#building the tree
tr1$Murder = as.factor(tr1$Murder)
fit1 <- rpart(formula=Murder ~ .,
              data=tr1,
              control=rpart.control(minsplit=20, minbucket=1, cp=0.08))
fit1

## n= 25 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 25 23 13.2 (0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.08 0.04 0.08 0.04)  
##   2) UrbanPop>=62 16 14 15.4 (0 0 0.062 0.062 0.062 0.062 0 0.062 0.062 0.062 0.062 0.062 0 0.062 0.062 0.062 0.062 0.062 0 0 0 0.12 0) *
##   3) UrbanPop< 62 9  7 13.2 (0.11 0.11 0 0 0 0 0.11 0 0 0 0 0 0.11 0 0 0 0 0 0.11 0.22 0.11 0 0.11) *

gc()

##           used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 2564181 137.0    4591772 245.3  4591772 245.3
## Vcells 4638565  35.4   10146329  77.5  6885330  52.6

fancyRpartPlot(fit1)

printcp(fit1)

## 
## Classification tree:
## rpart(formula = Murder ~ ., data = tr1, control = rpart.control(minsplit = 20, 
##     minbucket = 1, cp = 0.08))
## 
## Variables actually used in tree construction:
## [1] UrbanPop
## 
## Root node error: 23/25 = 0.92
## 
## n= 25 
## 
##         CP nsplit rel error xerror xstd
## 1 0.086957      0   1.00000  1.087    0
## 2 0.080000      1   0.91304  1.087    0

print(fit1)

## n= 25 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 25 23 13.2 (0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.04 0.08 0.04 0.08 0.04)  
##   2) UrbanPop>=62 16 14 15.4 (0 0 0.062 0.062 0.062 0.062 0 0.062 0.062 0.062 0.062 0.062 0 0.062 0.062 0.062 0.062 0.062 0 0 0 0.12 0) *
##   3) UrbanPop< 62 9  7 13.2 (0.11 0.11 0 0 0 0 0.11 0 0 0 0 0 0.11 0 0 0 0 0 0.11 0.22 0.11 0 0.11) *

plot(fit1)
text(fit1)

fit1$cptable[which.min(fit1$cptable[,"xerror"]),"CP"]

## [1] 0.08695652

#compare with base model

Prediction<-predict(fit1,te1,type="class")
#Update the prediction

te2$Purchase <- Prediction

Pred = factor(as.factor(te2$Murder))
Actual = factor(as.factor(te1$Murder))
table(te1$Murder)

## 
##  2.1  2.2  2.6  2.7  3.3  3.4  3.8  4.3  4.4  5.3  5.9    6  6.8  7.9  8.1  8.5 
##    1    2    1    1    1    1    1    1    1    1    1    2    1    1    1    1 
##    9  9.7   10 12.2 12.7 17.4 
##    2    1    1    1    1    1

cm1 = confusionMatrix(Actual,Pred)
cm1

## FP TP TN FN 
##  0 25  0  0 
## attr(,"negative")
## [1] "control"

diagnosticErrors(cm1)

##  acc sens spec  ppv  npv  lor 
##    1    1  NaN    1  NaN  NaN 
## attr(,"negative")
## [1] "control"

us-arrests.R

xholi

2021-01-05