Anaysis of Detroit Crime (Homicides)

Step 1:

Load the libraries needed for the analysis

library(shiny)
library(ggvis)
library(reshape2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(splines)
library(gam)

## Loading required package: foreach

## Loaded gam 1.14

# library(ISLR)
library(tree)
library(randomForest)

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

Step 2:

Read the data into R. We will read the data into R and then attach the dataset to use in our analysis.

Detroit_Data_Ben_Gonzalez=read.csv("~/Datasets/Detroit_Data_Ben_Gonzalez.csv",header = T)
attach(Detroit_Data_Ben_Gonzalez)

Step 3:

Look at the variable names in our data.

names(Detroit_Data_Ben_Gonzalez)

##  [1] "Year"  "FTP"   "UEMP"  "MAN"   "LIC"   "GR"    "CLEAR" "WM"   
##  [9] "NMAN"  "GOV"   "HE"    "WE"    "HOM"   "ACC"   "ASR"

Step 4:

Generate a summary of our data to better understand our dataset.

summary(Detroit_Data_Ben_Gonzalez)

##       Year           FTP             UEMP             MAN       
##  Min.   :1961   Min.   :260.4   Min.   : 3.200   Min.   :455.5  
##  1st Qu.:1964   1st Qu.:269.8   1st Qu.: 3.900   1st Qu.:535.8  
##  Median :1967   Median :273.0   Median : 5.200   Median :569.3  
##  Mean   :1967   Mean   :304.5   Mean   : 5.792   Mean   :556.4  
##  3rd Qu.:1970   3rd Qu.:341.4   3rd Qu.: 7.100   3rd Qu.:596.9  
##  Max.   :1973   Max.   :390.2   Max.   :11.000   Max.   :613.5  
##       LIC               GR             CLEAR             WM        
##  Min.   : 156.4   Min.   : 180.5   Min.   :58.90   Min.   :359647  
##  1st Qu.: 222.1   1st Qu.: 231.7   1st Qu.:73.90   1st Qu.:401518  
##  Median : 583.2   Median : 616.5   Median :87.40   Median :448267  
##  Mean   : 537.5   Mean   : 545.7   Mean   :81.45   Mean   :452508  
##  3rd Qu.: 794.9   3rd Qu.: 750.4   3rd Qu.:91.00   3rd Qu.:500457  
##  Max.   :1131.2   Max.   :1029.8   Max.   :94.40   Max.   :558724  
##       NMAN            GOV              HE              WE       
##  Min.   :538.1   Min.   :133.9   Min.   :2.910   Min.   :117.2  
##  1st Qu.:591.0   1st Qu.:150.3   1st Qu.:3.230   1st Qu.:141.7  
##  Median :686.2   Median :187.5   Median :3.600   Median :157.2  
##  Mean   :673.9   Mean   :185.8   Mean   :3.948   Mean   :170.0  
##  3rd Qu.:755.3   3rd Qu.:223.8   3rd Qu.:4.470   3rd Qu.:178.7  
##  Max.   :819.8   Max.   :230.9   Max.   :5.760   Max.   :258.1  
##       HOM             ACC             ASR       
##  Min.   : 8.52   Min.   :39.17   Min.   :218.0  
##  1st Qu.: 8.90   1st Qu.:44.17   1st Qu.:277.5  
##  Median :21.36   Median :45.80   Median :306.2  
##  Mean   :25.13   Mean   :46.92   Mean   :311.9  
##  3rd Qu.:37.39   3rd Qu.:50.62   3rd Qu.:323.0  
##  Max.   :52.33   Max.   :55.05   Max.   :473.0

Step 5:

Our next step is to make some preliminary plots of our data. Plotting our data helps us to better understand the relationship of our data. Plots also help us better understand our variables. In this exercise all of our variables are quantitative.

## Plots for Detroit Dataset ##########################################
#Plot 1
plot(HOM,FTP,xlab = "Homicides",ylab = "Full Time Police", type = "o")

#Plot 2
plot(HOM,UEMP,xlab = "Homicides",ylab = "Unemployment Percentage", type = "o")

# #Plot 3
# plot(HOM,MAN,xlab = "Homicides",ylab = "Manufacturing Workers", type = "o")
# #Plot 4
# plot(HOM,LIC,xlab = "Homicides",ylab = "Handgun Licenses per 100,000", type = "o")
# #Plot 5
# plot(HOM,GR,xlab = "White Males",ylab = "Gun Registrations", type = "o")
# #Plot 6
# plot(HOM,CLEAR,xlab = "Homicides",ylab = "% Homicides Cleared by Arrests",main="Homicides Cleared by Arrests", type = "o")
# #Plot 7
# plot(HOM,WM,xlab = "Homicides",ylab = "Number of White Males in Population", type = "o")
# #Plot 8
# plot(HOM,NMAN,xlab = "Homicides",ylab = "Number of Non-Manufacturing Workers", type = "o")
# #Plot 9
# plot(HOM,GOV,xlab = "Homicides",ylab = "Number of Government Workers", type = "o")
# #Plot 10
# plot(HOM,HE,xlab = "Homicides",ylab = "Hourly Earnings", type = "o")
# #Plot 11
# plot(HOM,WE,xlab = "Homicides",ylab = "Weekly Earnings", type = "o")
# #Plot 12
# plot(HOM,ACC,xlab = "Homicides",ylab = "Death Rate in Accidents", type = "o")
# #Plot 13
# plot(HOM,ASR,xlab = "Homicides",ylab = "Number of Assaults", type = "o")

Step 6:

Our next step is using a familiar and widely used statistical method “correlation”. Utilizing correlations helps us to understand a relationship, but remember that correlation does not imply causation.

## Correlations for Detroit Dataset ##################################
cor(Detroit_Data_Ben_Gonzalez)

##              Year        FTP         UEMP        MAN        LIC
## Year   1.00000000  0.9080331 -0.023582206  0.7216688  0.7662179
## FTP    0.90803311  1.0000000  0.292549420  0.4182908  0.5685049
## UEMP  -0.02358221  0.2925494  1.000000000 -0.6517189 -0.1669183
## MAN    0.72166884  0.4182908 -0.651718924  1.0000000  0.6981753
## LIC    0.76621792  0.5685049 -0.166918327  0.6981753  1.0000000
## GR     0.83840429  0.7023668 -0.032874663  0.6277811  0.9037943
## CLEAR -0.91378188 -0.9742599 -0.305663130 -0.4292950 -0.5548205
## WM    -0.99815522 -0.8839437  0.073117734 -0.7528185 -0.7836119
## NMAN   0.99456494  0.8818185 -0.038947215  0.7503422  0.7848187
## GOV    0.98744955  0.8793086  0.007512535  0.7097715  0.8036603
## HE     0.88841231  0.9365054  0.230809741  0.4536647  0.4216193
## WE     0.88091759  0.9222516  0.131281705  0.5023350  0.3909398
## HOM    0.96747586  0.9640610  0.210141746  0.5464227  0.7262884
## ACC    0.02239890 -0.3394657 -0.880520193  0.6503266  0.2183065
## ASR    0.69515696  0.8796429  0.497041538  0.1380582  0.3798743
##                GR      CLEAR          WM        NMAN          GOV
## Year   0.83840429 -0.9137819 -0.99815522  0.99456494  0.987449550
## FTP    0.70236681 -0.9742599 -0.88394371  0.88181848  0.879308573
## UEMP  -0.03287466 -0.3056631  0.07311773 -0.03894721  0.007512535
## MAN    0.62778114 -0.4292950 -0.75281848  0.75034215  0.709771513
## LIC    0.90379432 -0.5548205 -0.78361192  0.78481873  0.803660309
## GR     1.00000000 -0.6831611 -0.84583801  0.84329238  0.862428911
## CLEAR -0.68316109  1.0000000  0.89055542 -0.89159608 -0.893435582
## WM    -0.84583801  0.8905554  1.00000000 -0.99426188 -0.988568755
## NMAN   0.84329238 -0.8915961 -0.99426188  1.00000000  0.990035942
## GOV    0.86242891 -0.8934356 -0.98856876  0.99003594  1.000000000
## HE     0.57352189 -0.9574269 -0.86675416  0.86950539  0.857288765
## WE     0.55589546 -0.9362843 -0.86032306  0.85556943  0.826426592
## HOM    0.81628727 -0.9684603 -0.95257479  0.95593474  0.958054560
## ACC    0.03183887  0.3230314 -0.07564225  0.05171332  0.023996537
## ASR    0.54109991 -0.8611881 -0.65310115  0.67455804  0.649769457
##               HE         WE        HOM         ACC        ASR
## Year   0.8884123  0.8809176  0.9674759  0.02239890  0.6951570
## FTP    0.9365054  0.9222516  0.9640610 -0.33946567  0.8796429
## UEMP   0.2308097  0.1312817  0.2101417 -0.88052019  0.4970415
## MAN    0.4536647  0.5023350  0.5464227  0.65032657  0.1380582
## LIC    0.4216193  0.3909398  0.7262884  0.21830646  0.3798743
## GR     0.5735219  0.5558955  0.8162873  0.03183887  0.5410999
## CLEAR -0.9574269 -0.9362843 -0.9684603  0.32303145 -0.8611881
## WM    -0.8667542 -0.8603231 -0.9525748 -0.07564225 -0.6531011
## NMAN   0.8695054  0.8555694  0.9559347  0.05171332  0.6745580
## GOV    0.8572888  0.8264266  0.9580546  0.02399654  0.6497695
## HE     1.0000000  0.9827635  0.9134063 -0.26198453  0.8163920
## WE     0.9827635  1.0000000  0.8881526 -0.19137607  0.8024924
## HOM    0.9134063  0.8881526  1.0000000 -0.20443320  0.8247943
## ACC   -0.2619845 -0.1913761 -0.2044332  1.00000000 -0.6254742
## ASR    0.8163920  0.8024924  0.8247943 -0.62547418  1.0000000

cor(HOM,FTP)

## [1] 0.964061

cor(HOM,UEMP)

## [1] 0.2101417

cor(HOM,MAN)

## [1] 0.5464227

cor(HOM,LIC)

## [1] 0.7262884

cor(HOM,GR)

## [1] 0.8162873

cor(HOM,CLEAR)

## [1] -0.9684603

cor(HOM,WM)

## [1] -0.9525748

cor(HOM,NMAN)

## [1] 0.9559347

cor(HOM,GOV)

## [1] 0.9580546

cor(HOM,HE)

## [1] 0.9134063

cor(HOM,WE)

## [1] 0.8881526

cor(HOM,ACC)

## [1] -0.2044332

cor(HOM,ASR)

## [1] 0.8247943

Step 7:

Simple Linear Regression

In this step we will utilize simple linear regression to look at the relationship between our independent and dependent variables in the Detroit dataset. Our dependent variable is HOM (homicide) and our independent variable is FTP(full time police). The tilde ‘~’ is used to regress FTP onto HOM. We utilize the summary function to create a summary that will provide us with our coefficients, p-values, and r-squared values.

#Simple Linear Regression for Detroit Dataset #########################
lm.fitdetroit1=lm(HOM~FTP,data = Detroit_Data_Ben_Gonzalez)
summary(lm.fitdetroit1)

## 
## Call:
## lm(formula = HOM ~ FTP, data = Detroit_Data_Ben_Gonzalez)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.649 -2.244 -1.258  3.559  8.254 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -77.63026    8.63094  -8.994 2.11e-06 ***
## FTP           0.33745    0.02804  12.035 1.13e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.547 on 11 degrees of freedom
## Multiple R-squared:  0.9294, Adjusted R-squared:  0.923 
## F-statistic: 144.8 on 1 and 11 DF,  p-value: 1.129e-07

summary(lm.fitdetroit1)$r.squared

## [1] 0.9294135

summary(lm.fitdetroit1)$adj.r.squared

## [1] 0.9229966

Step 8:

Multiple Linear Regression

Now we will use multiple linear regression on our dataset. We have one caveat though due to the ‘high-dimensional’ nature of our data. We currently have more variables(columns) than we do observations(rows) in our data, hence the term ‘high-dimensional’. In most cases we can utilize the ‘.’ method lm(HOM~.,data=Detroit_Ben_Gonzalez) to look at all independent variables at once. In our case this will result in an error in R and is an ill advised way of doing things if you have data that is high-dimensional in nature. Therefore we will look at our variables by adding them in one by one as shown in the following code.

##Detroit Multiple Linear Regression####################################
detroitlm=lm(HOM~FTP+MAN+LIC+GR+WM+NMAN+GOV+HE+WE+ASR,data = Detroit_Data_Ben_Gonzalez)
summary(detroitlm)

## 
## Call:
## lm(formula = HOM ~ FTP + MAN + LIC + GR + WM + NMAN + GOV + HE + 
##     WE + ASR, data = Detroit_Data_Ben_Gonzalez)
## 
## Residuals:
##         1         2         3         4         5         6         7 
##  0.134676 -0.119242  0.035018  0.156071 -0.527640  0.528712 -0.262829 
##         8         9        10        11        12        13 
##  0.140073 -0.115613 -0.167151  0.101167 -0.002143  0.098901 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -2.021e+02  4.986e+01  -4.054   0.0558 .
## FTP          2.933e-02  2.345e-02   1.251   0.3375  
## MAN         -1.150e-01  1.832e-02  -6.279   0.0244 *
## LIC          2.704e-02  4.221e-03   6.406   0.0235 *
## GR          -4.355e-03  2.362e-03  -1.844   0.2066  
## WM           2.385e-04  6.509e-05   3.664   0.0671 .
## NMAN         1.147e-01  3.824e-02   3.000   0.0954 .
## GOV          2.740e-01  9.392e-02   2.917   0.1001  
## HE          -9.155e+00  3.038e+00  -3.013   0.0947 .
## WE           4.393e-01  7.687e-02   5.714   0.0293 *
## ASR         -1.437e-02  1.602e-02  -0.897   0.4644  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6188 on 2 degrees of freedom
## Multiple R-squared:  0.9998, Adjusted R-squared:  0.9986 
## F-statistic: 841.1 on 10 and 2 DF,  p-value: 0.001188

summary(detroitlm)$r.squared

## [1] 0.9997623

summary(detroitlm)$adj.r.squared

## [1] 0.9985736

detroitlm2=lm(HOM~NMAN+GOV+HE+WE+ACC+ASR,data = Detroit_Data_Ben_Gonzalez)
summary(detroitlm2)

## 
## Call:
## lm(formula = HOM ~ NMAN + GOV + HE + WE + ACC + ASR, data = Detroit_Data_Ben_Gonzalez)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.50164 -1.16426  0.04465  1.18885  2.52940 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -49.87507   19.82725  -2.515   0.0456 *
## NMAN         -0.11003    0.10192  -1.080   0.3218  
## GOV           0.56418    0.20446   2.759   0.0329 *
## HE           -2.85925    6.45585  -0.443   0.6734  
## WE            0.07516    0.13574   0.554   0.5998  
## ACC           0.23777    0.58404   0.407   0.6980  
## ASR           0.10164    0.05156   1.971   0.0962 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.179 on 6 degrees of freedom
## Multiple R-squared:  0.9912, Adjusted R-squared:  0.9823 
## F-statistic: 112.1 on 6 and 6 DF,  p-value: 6.829e-06

summary(detroitlm2)$r.squared

## [1] 0.9911547

summary(detroitlm2)$adj.r.squared

## [1] 0.9823094

detroitlm3=lm(HOM~LIC+FTP+GR+HE+WE+NMAN+ASR,data = Detroit_Data_Ben_Gonzalez)
summary(detroitlm3)

## 
## Call:
## lm(formula = HOM ~ LIC + FTP + GR + HE + WE + NMAN + ASR, data = Detroit_Data_Ben_Gonzalez)
## 
## Residuals:
##        1        2        3        4        5        6        7        8 
##  1.04955  0.21107 -1.31036 -1.22069  0.95249  1.12427 -0.33710  0.79344 
##        9       10       11       12       13 
## -2.99030 -0.30376  2.06887  0.07462 -0.11211 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -49.524781  20.527821  -2.413   0.0607 .
## LIC           0.015993   0.008576   1.865   0.1212  
## FTP           0.106338   0.057866   1.838   0.1255  
## GR            0.002685   0.005468   0.491   0.6441  
## HE            8.094982   4.292677   1.886   0.1180  
## WE           -0.027788   0.088143  -0.315   0.7653  
## NMAN         -0.001665   0.049411  -0.034   0.9744  
## ASR           0.019545   0.020002   0.977   0.3734  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.03 on 5 degrees of freedom
## Multiple R-squared:  0.9936, Adjusted R-squared:  0.9847 
## F-statistic:   111 on 7 and 5 DF,  p-value: 3.512e-05

summary(detroitlm3)$r.squared

## [1] 0.993606

summary(detroitlm3)$adj.r.squared

## [1] 0.9846544

Step 9:

Next we will use the “forward” and “backward” subset selections. These methods allow us to add and remove a variable one by one, and see which variable combinations are more suitable for our model selection.

Forward Selection Method:

## Detroit Subset Model Selection ######################### 
library(leaps)
#Model 1
detroitmodelfit.1=regsubsets(HOM~.,data = Detroit_Data_Ben_Gonzalez,method = "forward")
summary(detroitmodelfit.1)

## Subset selection object
## Call: regsubsets.formula(HOM ~ ., data = Detroit_Data_Ben_Gonzalez, 
##     method = "forward")
## 14 Variables  (and intercept)
##       Forced in Forced out
## Year      FALSE      FALSE
## FTP       FALSE      FALSE
## UEMP      FALSE      FALSE
## MAN       FALSE      FALSE
## LIC       FALSE      FALSE
## GR        FALSE      FALSE
## CLEAR     FALSE      FALSE
## WM        FALSE      FALSE
## NMAN      FALSE      FALSE
## GOV       FALSE      FALSE
## HE        FALSE      FALSE
## WE        FALSE      FALSE
## ACC       FALSE      FALSE
## ASR       FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
##          Year FTP UEMP MAN LIC GR  CLEAR WM  NMAN GOV HE  WE  ACC ASR
## 1  ( 1 ) " "  " " " "  " " " " " " "*"   " " " "  " " " " " " " " " "
## 2  ( 1 ) " "  " " " "  " " "*" " " "*"   " " " "  " " " " " " " " " "
## 3  ( 1 ) " "  " " " "  " " "*" " " "*"   " " " "  " " "*" " " " " " "
## 4  ( 1 ) " "  "*" " "  " " "*" " " "*"   " " " "  " " "*" " " " " " "
## 5  ( 1 ) " "  "*" "*"  " " "*" " " "*"   " " " "  " " "*" " " " " " "
## 6  ( 1 ) " "  "*" "*"  " " "*" " " "*"   " " " "  " " "*" "*" " " " "
## 7  ( 1 ) "*"  "*" "*"  " " "*" " " "*"   " " " "  " " "*" "*" " " " "
## 8  ( 1 ) "*"  "*" "*"  " " "*" " " "*"   " " " "  " " "*" "*" " " "*"

detroit.summary1=summary(detroitmodelfit.1)
summary(detroit.summary1)

##        Length Class      Mode     
## which  120    -none-     logical  
## rsq      8    -none-     numeric  
## rss      8    -none-     numeric  
## adjr2    8    -none-     numeric  
## cp       8    -none-     numeric  
## bic      8    -none-     numeric  
## outmat 112    -none-     character
## obj     28    regsubsets list

Max Variable Selection Method:

#Model 2
detroitmodelfit.2=regsubsets(HOM~.,data = Detroit_Data_Ben_Gonzalez,nvmax = 12)
detroit.summary2=summary(detroitmodelfit.2)
summary(detroitmodelfit.2)

## Subset selection object
## Call: regsubsets.formula(HOM ~ ., data = Detroit_Data_Ben_Gonzalez, 
##     nvmax = 12)
## 14 Variables  (and intercept)
##       Forced in Forced out
## Year      FALSE      FALSE
## FTP       FALSE      FALSE
## UEMP      FALSE      FALSE
## MAN       FALSE      FALSE
## LIC       FALSE      FALSE
## GR        FALSE      FALSE
## CLEAR     FALSE      FALSE
## WM        FALSE      FALSE
## NMAN      FALSE      FALSE
## GOV       FALSE      FALSE
## HE        FALSE      FALSE
## WE        FALSE      FALSE
## ACC       FALSE      FALSE
## ASR       FALSE      FALSE
## 1 subsets of each size up to 12
## Selection Algorithm: exhaustive
##           Year FTP UEMP MAN LIC GR  CLEAR WM  NMAN GOV HE  WE  ACC ASR
## 1  ( 1 )  " "  " " " "  " " " " " " "*"   " " " "  " " " " " " " " " "
## 2  ( 1 )  "*"  " " "*"  " " " " " " " "   " " " "  " " " " " " " " " "
## 3  ( 1 )  " "  " " "*"  " " "*" " " " "   " " " "  " " " " "*" " " " "
## 4  ( 1 )  " "  " " "*"  " " "*" " " "*"   " " " "  " " " " "*" " " " "
## 5  ( 1 )  " "  "*" "*"  " " "*" " " " "   " " " "  "*" " " "*" " " " "
## 6  ( 1 )  "*"  "*" "*"  " " "*" " " "*"   " " " "  " " " " "*" " " " "
## 7  ( 1 )  " "  " " "*"  " " "*" "*" " "   " " " "  "*" "*" "*" "*" " "
## 8  ( 1 )  " "  " " "*"  "*" "*" "*" " "   " " " "  "*" "*" "*" "*" " "
## 9  ( 1 )  " "  "*" "*"  "*" "*" "*" " "   " " " "  "*" "*" "*" " " "*"
## 10  ( 1 ) "*"  "*" "*"  "*" " " "*" "*"   "*" "*"  " " "*" " " " " "*"
## 11  ( 1 ) "*"  "*" "*"  "*" "*" "*" " "   " " "*"  "*" "*" "*" "*" " "
## 12  ( 1 ) "*"  "*" "*"  "*" "*" "*" "*"   "*" "*"  "*" " " "*" "*" " "

Backward Selection Method:

#Model 3
detroitmodelfit.3=regsubsets(HOM~.,data = Detroit_Data_Ben_Gonzalez,method = "backward")
detroit.summary3=summary(detroitmodelfit.3)
summary(detroitmodelfit.3)

## Subset selection object
## Call: regsubsets.formula(HOM ~ ., data = Detroit_Data_Ben_Gonzalez, 
##     method = "backward")
## 14 Variables  (and intercept)
##       Forced in Forced out
## Year      FALSE      FALSE
## FTP       FALSE      FALSE
## UEMP      FALSE      FALSE
## MAN       FALSE      FALSE
## LIC       FALSE      FALSE
## GR        FALSE      FALSE
## CLEAR     FALSE      FALSE
## WM        FALSE      FALSE
## NMAN      FALSE      FALSE
## GOV       FALSE      FALSE
## HE        FALSE      FALSE
## WE        FALSE      FALSE
## ACC       FALSE      FALSE
## ASR       FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: backward
##          Year FTP UEMP MAN LIC GR  CLEAR WM  NMAN GOV HE  WE  ACC ASR
## 1  ( 1 ) "*"  " " " "  " " " " " " " "   " " " "  " " " " " " " " " "
## 2  ( 1 ) "*"  " " " "  " " " " " " " "   "*" " "  " " " " " " " " " "
## 3  ( 1 ) "*"  " " " "  " " "*" " " " "   "*" " "  " " " " " " " " " "
## 4  ( 1 ) "*"  " " " "  "*" "*" " " " "   "*" " "  " " " " " " " " " "
## 5  ( 1 ) "*"  " " " "  "*" "*" " " " "   "*" " "  "*" " " " " " " " "
## 6  ( 1 ) "*"  " " " "  "*" "*" " " " "   "*" " "  "*" " " "*" " " " "
## 7  ( 1 ) "*"  " " " "  "*" "*" " " " "   "*" " "  "*" "*" "*" " " " "
## 8  ( 1 ) "*"  " " " "  "*" "*" "*" " "   "*" " "  "*" "*" "*" " " " "

Step 10:

In this step we will look at our first model and will look at the results.

#Detroit Model  1
detroit.summary1=summary(detroitmodelfit.1)
detroit.summary1

## Subset selection object
## Call: regsubsets.formula(HOM ~ ., data = Detroit_Data_Ben_Gonzalez, 
##     method = "forward")
## 14 Variables  (and intercept)
##       Forced in Forced out
## Year      FALSE      FALSE
## FTP       FALSE      FALSE
## UEMP      FALSE      FALSE
## MAN       FALSE      FALSE
## LIC       FALSE      FALSE
## GR        FALSE      FALSE
## CLEAR     FALSE      FALSE
## WM        FALSE      FALSE
## NMAN      FALSE      FALSE
## GOV       FALSE      FALSE
## HE        FALSE      FALSE
## WE        FALSE      FALSE
## ACC       FALSE      FALSE
## ASR       FALSE      FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
##          Year FTP UEMP MAN LIC GR  CLEAR WM  NMAN GOV HE  WE  ACC ASR
## 1  ( 1 ) " "  " " " "  " " " " " " "*"   " " " "  " " " " " " " " " "
## 2  ( 1 ) " "  " " " "  " " "*" " " "*"   " " " "  " " " " " " " " " "
## 3  ( 1 ) " "  " " " "  " " "*" " " "*"   " " " "  " " "*" " " " " " "
## 4  ( 1 ) " "  "*" " "  " " "*" " " "*"   " " " "  " " "*" " " " " " "
## 5  ( 1 ) " "  "*" "*"  " " "*" " " "*"   " " " "  " " "*" " " " " " "
## 6  ( 1 ) " "  "*" "*"  " " "*" " " "*"   " " " "  " " "*" "*" " " " "
## 7  ( 1 ) "*"  "*" "*"  " " "*" " " "*"   " " " "  " " "*" "*" " " " "
## 8  ( 1 ) "*"  "*" "*"  " " "*" " " "*"   " " " "  " " "*" "*" " " "*"

detroit.summary1$rsq

## [1] 0.9379154 0.9895042 0.9934288 0.9958723 0.9974545 0.9992607 0.9995836
## [8] 0.9996476

detroit.summary1$adjr2

## [1] 0.9322714 0.9874051 0.9912384 0.9938084 0.9956363 0.9985215 0.9990006
## [8] 0.9989428

detroit.summary1$rss

## [1] 200.023392  33.815100  21.171091  13.298648   8.201044   2.381723
## [7]   1.341586   1.135364

which.max(detroit.summary1$adjr2)

## [1] 7

detroit.summary1$bic

## [1] -31.00045 -51.54335 -55.06593 -58.54564 -62.26490 -75.77364 -80.67033
## [8] -80.27506

which.min(detroit.summary1$bic)

## [1] 7

plot(detroit.summary1$bic,xlab = "Number of Variables",ylab = "Detroit Model 1 BIC",type = "o")
points(8,detroit.summary1$bic[8],col="blue",cex=2,pch=20)

which.max(detroit.summary1$adjr2)

## [1] 7

plot(detroit.summary2$adjr2,xlab = "Number of Variables",ylab = "Detroit Model 1 Adjusted R-Squared",type = "o")
points(7,detroit.summary1$adjr2[7],col="blue",cex=2,pch=20)

which.min(detroit.summary1$rss)

## [1] 8

plot(detroit.summary1$rss,xlab = "Number of Variables",ylab = "Detroit Model 1 RSS",type = "o")
points(8,detroit.summary1$rss[8],col="blue",cex=2,pch=20)

coef(detroitmodelfit.1,8)

##   (Intercept)          Year           FTP          UEMP           LIC 
## -2.041112e+03  1.035964e+00  2.935713e-02  1.003702e+00  1.584314e-02 
##         CLEAR            HE            WE           ASR 
## -1.946830e-01 -4.522011e-02  1.135722e-01  6.277388e-03

Step 11:

In this step we will utilize polynomial regression on our data to see what effect it has on our dataset. The ‘gam’ and ‘splines’ package will allow us to perform polynomial regression and allows us to to further evaluate our data in this manner.

## Detroit GAM (Generalized Additive Model) Models ############################
library(splines)
#install.packages("gam")
library(gam)
## Detroit GAM 1
gam.detroitmodel11=lm(HOM~s(UEMP,2)+s(LIC,1)+WE+s(HE,2)+ASR+s(NMAN,1)+s(MAN,1)+ASR,data = Detroit_Data_Ben_Gonzalez)
anova(gam.detroitmodel11)

## Analysis of Variance Table
## 
## Response: HOM
##            Df  Sum Sq Mean Sq   F value    Pr(>F)    
## s(UEMP, 2)  1  142.27  142.27  168.7399 4.820e-05 ***
## s(LIC, 1)   1 1921.12 1921.12 2278.5095 7.623e-08 ***
## WE          1 1151.60 1151.60 1365.8342 2.732e-07 ***
## s(HE, 2)    1    0.11    0.11    0.1360    0.7274    
## ASR         1    0.02    0.02    0.0190    0.8956    
## s(NMAN, 1)  1    0.26    0.26    0.3122    0.6004    
## s(MAN, 1)   1    2.18    2.18    2.5908    0.1684    
## Residuals   5    4.22    0.84                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(gam.detroitmodel11)

## 
## Call:
## lm(formula = HOM ~ s(UEMP, 2) + s(LIC, 1) + WE + s(HE, 2) + ASR + 
##     s(NMAN, 1) + s(MAN, 1) + ASR, data = Detroit_Data_Ben_Gonzalez)
## 
## Residuals:
##        1        2        3        4        5        6        7        8 
## -0.54830  0.95624  0.13462 -0.56161  0.13595  0.43332 -1.29799  0.04715 
##        9       10       11       12       13 
##  0.17535  0.76758  0.13711 -0.36752 -0.01191 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept) -32.657710  10.906308  -2.994  0.03029 * 
## s(UEMP, 2)    0.999276   0.368150   2.714  0.04206 * 
## s(LIC, 1)     0.022671   0.003572   6.348  0.00143 **
## WE            0.247812   0.054239   4.569  0.00601 **
## s(HE, 2)     -1.369245   2.610692  -0.524  0.62236   
## ASR          -0.002747   0.010348  -0.265  0.80128   
## s(NMAN, 1)    0.043101   0.028955   1.489  0.19678   
## s(MAN, 1)    -0.045099   0.028019  -1.610  0.16840   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9182 on 5 degrees of freedom
## Multiple R-squared:  0.9987, Adjusted R-squared:  0.9969 
## F-statistic: 545.2 on 7 and 5 DF,  p-value: 6.714e-07

plot(gam.detroitmodel11,se=TRUE,col="blue")

##Interactive Detroit Data Sets###############################################################
library(ggvis)

Detroit_Data_Ben_Gonzalez %>% ggvis(~FTP, ~HOM, fill=~UEMP) %>% layer_points() %>% add_tooltip(function(df) df$UEMP)

This concludes the tutorial on analyzing quantitative variables.

Detroit Crime

Ben Gonzalez

3/6/2017