Load the libraries needed for the analysis
library(shiny)
library(ggvis)
library(reshape2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(splines)
library(gam)
## Loading required package: foreach
## Loaded gam 1.14
# library(ISLR)
library(tree)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
Read the data into R. We will read the data into R and then attach the dataset to use in our analysis.
Detroit_Data_Ben_Gonzalez=read.csv("~/Datasets/Detroit_Data_Ben_Gonzalez.csv",header = T)
attach(Detroit_Data_Ben_Gonzalez)
Look at the variable names in our data.
names(Detroit_Data_Ben_Gonzalez)
## [1] "Year" "FTP" "UEMP" "MAN" "LIC" "GR" "CLEAR" "WM"
## [9] "NMAN" "GOV" "HE" "WE" "HOM" "ACC" "ASR"
Generate a summary of our data to better understand our dataset.
summary(Detroit_Data_Ben_Gonzalez)
## Year FTP UEMP MAN
## Min. :1961 Min. :260.4 Min. : 3.200 Min. :455.5
## 1st Qu.:1964 1st Qu.:269.8 1st Qu.: 3.900 1st Qu.:535.8
## Median :1967 Median :273.0 Median : 5.200 Median :569.3
## Mean :1967 Mean :304.5 Mean : 5.792 Mean :556.4
## 3rd Qu.:1970 3rd Qu.:341.4 3rd Qu.: 7.100 3rd Qu.:596.9
## Max. :1973 Max. :390.2 Max. :11.000 Max. :613.5
## LIC GR CLEAR WM
## Min. : 156.4 Min. : 180.5 Min. :58.90 Min. :359647
## 1st Qu.: 222.1 1st Qu.: 231.7 1st Qu.:73.90 1st Qu.:401518
## Median : 583.2 Median : 616.5 Median :87.40 Median :448267
## Mean : 537.5 Mean : 545.7 Mean :81.45 Mean :452508
## 3rd Qu.: 794.9 3rd Qu.: 750.4 3rd Qu.:91.00 3rd Qu.:500457
## Max. :1131.2 Max. :1029.8 Max. :94.40 Max. :558724
## NMAN GOV HE WE
## Min. :538.1 Min. :133.9 Min. :2.910 Min. :117.2
## 1st Qu.:591.0 1st Qu.:150.3 1st Qu.:3.230 1st Qu.:141.7
## Median :686.2 Median :187.5 Median :3.600 Median :157.2
## Mean :673.9 Mean :185.8 Mean :3.948 Mean :170.0
## 3rd Qu.:755.3 3rd Qu.:223.8 3rd Qu.:4.470 3rd Qu.:178.7
## Max. :819.8 Max. :230.9 Max. :5.760 Max. :258.1
## HOM ACC ASR
## Min. : 8.52 Min. :39.17 Min. :218.0
## 1st Qu.: 8.90 1st Qu.:44.17 1st Qu.:277.5
## Median :21.36 Median :45.80 Median :306.2
## Mean :25.13 Mean :46.92 Mean :311.9
## 3rd Qu.:37.39 3rd Qu.:50.62 3rd Qu.:323.0
## Max. :52.33 Max. :55.05 Max. :473.0
Our next step is to make some preliminary plots of our data. Plotting our data helps us to better understand the relationship of our data. Plots also help us better understand our variables. In this exercise all of our variables are quantitative.
## Plots for Detroit Dataset ##########################################
#Plot 1
plot(HOM,FTP,xlab = "Homicides",ylab = "Full Time Police", type = "o")
#Plot 2
plot(HOM,UEMP,xlab = "Homicides",ylab = "Unemployment Percentage", type = "o")
# #Plot 3
# plot(HOM,MAN,xlab = "Homicides",ylab = "Manufacturing Workers", type = "o")
# #Plot 4
# plot(HOM,LIC,xlab = "Homicides",ylab = "Handgun Licenses per 100,000", type = "o")
# #Plot 5
# plot(HOM,GR,xlab = "White Males",ylab = "Gun Registrations", type = "o")
# #Plot 6
# plot(HOM,CLEAR,xlab = "Homicides",ylab = "% Homicides Cleared by Arrests",main="Homicides Cleared by Arrests", type = "o")
# #Plot 7
# plot(HOM,WM,xlab = "Homicides",ylab = "Number of White Males in Population", type = "o")
# #Plot 8
# plot(HOM,NMAN,xlab = "Homicides",ylab = "Number of Non-Manufacturing Workers", type = "o")
# #Plot 9
# plot(HOM,GOV,xlab = "Homicides",ylab = "Number of Government Workers", type = "o")
# #Plot 10
# plot(HOM,HE,xlab = "Homicides",ylab = "Hourly Earnings", type = "o")
# #Plot 11
# plot(HOM,WE,xlab = "Homicides",ylab = "Weekly Earnings", type = "o")
# #Plot 12
# plot(HOM,ACC,xlab = "Homicides",ylab = "Death Rate in Accidents", type = "o")
# #Plot 13
# plot(HOM,ASR,xlab = "Homicides",ylab = "Number of Assaults", type = "o")
Our next step is using a familiar and widely used statistical method “correlation”. Utilizing correlations helps us to understand a relationship, but remember that correlation does not imply causation.
## Correlations for Detroit Dataset ##################################
cor(Detroit_Data_Ben_Gonzalez)
## Year FTP UEMP MAN LIC
## Year 1.00000000 0.9080331 -0.023582206 0.7216688 0.7662179
## FTP 0.90803311 1.0000000 0.292549420 0.4182908 0.5685049
## UEMP -0.02358221 0.2925494 1.000000000 -0.6517189 -0.1669183
## MAN 0.72166884 0.4182908 -0.651718924 1.0000000 0.6981753
## LIC 0.76621792 0.5685049 -0.166918327 0.6981753 1.0000000
## GR 0.83840429 0.7023668 -0.032874663 0.6277811 0.9037943
## CLEAR -0.91378188 -0.9742599 -0.305663130 -0.4292950 -0.5548205
## WM -0.99815522 -0.8839437 0.073117734 -0.7528185 -0.7836119
## NMAN 0.99456494 0.8818185 -0.038947215 0.7503422 0.7848187
## GOV 0.98744955 0.8793086 0.007512535 0.7097715 0.8036603
## HE 0.88841231 0.9365054 0.230809741 0.4536647 0.4216193
## WE 0.88091759 0.9222516 0.131281705 0.5023350 0.3909398
## HOM 0.96747586 0.9640610 0.210141746 0.5464227 0.7262884
## ACC 0.02239890 -0.3394657 -0.880520193 0.6503266 0.2183065
## ASR 0.69515696 0.8796429 0.497041538 0.1380582 0.3798743
## GR CLEAR WM NMAN GOV
## Year 0.83840429 -0.9137819 -0.99815522 0.99456494 0.987449550
## FTP 0.70236681 -0.9742599 -0.88394371 0.88181848 0.879308573
## UEMP -0.03287466 -0.3056631 0.07311773 -0.03894721 0.007512535
## MAN 0.62778114 -0.4292950 -0.75281848 0.75034215 0.709771513
## LIC 0.90379432 -0.5548205 -0.78361192 0.78481873 0.803660309
## GR 1.00000000 -0.6831611 -0.84583801 0.84329238 0.862428911
## CLEAR -0.68316109 1.0000000 0.89055542 -0.89159608 -0.893435582
## WM -0.84583801 0.8905554 1.00000000 -0.99426188 -0.988568755
## NMAN 0.84329238 -0.8915961 -0.99426188 1.00000000 0.990035942
## GOV 0.86242891 -0.8934356 -0.98856876 0.99003594 1.000000000
## HE 0.57352189 -0.9574269 -0.86675416 0.86950539 0.857288765
## WE 0.55589546 -0.9362843 -0.86032306 0.85556943 0.826426592
## HOM 0.81628727 -0.9684603 -0.95257479 0.95593474 0.958054560
## ACC 0.03183887 0.3230314 -0.07564225 0.05171332 0.023996537
## ASR 0.54109991 -0.8611881 -0.65310115 0.67455804 0.649769457
## HE WE HOM ACC ASR
## Year 0.8884123 0.8809176 0.9674759 0.02239890 0.6951570
## FTP 0.9365054 0.9222516 0.9640610 -0.33946567 0.8796429
## UEMP 0.2308097 0.1312817 0.2101417 -0.88052019 0.4970415
## MAN 0.4536647 0.5023350 0.5464227 0.65032657 0.1380582
## LIC 0.4216193 0.3909398 0.7262884 0.21830646 0.3798743
## GR 0.5735219 0.5558955 0.8162873 0.03183887 0.5410999
## CLEAR -0.9574269 -0.9362843 -0.9684603 0.32303145 -0.8611881
## WM -0.8667542 -0.8603231 -0.9525748 -0.07564225 -0.6531011
## NMAN 0.8695054 0.8555694 0.9559347 0.05171332 0.6745580
## GOV 0.8572888 0.8264266 0.9580546 0.02399654 0.6497695
## HE 1.0000000 0.9827635 0.9134063 -0.26198453 0.8163920
## WE 0.9827635 1.0000000 0.8881526 -0.19137607 0.8024924
## HOM 0.9134063 0.8881526 1.0000000 -0.20443320 0.8247943
## ACC -0.2619845 -0.1913761 -0.2044332 1.00000000 -0.6254742
## ASR 0.8163920 0.8024924 0.8247943 -0.62547418 1.0000000
cor(HOM,FTP)
## [1] 0.964061
cor(HOM,UEMP)
## [1] 0.2101417
cor(HOM,MAN)
## [1] 0.5464227
cor(HOM,LIC)
## [1] 0.7262884
cor(HOM,GR)
## [1] 0.8162873
cor(HOM,CLEAR)
## [1] -0.9684603
cor(HOM,WM)
## [1] -0.9525748
cor(HOM,NMAN)
## [1] 0.9559347
cor(HOM,GOV)
## [1] 0.9580546
cor(HOM,HE)
## [1] 0.9134063
cor(HOM,WE)
## [1] 0.8881526
cor(HOM,ACC)
## [1] -0.2044332
cor(HOM,ASR)
## [1] 0.8247943
Simple Linear Regression
In this step we will utilize simple linear regression to look at the relationship between our independent and dependent variables in the Detroit dataset. Our dependent variable is HOM (homicide) and our independent variable is FTP(full time police). The tilde ‘~’ is used to regress FTP onto HOM. We utilize the summary function to create a summary that will provide us with our coefficients, p-values, and r-squared values.
#Simple Linear Regression for Detroit Dataset #########################
lm.fitdetroit1=lm(HOM~FTP,data = Detroit_Data_Ben_Gonzalez)
summary(lm.fitdetroit1)
##
## Call:
## lm(formula = HOM ~ FTP, data = Detroit_Data_Ben_Gonzalez)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.649 -2.244 -1.258 3.559 8.254
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -77.63026 8.63094 -8.994 2.11e-06 ***
## FTP 0.33745 0.02804 12.035 1.13e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.547 on 11 degrees of freedom
## Multiple R-squared: 0.9294, Adjusted R-squared: 0.923
## F-statistic: 144.8 on 1 and 11 DF, p-value: 1.129e-07
summary(lm.fitdetroit1)$r.squared
## [1] 0.9294135
summary(lm.fitdetroit1)$adj.r.squared
## [1] 0.9229966
Multiple Linear Regression
Now we will use multiple linear regression on our dataset. We have one caveat though due to the ‘high-dimensional’ nature of our data. We currently have more variables(columns) than we do observations(rows) in our data, hence the term ‘high-dimensional’. In most cases we can utilize the ‘.’ method lm(HOM~.,data=Detroit_Ben_Gonzalez) to look at all independent variables at once. In our case this will result in an error in R and is an ill advised way of doing things if you have data that is high-dimensional in nature. Therefore we will look at our variables by adding them in one by one as shown in the following code.
##Detroit Multiple Linear Regression####################################
detroitlm=lm(HOM~FTP+MAN+LIC+GR+WM+NMAN+GOV+HE+WE+ASR,data = Detroit_Data_Ben_Gonzalez)
summary(detroitlm)
##
## Call:
## lm(formula = HOM ~ FTP + MAN + LIC + GR + WM + NMAN + GOV + HE +
## WE + ASR, data = Detroit_Data_Ben_Gonzalez)
##
## Residuals:
## 1 2 3 4 5 6 7
## 0.134676 -0.119242 0.035018 0.156071 -0.527640 0.528712 -0.262829
## 8 9 10 11 12 13
## 0.140073 -0.115613 -0.167151 0.101167 -0.002143 0.098901
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.021e+02 4.986e+01 -4.054 0.0558 .
## FTP 2.933e-02 2.345e-02 1.251 0.3375
## MAN -1.150e-01 1.832e-02 -6.279 0.0244 *
## LIC 2.704e-02 4.221e-03 6.406 0.0235 *
## GR -4.355e-03 2.362e-03 -1.844 0.2066
## WM 2.385e-04 6.509e-05 3.664 0.0671 .
## NMAN 1.147e-01 3.824e-02 3.000 0.0954 .
## GOV 2.740e-01 9.392e-02 2.917 0.1001
## HE -9.155e+00 3.038e+00 -3.013 0.0947 .
## WE 4.393e-01 7.687e-02 5.714 0.0293 *
## ASR -1.437e-02 1.602e-02 -0.897 0.4644
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6188 on 2 degrees of freedom
## Multiple R-squared: 0.9998, Adjusted R-squared: 0.9986
## F-statistic: 841.1 on 10 and 2 DF, p-value: 0.001188
summary(detroitlm)$r.squared
## [1] 0.9997623
summary(detroitlm)$adj.r.squared
## [1] 0.9985736
detroitlm2=lm(HOM~NMAN+GOV+HE+WE+ACC+ASR,data = Detroit_Data_Ben_Gonzalez)
summary(detroitlm2)
##
## Call:
## lm(formula = HOM ~ NMAN + GOV + HE + WE + ACC + ASR, data = Detroit_Data_Ben_Gonzalez)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.50164 -1.16426 0.04465 1.18885 2.52940
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -49.87507 19.82725 -2.515 0.0456 *
## NMAN -0.11003 0.10192 -1.080 0.3218
## GOV 0.56418 0.20446 2.759 0.0329 *
## HE -2.85925 6.45585 -0.443 0.6734
## WE 0.07516 0.13574 0.554 0.5998
## ACC 0.23777 0.58404 0.407 0.6980
## ASR 0.10164 0.05156 1.971 0.0962 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.179 on 6 degrees of freedom
## Multiple R-squared: 0.9912, Adjusted R-squared: 0.9823
## F-statistic: 112.1 on 6 and 6 DF, p-value: 6.829e-06
summary(detroitlm2)$r.squared
## [1] 0.9911547
summary(detroitlm2)$adj.r.squared
## [1] 0.9823094
detroitlm3=lm(HOM~LIC+FTP+GR+HE+WE+NMAN+ASR,data = Detroit_Data_Ben_Gonzalez)
summary(detroitlm3)
##
## Call:
## lm(formula = HOM ~ LIC + FTP + GR + HE + WE + NMAN + ASR, data = Detroit_Data_Ben_Gonzalez)
##
## Residuals:
## 1 2 3 4 5 6 7 8
## 1.04955 0.21107 -1.31036 -1.22069 0.95249 1.12427 -0.33710 0.79344
## 9 10 11 12 13
## -2.99030 -0.30376 2.06887 0.07462 -0.11211
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -49.524781 20.527821 -2.413 0.0607 .
## LIC 0.015993 0.008576 1.865 0.1212
## FTP 0.106338 0.057866 1.838 0.1255
## GR 0.002685 0.005468 0.491 0.6441
## HE 8.094982 4.292677 1.886 0.1180
## WE -0.027788 0.088143 -0.315 0.7653
## NMAN -0.001665 0.049411 -0.034 0.9744
## ASR 0.019545 0.020002 0.977 0.3734
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.03 on 5 degrees of freedom
## Multiple R-squared: 0.9936, Adjusted R-squared: 0.9847
## F-statistic: 111 on 7 and 5 DF, p-value: 3.512e-05
summary(detroitlm3)$r.squared
## [1] 0.993606
summary(detroitlm3)$adj.r.squared
## [1] 0.9846544
Next we will use the “forward” and “backward” subset selections. These methods allow us to add and remove a variable one by one, and see which variable combinations are more suitable for our model selection.
Forward Selection Method:
## Detroit Subset Model Selection #########################
library(leaps)
#Model 1
detroitmodelfit.1=regsubsets(HOM~.,data = Detroit_Data_Ben_Gonzalez,method = "forward")
summary(detroitmodelfit.1)
## Subset selection object
## Call: regsubsets.formula(HOM ~ ., data = Detroit_Data_Ben_Gonzalez,
## method = "forward")
## 14 Variables (and intercept)
## Forced in Forced out
## Year FALSE FALSE
## FTP FALSE FALSE
## UEMP FALSE FALSE
## MAN FALSE FALSE
## LIC FALSE FALSE
## GR FALSE FALSE
## CLEAR FALSE FALSE
## WM FALSE FALSE
## NMAN FALSE FALSE
## GOV FALSE FALSE
## HE FALSE FALSE
## WE FALSE FALSE
## ACC FALSE FALSE
## ASR FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## Year FTP UEMP MAN LIC GR CLEAR WM NMAN GOV HE WE ACC ASR
## 1 ( 1 ) " " " " " " " " " " " " "*" " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " "*" " " "*" " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " "*" " " "*" " " " " " " "*" " " " " " "
## 4 ( 1 ) " " "*" " " " " "*" " " "*" " " " " " " "*" " " " " " "
## 5 ( 1 ) " " "*" "*" " " "*" " " "*" " " " " " " "*" " " " " " "
## 6 ( 1 ) " " "*" "*" " " "*" " " "*" " " " " " " "*" "*" " " " "
## 7 ( 1 ) "*" "*" "*" " " "*" " " "*" " " " " " " "*" "*" " " " "
## 8 ( 1 ) "*" "*" "*" " " "*" " " "*" " " " " " " "*" "*" " " "*"
detroit.summary1=summary(detroitmodelfit.1)
summary(detroit.summary1)
## Length Class Mode
## which 120 -none- logical
## rsq 8 -none- numeric
## rss 8 -none- numeric
## adjr2 8 -none- numeric
## cp 8 -none- numeric
## bic 8 -none- numeric
## outmat 112 -none- character
## obj 28 regsubsets list
Max Variable Selection Method:
#Model 2
detroitmodelfit.2=regsubsets(HOM~.,data = Detroit_Data_Ben_Gonzalez,nvmax = 12)
detroit.summary2=summary(detroitmodelfit.2)
summary(detroitmodelfit.2)
## Subset selection object
## Call: regsubsets.formula(HOM ~ ., data = Detroit_Data_Ben_Gonzalez,
## nvmax = 12)
## 14 Variables (and intercept)
## Forced in Forced out
## Year FALSE FALSE
## FTP FALSE FALSE
## UEMP FALSE FALSE
## MAN FALSE FALSE
## LIC FALSE FALSE
## GR FALSE FALSE
## CLEAR FALSE FALSE
## WM FALSE FALSE
## NMAN FALSE FALSE
## GOV FALSE FALSE
## HE FALSE FALSE
## WE FALSE FALSE
## ACC FALSE FALSE
## ASR FALSE FALSE
## 1 subsets of each size up to 12
## Selection Algorithm: exhaustive
## Year FTP UEMP MAN LIC GR CLEAR WM NMAN GOV HE WE ACC ASR
## 1 ( 1 ) " " " " " " " " " " " " "*" " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " "*" " " " " " " " " " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " "*" " " "*" " " " " " " " " " " " " "*" " " " "
## 4 ( 1 ) " " " " "*" " " "*" " " "*" " " " " " " " " "*" " " " "
## 5 ( 1 ) " " "*" "*" " " "*" " " " " " " " " "*" " " "*" " " " "
## 6 ( 1 ) "*" "*" "*" " " "*" " " "*" " " " " " " " " "*" " " " "
## 7 ( 1 ) " " " " "*" " " "*" "*" " " " " " " "*" "*" "*" "*" " "
## 8 ( 1 ) " " " " "*" "*" "*" "*" " " " " " " "*" "*" "*" "*" " "
## 9 ( 1 ) " " "*" "*" "*" "*" "*" " " " " " " "*" "*" "*" " " "*"
## 10 ( 1 ) "*" "*" "*" "*" " " "*" "*" "*" "*" " " "*" " " " " "*"
## 11 ( 1 ) "*" "*" "*" "*" "*" "*" " " " " "*" "*" "*" "*" "*" " "
## 12 ( 1 ) "*" "*" "*" "*" "*" "*" "*" "*" "*" "*" " " "*" "*" " "
Backward Selection Method:
#Model 3
detroitmodelfit.3=regsubsets(HOM~.,data = Detroit_Data_Ben_Gonzalez,method = "backward")
detroit.summary3=summary(detroitmodelfit.3)
summary(detroitmodelfit.3)
## Subset selection object
## Call: regsubsets.formula(HOM ~ ., data = Detroit_Data_Ben_Gonzalez,
## method = "backward")
## 14 Variables (and intercept)
## Forced in Forced out
## Year FALSE FALSE
## FTP FALSE FALSE
## UEMP FALSE FALSE
## MAN FALSE FALSE
## LIC FALSE FALSE
## GR FALSE FALSE
## CLEAR FALSE FALSE
## WM FALSE FALSE
## NMAN FALSE FALSE
## GOV FALSE FALSE
## HE FALSE FALSE
## WE FALSE FALSE
## ACC FALSE FALSE
## ASR FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: backward
## Year FTP UEMP MAN LIC GR CLEAR WM NMAN GOV HE WE ACC ASR
## 1 ( 1 ) "*" " " " " " " " " " " " " " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " " "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " "*" " " " " "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " "*" "*" " " " " "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " "*" "*" " " " " "*" " " "*" " " " " " " " "
## 6 ( 1 ) "*" " " " " "*" "*" " " " " "*" " " "*" " " "*" " " " "
## 7 ( 1 ) "*" " " " " "*" "*" " " " " "*" " " "*" "*" "*" " " " "
## 8 ( 1 ) "*" " " " " "*" "*" "*" " " "*" " " "*" "*" "*" " " " "
In this step we will look at our first model and will look at the results.
#Detroit Model 1
detroit.summary1=summary(detroitmodelfit.1)
detroit.summary1
## Subset selection object
## Call: regsubsets.formula(HOM ~ ., data = Detroit_Data_Ben_Gonzalez,
## method = "forward")
## 14 Variables (and intercept)
## Forced in Forced out
## Year FALSE FALSE
## FTP FALSE FALSE
## UEMP FALSE FALSE
## MAN FALSE FALSE
## LIC FALSE FALSE
## GR FALSE FALSE
## CLEAR FALSE FALSE
## WM FALSE FALSE
## NMAN FALSE FALSE
## GOV FALSE FALSE
## HE FALSE FALSE
## WE FALSE FALSE
## ACC FALSE FALSE
## ASR FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## Year FTP UEMP MAN LIC GR CLEAR WM NMAN GOV HE WE ACC ASR
## 1 ( 1 ) " " " " " " " " " " " " "*" " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " "*" " " "*" " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " "*" " " "*" " " " " " " "*" " " " " " "
## 4 ( 1 ) " " "*" " " " " "*" " " "*" " " " " " " "*" " " " " " "
## 5 ( 1 ) " " "*" "*" " " "*" " " "*" " " " " " " "*" " " " " " "
## 6 ( 1 ) " " "*" "*" " " "*" " " "*" " " " " " " "*" "*" " " " "
## 7 ( 1 ) "*" "*" "*" " " "*" " " "*" " " " " " " "*" "*" " " " "
## 8 ( 1 ) "*" "*" "*" " " "*" " " "*" " " " " " " "*" "*" " " "*"
detroit.summary1$rsq
## [1] 0.9379154 0.9895042 0.9934288 0.9958723 0.9974545 0.9992607 0.9995836
## [8] 0.9996476
detroit.summary1$adjr2
## [1] 0.9322714 0.9874051 0.9912384 0.9938084 0.9956363 0.9985215 0.9990006
## [8] 0.9989428
detroit.summary1$rss
## [1] 200.023392 33.815100 21.171091 13.298648 8.201044 2.381723
## [7] 1.341586 1.135364
which.max(detroit.summary1$adjr2)
## [1] 7
detroit.summary1$bic
## [1] -31.00045 -51.54335 -55.06593 -58.54564 -62.26490 -75.77364 -80.67033
## [8] -80.27506
which.min(detroit.summary1$bic)
## [1] 7
plot(detroit.summary1$bic,xlab = "Number of Variables",ylab = "Detroit Model 1 BIC",type = "o")
points(8,detroit.summary1$bic[8],col="blue",cex=2,pch=20)
which.max(detroit.summary1$adjr2)
## [1] 7
plot(detroit.summary2$adjr2,xlab = "Number of Variables",ylab = "Detroit Model 1 Adjusted R-Squared",type = "o")
points(7,detroit.summary1$adjr2[7],col="blue",cex=2,pch=20)
which.min(detroit.summary1$rss)
## [1] 8
plot(detroit.summary1$rss,xlab = "Number of Variables",ylab = "Detroit Model 1 RSS",type = "o")
points(8,detroit.summary1$rss[8],col="blue",cex=2,pch=20)
coef(detroitmodelfit.1,8)
## (Intercept) Year FTP UEMP LIC
## -2.041112e+03 1.035964e+00 2.935713e-02 1.003702e+00 1.584314e-02
## CLEAR HE WE ASR
## -1.946830e-01 -4.522011e-02 1.135722e-01 6.277388e-03
In this step we will utilize polynomial regression on our data to see what effect it has on our dataset. The ‘gam’ and ‘splines’ package will allow us to perform polynomial regression and allows us to to further evaluate our data in this manner.
## Detroit GAM (Generalized Additive Model) Models ############################
library(splines)
#install.packages("gam")
library(gam)
## Detroit GAM 1
gam.detroitmodel11=lm(HOM~s(UEMP,2)+s(LIC,1)+WE+s(HE,2)+ASR+s(NMAN,1)+s(MAN,1)+ASR,data = Detroit_Data_Ben_Gonzalez)
anova(gam.detroitmodel11)
## Analysis of Variance Table
##
## Response: HOM
## Df Sum Sq Mean Sq F value Pr(>F)
## s(UEMP, 2) 1 142.27 142.27 168.7399 4.820e-05 ***
## s(LIC, 1) 1 1921.12 1921.12 2278.5095 7.623e-08 ***
## WE 1 1151.60 1151.60 1365.8342 2.732e-07 ***
## s(HE, 2) 1 0.11 0.11 0.1360 0.7274
## ASR 1 0.02 0.02 0.0190 0.8956
## s(NMAN, 1) 1 0.26 0.26 0.3122 0.6004
## s(MAN, 1) 1 2.18 2.18 2.5908 0.1684
## Residuals 5 4.22 0.84
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(gam.detroitmodel11)
##
## Call:
## lm(formula = HOM ~ s(UEMP, 2) + s(LIC, 1) + WE + s(HE, 2) + ASR +
## s(NMAN, 1) + s(MAN, 1) + ASR, data = Detroit_Data_Ben_Gonzalez)
##
## Residuals:
## 1 2 3 4 5 6 7 8
## -0.54830 0.95624 0.13462 -0.56161 0.13595 0.43332 -1.29799 0.04715
## 9 10 11 12 13
## 0.17535 0.76758 0.13711 -0.36752 -0.01191
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -32.657710 10.906308 -2.994 0.03029 *
## s(UEMP, 2) 0.999276 0.368150 2.714 0.04206 *
## s(LIC, 1) 0.022671 0.003572 6.348 0.00143 **
## WE 0.247812 0.054239 4.569 0.00601 **
## s(HE, 2) -1.369245 2.610692 -0.524 0.62236
## ASR -0.002747 0.010348 -0.265 0.80128
## s(NMAN, 1) 0.043101 0.028955 1.489 0.19678
## s(MAN, 1) -0.045099 0.028019 -1.610 0.16840
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9182 on 5 degrees of freedom
## Multiple R-squared: 0.9987, Adjusted R-squared: 0.9969
## F-statistic: 545.2 on 7 and 5 DF, p-value: 6.714e-07
plot(gam.detroitmodel11,se=TRUE,col="blue")
##Interactive Detroit Data Sets###############################################################
library(ggvis)
Detroit_Data_Ben_Gonzalez %>% ggvis(~FTP, ~HOM, fill=~UEMP) %>% layer_points() %>% add_tooltip(function(df) df$UEMP)
This concludes the tutorial on analyzing quantitative variables.