library(readxl)
#A)
crime <- read_xlsx("crime - 1987.xlsx")
#The data is brought in with labels and saves into the local environment
#B)
#The summary command provides the descriptive statistics for each column
summary(crime)
## county year crmrte prbarr
## Min. : 1.0 Min. :87 Min. :0.005533 Min. : 9.277
## 1st Qu.: 51.5 1st Qu.:87 1st Qu.:0.020604 1st Qu.: 20.495
## Median :103.0 Median :87 Median :0.030002 Median : 27.146
## Mean :100.6 Mean :87 Mean :0.033510 Mean : 29.524
## 3rd Qu.:150.5 3rd Qu.:87 3rd Qu.:0.040249 3rd Qu.: 34.487
## Max. :197.0 Max. :87 Max. :0.098966 Max. :109.091
## prbconv prbpris avgsen polpc
## Min. : 6.838 Min. :15.00 Min. : 5.380 Min. :0.000746
## 1st Qu.: 34.422 1st Qu.:36.42 1st Qu.: 7.375 1st Qu.:0.001238
## Median : 45.170 Median :42.22 Median : 9.110 Median :0.001489
## Mean : 55.086 Mean :41.06 Mean : 9.689 Mean :0.001708
## 3rd Qu.: 58.513 3rd Qu.:45.76 3rd Qu.:11.465 3rd Qu.:0.001885
## Max. :212.121 Max. :60.00 Max. :20.700 Max. :0.009054
## density taxpc west central
## Min. :0.2034 Min. : 25.69 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.5472 1st Qu.: 30.73 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.9792 Median : 34.92 Median :0.0000 Median :0.0000
## Mean :1.4379 Mean : 38.16 Mean :0.2333 Mean :0.3778
## 3rd Qu.:1.5693 3rd Qu.: 41.01 3rd Qu.:0.0000 3rd Qu.:1.0000
## Max. :8.8277 Max. :119.76 Max. :1.0000 Max. :1.0000
## east urban pctmin80 pctymle
## Min. :0.0000 Min. :0.00000 Min. : 1.284 Min. :0.06216
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:10.024 1st Qu.:0.07437
## Median :0.0000 Median :0.00000 Median :24.852 Median :0.07770
## Mean :0.3889 Mean :0.08889 Mean :25.713 Mean :0.08403
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:38.183 3rd Qu.:0.08352
## Max. :1.0000 Max. :1.00000 Max. :64.348 Max. :0.24871
## wcon wtuc wtrd wfir
## Min. :193.6 Min. :187.6 Min. :154.2 Min. :170.9
## 1st Qu.:250.8 1st Qu.:374.3 1st Qu.:190.7 1st Qu.:285.6
## Median :281.2 Median :404.8 Median :203.0 Median :317.1
## Mean :285.4 Mean :410.9 Mean :210.9 Mean :321.6
## 3rd Qu.:315.0 3rd Qu.:440.7 3rd Qu.:224.3 3rd Qu.:342.6
## Max. :436.8 Max. :613.2 Max. :354.7 Max. :509.5
## wser wmfg wfed wsta
## Min. : 133.0 Min. :157.4 Min. :326.1 Min. :258.3
## 1st Qu.: 229.3 1st Qu.:288.6 1st Qu.:398.8 1st Qu.:329.3
## Median : 253.1 Median :321.1 Median :448.9 Median :358.4
## Mean : 275.3 Mean :336.0 Mean :442.6 Mean :357.7
## 3rd Qu.: 277.6 3rd Qu.:359.9 3rd Qu.:478.3 3rd Qu.:383.2
## Max. :2177.1 Max. :646.9 Max. :598.0 Max. :499.6
## wloc
## Min. :239.2
## 1st Qu.:297.2
## Median :307.6
## Mean :312.3
## 3rd Qu.:328.8
## Max. :388.1
#The data we are looking at data from 1987
#Most of the individuals do not live in an urban area
#There is large range in policy percentage between .0007 and .009
#There is also a large diversity in the level of population density within the sample
#This will export the summary tables into a csv
write.csv(summary(crime), "sumCrime.csv")
#C)
hist(crime$crmrte)

#the sample of crimes per person is skewed right because it has a long tail to the right
hist(crime$prbarr)

#Here we see that there are some outliers that are high risk for crime, the rest is relatively normally distributed
#D)
library(corrplot)
## corrplot 0.84 loaded
corrplot(cor(crime[,c('crmrte', 'prbarr', 'prbconv', 'prbpris', 'avgsen','polpc')]), method = "square")

#This plot shows us that our crime rate is negatively correlated with
#probability of arrest, and probability of conviction.
#This says that the higher the probability for arrest/conviction the lower the crime rate
#The plot also says there is a positive correlation between the crime rate and the policy percentage
#this is more likely caused by the increased policing leading to more crimes being documented
#Correlation Matrix the plot is based on
cor(crime[,c('crmrte', 'prbarr', 'prbconv', 'prbpris', 'avgsen','polpc')])
## crmrte prbarr prbconv prbpris avgsen
## crmrte 1.00000000 -0.39528491 -0.38596829 0.04799337 0.01979641
## prbarr -0.39528491 1.00000000 -0.05579620 0.04583325 0.17869424
## prbconv -0.38596829 -0.05579620 1.00000000 0.01102265 0.15585232
## prbpris 0.04799337 0.04583325 0.01102265 1.00000000 -0.09468087
## avgsen 0.01979641 0.17869424 0.15585232 -0.09468087 1.00000000
## polpc 0.16727644 0.42593153 0.17188550 0.04813433 0.48816731
## polpc
## crmrte 0.16727644
## prbarr 0.42593153
## prbconv 0.17188550
## prbpris 0.04813433
## avgsen 0.48816731
## polpc 1.00000000
#This writes the correlations into a .csv file
write.csv(cor(crime[,c('crmrte', 'prbarr', 'prbconv', 'prbpris', 'avgsen','polpc')]), "corrMatrix.csv")
#E)
crime.lm <- lm(crmrte ~ prbarr, data = crime)
summary(crime.lm)
##
## Call:
## lm(formula = crmrte ~ prbarr, data = crime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.028061 -0.010754 -0.003170 0.006823 0.057531
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0495201 0.0043716 11.328 < 2e-16 ***
## prbarr -0.0005423 0.0001343 -4.037 0.000115 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01745 on 88 degrees of freedom
## Multiple R-squared: 0.1563, Adjusted R-squared: 0.1467
## F-statistic: 16.3 on 1 and 88 DF, p-value: 0.0001153
#The coefficient shows that a 1 point increase in the probability of
#being arrested for an offense decrease the crimerate by .00054
#This implies that the higher the probability of arrest for a crime
#has little practical significance because 1 point change would lead
#to a fraction of a percentage point change in the crime rate.
#F)
#As I briefly discussed above there is statistical significance in
#the confidence of the coefficient. The coefficient is not practically significant
#A 1 point increase in the arrest rate lowers the crime rate by .00054.
firstQ <- .0206
median <- .0300
intercept <- crime.lm$coefficients[[1]]
b1 <- crime.lm$coefficients[[2]]
firstQx1 <- firstQ/b1 + intercept
medianx1 <- median/b1 + intercept
(medianx1 - firstQx1 - intercept)/b1
## [1] 32056.71
#This shows that to move from the 1st quartile to the median you would
#need to change the probability of arrest by over 30,000 which is not
#possible let alone practical.
#G)
(.05 - .04 - intercept)/b1
## [1] 72.87767
#this says taht we will need to have a 72 point increase in the probability
#of arrest.
#H)
yhat <- crime.lm$fitted.values
crime.residuals <- crime.lm$residuals
plot(yhat, crime.residuals)

#I think that the heteroskedasticity-robust estimator for the
#standard error was justified because it appears the plot shows a
#constant variance
#I)
library(estout)
model1 <- lm(crmrte ~ prbarr, data = crime)
model2 <- lm(crmrte ~ prbarr + density, data = crime)
model3 <- lm(crmrte ~ prbarr + density + urban, data = crime)
model4 <- lm(crmrte ~ prbarr + urban, data = crime)
eststo(model1)
eststo(model2)
eststo(model3)
eststo(model4)
esttab(table = "sideways", file = "sideways",csv = T)
#The R output is messy, for a cleaner view I have attached the .xls
esttab(table = "sideways")
## \def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}
## \begin{sideways}[htbp]
## \centering
##
## \begin{tabular}{l*{4}{c}}
## \hline\hline
## &\multicolumn{1}{c}{crmrte} &\multicolumn{1}{c}{crmrte} &\multicolumn{1}{c}{crmrte} &\multicolumn{1}{c}{crmrte} \\
## \hline
## (Intercept) &0.05\sym{***} &0.029\sym{***} &0.03\sym{***} &0.042\sym{***} \\
## &(0.004) &(0.004) &(0.004) &(0.004) \\
## prbarr &-0.001\sym{***} &0\sym{**} &0\sym{***} &0\sym{***} \\
## &(0) &(0) &(0) &(0) \\
## density & &0.008\sym{***} &0.008\sym{***} & \\
## & &(0.001) &(0.002) & \\
## urban & & &0.005 &0.037\sym{***} \\
## & & &(0.008) &(0.005) \\
## \hline
## $R^2$ &0.156 &0.564 &0.566 &0.453 \\
## $adj.R^2$ &0.147 &0.554 &0.551 &0.44 \\
## $N$ &\multicolumn{1}{c}{90} &\multicolumn{1}{c}{90} &\multicolumn{1}{c}{90} &\multicolumn{1}{c}{90} \\
## \hline\hline
## \multicolumn{5}{l}{\footnotesize Standard errors in parentheses}\\
## \multicolumn{5}{l}{\footnotesize $^{*}$ (p $\le$ 0.1), $^{**}$ (p $\le$ 0.05), $^{***}$ (p $\le$ 0.01)}\\
## \end{tabular}
## \end{sideways}
#J)
#Particularly looking at the R^2 stat there is a large jump from
#the first model to the second. This indicates that there is a much
#the second model does a much better job of explaining the variance.
#We can conclude the first model has significant OVB. Therefore we can
#make no conclusions about causality. The sign of the bias would be
#negative. This shows that the crime rate is not well explained by
#the probability of arrest. Model2 is more likely to be better because
#it includes an omitted variable.
#K)
#Out of models2/3/4 I would use model2. Model3 has a higher R^2 but
#we seet hat there is a 100% increase in SE, which implies a high
#correlation or multicolinearity between the density and urban variables
#The fourth model with just urban has a high coefficient and similar
#R^2. For practical purposes I would use the urban model as it has the
#highest coefficient while remaining statistically significant.
#You may use model2 if you had more data on the population density than
#whether or not the population is "Urban"
library(car)
## Loading required package: carData
vif(model2) #shows little correlation
## prbarr density
## 1.099288 1.099288
vif(model3) #shows higher correlation between density/urban
## prbarr density urban
## 1.104681 3.236428 3.078002
vif(model4) #shows little correlation between prbarr and urban
## prbarr urban
## 1.045477 1.045477
#L)
modelL <- lm(crmrte ~prbarr + density + west + central + east, data = crime)
summary(modelL)
##
## Call:
## lm(formula = crmrte ~ prbarr + density + west + central + east,
## data = crime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.020021 -0.005909 -0.001186 0.004252 0.045020
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.463e-02 3.646e-03 9.498 5.29e-15 ***
## prbarr -2.278e-04 9.198e-05 -2.477 0.015232 *
## density 8.766e-03 8.731e-04 10.041 4.25e-16 ***
## west -1.422e-02 3.141e-03 -4.527 1.93e-05 ***
## central -9.742e-03 2.857e-03 -3.410 0.000996 ***
## east NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01129 on 85 degrees of freedom
## Multiple R-squared: 0.659, Adjusted R-squared: 0.643
## F-statistic: 41.07 on 4 and 85 DF, p-value: < 2.2e-16
#the problem is that there is perfect colinearity, and we need to drop
#a dummy variable.
#M)
modelM <- lm(crmrte ~prbarr + density + west + central, data = crime)
summary(modelM)
##
## Call:
## lm(formula = crmrte ~ prbarr + density + west + central, data = crime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.020021 -0.005909 -0.001186 0.004252 0.045020
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.463e-02 3.646e-03 9.498 5.29e-15 ***
## prbarr -2.278e-04 9.198e-05 -2.477 0.015232 *
## density 8.766e-03 8.731e-04 10.041 4.25e-16 ***
## west -1.422e-02 3.141e-03 -4.527 1.93e-05 ***
## central -9.742e-03 2.857e-03 -3.410 0.000996 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01129 on 85 degrees of freedom
## Multiple R-squared: 0.659, Adjusted R-squared: 0.643
## F-statistic: 41.07 on 4 and 85 DF, p-value: < 2.2e-16
#N)
modelN <- lm(crmrte ~prbarr + density + west + east, data = crime)
summary(modelN)
##
## Call:
## lm(formula = crmrte ~ prbarr + density + west + east, data = crime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.020021 -0.005909 -0.001186 0.004252 0.045020
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.489e-02 3.938e-03 6.320 1.16e-08 ***
## prbarr -2.278e-04 9.198e-05 -2.477 0.015232 *
## density 8.766e-03 8.731e-04 10.041 4.25e-16 ***
## west -4.478e-03 3.325e-03 -1.347 0.181699
## east 9.742e-03 2.857e-03 3.410 0.000996 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01129 on 85 degrees of freedom
## Multiple R-squared: 0.659, Adjusted R-squared: 0.643
## F-statistic: 41.07 on 4 and 85 DF, p-value: < 2.2e-16
#In both models east/central have the same estimate/se and p-val
#West changes, as it changes its role in the dummy variable matrix.
#O Re-Scaling
crime$density_1000 <- crime$density/1000
#This shows the population density in thousands rather than individuals
modelO <- lm(crmrte ~ prbarr + density_1000,data = crime)
summary(modelO)
##
## Call:
## lm(formula = crmrte ~ prbarr + density_1000, data = crime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.019344 -0.008184 -0.002470 0.005411 0.051334
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0294066 0.0038684 7.602 3.17e-11 ***
## prbarr -0.0002663 0.0001018 -2.615 0.0105 *
## density_1000 8.3208023 0.9226151 9.019 4.11e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01261 on 87 degrees of freedom
## Multiple R-squared: 0.5639, Adjusted R-squared: 0.5539
## F-statistic: 56.26 on 2 and 87 DF, p-value: < 2.2e-16
summary(model2)
##
## Call:
## lm(formula = crmrte ~ prbarr + density, data = crime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.019344 -0.008184 -0.002470 0.005411 0.051334
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0294066 0.0038684 7.602 3.17e-11 ***
## prbarr -0.0002663 0.0001018 -2.615 0.0105 *
## density 0.0083208 0.0009226 9.019 4.11e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01261 on 87 degrees of freedom
## Multiple R-squared: 0.5639, Adjusted R-squared: 0.5539
## F-statistic: 56.26 on 2 and 87 DF, p-value: < 2.2e-16
#the models are the same mathematically but the interpration is more
#practical, as the coefficient represents a change in a 1000 of population.
#We note that the standard error is also higher because this is rescaling.
#This model shows us for every change in a 1000 of density there is a 8
#point increase in crime rate.