library(readxl)

#A)
crime <- read_xlsx("crime - 1987.xlsx")
#The data is brought in with labels and saves into the local environment

#B)
#The summary command provides the descriptive statistics for each column
summary(crime)
##      county           year        crmrte             prbarr       
##  Min.   :  1.0   Min.   :87   Min.   :0.005533   Min.   :  9.277  
##  1st Qu.: 51.5   1st Qu.:87   1st Qu.:0.020604   1st Qu.: 20.495  
##  Median :103.0   Median :87   Median :0.030002   Median : 27.146  
##  Mean   :100.6   Mean   :87   Mean   :0.033510   Mean   : 29.524  
##  3rd Qu.:150.5   3rd Qu.:87   3rd Qu.:0.040249   3rd Qu.: 34.487  
##  Max.   :197.0   Max.   :87   Max.   :0.098966   Max.   :109.091  
##     prbconv           prbpris          avgsen           polpc         
##  Min.   :  6.838   Min.   :15.00   Min.   : 5.380   Min.   :0.000746  
##  1st Qu.: 34.422   1st Qu.:36.42   1st Qu.: 7.375   1st Qu.:0.001238  
##  Median : 45.170   Median :42.22   Median : 9.110   Median :0.001489  
##  Mean   : 55.086   Mean   :41.06   Mean   : 9.689   Mean   :0.001708  
##  3rd Qu.: 58.513   3rd Qu.:45.76   3rd Qu.:11.465   3rd Qu.:0.001885  
##  Max.   :212.121   Max.   :60.00   Max.   :20.700   Max.   :0.009054  
##     density           taxpc             west           central      
##  Min.   :0.2034   Min.   : 25.69   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.5472   1st Qu.: 30.73   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.9792   Median : 34.92   Median :0.0000   Median :0.0000  
##  Mean   :1.4379   Mean   : 38.16   Mean   :0.2333   Mean   :0.3778  
##  3rd Qu.:1.5693   3rd Qu.: 41.01   3rd Qu.:0.0000   3rd Qu.:1.0000  
##  Max.   :8.8277   Max.   :119.76   Max.   :1.0000   Max.   :1.0000  
##       east            urban            pctmin80         pctymle       
##  Min.   :0.0000   Min.   :0.00000   Min.   : 1.284   Min.   :0.06216  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:10.024   1st Qu.:0.07437  
##  Median :0.0000   Median :0.00000   Median :24.852   Median :0.07770  
##  Mean   :0.3889   Mean   :0.08889   Mean   :25.713   Mean   :0.08403  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:38.183   3rd Qu.:0.08352  
##  Max.   :1.0000   Max.   :1.00000   Max.   :64.348   Max.   :0.24871  
##       wcon            wtuc            wtrd            wfir      
##  Min.   :193.6   Min.   :187.6   Min.   :154.2   Min.   :170.9  
##  1st Qu.:250.8   1st Qu.:374.3   1st Qu.:190.7   1st Qu.:285.6  
##  Median :281.2   Median :404.8   Median :203.0   Median :317.1  
##  Mean   :285.4   Mean   :410.9   Mean   :210.9   Mean   :321.6  
##  3rd Qu.:315.0   3rd Qu.:440.7   3rd Qu.:224.3   3rd Qu.:342.6  
##  Max.   :436.8   Max.   :613.2   Max.   :354.7   Max.   :509.5  
##       wser             wmfg            wfed            wsta      
##  Min.   : 133.0   Min.   :157.4   Min.   :326.1   Min.   :258.3  
##  1st Qu.: 229.3   1st Qu.:288.6   1st Qu.:398.8   1st Qu.:329.3  
##  Median : 253.1   Median :321.1   Median :448.9   Median :358.4  
##  Mean   : 275.3   Mean   :336.0   Mean   :442.6   Mean   :357.7  
##  3rd Qu.: 277.6   3rd Qu.:359.9   3rd Qu.:478.3   3rd Qu.:383.2  
##  Max.   :2177.1   Max.   :646.9   Max.   :598.0   Max.   :499.6  
##       wloc      
##  Min.   :239.2  
##  1st Qu.:297.2  
##  Median :307.6  
##  Mean   :312.3  
##  3rd Qu.:328.8  
##  Max.   :388.1
#The data we are looking at data from 1987
#Most of the individuals do not live in an urban area
#There is large range in policy percentage between .0007 and .009
#There is also a large diversity in the level of population density within the sample

#This will export the summary tables into a csv
write.csv(summary(crime), "sumCrime.csv")

#C)
hist(crime$crmrte)

#the sample of crimes per person is skewed right because it has a long tail to the right

hist(crime$prbarr)

#Here we see that there are some outliers that are high risk for crime, the rest is relatively normally distributed

#D)
library(corrplot)
## corrplot 0.84 loaded
corrplot(cor(crime[,c('crmrte', 'prbarr', 'prbconv', 'prbpris', 'avgsen','polpc')]), method = "square")

#This plot shows us that our crime rate is negatively correlated with 
#probability of arrest, and probability of conviction.
#This says that the higher the probability for arrest/conviction the lower the crime rate
#The plot also says there is a positive correlation between the crime rate and the policy percentage
#this is more likely caused by the increased policing leading to more crimes being documented

#Correlation Matrix the plot is based on
cor(crime[,c('crmrte', 'prbarr', 'prbconv', 'prbpris', 'avgsen','polpc')])
##              crmrte      prbarr     prbconv     prbpris      avgsen
## crmrte   1.00000000 -0.39528491 -0.38596829  0.04799337  0.01979641
## prbarr  -0.39528491  1.00000000 -0.05579620  0.04583325  0.17869424
## prbconv -0.38596829 -0.05579620  1.00000000  0.01102265  0.15585232
## prbpris  0.04799337  0.04583325  0.01102265  1.00000000 -0.09468087
## avgsen   0.01979641  0.17869424  0.15585232 -0.09468087  1.00000000
## polpc    0.16727644  0.42593153  0.17188550  0.04813433  0.48816731
##              polpc
## crmrte  0.16727644
## prbarr  0.42593153
## prbconv 0.17188550
## prbpris 0.04813433
## avgsen  0.48816731
## polpc   1.00000000
#This writes the correlations into a .csv file
write.csv(cor(crime[,c('crmrte', 'prbarr', 'prbconv', 'prbpris', 'avgsen','polpc')]), "corrMatrix.csv")

#E)
crime.lm <- lm(crmrte ~ prbarr, data = crime)
summary(crime.lm)
## 
## Call:
## lm(formula = crmrte ~ prbarr, data = crime)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.028061 -0.010754 -0.003170  0.006823  0.057531 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.0495201  0.0043716  11.328  < 2e-16 ***
## prbarr      -0.0005423  0.0001343  -4.037 0.000115 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01745 on 88 degrees of freedom
## Multiple R-squared:  0.1563, Adjusted R-squared:  0.1467 
## F-statistic:  16.3 on 1 and 88 DF,  p-value: 0.0001153
#The coefficient shows that a 1 point increase in the probability of 
#being arrested for an offense decrease the crimerate by .00054
#This implies that the higher the probability of arrest for a crime
#has little  practical significance because 1 point change would lead
#to a fraction of a percentage point change in the crime rate.

#F)
#As I briefly discussed above there is statistical significance in 
#the confidence of the coefficient. The coefficient is not practically significant
#A 1 point increase in the arrest rate lowers the crime rate by .00054.
firstQ <- .0206
median <- .0300
intercept  <- crime.lm$coefficients[[1]]
b1 <- crime.lm$coefficients[[2]]
firstQx1 <- firstQ/b1 + intercept
medianx1 <- median/b1 + intercept
(medianx1 - firstQx1 - intercept)/b1
## [1] 32056.71
#This shows that  to move from the 1st quartile to the median you would 
#need to change the probability of arrest by over 30,000 which is not
#possible let alone practical.

#G)
(.05 - .04 - intercept)/b1
## [1] 72.87767
#this says taht we will need to have a 72 point increase in the probability
#of arrest.

#H)
yhat  <- crime.lm$fitted.values
crime.residuals <- crime.lm$residuals
plot(yhat, crime.residuals)

#I think that the heteroskedasticity-robust estimator for the 
#standard error was justified because it appears the  plot shows a
#constant variance

#I)
library(estout)
model1  <- lm(crmrte ~ prbarr, data  = crime)
model2  <- lm(crmrte ~ prbarr + density, data  = crime)
model3  <- lm(crmrte ~ prbarr + density + urban, data  = crime)
model4  <- lm(crmrte ~ prbarr + urban, data  = crime)
eststo(model1)
eststo(model2)
eststo(model3)
eststo(model4)

esttab(table  = "sideways",  file = "sideways",csv = T)
#The R output is messy, for  a cleaner view I have attached the .xls
esttab(table = "sideways")
## \def\sym#1{\ifmmode^{#1}\else\(^{#1}\)\fi}
## \begin{sideways}[htbp]
## \centering
## 
## \begin{tabular}{l*{4}{c}}
## \hline\hline
##   &\multicolumn{1}{c}{crmrte} &\multicolumn{1}{c}{crmrte} &\multicolumn{1}{c}{crmrte} &\multicolumn{1}{c}{crmrte} \\
## \hline
## (Intercept)      &0.05\sym{***}      &0.029\sym{***}         &0.03\sym{***}      &0.042\sym{***} \\
##          &(0.004)        &(0.004)        &(0.004)        &(0.004) \\
## prbarr       &-0.001\sym{***}        &0\sym{**}      &0\sym{***}         &0\sym{***} \\
##          &(0)        &(0)        &(0)        &(0) \\
## density      &       &0.008\sym{***}         &0.008\sym{***}         & \\
##          &       &(0.001)        &(0.002)        & \\
## urban        &       &       &0.005      &0.037\sym{***} \\
##          &       &       &(0.008)        &(0.005) \\
## \hline
## $R^2$        &0.156      &0.564      &0.566      &0.453 \\
## $adj.R^2$        &0.147      &0.554      &0.551      &0.44 \\
## $N$      &\multicolumn{1}{c}{90}         &\multicolumn{1}{c}{90}         &\multicolumn{1}{c}{90}         &\multicolumn{1}{c}{90} \\
## \hline\hline
## \multicolumn{5}{l}{\footnotesize Standard errors in parentheses}\\
## \multicolumn{5}{l}{\footnotesize $^{*}$ (p $\le$ 0.1), $^{**}$ (p $\le$ 0.05), $^{***}$ (p $\le$ 0.01)}\\
## \end{tabular}
## \end{sideways}
#J)
#Particularly looking at the R^2 stat there is  a large jump from 
#the first model to the second. This  indicates that there is a much 
#the second model  does a much better job of explaining the variance.
#We can conclude the first model has significant OVB. Therefore we can
#make  no conclusions about causality. The sign of the bias would be
#negative. This shows that the crime rate is not  well explained by
#the probability of arrest. Model2 is  more likely to be better because
#it includes an omitted variable.

#K)
#Out of models2/3/4 I would use model2. Model3 has  a  higher  R^2 but
#we seet hat  there is a 100% increase in SE, which implies a high
#correlation or multicolinearity between the density and urban variables
#The fourth model with just urban has a high coefficient and similar
#R^2. For practical purposes I would use the urban model as it has the
#highest coefficient while  remaining statistically significant.
#You may use model2 if you had more data on the  population density than
#whether or not the population is "Urban"
library(car)
## Loading required package: carData
vif(model2) #shows little correlation
##   prbarr  density 
## 1.099288 1.099288
vif(model3) #shows higher correlation  between density/urban
##   prbarr  density    urban 
## 1.104681 3.236428 3.078002
vif(model4) #shows little correlation between  prbarr and  urban
##   prbarr    urban 
## 1.045477 1.045477
#L)
modelL <- lm(crmrte ~prbarr + density + west + central  + east, data = crime)
summary(modelL)
## 
## Call:
## lm(formula = crmrte ~ prbarr + density + west + central + east, 
##     data = crime)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.020021 -0.005909 -0.001186  0.004252  0.045020 
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.463e-02  3.646e-03   9.498 5.29e-15 ***
## prbarr      -2.278e-04  9.198e-05  -2.477 0.015232 *  
## density      8.766e-03  8.731e-04  10.041 4.25e-16 ***
## west        -1.422e-02  3.141e-03  -4.527 1.93e-05 ***
## central     -9.742e-03  2.857e-03  -3.410 0.000996 ***
## east                NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01129 on 85 degrees of freedom
## Multiple R-squared:  0.659,  Adjusted R-squared:  0.643 
## F-statistic: 41.07 on 4 and 85 DF,  p-value: < 2.2e-16
#the problem is that there is perfect colinearity, and we need to drop
#a dummy variable.

#M)
modelM <- lm(crmrte ~prbarr + density + west + central, data = crime)
summary(modelM)
## 
## Call:
## lm(formula = crmrte ~ prbarr + density + west + central, data = crime)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.020021 -0.005909 -0.001186  0.004252  0.045020 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.463e-02  3.646e-03   9.498 5.29e-15 ***
## prbarr      -2.278e-04  9.198e-05  -2.477 0.015232 *  
## density      8.766e-03  8.731e-04  10.041 4.25e-16 ***
## west        -1.422e-02  3.141e-03  -4.527 1.93e-05 ***
## central     -9.742e-03  2.857e-03  -3.410 0.000996 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01129 on 85 degrees of freedom
## Multiple R-squared:  0.659,  Adjusted R-squared:  0.643 
## F-statistic: 41.07 on 4 and 85 DF,  p-value: < 2.2e-16
#N)
modelN <- lm(crmrte ~prbarr + density + west + east, data = crime)
summary(modelN)
## 
## Call:
## lm(formula = crmrte ~ prbarr + density + west + east, data = crime)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.020021 -0.005909 -0.001186  0.004252  0.045020 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.489e-02  3.938e-03   6.320 1.16e-08 ***
## prbarr      -2.278e-04  9.198e-05  -2.477 0.015232 *  
## density      8.766e-03  8.731e-04  10.041 4.25e-16 ***
## west        -4.478e-03  3.325e-03  -1.347 0.181699    
## east         9.742e-03  2.857e-03   3.410 0.000996 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01129 on 85 degrees of freedom
## Multiple R-squared:  0.659,  Adjusted R-squared:  0.643 
## F-statistic: 41.07 on 4 and 85 DF,  p-value: < 2.2e-16
#In both models east/central have the  same estimate/se and p-val
#West  changes, as it changes its role in the dummy variable matrix.

#O Re-Scaling
crime$density_1000 <- crime$density/1000
#This shows the population density in thousands rather than individuals

modelO <- lm(crmrte ~ prbarr + density_1000,data = crime)
summary(modelO)
## 
## Call:
## lm(formula = crmrte ~ prbarr + density_1000, data = crime)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.019344 -0.008184 -0.002470  0.005411  0.051334 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.0294066  0.0038684   7.602 3.17e-11 ***
## prbarr       -0.0002663  0.0001018  -2.615   0.0105 *  
## density_1000  8.3208023  0.9226151   9.019 4.11e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01261 on 87 degrees of freedom
## Multiple R-squared:  0.5639, Adjusted R-squared:  0.5539 
## F-statistic: 56.26 on 2 and 87 DF,  p-value: < 2.2e-16
summary(model2)
## 
## Call:
## lm(formula = crmrte ~ prbarr + density, data = crime)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.019344 -0.008184 -0.002470  0.005411  0.051334 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.0294066  0.0038684   7.602 3.17e-11 ***
## prbarr      -0.0002663  0.0001018  -2.615   0.0105 *  
## density      0.0083208  0.0009226   9.019 4.11e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01261 on 87 degrees of freedom
## Multiple R-squared:  0.5639, Adjusted R-squared:  0.5539 
## F-statistic: 56.26 on 2 and 87 DF,  p-value: < 2.2e-16
#the models are the same mathematically but the interpration is more 
#practical, as the coefficient  represents a change in a 1000 of population.
#We note  that the standard error  is also higher because this is rescaling.
#This model shows us for every change in a 1000 of density there is a 8
#point increase in crime rate.