#Libraries for Homework 3
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.4 ✔ purrr 0.3.5
## ✔ tibble 3.2.1 ✔ dplyr 1.1.3
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.4 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
library(corrplot)
## corrplot 0.92 loaded
library(RColorBrewer)
library(knitr)
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(caret)
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:survival':
##
## cluster
##
## The following object is masked from 'package:purrr':
##
## lift
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(knitr)
library(dplyr)
#Overview DATA 621 – Business Analytics and Data Mining In this homework assignment, you will explore, analyze and model a data set containing information on crime for various neighborhoods of a major city. Each record has a response variable indicating whether or not the crime rate is above the median crime rate (1) or not (0). Your objective is to build a binary logistic regression model on the training data set to predict whether the neighborhood will be at risk for high crime levels. You will provide classifications and probabilities for the evaluation data set using your binary logistic regression model. You can only use the variables given to you (or variables that you derive from the variables provided). Below is a short description of the variables of interest in the data set:
#Data Exploration I begin loading up the libraries for this homework and going through the training data set there are 13 columns and 466 obversations. The evaluation date has 12 columns and 40 observations. I did a data clean to make sure there is no misisng data value in the columns or rows.
crime_train <- read.csv("https://raw.githubusercontent.com/Wilchau/Data_621_Homework_3/main/crime-training-data_modified.csv")
crime_eval <- read.csv("https://raw.githubusercontent.com/Wilchau/Data_621_Homework_3/main/crime-evaluation-data_modified.csv")
#Charachteristics in data: This is the characteristics of the variables in the data set: -zn proportion of residential land zoned for lots over 25,000 sq.ft. -indus proportion of non-retail business acres per town -chas Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) - nox nitrogen oxides concentration (parts per 10 million) - rm average number of rooms per dwelling - age proportion of owner-occupied units built prior to 1940 - dis weighted distances to five Boston employment centers - rad index of accessibility to radial highways - tax full-value property-tax rate per $10,000 - ptratio pupil-teacher ratio by town - lstat % lower status of the population - medv median value of owner-occupied homes in $1000’s - target indicating whether or not the crime rate is above the median crime rate (1) or not (0) response variable
glimpse(crime_train)
## Rows: 466
## Columns: 13
## $ zn <dbl> 0, 0, 0, 30, 0, 0, 0, 0, 0, 80, 22, 0, 0, 22, 0, 0, 100, 20, 0…
## $ indus <dbl> 19.58, 19.58, 18.10, 4.93, 2.46, 8.56, 18.10, 18.10, 5.19, 3.6…
## $ chas <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ nox <dbl> 0.605, 0.871, 0.740, 0.428, 0.488, 0.520, 0.693, 0.693, 0.515,…
## $ rm <dbl> 7.929, 5.403, 6.485, 6.393, 7.155, 6.781, 5.453, 4.519, 6.316,…
## $ age <dbl> 96.2, 100.0, 100.0, 7.8, 92.2, 71.3, 100.0, 100.0, 38.1, 19.1,…
## $ dis <dbl> 2.0459, 1.3216, 1.9784, 7.0355, 2.7006, 2.8561, 1.4896, 1.6582…
## $ rad <int> 5, 5, 24, 6, 3, 5, 24, 24, 5, 1, 7, 5, 24, 7, 3, 3, 5, 5, 24, …
## $ tax <int> 403, 403, 666, 300, 193, 384, 666, 666, 224, 315, 330, 398, 66…
## $ ptratio <dbl> 14.7, 14.7, 20.2, 16.6, 17.8, 20.9, 20.2, 20.2, 20.2, 16.4, 19…
## $ lstat <dbl> 3.70, 26.82, 18.85, 5.19, 4.82, 7.67, 30.59, 36.98, 5.68, 9.25…
## $ medv <dbl> 50.0, 13.4, 15.4, 23.7, 37.9, 26.5, 5.0, 7.0, 22.2, 20.9, 24.8…
## $ target <int> 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0,…
glimpse(crime_eval)
## Rows: 40
## Columns: 12
## $ zn <int> 0, 0, 0, 0, 0, 25, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 22, 90…
## $ indus <dbl> 7.07, 8.14, 8.14, 8.14, 5.96, 5.13, 5.13, 4.49, 4.49, 2.89, 25…
## $ chas <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
## $ nox <dbl> 0.469, 0.538, 0.538, 0.538, 0.499, 0.453, 0.453, 0.449, 0.449,…
## $ rm <dbl> 7.185, 6.096, 6.495, 5.950, 5.850, 5.741, 5.966, 6.630, 6.121,…
## $ age <dbl> 61.1, 84.5, 94.4, 82.0, 41.5, 66.2, 93.4, 56.1, 56.8, 69.6, 97…
## $ dis <dbl> 4.9671, 4.4619, 4.4547, 3.9900, 3.9342, 7.2254, 6.8185, 4.4377…
## $ rad <int> 2, 4, 4, 4, 5, 8, 8, 3, 3, 2, 2, 2, 4, 5, 5, 4, 8, 8, 7, 1, 1,…
## $ tax <int> 242, 307, 307, 307, 279, 284, 284, 247, 247, 276, 188, 188, 43…
## $ ptratio <dbl> 17.8, 21.0, 21.0, 21.0, 19.2, 19.7, 19.7, 18.5, 18.5, 18.0, 19…
## $ lstat <dbl> 4.03, 10.26, 12.80, 27.71, 8.77, 13.15, 14.44, 6.53, 8.44, 11.…
## $ medv <dbl> 34.7, 18.2, 18.4, 13.2, 21.0, 18.7, 16.0, 26.6, 22.2, 21.4, 17…
#Summary: I look into building a histogram plot below to see the characteristics of the data more. Notes: medv and rm are normally distri buted. bi-modal distribution of the variables are: Indus, rad, and tax. Most of the variables are are skewed on either side respectively.
# distribution of the varaibles
crime_train %>%
gather(variable, value, zn:target) %>%
ggplot(., aes(value)) +
geom_density(fill = "steelblue", color="steelblue") +
facet_wrap(~variable, scales ="free", ncol = 4) +
labs(x = element_blank(), y = element_blank())
Boxplot overview: I also plotted out a box plot for an overview to see
if there is any existing correlation between the data and the target
variable, we obtain the values of correlation as well as a
visualization. Going through the graphs we can see that there is a
correlation between nox, age, rad, tax, indus and target variables and
the only negative correlation is variable dis. the rest has no
correlation.
crime_train %>%
dplyr::select(-chas) %>%
gather(key, value, -target) %>%
mutate(key = factor(key),
target = factor(target)) %>%
ggplot(aes(x = key, y = value)) +
geom_boxplot(aes(fill = target)) +
facet_wrap(~ key, scales = 'free', ncol = 3) +
scale_fill_manual(values=c("steelblue", "firebrick")) +
theme_minimal()
kable(sort(cor(dplyr::select(crime_train, target, everything()))[,1], decreasing = T), col.names = c("Correlation")) %>%
kable_styling(full_width = F)
| Correlation | |
|---|---|
| target | 1.0000000 |
| nox | 0.7261062 |
| age | 0.6301062 |
| rad | 0.6281049 |
| tax | 0.6111133 |
| indus | 0.6048507 |
| lstat | 0.4691270 |
| ptratio | 0.2508489 |
| chas | 0.0800419 |
| rm | -0.1525533 |
| medv | -0.2705507 |
| zn | -0.4316818 |
| dis | -0.6186731 |
numeric_values <- crime_train %>% select_if(is.numeric)
train_cor <- cor(numeric_values)
corrplot.mixed(train_cor, tl.col = 'black', tl.pos = 'lt', upper = "number", lower="circle")
#Build Models
lm1 = glm( target ~ zn + indus + nox + rm + age + dis + rad + tax + ptratio + lstat + medv,
family=binomial(),
data=crime_train )
summary(lm1)
##
## Call:
## glm(formula = target ~ zn + indus + nox + rm + age + dis + rad +
## tax + ptratio + lstat + medv, family = binomial(), data = crime_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8538 -0.1411 -0.0014 0.0026 3.4667
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -39.433599 6.470184 -6.095 1.10e-09 ***
## zn -0.072337 0.034688 -2.085 0.03704 *
## indus -0.052045 0.045781 -1.137 0.25561
## nox 47.332410 7.706465 6.142 8.15e-10 ***
## rm -0.611244 0.721987 -0.847 0.39721
## age 0.034866 0.013752 2.535 0.01123 *
## dis 0.716434 0.228385 3.137 0.00171 **
## rad 0.716867 0.160848 4.457 8.32e-06 ***
## tax -0.006894 0.002895 -2.381 0.01726 *
## ptratio 0.377862 0.123881 3.050 0.00229 **
## lstat 0.053818 0.053060 1.014 0.31045
## medv 0.183465 0.068417 2.682 0.00733 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 645.88 on 465 degrees of freedom
## Residual deviance: 193.52 on 454 degrees of freedom
## AIC: 217.52
##
## Number of Fisher Scoring iterations: 9
#Part 3 model building The first model: mod2 is the original model keeping all variables!
mod2 = glm( target ~ nox + rad + tax + nox + rad + ptratio, family=binomial(), data=crime_train )
#Part 4 Select Models
summary(mod2)
##
## Call:
## glm(formula = target ~ nox + rad + tax + nox + rad + ptratio,
## family = binomial(), data = crime_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.02152 -0.24513 -0.01788 0.00225 2.65257
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -25.091663 3.559764 -7.049 1.81e-12 ***
## nox 37.867007 4.908192 7.715 1.21e-14 ***
## rad 0.749723 0.135172 5.546 2.92e-08 ***
## tax -0.009304 0.002410 -3.860 0.000113 ***
## ptratio 0.207481 0.087120 2.382 0.017240 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 645.88 on 465 degrees of freedom
## Residual deviance: 218.70 on 461 degrees of freedom
## AIC: 228.7
##
## Number of Fisher Scoring iterations: 9
#Model with Univariate For this model I used nox in relation to nox
Model 2:
mod3 = glm( target ~ nox , family = binomial(), data=crime_train)
summary(mod3)
##
## Call:
## glm(formula = target ~ nox, family = binomial(), data = crime_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2456 -0.3759 -0.1675 0.3707 2.5576
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -15.892 1.449 -10.97 <2e-16 ***
## nox 29.375 2.707 10.85 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 645.88 on 465 degrees of freedom
## Residual deviance: 292.01 on 464 degrees of freedom
## AIC: 296.01
##
## Number of Fisher Scoring iterations: 6
#Part 4 Model selection and Evaluation I would go with the 2nd model: Model with selected variables because the nox and tax variable seem to have a significants in helping to discriminate the high and low crime rate. Also in certain areas when we think of taxes and how this is applicable to how we distrubted tax or government revenue to certain places there is a production of community building when taxes are allocated to certain community.
crime_eval %>% mutate( Idnox = ifelse(nox > .6, "Hi", ifelse( nox < .48, "Low", "Mid"))) -> testing3
testing3 %>% mutate( Idrad = ifelse(rad > 6, "High", "Low"),
Tax2 = (tax - 300)^2 ,
Idptratio = ifelse(ptratio > 18, "High", "Low")
) -> testing
(predictions = predict(mod2, testing))
## 1 2 3 4 5 6 7
## -4.3911111 -0.2196941 -0.2196941 -0.2196941 -1.0597253 -0.4952213 -0.4952213
## 8 9 10 11 12 13 14
## -4.3000141 -4.3000141 -5.5747758 0.6221621 0.6221621 1.8687804 0.8667540
## 15 16 17 18 19 20 21
## 0.8667540 -2.2939978 0.7447855 0.8583865 -2.6305137 -8.6724539 -8.2293653
## 22 23 24 25 26 27 28
## -0.2183180 -0.5040301 -1.2783707 -1.2783707 0.2653754 -4.6986998 18.0845219
## 29 30 31 32 33 34 35
## 14.7900923 13.0103430 18.9175960 18.9175960 18.9175960 18.9175960 18.9175960
## 36 37 38 39 40
## 17.8951869 17.8951869 15.6989005 1.9044566 -0.8271641
write(predictions, "evaluation_predictions.csv")