Data set was procured from: http://college.cengage.com/mathematics/brase/understandable_statistics/7e/students/datasets/mlr/frames/frame.html
There are 7 variables
crime <- read.csv("crime.csv")
Scatterplots of responses and predictors
library(ggplot2); library(ggpubr)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
## Loading required package: magrittr
# Plots are paired
# density
p1 <- ggplot(crime, aes(crime_rate)) +
geom_density() +
geom_rug(sides="b")
# police_fund
p2 <- ggplot(crime, aes(y = crime_rate, x = police_fund)) +
geom_point() +
geom_smooth(method = "loess", se = F, color = "blue")
ggarrange(p1, p2,
labels = c("A", "B"),
ncol = 2, nrow = 1)
# ggsave("pics/exp_plot_1.png", width = 14, height = 7, units = "cm")
###
# perc_hs
p1 <- ggplot(crime, aes(y = crime_rate, x = perc_hs)) +
geom_point() +
geom_smooth(method = "loess", se = F, color = "blue")
# perc_teens_no_hs
p2 <- ggplot(crime, aes(y = crime_rate, x = perc_teens_no_hs)) +
geom_point() +
geom_smooth(method = "loess", se = F, color = "orange") +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
ggarrange(p1, p2,
labels = c("C", "D"),
ncol = 2, nrow = 1)
# ggsave("pics/exp_plot_2.png", width = 14, height = 7, units = "cm")
###
# perc_college
p1 <- ggplot(crime, aes(y = crime_rate, x = perc_college)) +
geom_point() +
geom_smooth(method = "loess", se = F, color = "blue")
# perc_adol_in_college
p2 <- ggplot(crime, aes(y = crime_rate, x = perc_adol_in_college)) +
geom_point() +
geom_smooth(method = "loess", se = F, color = "orange") +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
ggarrange(p1, p2,
labels = c("E", "F"),
ncol = 2, nrow = 1)
# ggsave("pics/exp_plot_3.png", width = 14, height = 7, units = "cm")
Correlation coefs
Perhaps perc_hs won’t be a significant variable because it’s correlation with the responses is low, with -.14 correlation when paired with crime_rate, and -.18 when with violent_crime_rate.
Correlation represented as (with Crime rate, with Violent crime rate). Police funding is highly correlated (.53, .51).
Percent of teens (16-19) not in highschool and not graduates is correlated (.32, .29).
Corrgram
library(corrgram)
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
corrgram(crime, order=TRUE,
lower.panel=panel.shade,
upper.panel=panel.cor,
text.panel=panel.txt,
main="Correlation of Crime Data")
crime.fit <- lm(crime_rate ~ police_fund + perc_hs + perc_teens_no_hs + perc_adol_in_college + perc_college, data=crime)
summary(crime.fit)
##
## Call:
## lm(formula = crime_rate ~ police_fund + perc_hs + perc_teens_no_hs +
## perc_adol_in_college + perc_college, data = crime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -346.60 -148.46 -62.43 111.75 788.21
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 489.649 472.366 1.037 0.305592
## police_fund 10.981 3.078 3.568 0.000884 ***
## perc_hs -6.088 6.544 -0.930 0.357219
## perc_teens_no_hs 5.480 10.053 0.545 0.588428
## perc_adol_in_college 0.377 4.417 0.085 0.932367
## perc_college 5.500 13.754 0.400 0.691150
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 253.2 on 44 degrees of freedom
## Multiple R-squared: 0.3336, Adjusted R-squared: 0.2579
## F-statistic: 4.405 on 5 and 44 DF, p-value: 0.002444
Tranformation of x perc_adol_in_college was highly insignificant, p = .93, trying 1/x and exp(-x)
library(tidyverse)
## Registered S3 method overwritten by 'rvest':
## method from
## read_xml.response xml2
## -- Attaching packages ------------------------------------ tidyverse 1.2.1 --
## v tibble 2.1.1 v purrr 0.3.2
## v tidyr 0.8.3 v dplyr 0.8.0.1
## v readr 1.3.1 v stringr 1.4.0
## v tibble 2.1.1 v forcats 0.4.0
## -- Conflicts --------------------------------------- tidyverse_conflicts() --
## x tidyr::extract() masks magrittr::extract()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::set_names() masks magrittr::set_names()
crime2 <- crime
crime2 <- crime2 %>% mutate(perc_adol_in_college = perc_adol_in_college^(-1))
crime.fit2 <- lm(crime_rate ~ police_fund + perc_hs + perc_teens_no_hs + perc_adol_in_college + perc_college, data=crime2)
summary(crime.fit2)
##
## Call:
## lm(formula = crime_rate ~ police_fund + perc_hs + perc_teens_no_hs +
## perc_adol_in_college + perc_college, data = crime2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -368.56 -145.30 -33.22 125.38 819.94
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 741.525 376.687 1.969 0.055325 .
## police_fund 11.768 3.048 3.861 0.000366 ***
## perc_hs -11.301 6.694 -1.688 0.098465 .
## perc_teens_no_hs -4.512 11.564 -0.390 0.698328
## perc_adol_in_college 3144.636 2687.697 1.170 0.248295
## perc_college 9.930 10.376 0.957 0.343781
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 249.4 on 44 degrees of freedom
## Multiple R-squared: 0.3536, Adjusted R-squared: 0.2801
## F-statistic: 4.814 on 5 and 44 DF, p-value: 0.00135
ALL OF THE TRANSFORMS
library(tidyverse)
crime2 <- crime
crime2 <- crime2 %>% mutate(perc_adol_in_college = perc_adol_in_college^(-1),
perc_college = perc_college^(-1),
police_fund = exp(police_fund),
perc_teens_no_hs = log(perc_teens_no_hs),
perc_hs = perc_hs^(-1))
crime.fit2 <- lm(crime_rate ~ police_fund + perc_hs + perc_teens_no_hs + perc_adol_in_college + perc_college, data=crime2)
summary(crime.fit2)
##
## Call:
## lm(formula = crime_rate ~ police_fund + perc_hs + perc_teens_no_hs +
## perc_adol_in_college + perc_college, data = crime2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -374.29 -146.67 -41.27 104.35 803.69
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.114e+02 2.604e+02 0.428 0.670825
## police_fund 4.239e-35 1.172e-35 3.617 0.000764 ***
## perc_hs 2.831e+04 2.160e+04 1.310 0.196866
## perc_teens_no_hs 1.427e+02 1.578e+02 0.905 0.370623
## perc_adol_in_college 1.564e+03 2.569e+03 0.609 0.545953
## perc_college -4.406e+03 2.469e+03 -1.784 0.081294 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 248.4 on 44 degrees of freedom
## Multiple R-squared: 0.3587, Adjusted R-squared: 0.2858
## F-statistic: 4.922 on 5 and 44 DF, p-value: 0.001156
summary(crime.fit)
##
## Call:
## lm(formula = crime_rate ~ police_fund + perc_hs + perc_teens_no_hs +
## perc_adol_in_college + perc_college, data = crime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -346.60 -148.46 -62.43 111.75 788.21
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 489.649 472.366 1.037 0.305592
## police_fund 10.981 3.078 3.568 0.000884 ***
## perc_hs -6.088 6.544 -0.930 0.357219
## perc_teens_no_hs 5.480 10.053 0.545 0.588428
## perc_adol_in_college 0.377 4.417 0.085 0.932367
## perc_college 5.500 13.754 0.400 0.691150
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 253.2 on 44 degrees of freedom
## Multiple R-squared: 0.3336, Adjusted R-squared: 0.2579
## F-statistic: 4.405 on 5 and 44 DF, p-value: 0.002444
Transforms didn’t really help the data at all.
# perc_college
crime_original <- crime
crime <- crime2 # I do this bc I didn't want to change the var name in the ggplots
p1 <- ggplot(crime, aes(y = crime_rate, x = perc_college)) +
geom_point() +
geom_smooth(method = "loess", se = F, color = "blue")
# perc_adol_in_college
p2 <- ggplot(crime, aes(y = crime_rate, x = perc_adol_in_college)) +
geom_point() +
geom_smooth(method = "loess", se = F, color = "orange") +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
ggarrange(p1, p2,
labels = c("A", "B"),
ncol = 2, nrow = 1)
# perc_hs
p1 <- ggplot(crime, aes(y = crime_rate, x = perc_hs)) +
geom_point() +
geom_smooth(method = "loess", se = F, color = "blue")
# perc_teens_no_hs
p2 <- ggplot(crime, aes(y = crime_rate, x = perc_teens_no_hs)) +
geom_point() +
geom_smooth(method = "loess", se = F, color = "orange") +
theme(axis.title.y=element_blank(),
axis.text.y=element_blank(),
axis.ticks.y=element_blank())
ggarrange(p1, p2,
labels = c("A", "B"),
ncol = 2, nrow = 1)
# police_fund
p1 <- ggplot(crime, aes(y = crime_rate, x = police_fund)) +
geom_point() +
geom_smooth(method = "loess", se = F, color = "blue")
# density
p2 <- ggplot(crime, aes(crime_rate)) +
geom_density() +
geom_rug(sides="b")
ggarrange(p1, p2,
labels = c("A", "B"),
ncol = 2, nrow = 1)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -1.1176e+035
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 1.2491e+070
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -1.1176e+035
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.1176e+035
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
crime <- crime_original
Stick with no transforms
crime.fit <- lm(crime_rate ~ police_fund + perc_hs + perc_teens_no_hs + perc_adol_in_college + perc_college, data=crime)
summary(crime.fit)
##
## Call:
## lm(formula = crime_rate ~ police_fund + perc_hs + perc_teens_no_hs +
## perc_adol_in_college + perc_college, data = crime)
##
## Residuals:
## Min 1Q Median 3Q Max
## -346.60 -148.46 -62.43 111.75 788.21
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 489.649 472.366 1.037 0.305592
## police_fund 10.981 3.078 3.568 0.000884 ***
## perc_hs -6.088 6.544 -0.930 0.357219
## perc_teens_no_hs 5.480 10.053 0.545 0.588428
## perc_adol_in_college 0.377 4.417 0.085 0.932367
## perc_college 5.500 13.754 0.400 0.691150
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 253.2 on 44 degrees of freedom
## Multiple R-squared: 0.3336, Adjusted R-squared: 0.2579
## F-statistic: 4.405 on 5 and 44 DF, p-value: 0.002444
Lack of Fit Test was not used bc insufficient df (to few obs, too many factors)
Residual variance is not constant and reveals a parabula-like pattern.
Residuals are not normal with large right tail. Fails shapiro-wilk test.
plot(crime.fit, which = c(1,2))
shapiro.test(crime.fit$residuals)
##
## Shapiro-Wilk normality test
##
## data: crime.fit$residuals
## W = 0.90663, p-value = 0.0008015
Y transforms log(y) Looks great!
crime2 <- crime %>% mutate(crime_rate = log(crime_rate))
crime.fit2 <- lm(crime_rate ~ police_fund + perc_hs + perc_teens_no_hs + perc_adol_in_college + perc_college, data=crime2)
summary(crime.fit2)
##
## Call:
## lm(formula = crime_rate ~ police_fund + perc_hs + perc_teens_no_hs +
## perc_adol_in_college + perc_college, data = crime2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.62827 -0.15213 -0.06212 0.20198 0.81508
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.418661 0.597608 10.741 7.05e-14 ***
## police_fund 0.012107 0.003894 3.109 0.00328 **
## perc_hs -0.011361 0.008279 -1.372 0.17693
## perc_teens_no_hs 0.008351 0.012719 0.657 0.51486
## perc_adol_in_college -0.000460 0.005589 -0.082 0.93477
## perc_college 0.013275 0.017401 0.763 0.44958
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3204 on 44 degrees of freedom
## Multiple R-squared: 0.3253, Adjusted R-squared: 0.2486
## F-statistic: 4.243 on 5 and 44 DF, p-value: 0.003106
plot(crime.fit2, which = c(1,2))
shapiro.test(crime.fit2$residuals)
##
## Shapiro-Wilk normality test
##
## data: crime.fit2$residuals
## W = 0.97384, p-value = 0.3292
Partial F Test Should perc_adol_in_college be removed? P-value = .9348, so very little information is lost by removing perc_adol_in_college.
full_model <- lm(crime_rate ~ police_fund + perc_hs + perc_teens_no_hs + perc_adol_in_college + perc_college, data=crime2)
reduced_model <- lm(crime_rate ~ police_fund + perc_hs + perc_teens_no_hs + perc_college, data=crime2)
anova(full_model, reduced_model)
## Analysis of Variance Table
##
## Model 1: crime_rate ~ police_fund + perc_hs + perc_teens_no_hs + perc_adol_in_college +
## perc_college
## Model 2: crime_rate ~ police_fund + perc_hs + perc_teens_no_hs + perc_college
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 44 4.5156
## 2 45 4.5163 -1 -0.00069545 0.0068 0.9348
summary(reduced_model)
##
## Call:
## lm(formula = crime_rate ~ police_fund + perc_hs + perc_teens_no_hs +
## perc_college, data = crime2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.62482 -0.15140 -0.06376 0.20068 0.81293
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.383451 0.412722 15.467 < 2e-16 ***
## police_fund 0.012175 0.003762 3.237 0.00227 **
## perc_hs -0.010963 0.006646 -1.649 0.10602
## perc_teens_no_hs 0.008935 0.010446 0.855 0.39689
## perc_college 0.012298 0.012578 0.978 0.33344
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3168 on 45 degrees of freedom
## Multiple R-squared: 0.3252, Adjusted R-squared: 0.2652
## F-statistic: 5.421 on 4 and 45 DF, p-value: 0.001193
Next, removing perc_teens_no_hs because it’s p-value in the reduced model is highest at .3969. On it’s removal, the partial F returns a p-value of .33. Additionally, the p-value of perc_hs dropped dramatically, to
full_model <- lm(crime_rate ~ police_fund + perc_hs + perc_teens_no_hs + perc_college, data=crime2)
reduced_model <- lm(crime_rate ~ police_fund + perc_hs + perc_college, data=crime2)
anova(full_model, reduced_model)
## Analysis of Variance Table
##
## Model 1: crime_rate ~ police_fund + perc_hs + perc_teens_no_hs + perc_college
## Model 2: crime_rate ~ police_fund + perc_hs + perc_college
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 45 4.5163
## 2 46 4.5898 -1 -0.073427 0.7316 0.3969
summary(reduced_model)
##
## Call:
## lm(formula = crime_rate ~ police_fund + perc_hs + perc_college,
## data = crime2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.64544 -0.18131 -0.06119 0.17862 0.80425
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.626463 0.298492 22.200 < 2e-16 ***
## police_fund 0.013713 0.003295 4.162 0.000137 ***
## perc_hs -0.012989 0.006192 -2.098 0.041447 *
## perc_college 0.009087 0.011970 0.759 0.451607
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3159 on 46 degrees of freedom
## Multiple R-squared: 0.3142, Adjusted R-squared: 0.2695
## F-statistic: 7.026 on 3 and 46 DF, p-value: 0.0005493
Remove perc_college Looks good, this is final model
full_model <- lm(crime_rate ~ police_fund + perc_hs + perc_college, data=crime2)
reduced_model <- lm(crime_rate ~ police_fund + perc_hs, data=crime2)
anova(full_model, reduced_model)
## Analysis of Variance Table
##
## Model 1: crime_rate ~ police_fund + perc_hs + perc_college
## Model 2: crime_rate ~ police_fund + perc_hs
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 46 4.5898
## 2 47 4.6473 -1 -0.05751 0.5764 0.4516
summary(reduced_model)
##
## Call:
## lm(formula = crime_rate ~ police_fund + perc_hs, data = crime2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.63723 -0.21609 -0.07197 0.18590 0.80263
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.559607 0.283918 23.104 < 2e-16 ***
## police_fund 0.013860 0.003274 4.233 0.000106 ***
## perc_hs -0.009810 0.004541 -2.160 0.035869 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3144 on 47 degrees of freedom
## Multiple R-squared: 0.3056, Adjusted R-squared: 0.2761
## F-statistic: 10.34 on 2 and 47 DF, p-value: 0.0001894
crime.fit2 <- reduced_model
plot(crime.fit2, which = c(1,2))
shapiro.test(crime.fit2$residuals)
##
## Shapiro-Wilk normality test
##
## data: crime.fit2$residuals
## W = 0.97281, p-value = 0.3
Interpreting transforms: https://www.princeton.edu/~otorres/Stata/inference.htm https://www.cscu.cornell.edu/news/statnews/stnews83.pdf
3d regression plot
Source for 3d plane in plotly: https://stackoverflow.com/questions/38331198/add-regression-plane-to-3d-scatter-plot-in-plotly?utm_medium=organic&utm_source=google_rich_qa&utm_campaign=google_rich_qa
library(plotly); library(reshape2)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
# Graph Resolution (more important for more complex shapes)
graph_reso <- 0.5 #0.05
# Setup Axis
axis_x <- seq(min(crime2$police_fund), max(crime2$police_fund), by = graph_reso)
axis_y <- seq(min(crime2$perc_hs), max(crime2$perc_hs), by = graph_reso)
# Sample points
crime_fit_surface <- expand.grid(police_fund = axis_x,
perc_hs = axis_y,
KEEP.OUT.ATTRS = F)
crime_fit_surface$crime_rate <- exp(predict.lm(crime.fit2, newdata = crime_fit_surface)) # exp to remove ln() from y
crime_fit_surface <- acast(crime_fit_surface, perc_hs ~ police_fund, value.var = "crime_rate") #y ~ x
# Add non-ln of crime rate to df
crime2$crime_rate1 <- crime$crime_rate
# Plot
p <- plot_ly(data = crime2) %>%
add_trace(x = ~police_fund, y = ~perc_hs, z = ~crime_rate1,
type = "scatter3d", mode = "markers",
opacity = .8) %>%
add_trace(z = crime_fit_surface,
x = axis_x,
y = axis_y,
type = "surface") %>%
layout(scene = list(xaxis = list(title = 'Police Funding'),
yaxis = list(title = 'Percent of 25+ with HS'),
zaxis = list(title = 'Crime Rate')))
p