library(fansi)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
crime <- read_csv('http://datasets.flowingdata.com/crimeRatesByState2005.csv')
##
## -- Column specification --------------------------------------------------------
## cols(
## state = col_character(),
## murder = col_double(),
## forcible_rape = col_double(),
## robbery = col_double(),
## aggravated_assault = col_double(),
## burglary = col_double(),
## larceny_theft = col_double(),
## motor_vehicle_theft = col_double(),
## population = col_double()
## )
Check out the first 10 lines
print(crime)
## # A tibble: 52 x 9
## state murder forcible_rape robbery aggravated_assa~ burglary larceny_theft
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 United ~ 5.6 31.7 141. 291. 727. 2286.
## 2 Alabama 8.2 34.3 141. 248. 954. 2650
## 3 Alaska 4.8 81.1 80.9 465. 622. 2599.
## 4 Arizona 7.5 33.8 144. 327. 948. 2965.
## 5 Arkansas 6.7 42.9 91.1 387. 1085. 2711.
## 6 Califor~ 6.9 26 176. 317. 693. 1916.
## 7 Colorado 3.7 43.4 84.6 265. 745. 2735.
## 8 Connect~ 2.9 20 113 139. 437. 1824.
## 9 Delaware 4.4 44.7 155. 428. 689. 2144
## 10 Distric~ 35.4 30.2 672. 721. 650. 2695.
## # ... with 42 more rows, and 2 more variables: motor_vehicle_theft <dbl>,
## # population <dbl>
ggplot(crime, aes(x = burglary, y = murder)) +
xlab("Burglary rates in each state per 100,000") +
ylab("Murder rates in each state per 100,000")+
geom_point()
ggplot(crime, aes(x = burglary, y = murder)) +
xlab("Burglary rates in each state per 100,000") +
ylab("Murder rates in each state per 100,000")+
geom_point()+
theme_minimal(base_size = 12)
ggplot(crime, aes(x = burglary, y = murder)) +
xlab("Burglary rates in each state per 100,000") +
ylab("Murder rates in each state per 100,000")+
theme_minimal(base_size = 12)
p1 <- ggplot(crime, aes(x = burglary, y = murder)) +
labs(title = "MURDERS VERSUS BURGLARIES IN US STATES PER 100,000",
caption = "Source: U.S. Census Bureau and Nathan Yau") +
xlab("Burglary rates in each state per 100,000") +
ylab ("Murder rates in each state per 100,000") +
theme_minimal(base_size = 12)
p1 + geom_point()
The one point far higher than the rest represents Washington, D.C., which had a much higher murder rate of 35.4. The states with the next highest murder rate at that time were Louisiana and Maryland at 9.9 per 100,000.
crime2 <- crime[crime$state != "District of Columbia",]
crime2 <- crime2[crime2$state != "United States",]
p2 <- ggplot(crime2, aes(x = burglary, y = murder)) +
labs(title = "MURDERS VERSUS BURGLARIES IN US STATES PER 100,000",
caption = "Source: U.S. Census Bureau and Nathan Yau") +
xlab("Burglary rates in each state per 100,000") +
ylab ("Murder rates in each state per 100,000") +
theme_minimal(base_size = 12)
p2 + geom_point()
Now the scatterplot appears to show a positive correlation betwwen murder rates and burglary rates.
p3 <- p2 + xlim(250,1200)+ ylim(0,10)
p3 + geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).
p4 <- p3 + geom_point() + geom_smooth(color = "red")
p4
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
p5 <- p3 + geom_point() + geom_smooth(method='lm',formula=y~x)
p5
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
The command se = FALSE takes away the CI band
p6 <- p3 + geom_point() + geom_smooth(method='lm',formula=y~x, se = FALSE)
p6
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
p6a <- p3 + geom_point() + geom_smooth(method='lm',formula=y~x, se = FALSE, linetype= "dotdash")
p6a
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
# Make the line thinner using size=0.3
p6b <- p3 + geom_point() + geom_smooth(method='lm',formula=y~x, se = FALSE, linetype= "dotdash", size=0.3)
p6b
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
## How about making the line thicker by size=1.5
p6c <- p3 + geom_point() + geom_smooth(method='lm',formula=y~x, se = FALSE, linetype= "dotdash", size=1.5)
p6c
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
p6c <- p3 + geom_point() + geom_smooth(method='lm',formula=y~x, se = FALSE, linetype= "dotdash", size=0.3)+
ggtitle("BURGLARIES VERSUS MURDERS IN THE U.S.")
p6c
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
cor(crime2$burglary, crime2$murder)
## [1] 0.6231757
fit1 <- lm(murder ~ burglary, data = crime2)
summary(fit1)
##
## Call:
## lm(formula = murder ~ burglary, data = crime2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2924 -1.2156 -0.2142 1.1749 5.4978
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.395519 0.825748 0.479 0.634
## burglary 0.006247 0.001132 5.521 1.34e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.87 on 48 degrees of freedom
## Multiple R-squared: 0.3883, Adjusted R-squared: 0.3756
## F-statistic: 30.48 on 1 and 48 DF, p-value: 1.342e-06
Look at the values of (Intercept) and murder. The column, Estimate gives the value you need in your linear model. The column for Pr(>|t|) describes whether the predictor is useful to the model. The more asterisks, the more the variable contributes to the model.
Cor stands for “correlation”. This is a value between (inclusively) -1 and 1. The correlation coefficient tells how strong or weak the correlation is. Values closer to +/- 1 are strong correlation (the sign is determined by the linear slope), values close to +/- 0.5 are weak correlation, and values close to zero have no correlation. The model has the equation: murder = 0.0062(burglary) + 0.396 The slope may be interpreted in the following: For each additional burglary per 100,000, there is a predicted increase of 0.006 murders. The p-value on the right of burglary has 3 asterisks which suggests it is a meaningful variable to explain the linear increase in murders. But we also need to look at the Adjusted R-Squared value. It states that about 38% of the variation in the observations may be explained by the model. In other words, 62% of the variation in the data is likely not explained by this model.
I did not succeed in loading the GGally package
This correlation plot shows similar pairwise results as above, but in a heatmap of correlation values.
#install.packages("DataExplorer")
library(DataExplorer)
plot_correlation(crime2)
## Warning in dummify(data, maxcat = maxcat): Ignored all discrete features since
## `maxcat` set to 20 categories!
Collinearity The key goal of multiple regression analysis is to isolate the relationship between EACH INDEPENDENT VARIABLE and the DEPENDENT VARIABLE. COLLINEARITY means explanatory variables are correlated and thus NOT INDEPENDENT. The more correlated the variables, the more difficult it is to change one variable without changing the other. This is important to keep in mind. The two different matrices gave slightly different correlation information. We are concerned with dependence of 2 or more variables. The two variables with the highest correlation of 0.68 or 0.69 are burglary and larceny_theft.
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
pairs.panels(crime2[3:8], # plot distributions and correlations for all the data
gap = 0,
pch = 21,
lm = TRUE)
With multiple regression, there are several strategies for comparing variable inputs into a model. ## I will show you backward elimination.
In backward elimination, start with all possible predictor variables with your response variable. In this case, we will use: burglary forcible_rape aggravated_assault larceny_theft motor_vehicle_theft Perform a model fit with all predictors. 1. Look at the p-value for each variable - if it is relatively small ( < 0.10), then it is likely contributing to the model. 2. Check out the residual plots. A good model will have a relatively straight horizontal red line across the scatterplot between residuals plotted with fitted values (see below for a good residuals plot). You can also look at the other plots (Normal QQ, Scale-Location, and Residuals vs Leverage), but for now we will focus on the residual vs. fitted plot. The more curved the red line, the more likely that a better model exists. 3. Look at the output for the Adjusted R-Squared value at the bottom of the output. The interpretation is: __% (from the adjusted r-squared value) of the variation in the observations may be explained by this model. The higher the adjusted R-squared value, the better the model. We use the adjusted R-squared value because it compensates for more predictors mathematically increasing the normal R-squared value.
fit2 <- lm(murder ~ robbery + burglary + forcible_rape + aggravated_assault + larceny_theft + motor_vehicle_theft, data = crime2)
summary(fit2)
##
## Call:
## lm(formula = murder ~ robbery + burglary + forcible_rape + aggravated_assault +
## larceny_theft + motor_vehicle_theft, data = crime2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.7088 -0.7961 -0.0508 0.6757 3.4723
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.9940985 1.0835208 0.917 0.364014
## robbery 0.0194331 0.0052193 3.723 0.000567 ***
## burglary 0.0041431 0.0013339 3.106 0.003352 **
## forcible_rape -0.0126884 0.0210395 -0.603 0.549627
## aggravated_assault 0.0045161 0.0023433 1.927 0.060576 .
## larceny_theft -0.0007841 0.0005622 -1.395 0.170246
## motor_vehicle_theft -0.0002426 0.0012751 -0.190 0.849982
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.338 on 43 degrees of freedom
## Multiple R-squared: 0.7193, Adjusted R-squared: 0.6801
## F-statistic: 18.36 on 6 and 43 DF, p-value: 1.949e-10
If we are trying to predict murder rates, then we can see if any of the predictor variables contribute to this model. Note the adjusted R-squared value is 68.01% The only variable that does not appear to be as significant as the others is motor_vehicle_theft. ## So drop that (motor_vehicle_theft) and re-run the model.
fit3 <- lm(murder ~ robbery + burglary + forcible_rape + aggravated_assault + larceny_theft, data = crime2)
summary(fit3)
##
## Call:
## lm(formula = murder ~ robbery + burglary + forcible_rape + aggravated_assault +
## larceny_theft, data = crime2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6923 -0.7545 -0.0751 0.6404 3.4836
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.0611101 1.0134089 1.047 0.300785
## robbery 0.0189486 0.0045060 4.205 0.000126 ***
## burglary 0.0041189 0.0013131 3.137 0.003044 **
## forcible_rape -0.0134321 0.0204456 -0.657 0.514623
## aggravated_assault 0.0046152 0.0022596 2.042 0.047124 *
## larceny_theft -0.0008229 0.0005181 -1.588 0.119349
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.324 on 44 degrees of freedom
## Multiple R-squared: 0.719, Adjusted R-squared: 0.6871
## F-statistic: 22.52 on 5 and 44 DF, p-value: 3.917e-11
The adjusted R-squared value improved slightly to 68.7%.The variable with the largest p value now is forcible_rape.
fit4 <- lm(murder ~ robbery + burglary + aggravated_assault + larceny_theft, data = crime2)
summary(fit4)
##
## Call:
## lm(formula = murder ~ robbery + burglary + aggravated_assault +
## larceny_theft, data = crime2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6290 -0.7670 -0.0601 0.4779 3.6348
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7555163 0.8946439 0.844 0.40286
## robbery 0.0201084 0.0041195 4.881 1.36e-05 ***
## burglary 0.0040134 0.0012950 3.099 0.00334 **
## aggravated_assault 0.0039521 0.0020089 1.967 0.05533 .
## larceny_theft -0.0008325 0.0005146 -1.618 0.11268
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.315 on 45 degrees of freedom
## Multiple R-squared: 0.7163, Adjusted R-squared: 0.691
## F-statistic: 28.4 on 4 and 45 DF, p-value: 8.396e-12
Interesting!! The adjusted R-squared went up to 69.1%. The residuals plot looks about the same. One final model - the simplest (parsimonious) by removing larceny_theft.
fit5 <- lm(murder ~ robbery + burglary + aggravated_assault , data = crime2)
summary(fit5)
##
## Call:
## lm(formula = murder ~ robbery + burglary + aggravated_assault,
## data = crime2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6434 -0.7535 -0.0107 0.7229 3.7420
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.330470 0.601764 -0.549 0.5855
## robbery 0.021669 0.004075 5.318 3e-06 ***
## burglary 0.002732 0.001042 2.621 0.0118 *
## aggravated_assault 0.003570 0.002030 1.759 0.0853 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.338 on 46 degrees of freedom
## Multiple R-squared: 0.6998, Adjusted R-squared: 0.6802
## F-statistic: 35.74 on 3 and 46 DF, p-value: 4.451e-12
Interesting! The adjusted R^2 went down to 68% now.
p2 +
geom_point(aes(size = population), color = "red") + xlim(250,1200) + ylim(0,10) +
labs(title = "MURDERS VERSUS BURGLARIES IN US STATES PER 100,000",
caption = "Source: U.S. Census Bureau and Nathan Yau") +
xlab("Burglary rates in each state per 100,000") +
ylab ("Murder rates in each state per 100,000") +
theme_minimal(base_size = 12)
## Warning: Removed 1 rows containing missing values (geom_point).
# Finally, add some interactivity to the plot with plotly
p <- ggplot(crime2, aes(x = burglary, y = murder, size = population, text = paste("state:", state))) +
geom_point(alpha = 0.5, color = "red") + xlim(250,1200) + ylim(0,10) +
ggtitle("BURGLARIES VERSUS MURDERS IN THE U.S.", subtitle = "Sizes of circles are proportional to state populations") +
xlab("Burglary rates in each state per 100,000") +
ylab ("Murder rates in each state per 100,000") +
theme_minimal(base_size = 12)
p <- ggplotly(p)
p