Tutorial (US Arrests)

library(ggplot2)
library(dplyr)
library(gridExtra)
library(corrplot)

data(USArrests)
## Describe and summarize your assigned data set.
head(USArrests) # view first 6 rows of data
##            Murder Assault UrbanPop Rape
## Alabama      13.2     236       58 21.2
## Alaska       10.0     263       48 44.5
## Arizona       8.1     294       80 31.0
## Arkansas      8.8     190       50 19.5
## California    9.0     276       91 40.6
## Colorado      7.9     204       78 38.7
str(USArrests) # view data structure
## 'data.frame':    50 obs. of  4 variables:
##  $ Murder  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
##  $ Assault : int  236 263 294 190 276 204 110 238 335 211 ...
##  $ UrbanPop: int  58 48 80 50 91 78 77 72 80 60 ...
##  $ Rape    : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
summary(USArrests) # view mean, min, max etc of each variable
##      Murder          Assault         UrbanPop          Rape      
##  Min.   : 0.800   Min.   : 45.0   Min.   :32.00   Min.   : 7.30  
##  1st Qu.: 4.075   1st Qu.:109.0   1st Qu.:54.50   1st Qu.:15.07  
##  Median : 7.250   Median :159.0   Median :66.00   Median :20.10  
##  Mean   : 7.788   Mean   :170.8   Mean   :65.54   Mean   :21.23  
##  3rd Qu.:11.250   3rd Qu.:249.0   3rd Qu.:77.75   3rd Qu.:26.18  
##  Max.   :17.400   Max.   :337.0   Max.   :91.00   Max.   :46.00
boxplot(USArrests)

cor(USArrests)
##              Murder   Assault   UrbanPop      Rape
## Murder   1.00000000 0.8018733 0.06957262 0.5635788
## Assault  0.80187331 1.0000000 0.25887170 0.6652412
## UrbanPop 0.06957262 0.2588717 1.00000000 0.4113412
## Rape     0.56357883 0.6652412 0.41134124 1.0000000
correlation <- cor(USArrests)
corrplot(correlation, method = "color", 
         addCoef.col = "black", 
         tl.col = "black") 

hist(USArrests$Murder, breaks = 20, main = "Murder")

hist(USArrests$Assault, breaks = 20, main = "Assault")

hist(USArrests$Rape, breaks = 20, main = "Rape")

## Which states have the highest murder and assault rates?
head(USArrests[order(-USArrests$Murder), ])
##                Murder Assault UrbanPop Rape
## Georgia          17.4     211       60 25.8
## Mississippi      16.1     259       44 17.1
## Florida          15.4     335       80 31.9
## Louisiana        15.4     249       66 22.2
## South Carolina   14.4     279       48 22.5
## Alabama          13.2     236       58 21.2
head(USArrests[order(-USArrests$Assault), ])
##                Murder Assault UrbanPop Rape
## North Carolina   13.0     337       45 16.1
## Florida          15.4     335       80 31.9
## Maryland         11.3     300       67 27.8
## Arizona           8.1     294       80 31.0
## New Mexico       11.4     285       70 32.1
## South Carolina   14.4     279       48 22.5
## Is there a relationship between urban population percentage and crime rates? Graph your data.
us_murder <- USArrests %>%
  ggplot(aes(x = UrbanPop, y = Murder)) + 
  geom_point() +
  geom_smooth(method = "lm", se = TRUE, color = "red") + 
  labs(x = "Percent Urban Population", y = "Murder Arrests per 100,000") +
  theme_minimal()

us_assault <- USArrests %>%
  ggplot(aes(x = UrbanPop, y = Assault)) + 
  geom_point() +
  geom_smooth(method = "lm", se = TRUE, color = "blue") + 
  labs(x = "Percent Urban Population", y = "Assault Arrests per 100,000") +
  theme_minimal()

us_rape <- USArrests %>%
  ggplot(aes(x = UrbanPop, y = Rape)) + 
  geom_point() +
  geom_smooth(method = "lm", se = TRUE, color = "yellow") + 
  labs(x = "Percent Urban Population", y = "Rape Arrests per 100,000") +
  theme_minimal()

grid.arrange(us_murder, us_assault, us_rape, ncol = 2)