USArrests

# I load the data.table library because I want to work efficiently with large data sets.
library(data.table)

# I load the USArrests dataset to analyze arrest data for different crimes across US states.
data("USArrests")

# I like to preview the data to get a quick look at what I'm working with.
head(USArrests)

##            Murder Assault UrbanPop Rape
## Alabama      13.2     236       58 21.2
## Alaska       10.0     263       48 44.5
## Arizona       8.1     294       80 31.0
## Arkansas      8.8     190       50 19.5
## California    9.0     276       91 40.6
## Colorado      7.9     204       78 38.7

# I convert the dataset to a data.table format and make sure to keep the state names as a separate column.
DT <- as.data.table(USArrests, keep.rownames = TRUE)

# I realize this data is in a wide form, but I want to see it in long form to analyze it differently.
DTm <- melt(DT)

## Warning in melt.data.table(DT): id.vars and measure.vars are internally guessed
## when both are 'NULL'. All non-numeric/integer/logical type columns are
## considered id.vars, which in this case are columns [rn, ...]. Consider
## providing at least one of 'id' or 'measure' vars in future.

## Warning in melt.data.table(DT): 'measure.vars' [Murder, Assault, UrbanPop,
## Rape, ...] are not all of the same type. By order of hierarchy, the molten data
## value column will be of type 'double'. All measure variables not of type
## 'double' will be coerced too. Check DETAILS in ?melt.data.table for more on
## coercion.

# I rename the columns to make them more intuitive for my analysis.
names(DTm) <- c("State", "CrimeType", "CrimeRate")

# Since UrbanPop is a unique variable that represents something different, I want to keep it separate in my long form.
DTmu <- melt(DT, id.vars = c("rn", "UrbanPop"),
             variable.name = 'CrimeType', value.name = "CrimeRate")

## Warning in melt.data.table(DT, id.vars = c("rn", "UrbanPop"), variable.name =
## "CrimeType", : 'measure.vars' [Murder, Assault, Rape, ...] are not all of the
## same type. By order of hierarchy, the molten data value column will be of type
## 'double'. All measure variables not of type 'double' will be coerced too. Check
## DETAILS in ?melt.data.table for more on coercion.

names(DTmu)[1] <- "State"

# I like to use intuitive names, so I carefully name each column to avoid confusion later.
# Now, I want to summarize the violent crime rates for each state to get a high-level overview.
DTmu[, .(TotalCrimeRate = sum(CrimeRate)), by = State]

##              State TotalCrimeRate
##             <char>          <num>
##  1:        Alabama          270.4
##  2:         Alaska          317.5
##  3:        Arizona          333.1
##  4:       Arkansas          218.3
##  5:     California          325.6
##  6:       Colorado          250.6
##  7:    Connecticut          124.4
##  8:       Delaware          259.7
##  9:        Florida          382.3
## 10:        Georgia          254.2
## 11:         Hawaii           71.5
## 12:          Idaho          136.8
## 13:       Illinois          283.4
## 14:        Indiana          141.2
## 15:           Iowa           69.5
## 16:         Kansas          139.0
## 17:       Kentucky          135.0
## 18:      Louisiana          286.6
## 19:          Maine           92.9
## 20:       Maryland          339.1
## 21:  Massachusetts          169.7
## 22:       Michigan          302.2
## 23:      Minnesota           89.6
## 24:    Mississippi          292.2
## 25:       Missouri          215.2
## 26:        Montana          131.4
## 27:       Nebraska          122.8
## 28:         Nevada          310.2
## 29:  New Hampshire           68.6
## 30:     New Jersey          185.2
## 31:     New Mexico          328.5
## 32:       New York          291.2
## 33: North Carolina          366.1
## 34:   North Dakota           53.1
## 35:           Ohio          148.7
## 36:       Oklahoma          177.6
## 37:         Oregon          193.2
## 38:   Pennsylvania          127.2
## 39:   Rhode Island          185.7
## 40: South Carolina          315.9
## 41:   South Dakota          102.6
## 42:      Tennessee          228.1
## 43:          Texas          239.2
## 44:           Utah          146.1
## 45:        Vermont           61.4
## 46:       Virginia          185.2
## 47:     Washington          175.2
## 48:  West Virginia           96.0
## 49:      Wisconsin           66.4
## 50:        Wyoming          183.4
##              State TotalCrimeRate

# I decide to pivot the long form data back to wide form for comparison.
DTc <- dcast(DTmu, State + UrbanPop ~ CrimeType)

## Using 'CrimeRate' as value column. Use 'value.var' to override

# I realize this step is essential to recover the original structure of the data.
# I now want to explore how states with similar urban populations compare in terms of crime rates.
DTmu[, UrbanPopDecile := cut(UrbanPop, quantile(UrbanPop, probs = seq(0, 1, by = 0.1)))]

# I use descriptive labels for each decile to make the data more readable.
levels(DTmu$UrbanPopDecile) <- paste0(1:10, "D")

# I want to see how the crime rates are distributed across different deciles.
dcast(DTmu, UrbanPopDecile ~ CrimeType, value.var = "CrimeRate", fun.aggregate = sum)

## Key: <UrbanPopDecile>
##     UrbanPopDecile Murder Assault  Rape
##             <fctr>  <num>   <num> <num>
##  1:           <NA>    2.2      48  11.2
##  2:             1D   39.4     808  62.6
##  3:             2D   35.3     815  94.3
##  4:             3D   22.6     451  67.7
##  5:             4D   54.9     898 106.0
##  6:             5D   42.4     758 107.6
##  7:             6D   43.2    1073 137.4
##  8:             7D   28.3     744  92.0
##  9:             8D   57.9    1384 182.5
## 10:             9D   27.9     547  90.2
## 11:            10D   35.3    1012 110.1

# By summarizing the crime data for each decile, I can identify patterns more easily.
dcast(DTmu, UrbanPopDecile ~ CrimeType, value.var = "CrimeRate", fun.aggregate = mean)

## Key: <UrbanPopDecile>
##     UrbanPopDecile    Murder  Assault     Rape
##             <fctr>     <num>    <num>    <num>
##  1:           <NA>  2.200000  48.0000 11.20000
##  2:             1D  7.880000 161.6000 12.52000
##  3:             2D  8.825000 203.7500 23.57500
##  4:             3D  4.520000  90.2000 13.54000
##  5:             4D 10.980000 179.6000 21.20000
##  6:             5D  7.066667 126.3333 17.93333
##  7:             6D  8.640000 214.6000 27.48000
##  8:             7D  7.075000 186.0000 23.00000
##  9:             8D  8.271429 197.7143 26.07143
## 10:             9D  9.300000 182.3333 30.06667
## 11:            10D  7.060000 202.4000 22.02000

# I appreciate this view because it provides an aggregated summary of crime rates by urban population decile.

# Load necessary libraries
library(ggplot2)
library(data.table)

# Load and convert USArrests to a data.table
data("USArrests")
DT <- as.data.table(USArrests, keep.rownames = TRUE)

# Create a scatter plot of Assault vs. Murder with UrbanPop as color
ggplot(DT, aes(x = Murder, y = Assault, color = UrbanPop)) +
  geom_point(size = 3) +  # I chose a point size of 3 to make the dots more visible
  scale_color_gradient(low = "blue", high = "red") +  # I wanted a gradient from blue to red based on UrbanPop
  labs(
    title = "Scatter Plot of Assault vs. Murder",
    x = "Murder Rate (per 100,000)",
    y = "Assault Rate (per 100,000)",
    color = "Urban Population (%)"
  ) +
  theme_minimal() +  # I prefer a minimal theme for a clean look
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),  # Center and bold the title
    axis.title = element_text(face = "bold"),  # Make axis titles bold
    legend.position = "right"  # I like the legend on the right for better readability
  )

USArrests

Avery Holloman

2024-11-05