# I load the data.table library because I want to work efficiently with large data sets.
library(data.table)
# I load the USArrests dataset to analyze arrest data for different crimes across US states.
data("USArrests")
# I like to preview the data to get a quick look at what I'm working with.
head(USArrests)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
# I convert the dataset to a data.table format and make sure to keep the state names as a separate column.
DT <- as.data.table(USArrests, keep.rownames = TRUE)
# I realize this data is in a wide form, but I want to see it in long form to analyze it differently.
DTm <- melt(DT)
## Warning in melt.data.table(DT): id.vars and measure.vars are internally guessed
## when both are 'NULL'. All non-numeric/integer/logical type columns are
## considered id.vars, which in this case are columns [rn, ...]. Consider
## providing at least one of 'id' or 'measure' vars in future.
## Warning in melt.data.table(DT): 'measure.vars' [Murder, Assault, UrbanPop,
## Rape, ...] are not all of the same type. By order of hierarchy, the molten data
## value column will be of type 'double'. All measure variables not of type
## 'double' will be coerced too. Check DETAILS in ?melt.data.table for more on
## coercion.
# I rename the columns to make them more intuitive for my analysis.
names(DTm) <- c("State", "CrimeType", "CrimeRate")
# Since UrbanPop is a unique variable that represents something different, I want to keep it separate in my long form.
DTmu <- melt(DT, id.vars = c("rn", "UrbanPop"),
variable.name = 'CrimeType', value.name = "CrimeRate")
## Warning in melt.data.table(DT, id.vars = c("rn", "UrbanPop"), variable.name =
## "CrimeType", : 'measure.vars' [Murder, Assault, Rape, ...] are not all of the
## same type. By order of hierarchy, the molten data value column will be of type
## 'double'. All measure variables not of type 'double' will be coerced too. Check
## DETAILS in ?melt.data.table for more on coercion.
names(DTmu)[1] <- "State"
# I like to use intuitive names, so I carefully name each column to avoid confusion later.
# Now, I want to summarize the violent crime rates for each state to get a high-level overview.
DTmu[, .(TotalCrimeRate = sum(CrimeRate)), by = State]
## State TotalCrimeRate
## <char> <num>
## 1: Alabama 270.4
## 2: Alaska 317.5
## 3: Arizona 333.1
## 4: Arkansas 218.3
## 5: California 325.6
## 6: Colorado 250.6
## 7: Connecticut 124.4
## 8: Delaware 259.7
## 9: Florida 382.3
## 10: Georgia 254.2
## 11: Hawaii 71.5
## 12: Idaho 136.8
## 13: Illinois 283.4
## 14: Indiana 141.2
## 15: Iowa 69.5
## 16: Kansas 139.0
## 17: Kentucky 135.0
## 18: Louisiana 286.6
## 19: Maine 92.9
## 20: Maryland 339.1
## 21: Massachusetts 169.7
## 22: Michigan 302.2
## 23: Minnesota 89.6
## 24: Mississippi 292.2
## 25: Missouri 215.2
## 26: Montana 131.4
## 27: Nebraska 122.8
## 28: Nevada 310.2
## 29: New Hampshire 68.6
## 30: New Jersey 185.2
## 31: New Mexico 328.5
## 32: New York 291.2
## 33: North Carolina 366.1
## 34: North Dakota 53.1
## 35: Ohio 148.7
## 36: Oklahoma 177.6
## 37: Oregon 193.2
## 38: Pennsylvania 127.2
## 39: Rhode Island 185.7
## 40: South Carolina 315.9
## 41: South Dakota 102.6
## 42: Tennessee 228.1
## 43: Texas 239.2
## 44: Utah 146.1
## 45: Vermont 61.4
## 46: Virginia 185.2
## 47: Washington 175.2
## 48: West Virginia 96.0
## 49: Wisconsin 66.4
## 50: Wyoming 183.4
## State TotalCrimeRate
# I decide to pivot the long form data back to wide form for comparison.
DTc <- dcast(DTmu, State + UrbanPop ~ CrimeType)
## Using 'CrimeRate' as value column. Use 'value.var' to override
# I realize this step is essential to recover the original structure of the data.
# I now want to explore how states with similar urban populations compare in terms of crime rates.
DTmu[, UrbanPopDecile := cut(UrbanPop, quantile(UrbanPop, probs = seq(0, 1, by = 0.1)))]
# I use descriptive labels for each decile to make the data more readable.
levels(DTmu$UrbanPopDecile) <- paste0(1:10, "D")
# I want to see how the crime rates are distributed across different deciles.
dcast(DTmu, UrbanPopDecile ~ CrimeType, value.var = "CrimeRate", fun.aggregate = sum)
## Key: <UrbanPopDecile>
## UrbanPopDecile Murder Assault Rape
## <fctr> <num> <num> <num>
## 1: <NA> 2.2 48 11.2
## 2: 1D 39.4 808 62.6
## 3: 2D 35.3 815 94.3
## 4: 3D 22.6 451 67.7
## 5: 4D 54.9 898 106.0
## 6: 5D 42.4 758 107.6
## 7: 6D 43.2 1073 137.4
## 8: 7D 28.3 744 92.0
## 9: 8D 57.9 1384 182.5
## 10: 9D 27.9 547 90.2
## 11: 10D 35.3 1012 110.1
# By summarizing the crime data for each decile, I can identify patterns more easily.
dcast(DTmu, UrbanPopDecile ~ CrimeType, value.var = "CrimeRate", fun.aggregate = mean)
## Key: <UrbanPopDecile>
## UrbanPopDecile Murder Assault Rape
## <fctr> <num> <num> <num>
## 1: <NA> 2.200000 48.0000 11.20000
## 2: 1D 7.880000 161.6000 12.52000
## 3: 2D 8.825000 203.7500 23.57500
## 4: 3D 4.520000 90.2000 13.54000
## 5: 4D 10.980000 179.6000 21.20000
## 6: 5D 7.066667 126.3333 17.93333
## 7: 6D 8.640000 214.6000 27.48000
## 8: 7D 7.075000 186.0000 23.00000
## 9: 8D 8.271429 197.7143 26.07143
## 10: 9D 9.300000 182.3333 30.06667
## 11: 10D 7.060000 202.4000 22.02000
# I appreciate this view because it provides an aggregated summary of crime rates by urban population decile.
# Load necessary libraries
library(ggplot2)
library(data.table)
# Load and convert USArrests to a data.table
data("USArrests")
DT <- as.data.table(USArrests, keep.rownames = TRUE)
# Create a scatter plot of Assault vs. Murder with UrbanPop as color
ggplot(DT, aes(x = Murder, y = Assault, color = UrbanPop)) +
geom_point(size = 3) + # I chose a point size of 3 to make the dots more visible
scale_color_gradient(low = "blue", high = "red") + # I wanted a gradient from blue to red based on UrbanPop
labs(
title = "Scatter Plot of Assault vs. Murder",
x = "Murder Rate (per 100,000)",
y = "Assault Rate (per 100,000)",
color = "Urban Population (%)"
) +
theme_minimal() + # I prefer a minimal theme for a clean look
theme(
plot.title = element_text(hjust = 0.5, face = "bold"), # Center and bold the title
axis.title = element_text(face = "bold"), # Make axis titles bold
legend.position = "right" # I like the legend on the right for better readability
)
