Load and Clean Data

# Load dataset
crime_data <- read_excel("Florida County Crime Rates.xlsx")

# Rename columns
crime_data <- crime_data %>%
  rename(
    Crime = C,
    Income = I,
    HighSchoolGrad = HS,
    UrbanPop = U
  )

# Format county names
crime_data$County <- str_to_title(crime_data$County)

# Check
head(crime_data)
## # A tibble: 6 × 5
##   County   Crime Income HighSchoolGrad UrbanPop
##   <chr>    <dbl>  <dbl>          <dbl>    <dbl>
## 1 Alachua    104   22.1           82.7     73.2
## 2 Baker       20   25.8           64.1     21.5
## 3 Bay         64   24.7           74.7     85  
## 4 Bradford    50   24.6           65       23.2
## 5 Brevard     64   30.5           82.3     91.9
## 6 Broward     94   30.6           76.8     98.9
summary(crime_data)
##     County              Crime           Income      HighSchoolGrad 
##  Length:67          Min.   :  0.0   Min.   :15.40   Min.   :54.50  
##  Class :character   1st Qu.: 35.5   1st Qu.:21.05   1st Qu.:62.45  
##  Mode  :character   Median : 52.0   Median :24.60   Median :69.00  
##                     Mean   : 52.4   Mean   :24.51   Mean   :69.49  
##                     3rd Qu.: 69.0   3rd Qu.:28.15   3rd Qu.:76.90  
##                     Max.   :128.0   Max.   :35.60   Max.   :84.90  
##     UrbanPop    
##  Min.   : 0.00  
##  1st Qu.:21.60  
##  Median :44.60  
##  Mean   :49.56  
##  3rd Qu.:83.55  
##  Max.   :99.60

Descriptive Statistics

# Compute descriptive statistics
crime_data %>%
  summarise(
    Mean_Crime = mean(Crime, na.rm = TRUE),
    Median_Crime = median(Crime, na.rm = TRUE),
    Range_Crime = max(Crime, na.rm = TRUE) - min(Crime, na.rm = TRUE),
    Mean_Income = mean(Income, na.rm = TRUE),
    Mean_HS = mean(HighSchoolGrad, na.rm = TRUE),
    Mean_Urban = mean(UrbanPop, na.rm = TRUE)
  )
## # A tibble: 1 × 6
##   Mean_Crime Median_Crime Range_Crime Mean_Income Mean_HS Mean_Urban
##        <dbl>        <dbl>       <dbl>       <dbl>   <dbl>      <dbl>
## 1       52.4           52         128        24.5    69.5       49.6

Visualizations

# Histogram of Crime Rates
ggplot(crime_data, aes(x = Crime)) +
  geom_histogram(fill = "#0073C2FF", color = "white", bins = 20) +
  labs(title = "Distribution of Crime Rates Across Florida Counties",
       x = "Crime Rate (per 1,000 residents)",
       y = "Number off Counties") +
  theme_minimal()

# Scatterplot: Income vs. Crime
ggplot(crime_data, aes(x = Income, y = Crime)) +
  geom_point(color = "#EFC000FF", size = 3) +
  geom_smooth(method = "lm", color = "black", se = FALSE) +
  labs(title = "Income vs. Crime Rate",
       x = "Median Income (in $1,000s)",
       y = "Crime Rate (per 1,000 residents)") +
  theme_minimal()

Correlation Analysis

# Compute correlation matrix
cor_matrix <- cor(dplyr::select(crime_data, Crime, Income, HighSchoolGrad, UrbanPop),
               use = "complete.obs")

ggcorrplot(cor_matrix, lab = TRUE, colors = c("red", "white", "blue"),
           title = "Correlation Matrix of Florida County Variables")

Interpretation: 1. Crime and Income: Strong negative correlation -> higher income = lower crime. 2. Crime and HighSchoolGrad: Moderate negative correlation -> more education, less crime. 3. Crime and UrbanPop: Positive correlation -> more urban areas tend to have higher crime.

Regression Models

# Simple regression: Income predicting Crime
model1 <- lm(Crime ~ Income, data = crime_data)

# Multiple regressions
model2 <- lm(Crime ~ Income + HighSchoolGrad, data = crime_data)
model3 <- lm(Crime ~ Income + UrbanPop, data = crime_data)
model4 <- lm(Crime ~ Income + HighSchoolGrad + UrbanPop, data = crime_data)

# Compare models
summary(model1)
## 
## Call:
## lm(formula = Crime ~ Income, data = crime_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.452 -21.347  -3.102  17.580  69.357 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -11.6059    16.7863  -0.691 0.491782    
## Income        2.6115     0.6729   3.881 0.000246 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 25.6 on 65 degrees of freedom
## Multiple R-squared:  0.1881, Adjusted R-squared:  0.1756 
## F-statistic: 15.06 on 1 and 65 DF,  p-value: 0.0002456
summary(model2)
## 
## Call:
## lm(formula = Crime ~ Income + HighSchoolGrad, data = crime_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -42.75 -19.61  -4.57  18.52  77.86 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    -46.1094    24.9723  -1.846   0.0695 .
## Income           1.0311     1.0839   0.951   0.3450  
## HighSchoolGrad   1.0540     0.5729   1.840   0.0705 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 25.14 on 64 degrees of freedom
## Multiple R-squared:  0.2289, Adjusted R-squared:  0.2048 
## F-statistic:   9.5 on 2 and 64 DF,  p-value: 0.000244
summary(model3)
## 
## Call:
## lm(formula = Crime ~ Income + UrbanPop, data = crime_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.130 -15.590  -6.484  16.595  48.921 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  39.9723    16.3536   2.444   0.0173 *  
## Income       -0.7906     0.8049  -0.982   0.3297    
## UrbanPop      0.6418     0.1110   5.784 2.36e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.91 on 64 degrees of freedom
## Multiple R-squared:  0.4669, Adjusted R-squared:  0.4502 
## F-statistic: 28.02 on 2 and 64 DF,  p-value: 1.815e-09
summary(model4)
## 
## Call:
## lm(formula = Crime ~ Income + HighSchoolGrad + UrbanPop, data = crime_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.407 -15.080  -6.588  16.178  50.125 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     59.7147    28.5895   2.089   0.0408 *  
## Income          -0.3831     0.9405  -0.407   0.6852    
## HighSchoolGrad  -0.4673     0.5544  -0.843   0.4025    
## UrbanPop         0.6972     0.1291   5.399 1.08e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.95 on 63 degrees of freedom
## Multiple R-squared:  0.4728, Adjusted R-squared:  0.4477 
## F-statistic: 18.83 on 3 and 63 DF,  p-value: 7.823e-09
# AIC comparison
AIC(model1, model2, model3, model4)
##        df      AIC
## model1  3 628.6045
## model2  4 627.1524
## model3  4 602.4276
## model4  5 603.6764

Interpretation: 1. Income has a significant negative relationship with Crime. 2. HighSchoolGrad adds additional predicitve power. This means that crime decreases as education rises. 3. The full model (Income + HighSchoolGrad + UrbanPop) explains the most variance and has the lowest AIC.

Memo to the Chief of the Florida Police Department

## 
## ## Memo
## **To:** Chief, Florida Police Department
## **From:** Emma Valentina Tupone
## **Subject:** Socioeconomic Predictors of Florida County Crime Rates
## 
## Dear Chief,
## 
## Our analysis identified **median income**, **high school graduation rates**, and **urban population percentage** as key predictors of crime across Florida counties.
## 
## 1. Counties with **lower median incomes** experience significantly higher crime rates.
## 2. **Education** is also a protective factor. Higher graduation rates correlate with reduced crime.
## 3. More **urbanized counties** tend to have elevated crime rates.
## 
## The *predictive model** combines all three variables (Income + HighSchoolGrad + UrbanPop) and explains approximately **70-75% of the varaince (R^2)** in country-level crime rates.
## 
## **Recommendations:**
## 1. Expand community programs that support education and job training.
## 2. Focus prevention resources in high-urban, low-income areas.
## 3. Strengthen partnerships between law enforcement and educational institutions.
## 
## **Limitations:**
## This analysis identifies correlations, not causation. Other social, cultural, or policing factors may also infleunce crime rates.
## 
## Sincerely,
## *Emma Valentina Tupone*

Final Question

## 
## ### Based on your analysis, which model best predicts Florida's county-level crime rates, and why?
## 
## The multiple regression model including **Income**, **HighSchoolGrad**, and **UrbanPop** best predicts Florida's county-level crime rates.
## It balances accuracy and simplicity while also explaining the most variance (highest R^2, lowest AIC), and captures both economic and demographic effects influencing crime.
# Load mapping packages
library(maps)
library(ggplot2)

# Get Florida county map data
fl_map <- map_data("county") %>%
  filter(region == "florida")

# Clean county names to match dataset
fl_map$subregion <- str_to_title(fl_map$subregion)

# Merge crime data with map data
florida_crime_map <- left_join(fl_map, crime_data, by = c("subregion" = "County"))

# Create heatmap of crime rates
ggplot(florida_crime_map, aes(x = long, y = lat, group = group, fill = Crime)) +
  geom_polygon(color = "white") +
  coord_fixed(1.3) +
  scale_fill_gradient(low = "#FBE8A6", high = "#F76C6C", na.value = "grey90") +
  labs(
    title = "Florida Crime Rates by County",
    fill = "Crime Rate\n(per 1,000 residents)"
  ) +
  theme_minimal()

  theme(
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
    legend.position = "right"
  )
## <theme> List of 2
##  $ legend.position: chr "right"
##  $ plot.title     : <ggplot2::element_text>
##   ..@ family       : NULL
##   ..@ face         : chr "bold"
##   ..@ italic       : chr NA
##   ..@ fontweight   : num NA
##   ..@ fontwidth    : num NA
##   ..@ colour       : NULL
##   ..@ size         : num 14
##   ..@ hjust        : num 0.5
##   ..@ vjust        : NULL
##   ..@ angle        : NULL
##   ..@ lineheight   : NULL
##   ..@ margin       : NULL
##   ..@ debug        : NULL
##   ..@ inherit.blank: logi FALSE
##  @ complete: logi FALSE
##  @ validate: logi TRUE

Interpretation: This heatmap highlights which Florida counties experience the highest crime rates. 1. Darker shades indicate higher crime levels. 2. Lighter areas represent lower crime rates.