guns <- read_csv("Guns.csv")
## Rows: 1173 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): state, law
## dbl (12): rownames, year, violent, murder, robbery, prisoners, afam, cauc, m...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Clean / Prepare Data

sum(is.na(guns))  # Total number of missing values
## [1] 0
colSums(is.na(guns))  # Number of missing values per column
##   rownames       year    violent     murder    robbery  prisoners       afam 
##          0          0          0          0          0          0          0 
##       cauc       male population     income    density      state        law 
##          0          0          0          0          0          0          0

Selecting Vairables of Interest for outlier removal

selected_data <- guns[, c("violent", "income")]
print(selected_data)
## # A tibble: 1,173 × 2
##    violent income
##      <dbl>  <dbl>
##  1    414.  9563.
##  2    419.  9932 
##  3    413.  9877.
##  4    448.  9541.
##  5    470.  9548.
##  6    448.  9479.
##  7    416   9783 
##  8    431. 10357.
##  9    458. 10726.
## 10    558  11092.
## # ℹ 1,163 more rows
# Function to identify outliers using the IQR method
identify_outliers <- function(data, variable) {
  Q1 <- quantile(data[[variable]], 0.25, na.rm = TRUE)
  Q3 <- quantile(data[[variable]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  outliers <- data[data[[variable]] < lower_bound | data[[variable]] > upper_bound, ]
  return(outliers)
}

# Function to remove outliers
remove_outliers <- function(data, variable) {
  Q1 <- quantile(data[[variable]], 0.25, na.rm = TRUE)
  Q3 <- quantile(data[[variable]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  clean_data <- data[data[[variable]] >= lower_bound & data[[variable]] <= upper_bound, ]
  return(clean_data)
}

# Function to perform sensitivity analysis
sensitivity_analysis <- function(original_data, cleaned_data, variable) {
  original_mean <- mean(original_data[[variable]], na.rm = TRUE)
  cleaned_mean <- mean(cleaned_data[[variable]], na.rm = TRUE)
  original_sd <- sd(original_data[[variable]], na.rm = TRUE)
  cleaned_sd <- sd(cleaned_data[[variable]], na.rm = TRUE)
  impact <- data.frame(
    Metric = c("Mean", "Standard Deviation"),
    Original = c(original_mean, original_sd),
    Cleaned = c(cleaned_mean, cleaned_sd)
  )
  return(impact)
}

# Apply functions to each selected variable
for (variable in names(selected_data)) {
  cat("\nAnalyzing variable:", variable, "\n")
  
  # Identify outliers
  outliers <- identify_outliers(selected_data, variable)
  cat("Number of outliers identified:", nrow(outliers), "\n")
  
  # Remove outliers
  cleaned_data <- remove_outliers(selected_data, variable)
  
  # Perform sensitivity analysis
  impact <- sensitivity_analysis(selected_data, cleaned_data, variable)
  print(impact)
  
  # Optionally visualize the distribution
  p1 <- ggplot(selected_data, aes_string(x = variable)) +
    geom_histogram(binwidth = 1, fill = "blue", alpha = 0.5, boundary = 0) +
    labs(title = paste("Distribution of", variable, "with outliers"),
         x = variable, y = "Count") +
    theme_minimal()
  
  p2 <- ggplot(cleaned_data, aes_string(x = variable)) +
    geom_histogram(binwidth = 1, fill = "red", alpha = 0.5, boundary = 0) +
    labs(title = paste("Distribution of", variable, "without outliers"),
         x = variable, y = "Count") +
    theme_minimal()
  
  # Save plots
  ggsave(paste0(variable, "_with_outliers.png"), plot = p1)
  ggsave(paste0(variable, "_without_outliers.png"), plot = p2)
}
## 
## Analyzing variable: violent 
## Number of outliers identified: 26 
##               Metric Original  Cleaned
## 1               Mean 503.0747 470.2028
## 2 Standard Deviation 334.2772 244.3601
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## 
## Analyzing variable: income 
## Number of outliers identified: 25 
##               Metric  Original   Cleaned
## 1               Mean 13724.796 13557.084
## 2 Standard Deviation  2554.542  2307.462
## Saving 7 x 5 in image
## Saving 7 x 5 in image

Variables of Interest

# Select specific columns by name
guns_main <- guns[, c("law", "income", "male", "violent")]

Descriptive Statistics

guns_main_renamed <- guns_main %>% 
  rename(
    "Shall Carry Law" = law,
    "Income Per Capita" = income,
    "Male Population % 10-29" = male,
    "Violent Crime Rate" = violent
  )

datasummary_skim(guns_main_renamed)
tinytable_xfdrw7ucs0puhdt6ty6q
Unique Missing Pct. Mean SD Min Median Max Histogram
Income Per Capita 1170 0 13724.8 2554.5 8554.9 13401.5 23646.7
Male Population % 10-29 1173 0 16.1 1.7 12.2 15.9 22.4
Violent Crime Rate 1101 0 503.1 334.3 47.0 443.0 2921.8
Shall Carry Law N %
no 888 75.7
yes 285 24.3
print(datasummary_skim(guns_main_renamed))
## 
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## |                         | Unique | Missing Pct. | Mean    | SD     | Min    | Median  | Max     | Histogram                                        |
## +=========================+========+==============+=========+========+========+=========+=========+==================================================+
## | Income Per Capita       | 1170   | 0            | 13724.8 | 2554.5 | 8554.9 | 13401.5 | 23646.7 | ![](tinytable_assets/idnbe7vveb6wnhbsjcw95e.png) |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | Male Population % 10-29 | 1173   | 0            | 16.1    | 1.7    | 12.2   | 15.9    | 22.4    | ![](tinytable_assets/idn3tpmrbi5sijt94v5ldi.png) |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | Violent Crime Rate      | 1101   | 0            | 503.1   | 334.3  | 47.0   | 443.0   | 2921.8  | ![](tinytable_assets/idzectafuhfxcj3fbu6evr.png) |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | Shall Carry Law         | N      | %            |         |        |        |         |         |                                                  |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | no                      | 888    | 75.7         |         |        |        |         |         |                                                  |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | yes                     | 285    | 24.3         |         |        |        |         |         |                                                  |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
# Select specific columns by name
guns_main <- guns[, c("law", "income", "male", "violent")]

Clean / Prepare Data

sum(is.na(guns))  # Total number of missing values
## [1] 0
colSums(is.na(guns))  # Number of missing values per column
##   rownames       year    violent     murder    robbery  prisoners       afam 
##          0          0          0          0          0          0          0 
##       cauc       male population     income    density      state        law 
##          0          0          0          0          0          0          0

Selecting Vairables of Interest for outlier removal

selected_data <- guns[, c("violent", "income", "density")]
print(selected_data)
## # A tibble: 1,173 × 3
##    violent income density
##      <dbl>  <dbl>   <dbl>
##  1    414.  9563.  0.0746
##  2    419.  9932   0.0756
##  3    413.  9877.  0.0762
##  4    448.  9541.  0.0768
##  5    470.  9548.  0.0772
##  6    448.  9479.  0.0773
##  7    416   9783   0.0775
##  8    431. 10357.  0.0778
##  9    458. 10726.  0.0782
## 10    558  11092.  0.0786
## # ℹ 1,163 more rows
# Function to identify outliers using the IQR method
identify_outliers <- function(data, variable) {
  Q1 <- quantile(data[[variable]], 0.25, na.rm = TRUE)
  Q3 <- quantile(data[[variable]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  outliers <- data[data[[variable]] < lower_bound | data[[variable]] > upper_bound, ]
  return(outliers)
}

# Function to remove outliers
remove_outliers <- function(data, variable) {
  Q1 <- quantile(data[[variable]], 0.25, na.rm = TRUE)
  Q3 <- quantile(data[[variable]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  clean_data <- data[data[[variable]] >= lower_bound & data[[variable]] <= upper_bound, ]
  return(clean_data)
}

# Function to perform sensitivity analysis
sensitivity_analysis <- function(original_data, cleaned_data, variable) {
  original_mean <- mean(original_data[[variable]], na.rm = TRUE)
  cleaned_mean <- mean(cleaned_data[[variable]], na.rm = TRUE)
  original_sd <- sd(original_data[[variable]], na.rm = TRUE)
  cleaned_sd <- sd(cleaned_data[[variable]], na.rm = TRUE)
  impact <- data.frame(
    Metric = c("Mean", "Standard Deviation"),
    Original = c(original_mean, original_sd),
    Cleaned = c(cleaned_mean, cleaned_sd)
  )
  return(impact)
}

# Apply functions to each selected variable
for (variable in names(selected_data)) {
  cat("\nAnalyzing variable:", variable, "\n")
  
  # Identify outliers
  outliers <- identify_outliers(selected_data, variable)
  cat("Number of outliers identified:", nrow(outliers), "\n")
  
  # Remove outliers
  cleaned_data <- remove_outliers(selected_data, variable)
  
  # Perform sensitivity analysis
  impact <- sensitivity_analysis(selected_data, cleaned_data, variable)
  print(impact)
  
  # Optionally visualize the distribution
  p1 <- ggplot(selected_data, aes_string(x = variable)) +
    geom_histogram(binwidth = 1, fill = "blue", alpha = 0.5, boundary = 0) +
    labs(title = paste("Distribution of", variable, "with outliers"),
         x = variable, y = "Count") +
    theme_minimal()
  
  p2 <- ggplot(cleaned_data, aes_string(x = variable)) +
    geom_histogram(binwidth = 1, fill = "red", alpha = 0.5, boundary = 0) +
    labs(title = paste("Distribution of", variable, "without outliers"),
         x = variable, y = "Count") +
    theme_minimal()
  
  # Save plots
  ggsave(paste0(variable, "_with_outliers.png"), plot = p1)
  ggsave(paste0(variable, "_without_outliers.png"), plot = p2)
}
## 
## Analyzing variable: violent 
## Number of outliers identified: 26 
##               Metric Original  Cleaned
## 1               Mean 503.0747 470.2028
## 2 Standard Deviation 334.2772 244.3601
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## 
## Analyzing variable: income 
## Number of outliers identified: 25 
##               Metric  Original   Cleaned
## 1               Mean 13724.796 13557.084
## 2 Standard Deviation  2554.542  2307.462
## Saving 7 x 5 in image
## Saving 7 x 5 in image
## 
## Analyzing variable: density 
## Number of outliers identified: 138 
##               Metric  Original    Cleaned
## 1               Mean 0.3520382 0.09607251
## 2 Standard Deviation 1.3554719 0.08987374
## Saving 7 x 5 in image
## Saving 7 x 5 in image
# Select specific columns by name
guns_main <- guns[, c("law", "income", "male", "violent")]
guns_main_renamed <- guns_main %>%
  rename(
    "Shall Carry Law" = law,
    "Income Per Capita" = income,
    "Male Population % 10-29" = male,
    "Violent Crime Rate" = violent
  )

datasummary_skim(guns_main_renamed)
tinytable_y2y80xpt3zv5f4lwu5b8
Unique Missing Pct. Mean SD Min Median Max Histogram
Income Per Capita 1170 0 13724.8 2554.5 8554.9 13401.5 23646.7
Male Population % 10-29 1173 0 16.1 1.7 12.2 15.9 22.4
Violent Crime Rate 1101 0 503.1 334.3 47.0 443.0 2921.8
Shall Carry Law N %
no 888 75.7
yes 285 24.3
print(datasummary_skim(guns_main_renamed))
## 
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## |                         | Unique | Missing Pct. | Mean    | SD     | Min    | Median  | Max     | Histogram                                        |
## +=========================+========+==============+=========+========+========+=========+=========+==================================================+
## | Income Per Capita       | 1170   | 0            | 13724.8 | 2554.5 | 8554.9 | 13401.5 | 23646.7 | ![](tinytable_assets/idgwx5qe74ujy6xnwj5r0r.png) |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | Male Population % 10-29 | 1173   | 0            | 16.1    | 1.7    | 12.2   | 15.9    | 22.4    | ![](tinytable_assets/id4stl01hamnx6h5bxcz9f.png) |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | Violent Crime Rate      | 1101   | 0            | 503.1   | 334.3  | 47.0   | 443.0   | 2921.8  | ![](tinytable_assets/idgru1sp6i1342yyoss9vp.png) |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | Shall Carry Law         | N      | %            |         |        |        |         |         |                                                  |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | no                      | 888    | 75.7         |         |        |        |         |         |                                                  |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | yes                     | 285    | 24.3         |         |        |        |         |         |                                                  |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+

Interaction plot between Violent Crime Rate and Year by Shall Carry Law

interaction_plot <- ggplot(guns, aes(x = year, y = violent, color = as.factor(law), group = law)) +
  #geom_point(alpha = 0.1) +
  geom_smooth(method = "loess", se = TRUE) +
  labs(title = "Figure 1: Interaction between Violent Crime Rate and Year by Shall Carry Law",
       x = "Year",
       y = "Violent Crime Rate per 100,000",
       color = "State Shall Carry Law
       Instituted?") +
  theme_minimal() +
  scale_color_manual(values = c("blue", "red"), labels = c("No", "Yes"))

print(interaction_plot)
## `geom_smooth()` using formula = 'y ~ x'

Interaction plot between Violent Crime Rate and Young Male Population Percentage by Shall Carry Law

interaction_plot <- ggplot(guns, aes(x = male, y = violent, color = as.factor(law), group = law)) +
  #geom_point(alpha = 0.1) +
  geom_smooth(method = "loess", se = TRUE) +
  labs(title = "Figure 2: Interaction between Violent Crime Rate and Young Male Pop % 
               by Shall Carry Law",
       x = "Young Male Population %",
       y = "Violent Crime Rate per 100,000",
       color = "State Shall Carry Law
       Instituted?") +
  theme_minimal() +
  scale_color_manual(values = c("blue", "red"), labels = c("No", "Yes"))

print(interaction_plot)
## `geom_smooth()` using formula = 'y ~ x'

Interaction plot between Violent Crime Rate and Per-Capita Income by Shall Carry Law

interaction_plot <- ggplot(guns, aes(x = income, y = violent, color = as.factor(law), group = law)) +
  #geom_point(alpha = 0.1) +
  geom_smooth(method = "loess", se = TRUE) +
  labs(title = "Figure 3: Interaction between Violent Crime Rate and Per-Capita Income 
               by Shall Carry Law",
       x = "Per-Capita Income",
       y = "Violent Crime Rate per 100,000",
       color = "State Shall Carry Law
       Instituted?") +
  theme_minimal() +
  scale_color_manual(values = c("blue", "red"), labels = c("No", "Yes"))

print(interaction_plot)
## `geom_smooth()` using formula = 'y ~ x'

Initial Regression Analysis

#Looking into assumption violations in performance package

library(performance)
## 
## Attaching package: 'performance'
## The following objects are masked from 'package:modelr':
## 
##     mae, mse, rmse
## The following object is masked from 'package:fixest':
## 
##     r2
# Fit the simple regression model
model1 <- lm(violent ~ law, data = guns)

# Fit the multiple predictor regression model
model2 <- lm(violent ~ law + income + male, data = guns)

# Check assumptions
check_model(model1)         # Check linearity assumption

check_model(model2)

addressing heterodasticity

# Transform the dependent variable using log
model1_log <- lm(log(violent) ~ law, data = guns)
model2_log <- lm(log(violent) ~ law * income, data = guns)
model3_log <- lm(log(violent) ~ law * male, data = guns)

check_model(model1_log)

check_model(model2_log)

Fitting Models

# Fit the simple regression model
#model1 <- lm(violent ~ law, data = guns)

# Fit the multiple predictor regression model
#model2 <- lm(violent ~ law + income + male, data = guns)

# Summarize the models
summary(model1_log)
## 
## Call:
## lm(formula = log(violent) ~ law, data = guns)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.28477 -0.42748  0.04655  0.42172  1.84504 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.13492    0.02072  296.13   <2e-16 ***
## lawyes      -0.44296    0.04203  -10.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6174 on 1171 degrees of freedom
## Multiple R-squared:  0.08664,    Adjusted R-squared:  0.08586 
## F-statistic: 111.1 on 1 and 1171 DF,  p-value: < 2.2e-16
summary(model2_log)
## 
## Call:
## lm(formula = log(violent) ~ law * income, data = guns)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1241 -0.3995  0.0673  0.4158  1.4994 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.950e+00  9.751e-02  50.771  < 2e-16 ***
## lawyes        -1.047e+00  2.750e-01  -3.809 0.000147 ***
## income         8.630e-05  6.966e-06  12.388  < 2e-16 ***
## lawyes:income  4.403e-05  1.983e-05   2.220 0.026609 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5704 on 1169 degrees of freedom
## Multiple R-squared:  0.2216, Adjusted R-squared:  0.2196 
## F-statistic: 110.9 on 3 and 1169 DF,  p-value: < 2.2e-16
summary(model3_log)
## 
## Call:
## lm(formula = log(violent) ~ law * male, data = guns)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1833 -0.4253  0.0648  0.4438  1.7036 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.52921    0.19193  39.229  < 2e-16 ***
## lawyes      -0.07786    0.41620  -0.187    0.852    
## male        -0.08526    0.01167  -7.305 5.13e-13 ***
## lawyes:male -0.03023    0.02681  -1.128    0.260    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5987 on 1169 degrees of freedom
## Multiple R-squared:  0.1426, Adjusted R-squared:  0.1404 
## F-statistic:  64.8 on 3 and 1169 DF,  p-value: < 2.2e-16
# Create detailed regression tables using sjPlot
tab_model(model1_log, model2_log, model3_log,
          show.ci = FALSE,  # Show confidence intervals
          show.se = TRUE,   # Show standard errors
          show.stat = TRUE, # Show statistics (t-value)
          show.p = TRUE,    # Show p-values
          title = "Table 2: Log-Transformed Regression Analysis of Violent Crime Rates",
          dv.labels = c("Model 1", "Model 2", "Model 3"),
          pred.labels = c("(Intercept)", 
                          "Shall Carry States",
                          "Income Per Capita",
                          "Income Per Capita in
                           Shall Carry States",
                          "Male Population % 10-29", 
                          "Male Population % 10-29
                           in Shall Carry States"),
          string.ci = "Conf. Int.", 
          string.se = "Std. Error", 
          string.stat = "t value",
          string.p = "p value")
Table 2: Log-Transformed Regression Analysis of Violent Crime Rates
  Model 1 Model 2 Model 3
Predictors Estimates Std. Error t value p value Estimates Std. Error t value p value Estimates Std. Error t value p value
(Intercept) 6.13 0.02 296.13 <0.001 4.95 0.10 50.77 <0.001 7.53 0.19 39.23 <0.001
Shall Carry States -0.44 0.04 -10.54 <0.001 -1.05 0.27 -3.81 <0.001 -0.08 0.42 -0.19 0.852
Income Per Capita 0.00 0.00 12.39 <0.001
Income Per Capita in Shall Carry States 0.00 0.00 2.22 0.027
Male Population % 10-29 -0.09 0.01 -7.30 <0.001
Male Population % 10-29 in Shall Carry States -0.03 0.03 -1.13 0.260
Observations 1173 1173 1173
R2 / R2 adjusted 0.087 / 0.086 0.222 / 0.220 0.143 / 0.140

#Appendix ChatGPT4 was used as a learning aid in this assingment to troubleshoot error messages and give tips on code.