Clean / Prepare Data

sum(is.na(guns))  # Total number of missing values

## [1] 0

colSums(is.na(guns))  # Number of missing values per column

##   rownames       year    violent     murder    robbery  prisoners       afam 
##          0          0          0          0          0          0          0 
##       cauc       male population     income    density      state        law 
##          0          0          0          0          0          0          0

Selecting Vairables of Interest for outlier removal

selected_data <- guns[, c("violent", "income")]
print(selected_data)

## # A tibble: 1,173 × 2
##    violent income
##      <dbl>  <dbl>
##  1    414.  9563.
##  2    419.  9932 
##  3    413.  9877.
##  4    448.  9541.
##  5    470.  9548.
##  6    448.  9479.
##  7    416   9783 
##  8    431. 10357.
##  9    458. 10726.
## 10    558  11092.
## # ℹ 1,163 more rows

# Function to identify outliers using the IQR method
identify_outliers <- function(data, variable) {
  Q1 <- quantile(data[[variable]], 0.25, na.rm = TRUE)
  Q3 <- quantile(data[[variable]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  outliers <- data[data[[variable]] < lower_bound | data[[variable]] > upper_bound, ]
  return(outliers)
}

# Function to remove outliers
remove_outliers <- function(data, variable) {
  Q1 <- quantile(data[[variable]], 0.25, na.rm = TRUE)
  Q3 <- quantile(data[[variable]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  clean_data <- data[data[[variable]] >= lower_bound & data[[variable]] <= upper_bound, ]
  return(clean_data)
}

# Function to perform sensitivity analysis
sensitivity_analysis <- function(original_data, cleaned_data, variable) {
  original_mean <- mean(original_data[[variable]], na.rm = TRUE)
  cleaned_mean <- mean(cleaned_data[[variable]], na.rm = TRUE)
  original_sd <- sd(original_data[[variable]], na.rm = TRUE)
  cleaned_sd <- sd(cleaned_data[[variable]], na.rm = TRUE)
  impact <- data.frame(
    Metric = c("Mean", "Standard Deviation"),
    Original = c(original_mean, original_sd),
    Cleaned = c(cleaned_mean, cleaned_sd)
  )
  return(impact)
}

# Apply functions to each selected variable
for (variable in names(selected_data)) {
  cat("\nAnalyzing variable:", variable, "\n")
  
  # Identify outliers
  outliers <- identify_outliers(selected_data, variable)
  cat("Number of outliers identified:", nrow(outliers), "\n")
  
  # Remove outliers
  cleaned_data <- remove_outliers(selected_data, variable)
  
  # Perform sensitivity analysis
  impact <- sensitivity_analysis(selected_data, cleaned_data, variable)
  print(impact)
  
  # Optionally visualize the distribution
  p1 <- ggplot(selected_data, aes_string(x = variable)) +
    geom_histogram(binwidth = 1, fill = "blue", alpha = 0.5, boundary = 0) +
    labs(title = paste("Distribution of", variable, "with outliers"),
         x = variable, y = "Count") +
    theme_minimal()
  
  p2 <- ggplot(cleaned_data, aes_string(x = variable)) +
    geom_histogram(binwidth = 1, fill = "red", alpha = 0.5, boundary = 0) +
    labs(title = paste("Distribution of", variable, "without outliers"),
         x = variable, y = "Count") +
    theme_minimal()
  
  # Save plots
  ggsave(paste0(variable, "_with_outliers.png"), plot = p1)
  ggsave(paste0(variable, "_without_outliers.png"), plot = p2)
}

## 
## Analyzing variable: violent 
## Number of outliers identified: 26 
##               Metric Original  Cleaned
## 1               Mean 503.0747 470.2028
## 2 Standard Deviation 334.2772 244.3601

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Saving 7 x 5 in image
## Saving 7 x 5 in image

## 
## Analyzing variable: income 
## Number of outliers identified: 25 
##               Metric  Original   Cleaned
## 1               Mean 13724.796 13557.084
## 2 Standard Deviation  2554.542  2307.462

## Saving 7 x 5 in image
## Saving 7 x 5 in image

Variables of Interest

# Select specific columns by name
guns_main <- guns[, c("law", "income", "male", "violent")]

Descriptive Statistics

guns_main_renamed <- guns_main %>% 
  rename(
    "Shall Carry Law" = law,
    "Income Per Capita" = income,
    "Male Population % 10-29" = male,
    "Violent Crime Rate" = violent
  )

datasummary_skim(guns_main_renamed)

tinytable_xfdrw7ucs0puhdt6ty6q

	Unique	Missing Pct.	Mean	SD	Min	Median	Max
Income Per Capita	1170	0	13724.8	2554.5	8554.9	13401.5	23646.7
Male Population % 10-29	1173	0	16.1	1.7	12.2	15.9	22.4
Violent Crime Rate	1101	0	503.1	334.3	47.0	443.0	2921.8
Shall Carry Law	N	%
no	888	75.7
yes	285	24.3

print(datasummary_skim(guns_main_renamed))

## 
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## |                         | Unique | Missing Pct. | Mean    | SD     | Min    | Median  | Max     | Histogram                                        |
## +=========================+========+==============+=========+========+========+=========+=========+==================================================+
## | Income Per Capita       | 1170   | 0            | 13724.8 | 2554.5 | 8554.9 | 13401.5 | 23646.7 | ![](tinytable_assets/idnbe7vveb6wnhbsjcw95e.png) |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | Male Population % 10-29 | 1173   | 0            | 16.1    | 1.7    | 12.2   | 15.9    | 22.4    | ![](tinytable_assets/idn3tpmrbi5sijt94v5ldi.png) |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | Violent Crime Rate      | 1101   | 0            | 503.1   | 334.3  | 47.0   | 443.0   | 2921.8  | ![](tinytable_assets/idzectafuhfxcj3fbu6evr.png) |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | Shall Carry Law         | N      | %            |         |        |        |         |         |                                                  |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | no                      | 888    | 75.7         |         |        |        |         |         |                                                  |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | yes                     | 285    | 24.3         |         |        |        |         |         |                                                  |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+

# Select specific columns by name
guns_main <- guns[, c("law", "income", "male", "violent")]

Clean / Prepare Data

sum(is.na(guns))  # Total number of missing values

## [1] 0

colSums(is.na(guns))  # Number of missing values per column

##   rownames       year    violent     murder    robbery  prisoners       afam 
##          0          0          0          0          0          0          0 
##       cauc       male population     income    density      state        law 
##          0          0          0          0          0          0          0

Selecting Vairables of Interest for outlier removal

selected_data <- guns[, c("violent", "income", "density")]
print(selected_data)

## # A tibble: 1,173 × 3
##    violent income density
##      <dbl>  <dbl>   <dbl>
##  1    414.  9563.  0.0746
##  2    419.  9932   0.0756
##  3    413.  9877.  0.0762
##  4    448.  9541.  0.0768
##  5    470.  9548.  0.0772
##  6    448.  9479.  0.0773
##  7    416   9783   0.0775
##  8    431. 10357.  0.0778
##  9    458. 10726.  0.0782
## 10    558  11092.  0.0786
## # ℹ 1,163 more rows

# Function to identify outliers using the IQR method
identify_outliers <- function(data, variable) {
  Q1 <- quantile(data[[variable]], 0.25, na.rm = TRUE)
  Q3 <- quantile(data[[variable]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  outliers <- data[data[[variable]] < lower_bound | data[[variable]] > upper_bound, ]
  return(outliers)
}

# Function to remove outliers
remove_outliers <- function(data, variable) {
  Q1 <- quantile(data[[variable]], 0.25, na.rm = TRUE)
  Q3 <- quantile(data[[variable]], 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  clean_data <- data[data[[variable]] >= lower_bound & data[[variable]] <= upper_bound, ]
  return(clean_data)
}

# Function to perform sensitivity analysis
sensitivity_analysis <- function(original_data, cleaned_data, variable) {
  original_mean <- mean(original_data[[variable]], na.rm = TRUE)
  cleaned_mean <- mean(cleaned_data[[variable]], na.rm = TRUE)
  original_sd <- sd(original_data[[variable]], na.rm = TRUE)
  cleaned_sd <- sd(cleaned_data[[variable]], na.rm = TRUE)
  impact <- data.frame(
    Metric = c("Mean", "Standard Deviation"),
    Original = c(original_mean, original_sd),
    Cleaned = c(cleaned_mean, cleaned_sd)
  )
  return(impact)
}

# Apply functions to each selected variable
for (variable in names(selected_data)) {
  cat("\nAnalyzing variable:", variable, "\n")
  
  # Identify outliers
  outliers <- identify_outliers(selected_data, variable)
  cat("Number of outliers identified:", nrow(outliers), "\n")
  
  # Remove outliers
  cleaned_data <- remove_outliers(selected_data, variable)
  
  # Perform sensitivity analysis
  impact <- sensitivity_analysis(selected_data, cleaned_data, variable)
  print(impact)
  
  # Optionally visualize the distribution
  p1 <- ggplot(selected_data, aes_string(x = variable)) +
    geom_histogram(binwidth = 1, fill = "blue", alpha = 0.5, boundary = 0) +
    labs(title = paste("Distribution of", variable, "with outliers"),
         x = variable, y = "Count") +
    theme_minimal()
  
  p2 <- ggplot(cleaned_data, aes_string(x = variable)) +
    geom_histogram(binwidth = 1, fill = "red", alpha = 0.5, boundary = 0) +
    labs(title = paste("Distribution of", variable, "without outliers"),
         x = variable, y = "Count") +
    theme_minimal()
  
  # Save plots
  ggsave(paste0(variable, "_with_outliers.png"), plot = p1)
  ggsave(paste0(variable, "_without_outliers.png"), plot = p2)
}

## 
## Analyzing variable: violent 
## Number of outliers identified: 26 
##               Metric Original  Cleaned
## 1               Mean 503.0747 470.2028
## 2 Standard Deviation 334.2772 244.3601

## Saving 7 x 5 in image
## Saving 7 x 5 in image

## 
## Analyzing variable: income 
## Number of outliers identified: 25 
##               Metric  Original   Cleaned
## 1               Mean 13724.796 13557.084
## 2 Standard Deviation  2554.542  2307.462

## Saving 7 x 5 in image
## Saving 7 x 5 in image

## 
## Analyzing variable: density 
## Number of outliers identified: 138 
##               Metric  Original    Cleaned
## 1               Mean 0.3520382 0.09607251
## 2 Standard Deviation 1.3554719 0.08987374

## Saving 7 x 5 in image
## Saving 7 x 5 in image

# Select specific columns by name
guns_main <- guns[, c("law", "income", "male", "violent")]

guns_main_renamed <- guns_main %>%
  rename(
    "Shall Carry Law" = law,
    "Income Per Capita" = income,
    "Male Population % 10-29" = male,
    "Violent Crime Rate" = violent
  )

datasummary_skim(guns_main_renamed)

tinytable_y2y80xpt3zv5f4lwu5b8

	Unique	Missing Pct.	Mean	SD	Min	Median	Max
Income Per Capita	1170	0	13724.8	2554.5	8554.9	13401.5	23646.7
Male Population % 10-29	1173	0	16.1	1.7	12.2	15.9	22.4
Violent Crime Rate	1101	0	503.1	334.3	47.0	443.0	2921.8
Shall Carry Law	N	%
no	888	75.7
yes	285	24.3

print(datasummary_skim(guns_main_renamed))

## 
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## |                         | Unique | Missing Pct. | Mean    | SD     | Min    | Median  | Max     | Histogram                                        |
## +=========================+========+==============+=========+========+========+=========+=========+==================================================+
## | Income Per Capita       | 1170   | 0            | 13724.8 | 2554.5 | 8554.9 | 13401.5 | 23646.7 | ![](tinytable_assets/idgwx5qe74ujy6xnwj5r0r.png) |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | Male Population % 10-29 | 1173   | 0            | 16.1    | 1.7    | 12.2   | 15.9    | 22.4    | ![](tinytable_assets/id4stl01hamnx6h5bxcz9f.png) |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | Violent Crime Rate      | 1101   | 0            | 503.1   | 334.3  | 47.0   | 443.0   | 2921.8  | ![](tinytable_assets/idgru1sp6i1342yyoss9vp.png) |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | Shall Carry Law         | N      | %            |         |        |        |         |         |                                                  |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | no                      | 888    | 75.7         |         |        |        |         |         |                                                  |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | yes                     | 285    | 24.3         |         |        |        |         |         |                                                  |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+

Interaction plot between Violent Crime Rate and Year by Shall Carry Law

interaction_plot <- ggplot(guns, aes(x = year, y = violent, color = as.factor(law), group = law)) +
  #geom_point(alpha = 0.1) +
  geom_smooth(method = "loess", se = TRUE) +
  labs(title = "Figure 1: Interaction between Violent Crime Rate and Year by Shall Carry Law",
       x = "Year",
       y = "Violent Crime Rate per 100,000",
       color = "State Shall Carry Law
       Instituted?") +
  theme_minimal() +
  scale_color_manual(values = c("blue", "red"), labels = c("No", "Yes"))

print(interaction_plot)

## `geom_smooth()` using formula = 'y ~ x'

Interaction plot between Violent Crime Rate and Young Male Population Percentage by Shall Carry Law

interaction_plot <- ggplot(guns, aes(x = male, y = violent, color = as.factor(law), group = law)) +
  #geom_point(alpha = 0.1) +
  geom_smooth(method = "loess", se = TRUE) +
  labs(title = "Figure 2: Interaction between Violent Crime Rate and Young Male Pop % 
               by Shall Carry Law",
       x = "Young Male Population %",
       y = "Violent Crime Rate per 100,000",
       color = "State Shall Carry Law
       Instituted?") +
  theme_minimal() +
  scale_color_manual(values = c("blue", "red"), labels = c("No", "Yes"))

print(interaction_plot)

## `geom_smooth()` using formula = 'y ~ x'

Interaction plot between Violent Crime Rate and Per-Capita Income by Shall Carry Law

interaction_plot <- ggplot(guns, aes(x = income, y = violent, color = as.factor(law), group = law)) +
  #geom_point(alpha = 0.1) +
  geom_smooth(method = "loess", se = TRUE) +
  labs(title = "Figure 3: Interaction between Violent Crime Rate and Per-Capita Income 
               by Shall Carry Law",
       x = "Per-Capita Income",
       y = "Violent Crime Rate per 100,000",
       color = "State Shall Carry Law
       Instituted?") +
  theme_minimal() +
  scale_color_manual(values = c("blue", "red"), labels = c("No", "Yes"))

print(interaction_plot)

## `geom_smooth()` using formula = 'y ~ x'

Initial Regression Analysis

#Looking into assumption violations in performance package

library(performance)

## 
## Attaching package: 'performance'

## The following objects are masked from 'package:modelr':
## 
##     mae, mse, rmse

## The following object is masked from 'package:fixest':
## 
##     r2

# Fit the simple regression model
model1 <- lm(violent ~ law, data = guns)

# Fit the multiple predictor regression model
model2 <- lm(violent ~ law + income + male, data = guns)

# Check assumptions
check_model(model1)         # Check linearity assumption

check_model(model2)

addressing heterodasticity

# Transform the dependent variable using log
model1_log <- lm(log(violent) ~ law, data = guns)
model2_log <- lm(log(violent) ~ law * income, data = guns)
model3_log <- lm(log(violent) ~ law * male, data = guns)

check_model(model1_log)

check_model(model2_log)

Fitting Models

# Fit the simple regression model
#model1 <- lm(violent ~ law, data = guns)

# Fit the multiple predictor regression model
#model2 <- lm(violent ~ law + income + male, data = guns)

# Summarize the models
summary(model1_log)

## 
## Call:
## lm(formula = log(violent) ~ law, data = guns)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.28477 -0.42748  0.04655  0.42172  1.84504 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.13492    0.02072  296.13   <2e-16 ***
## lawyes      -0.44296    0.04203  -10.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6174 on 1171 degrees of freedom
## Multiple R-squared:  0.08664,    Adjusted R-squared:  0.08586 
## F-statistic: 111.1 on 1 and 1171 DF,  p-value: < 2.2e-16

summary(model2_log)

## 
## Call:
## lm(formula = log(violent) ~ law * income, data = guns)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1241 -0.3995  0.0673  0.4158  1.4994 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.950e+00  9.751e-02  50.771  < 2e-16 ***
## lawyes        -1.047e+00  2.750e-01  -3.809 0.000147 ***
## income         8.630e-05  6.966e-06  12.388  < 2e-16 ***
## lawyes:income  4.403e-05  1.983e-05   2.220 0.026609 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5704 on 1169 degrees of freedom
## Multiple R-squared:  0.2216, Adjusted R-squared:  0.2196 
## F-statistic: 110.9 on 3 and 1169 DF,  p-value: < 2.2e-16

summary(model3_log)

## 
## Call:
## lm(formula = log(violent) ~ law * male, data = guns)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1833 -0.4253  0.0648  0.4438  1.7036 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.52921    0.19193  39.229  < 2e-16 ***
## lawyes      -0.07786    0.41620  -0.187    0.852    
## male        -0.08526    0.01167  -7.305 5.13e-13 ***
## lawyes:male -0.03023    0.02681  -1.128    0.260    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5987 on 1169 degrees of freedom
## Multiple R-squared:  0.1426, Adjusted R-squared:  0.1404 
## F-statistic:  64.8 on 3 and 1169 DF,  p-value: < 2.2e-16

# Create detailed regression tables using sjPlot
tab_model(model1_log, model2_log, model3_log,
          show.ci = FALSE,  # Show confidence intervals
          show.se = TRUE,   # Show standard errors
          show.stat = TRUE, # Show statistics (t-value)
          show.p = TRUE,    # Show p-values
          title = "Table 2: Log-Transformed Regression Analysis of Violent Crime Rates",
          dv.labels = c("Model 1", "Model 2", "Model 3"),
          pred.labels = c("(Intercept)", 
                          "Shall Carry States",
                          "Income Per Capita",
                          "Income Per Capita in
                           Shall Carry States",
                          "Male Population % 10-29", 
                          "Male Population % 10-29
                           in Shall Carry States"),
          string.ci = "Conf. Int.", 
          string.se = "Std. Error", 
          string.stat = "t value",
          string.p = "p value")

Table 2: Log-Transformed Regression Analysis of Violent Crime Rates
	Model 1				Model 2				Model 3
Predictors	Estimates	Std. Error	t value	p value	Estimates	Std. Error	t value	p value	Estimates	Std. Error	t value	p value
(Intercept)	6.13	0.02	296.13	<0.001	4.95	0.10	50.77	<0.001	7.53	0.19	39.23	<0.001
Shall Carry States	-0.44	0.04	-10.54	<0.001	-1.05	0.27	-3.81	<0.001	-0.08	0.42	-0.19	0.852
Income Per Capita					0.00	0.00	12.39	<0.001
Income Per Capita in Shall Carry States					0.00	0.00	2.22	0.027
Male Population % 10-29									-0.09	0.01	-7.30	<0.001
Male Population % 10-29 in Shall Carry States									-0.03	0.03	-1.13	0.260
Observations	1173				1173				1173
R² / R² adjusted	0.087 / 0.086				0.222 / 0.220				0.143 / 0.140

#Appendix ChatGPT4 was used as a learning aid in this assingment to troubleshoot error messages and give tips on code.

Final Project

Benny Konishi

2024-08-13

Clean / Prepare Data

Selecting Vairables of Interest for outlier removal

Variables of Interest

Descriptive Statistics

Clean / Prepare Data

Selecting Vairables of Interest for outlier removal

Interaction plot between Violent Crime Rate and Year by Shall Carry Law

Interaction plot between Violent Crime Rate and Young Male Population Percentage by Shall Carry Law

Interaction plot between Violent Crime Rate and Per-Capita Income by Shall Carry Law

Initial Regression Analysis

Fitting Models