guns <- read_csv("Guns.csv")
## Rows: 1173 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): state, law
## dbl (12): rownames, year, violent, murder, robbery, prisoners, afam, cauc, m...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sum(is.na(guns)) # Total number of missing values
## [1] 0
colSums(is.na(guns)) # Number of missing values per column
## rownames year violent murder robbery prisoners afam
## 0 0 0 0 0 0 0
## cauc male population income density state law
## 0 0 0 0 0 0 0
selected_data <- guns[, c("violent", "income")]
print(selected_data)
## # A tibble: 1,173 × 2
## violent income
## <dbl> <dbl>
## 1 414. 9563.
## 2 419. 9932
## 3 413. 9877.
## 4 448. 9541.
## 5 470. 9548.
## 6 448. 9479.
## 7 416 9783
## 8 431. 10357.
## 9 458. 10726.
## 10 558 11092.
## # ℹ 1,163 more rows
# Function to identify outliers using the IQR method
identify_outliers <- function(data, variable) {
Q1 <- quantile(data[[variable]], 0.25, na.rm = TRUE)
Q3 <- quantile(data[[variable]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
outliers <- data[data[[variable]] < lower_bound | data[[variable]] > upper_bound, ]
return(outliers)
}
# Function to remove outliers
remove_outliers <- function(data, variable) {
Q1 <- quantile(data[[variable]], 0.25, na.rm = TRUE)
Q3 <- quantile(data[[variable]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
clean_data <- data[data[[variable]] >= lower_bound & data[[variable]] <= upper_bound, ]
return(clean_data)
}
# Function to perform sensitivity analysis
sensitivity_analysis <- function(original_data, cleaned_data, variable) {
original_mean <- mean(original_data[[variable]], na.rm = TRUE)
cleaned_mean <- mean(cleaned_data[[variable]], na.rm = TRUE)
original_sd <- sd(original_data[[variable]], na.rm = TRUE)
cleaned_sd <- sd(cleaned_data[[variable]], na.rm = TRUE)
impact <- data.frame(
Metric = c("Mean", "Standard Deviation"),
Original = c(original_mean, original_sd),
Cleaned = c(cleaned_mean, cleaned_sd)
)
return(impact)
}
# Apply functions to each selected variable
for (variable in names(selected_data)) {
cat("\nAnalyzing variable:", variable, "\n")
# Identify outliers
outliers <- identify_outliers(selected_data, variable)
cat("Number of outliers identified:", nrow(outliers), "\n")
# Remove outliers
cleaned_data <- remove_outliers(selected_data, variable)
# Perform sensitivity analysis
impact <- sensitivity_analysis(selected_data, cleaned_data, variable)
print(impact)
# Optionally visualize the distribution
p1 <- ggplot(selected_data, aes_string(x = variable)) +
geom_histogram(binwidth = 1, fill = "blue", alpha = 0.5, boundary = 0) +
labs(title = paste("Distribution of", variable, "with outliers"),
x = variable, y = "Count") +
theme_minimal()
p2 <- ggplot(cleaned_data, aes_string(x = variable)) +
geom_histogram(binwidth = 1, fill = "red", alpha = 0.5, boundary = 0) +
labs(title = paste("Distribution of", variable, "without outliers"),
x = variable, y = "Count") +
theme_minimal()
# Save plots
ggsave(paste0(variable, "_with_outliers.png"), plot = p1)
ggsave(paste0(variable, "_without_outliers.png"), plot = p2)
}
##
## Analyzing variable: violent
## Number of outliers identified: 26
## Metric Original Cleaned
## 1 Mean 503.0747 470.2028
## 2 Standard Deviation 334.2772 244.3601
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Saving 7 x 5 in image
## Saving 7 x 5 in image
##
## Analyzing variable: income
## Number of outliers identified: 25
## Metric Original Cleaned
## 1 Mean 13724.796 13557.084
## 2 Standard Deviation 2554.542 2307.462
## Saving 7 x 5 in image
## Saving 7 x 5 in image
# Select specific columns by name
guns_main <- guns[, c("law", "income", "male", "violent")]
guns_main_renamed <- guns_main %>%
rename(
"Shall Carry Law" = law,
"Income Per Capita" = income,
"Male Population % 10-29" = male,
"Violent Crime Rate" = violent
)
datasummary_skim(guns_main_renamed)
| Unique | Missing Pct. | Mean | SD | Min | Median | Max | Histogram | |
|---|---|---|---|---|---|---|---|---|
| Income Per Capita | 1170 | 0 | 13724.8 | 2554.5 | 8554.9 | 13401.5 | 23646.7 | |
| Male Population % 10-29 | 1173 | 0 | 16.1 | 1.7 | 12.2 | 15.9 | 22.4 | |
| Violent Crime Rate | 1101 | 0 | 503.1 | 334.3 | 47.0 | 443.0 | 2921.8 | |
| Shall Carry Law | N | % | ||||||
| no | 888 | 75.7 | ||||||
| yes | 285 | 24.3 |
print(datasummary_skim(guns_main_renamed))
##
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | | Unique | Missing Pct. | Mean | SD | Min | Median | Max | Histogram |
## +=========================+========+==============+=========+========+========+=========+=========+==================================================+
## | Income Per Capita | 1170 | 0 | 13724.8 | 2554.5 | 8554.9 | 13401.5 | 23646.7 |  |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | Male Population % 10-29 | 1173 | 0 | 16.1 | 1.7 | 12.2 | 15.9 | 22.4 |  |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | Violent Crime Rate | 1101 | 0 | 503.1 | 334.3 | 47.0 | 443.0 | 2921.8 |  |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | Shall Carry Law | N | % | | | | | | |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | no | 888 | 75.7 | | | | | | |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | yes | 285 | 24.3 | | | | | | |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
# Select specific columns by name
guns_main <- guns[, c("law", "income", "male", "violent")]
sum(is.na(guns)) # Total number of missing values
## [1] 0
colSums(is.na(guns)) # Number of missing values per column
## rownames year violent murder robbery prisoners afam
## 0 0 0 0 0 0 0
## cauc male population income density state law
## 0 0 0 0 0 0 0
selected_data <- guns[, c("violent", "income", "density")]
print(selected_data)
## # A tibble: 1,173 × 3
## violent income density
## <dbl> <dbl> <dbl>
## 1 414. 9563. 0.0746
## 2 419. 9932 0.0756
## 3 413. 9877. 0.0762
## 4 448. 9541. 0.0768
## 5 470. 9548. 0.0772
## 6 448. 9479. 0.0773
## 7 416 9783 0.0775
## 8 431. 10357. 0.0778
## 9 458. 10726. 0.0782
## 10 558 11092. 0.0786
## # ℹ 1,163 more rows
# Function to identify outliers using the IQR method
identify_outliers <- function(data, variable) {
Q1 <- quantile(data[[variable]], 0.25, na.rm = TRUE)
Q3 <- quantile(data[[variable]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
outliers <- data[data[[variable]] < lower_bound | data[[variable]] > upper_bound, ]
return(outliers)
}
# Function to remove outliers
remove_outliers <- function(data, variable) {
Q1 <- quantile(data[[variable]], 0.25, na.rm = TRUE)
Q3 <- quantile(data[[variable]], 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
clean_data <- data[data[[variable]] >= lower_bound & data[[variable]] <= upper_bound, ]
return(clean_data)
}
# Function to perform sensitivity analysis
sensitivity_analysis <- function(original_data, cleaned_data, variable) {
original_mean <- mean(original_data[[variable]], na.rm = TRUE)
cleaned_mean <- mean(cleaned_data[[variable]], na.rm = TRUE)
original_sd <- sd(original_data[[variable]], na.rm = TRUE)
cleaned_sd <- sd(cleaned_data[[variable]], na.rm = TRUE)
impact <- data.frame(
Metric = c("Mean", "Standard Deviation"),
Original = c(original_mean, original_sd),
Cleaned = c(cleaned_mean, cleaned_sd)
)
return(impact)
}
# Apply functions to each selected variable
for (variable in names(selected_data)) {
cat("\nAnalyzing variable:", variable, "\n")
# Identify outliers
outliers <- identify_outliers(selected_data, variable)
cat("Number of outliers identified:", nrow(outliers), "\n")
# Remove outliers
cleaned_data <- remove_outliers(selected_data, variable)
# Perform sensitivity analysis
impact <- sensitivity_analysis(selected_data, cleaned_data, variable)
print(impact)
# Optionally visualize the distribution
p1 <- ggplot(selected_data, aes_string(x = variable)) +
geom_histogram(binwidth = 1, fill = "blue", alpha = 0.5, boundary = 0) +
labs(title = paste("Distribution of", variable, "with outliers"),
x = variable, y = "Count") +
theme_minimal()
p2 <- ggplot(cleaned_data, aes_string(x = variable)) +
geom_histogram(binwidth = 1, fill = "red", alpha = 0.5, boundary = 0) +
labs(title = paste("Distribution of", variable, "without outliers"),
x = variable, y = "Count") +
theme_minimal()
# Save plots
ggsave(paste0(variable, "_with_outliers.png"), plot = p1)
ggsave(paste0(variable, "_without_outliers.png"), plot = p2)
}
##
## Analyzing variable: violent
## Number of outliers identified: 26
## Metric Original Cleaned
## 1 Mean 503.0747 470.2028
## 2 Standard Deviation 334.2772 244.3601
## Saving 7 x 5 in image
## Saving 7 x 5 in image
##
## Analyzing variable: income
## Number of outliers identified: 25
## Metric Original Cleaned
## 1 Mean 13724.796 13557.084
## 2 Standard Deviation 2554.542 2307.462
## Saving 7 x 5 in image
## Saving 7 x 5 in image
##
## Analyzing variable: density
## Number of outliers identified: 138
## Metric Original Cleaned
## 1 Mean 0.3520382 0.09607251
## 2 Standard Deviation 1.3554719 0.08987374
## Saving 7 x 5 in image
## Saving 7 x 5 in image
# Select specific columns by name
guns_main <- guns[, c("law", "income", "male", "violent")]
guns_main_renamed <- guns_main %>%
rename(
"Shall Carry Law" = law,
"Income Per Capita" = income,
"Male Population % 10-29" = male,
"Violent Crime Rate" = violent
)
datasummary_skim(guns_main_renamed)
| Unique | Missing Pct. | Mean | SD | Min | Median | Max | Histogram | |
|---|---|---|---|---|---|---|---|---|
| Income Per Capita | 1170 | 0 | 13724.8 | 2554.5 | 8554.9 | 13401.5 | 23646.7 | |
| Male Population % 10-29 | 1173 | 0 | 16.1 | 1.7 | 12.2 | 15.9 | 22.4 | |
| Violent Crime Rate | 1101 | 0 | 503.1 | 334.3 | 47.0 | 443.0 | 2921.8 | |
| Shall Carry Law | N | % | ||||||
| no | 888 | 75.7 | ||||||
| yes | 285 | 24.3 |
print(datasummary_skim(guns_main_renamed))
##
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | | Unique | Missing Pct. | Mean | SD | Min | Median | Max | Histogram |
## +=========================+========+==============+=========+========+========+=========+=========+==================================================+
## | Income Per Capita | 1170 | 0 | 13724.8 | 2554.5 | 8554.9 | 13401.5 | 23646.7 |  |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | Male Population % 10-29 | 1173 | 0 | 16.1 | 1.7 | 12.2 | 15.9 | 22.4 |  |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | Violent Crime Rate | 1101 | 0 | 503.1 | 334.3 | 47.0 | 443.0 | 2921.8 |  |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | Shall Carry Law | N | % | | | | | | |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | no | 888 | 75.7 | | | | | | |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
## | yes | 285 | 24.3 | | | | | | |
## +-------------------------+--------+--------------+---------+--------+--------+---------+---------+--------------------------------------------------+
interaction_plot <- ggplot(guns, aes(x = year, y = violent, color = as.factor(law), group = law)) +
#geom_point(alpha = 0.1) +
geom_smooth(method = "loess", se = TRUE) +
labs(title = "Figure 1: Interaction between Violent Crime Rate and Year by Shall Carry Law",
x = "Year",
y = "Violent Crime Rate per 100,000",
color = "State Shall Carry Law
Instituted?") +
theme_minimal() +
scale_color_manual(values = c("blue", "red"), labels = c("No", "Yes"))
print(interaction_plot)
## `geom_smooth()` using formula = 'y ~ x'
interaction_plot <- ggplot(guns, aes(x = male, y = violent, color = as.factor(law), group = law)) +
#geom_point(alpha = 0.1) +
geom_smooth(method = "loess", se = TRUE) +
labs(title = "Figure 2: Interaction between Violent Crime Rate and Young Male Pop %
by Shall Carry Law",
x = "Young Male Population %",
y = "Violent Crime Rate per 100,000",
color = "State Shall Carry Law
Instituted?") +
theme_minimal() +
scale_color_manual(values = c("blue", "red"), labels = c("No", "Yes"))
print(interaction_plot)
## `geom_smooth()` using formula = 'y ~ x'
interaction_plot <- ggplot(guns, aes(x = income, y = violent, color = as.factor(law), group = law)) +
#geom_point(alpha = 0.1) +
geom_smooth(method = "loess", se = TRUE) +
labs(title = "Figure 3: Interaction between Violent Crime Rate and Per-Capita Income
by Shall Carry Law",
x = "Per-Capita Income",
y = "Violent Crime Rate per 100,000",
color = "State Shall Carry Law
Instituted?") +
theme_minimal() +
scale_color_manual(values = c("blue", "red"), labels = c("No", "Yes"))
print(interaction_plot)
## `geom_smooth()` using formula = 'y ~ x'
#Looking into assumption violations in performance package
library(performance)
##
## Attaching package: 'performance'
## The following objects are masked from 'package:modelr':
##
## mae, mse, rmse
## The following object is masked from 'package:fixest':
##
## r2
# Fit the simple regression model
model1 <- lm(violent ~ law, data = guns)
# Fit the multiple predictor regression model
model2 <- lm(violent ~ law + income + male, data = guns)
# Check assumptions
check_model(model1) # Check linearity assumption
check_model(model2)
addressing heterodasticity
# Transform the dependent variable using log
model1_log <- lm(log(violent) ~ law, data = guns)
model2_log <- lm(log(violent) ~ law * income, data = guns)
model3_log <- lm(log(violent) ~ law * male, data = guns)
check_model(model1_log)
check_model(model2_log)
# Fit the simple regression model
#model1 <- lm(violent ~ law, data = guns)
# Fit the multiple predictor regression model
#model2 <- lm(violent ~ law + income + male, data = guns)
# Summarize the models
summary(model1_log)
##
## Call:
## lm(formula = log(violent) ~ law, data = guns)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.28477 -0.42748 0.04655 0.42172 1.84504
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.13492 0.02072 296.13 <2e-16 ***
## lawyes -0.44296 0.04203 -10.54 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6174 on 1171 degrees of freedom
## Multiple R-squared: 0.08664, Adjusted R-squared: 0.08586
## F-statistic: 111.1 on 1 and 1171 DF, p-value: < 2.2e-16
summary(model2_log)
##
## Call:
## lm(formula = log(violent) ~ law * income, data = guns)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1241 -0.3995 0.0673 0.4158 1.4994
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.950e+00 9.751e-02 50.771 < 2e-16 ***
## lawyes -1.047e+00 2.750e-01 -3.809 0.000147 ***
## income 8.630e-05 6.966e-06 12.388 < 2e-16 ***
## lawyes:income 4.403e-05 1.983e-05 2.220 0.026609 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5704 on 1169 degrees of freedom
## Multiple R-squared: 0.2216, Adjusted R-squared: 0.2196
## F-statistic: 110.9 on 3 and 1169 DF, p-value: < 2.2e-16
summary(model3_log)
##
## Call:
## lm(formula = log(violent) ~ law * male, data = guns)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1833 -0.4253 0.0648 0.4438 1.7036
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.52921 0.19193 39.229 < 2e-16 ***
## lawyes -0.07786 0.41620 -0.187 0.852
## male -0.08526 0.01167 -7.305 5.13e-13 ***
## lawyes:male -0.03023 0.02681 -1.128 0.260
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5987 on 1169 degrees of freedom
## Multiple R-squared: 0.1426, Adjusted R-squared: 0.1404
## F-statistic: 64.8 on 3 and 1169 DF, p-value: < 2.2e-16
# Create detailed regression tables using sjPlot
tab_model(model1_log, model2_log, model3_log,
show.ci = FALSE, # Show confidence intervals
show.se = TRUE, # Show standard errors
show.stat = TRUE, # Show statistics (t-value)
show.p = TRUE, # Show p-values
title = "Table 2: Log-Transformed Regression Analysis of Violent Crime Rates",
dv.labels = c("Model 1", "Model 2", "Model 3"),
pred.labels = c("(Intercept)",
"Shall Carry States",
"Income Per Capita",
"Income Per Capita in
Shall Carry States",
"Male Population % 10-29",
"Male Population % 10-29
in Shall Carry States"),
string.ci = "Conf. Int.",
string.se = "Std. Error",
string.stat = "t value",
string.p = "p value")
| Model 1 | Model 2 | Model 3 | ||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Predictors | Estimates | Std. Error | t value | p value | Estimates | Std. Error | t value | p value | Estimates | Std. Error | t value | p value |
| (Intercept) | 6.13 | 0.02 | 296.13 | <0.001 | 4.95 | 0.10 | 50.77 | <0.001 | 7.53 | 0.19 | 39.23 | <0.001 |
| Shall Carry States | -0.44 | 0.04 | -10.54 | <0.001 | -1.05 | 0.27 | -3.81 | <0.001 | -0.08 | 0.42 | -0.19 | 0.852 |
| Income Per Capita | 0.00 | 0.00 | 12.39 | <0.001 | ||||||||
| Income Per Capita in Shall Carry States | 0.00 | 0.00 | 2.22 | 0.027 | ||||||||
| Male Population % 10-29 | -0.09 | 0.01 | -7.30 | <0.001 | ||||||||
| Male Population % 10-29 in Shall Carry States | -0.03 | 0.03 | -1.13 | 0.260 | ||||||||
| Observations | 1173 | 1173 | 1173 | |||||||||
| R2 / R2 adjusted | 0.087 / 0.086 | 0.222 / 0.220 | 0.143 / 0.140 | |||||||||
#Appendix ChatGPT4 was used as a learning aid in this assingment to troubleshoot error messages and give tips on code.