loading the dataset and other packages needed for the exploration

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(datasets)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.4.4     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(corrplot)
## corrplot 0.92 loaded
library(ggplot2)
library(maps)
## 
## Attaching package: 'maps'
## 
## The following object is masked from 'package:purrr':
## 
##     map

exploring the corrrelation between population and the total murders which appears to be strongly correlated

murders <- read_csv("~/murders.csv")
## Rows: 51 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): state, abb, region
## dbl (2): population, total
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cor(murders$population, murders$total)
## [1] 0.9635956

#plotting the distribution between murders and the region, to see the relationship

ggplot(data =murders, aes(total, region, color = factor(region)))+
  geom_point() +labs(title= "Total Murders", x = "Region", y="Region"
                     )+scale_color_manual(values =c("red", "blue", "green", "purple"))

from the visualization we can see that the west region has more number of murders what to understand the distribution amongst state

N <- 10 
top_states <- head(arrange(murders, desc(total)), -N)

ggplot(top_states, aes(x = reorder(state, -total), y = total)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  coord_flip() +
  labs(title = paste("Top", N, "Murder Totals by State (2014)"),
       x = "State",
       y = "Murder Total")

population cleary has an impact on the number of murders, to be better understand the character of each data point among the state, i want to understand the rate of the murders amongst individual state by a factor of 100000 and plot cluster to visualize it

murder_rate <- (murders$total/murders$population)* 100000
murders <- mutate(murders, murder_rate)
ggplot(murders, aes(murder_rate, region, color = factor(region)))+
  geom_point() +labs(title= "Total Murders Rate", y = "Region", x="Murder Rate"
                     )+scale_color_manual(values =c("red", "blue", "green", "purple"))

the southern region seems to have more murders per 100,000 than the other region

i want to understand more among the distribution within each region, so i want to calculate the zcore

mean_murder_rate <- mean(murder_rate)
sd_murder_rate <- sd(murder_rate)

z_scores <- (murder_rate - mean_murder_rate)/sd_murder_rate
z_scores
##  [1]  0.01844305 -0.04231860  0.34623812  0.16703781  0.24225740 -0.60529343
##  [7] -0.02652691  0.59150707  5.56716958  0.25200063  0.41170537 -0.92199686
## [13] -0.81983639  0.02354746 -0.23983062 -0.85084539 -0.23248671 -0.04312679
## [19]  2.02085353 -0.79435800  0.93470252 -0.39776029  0.56980030 -0.72466598
## [25]  0.51502367  1.05074992 -0.63770849 -0.41813467  0.13490835 -0.97687549
## [31]  0.00769770  0.19323111 -0.04526064  0.08965294 -0.88937503 -0.03745866
## [37]  0.07320842 -0.74892202  0.33330063 -0.51261054  0.69060107 -0.73145567
## [43]  0.27352517  0.17191143 -0.80743030 -1.00137859  0.14065877 -0.56842993
## [49] -0.53825749 -0.43706231 -0.77032620

to better visualize the data, i’d make a plot for the z scores

murders<- mutate(murders, z_scores)

num_clusters <-4
colors <- rainbow(num_clusters)


set.seed(123)  
clusters <- sample(1:num_clusters, length(z_scores), replace = TRUE)


plot(z_scores, col = colors[clusters], pch = 16, xlab = "Regions", ylab = "Z-Scores", main = "Z-Scores for Murders Among Regions")

legend("topright", legend = paste("Cluster", 1:num_clusters), fill = colors, title = "Clusters")

and also want to plot the density of the z_scores

density_z <- density(z_scores)
plot(density_z, main = 'density plot of z-score')

there are regions which relatively lower murder rate than the others

lastly i want to build a model to predict murder_rate based on population

model <-lm(murder_rate~population, murders)
summary(model)
## 
## Call:
## lm(formula = murder_rate ~ population, data = murders)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2762 -1.4347 -0.2191  0.5561 13.8577 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.575e+00  4.640e-01   5.549 1.15e-06 ***
## population  3.363e-08  5.092e-08   0.661    0.512    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.47 on 49 degrees of freedom
## Multiple R-squared:  0.008825,   Adjusted R-squared:  -0.0114 
## F-statistic: 0.4363 on 1 and 49 DF,  p-value: 0.512

and a visual plot to visualize the model

population <-murders$population

plot(population, murder_rate, xlab="population", ylab="murder rate",
     main = "Linear regression: murder rate vs population")
abline(model, col= "red")

conlusion Murder rate can be influenced by the total number of people living in a area.