loading the dataset and other packages needed for the exploration
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(datasets)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.4.4 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(corrplot)
## corrplot 0.92 loaded
library(ggplot2)
library(maps)
##
## Attaching package: 'maps'
##
## The following object is masked from 'package:purrr':
##
## map
exploring the corrrelation between population and the total murders which appears to be strongly correlated
murders <- read_csv("~/murders.csv")
## Rows: 51 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): state, abb, region
## dbl (2): population, total
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
cor(murders$population, murders$total)
## [1] 0.9635956
#plotting the distribution between murders and the region, to see the relationship
ggplot(data =murders, aes(total, region, color = factor(region)))+
geom_point() +labs(title= "Total Murders", x = "Region", y="Region"
)+scale_color_manual(values =c("red", "blue", "green", "purple"))
from the visualization we can see that the west region has more number
of murders what to understand the distribution amongst state
N <- 10
top_states <- head(arrange(murders, desc(total)), -N)
ggplot(top_states, aes(x = reorder(state, -total), y = total)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(title = paste("Top", N, "Murder Totals by State (2014)"),
x = "State",
y = "Murder Total")
population cleary has an impact on the number of murders, to be better
understand the character of each data point among the state, i want to
understand the rate of the murders amongst individual state by a factor
of 100000 and plot cluster to visualize it
murder_rate <- (murders$total/murders$population)* 100000
murders <- mutate(murders, murder_rate)
ggplot(murders, aes(murder_rate, region, color = factor(region)))+
geom_point() +labs(title= "Total Murders Rate", y = "Region", x="Murder Rate"
)+scale_color_manual(values =c("red", "blue", "green", "purple"))
the southern region seems to have more murders per 100,000 than the
other region
i want to understand more among the distribution within each region, so i want to calculate the zcore
mean_murder_rate <- mean(murder_rate)
sd_murder_rate <- sd(murder_rate)
z_scores <- (murder_rate - mean_murder_rate)/sd_murder_rate
z_scores
## [1] 0.01844305 -0.04231860 0.34623812 0.16703781 0.24225740 -0.60529343
## [7] -0.02652691 0.59150707 5.56716958 0.25200063 0.41170537 -0.92199686
## [13] -0.81983639 0.02354746 -0.23983062 -0.85084539 -0.23248671 -0.04312679
## [19] 2.02085353 -0.79435800 0.93470252 -0.39776029 0.56980030 -0.72466598
## [25] 0.51502367 1.05074992 -0.63770849 -0.41813467 0.13490835 -0.97687549
## [31] 0.00769770 0.19323111 -0.04526064 0.08965294 -0.88937503 -0.03745866
## [37] 0.07320842 -0.74892202 0.33330063 -0.51261054 0.69060107 -0.73145567
## [43] 0.27352517 0.17191143 -0.80743030 -1.00137859 0.14065877 -0.56842993
## [49] -0.53825749 -0.43706231 -0.77032620
to better visualize the data, i’d make a plot for the z scores
murders<- mutate(murders, z_scores)
num_clusters <-4
colors <- rainbow(num_clusters)
set.seed(123)
clusters <- sample(1:num_clusters, length(z_scores), replace = TRUE)
plot(z_scores, col = colors[clusters], pch = 16, xlab = "Regions", ylab = "Z-Scores", main = "Z-Scores for Murders Among Regions")
legend("topright", legend = paste("Cluster", 1:num_clusters), fill = colors, title = "Clusters")
and also want to plot the density of the z_scores
density_z <- density(z_scores)
plot(density_z, main = 'density plot of z-score')
there are regions which relatively lower murder rate than the others
lastly i want to build a model to predict murder_rate based on population
model <-lm(murder_rate~population, murders)
summary(model)
##
## Call:
## lm(formula = murder_rate ~ population, data = murders)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2762 -1.4347 -0.2191 0.5561 13.8577
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.575e+00 4.640e-01 5.549 1.15e-06 ***
## population 3.363e-08 5.092e-08 0.661 0.512
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.47 on 49 degrees of freedom
## Multiple R-squared: 0.008825, Adjusted R-squared: -0.0114
## F-statistic: 0.4363 on 1 and 49 DF, p-value: 0.512
and a visual plot to visualize the model
population <-murders$population
plot(population, murder_rate, xlab="population", ylab="murder rate",
main = "Linear regression: murder rate vs population")
abline(model, col= "red")
conlusion Murder rate can be influenced by the total number of people living in a area.