## LIBRARIES
library(smacof)
library(tidyverse)
library(plotly)
library(ggplot2)
library(ggrepel)
library(patchwork)
library(stringi)
## FUNCTIONS
# Negative analogue of %in%
'%!in%' = function(x,y)!('%in%'(x,y))
## DATA IMPORT
# Crimes by severity degree
minor_severity =
read.csv("minor_severity.csv", header = F) %>% rename(minor_severity = V3)
moderate_severity =
read.csv("moderate_severity.csv", header = F) %>% rename(moderate_severity = V3)
serious_severity =
read.csv("serious_severity.csv", header = F) %>% rename(serious_severity = V3)
major_severity =
read.csv("major_severity.csv", header = F) %>% rename(major_severity = V3)
# Crimes by category
extremism =
read.csv("extremism.csv", header = F) %>% rename(extremism = V3)
economical = read.csv("economical.csv", header = F) %>% rename(economical = V3)
drugs = read.csv("drugs.csv", header = F) %>% rename(drugs = V3)
arms = read.csv("arms.csv", header = F) %>% rename(arms = V3)
terrorism = read.csv("terrorism.csv", header = F) %>% rename(terrorism = V3)
against_foreigns =
read.csv("against_foreigns.csv", header = F) %>% rename(against_foreigns = V3)
# Control variables
unsolved =
read.csv2("unsolved.csv", header = F) %>% rename(unsolved = V3) %>% mutate(unsolved = as.numeric(unsolved))
median_income = read.csv("median_income.csv", header = F) %>% rename(median_income = V3)
gini = read.csv("gini.csv", header = F) %>% rename(gini = V3)
poor = read.csv("poor.csv", header = F) %>% rename(poor = V3)
russians = read.csv("russians.csv", header = F) %>% rename(russians = V3) %>% select(-V1)
## DATA MERGE
combined_regions_data = minor_severity %>%
inner_join(moderate_severity, by = c("V1", "V2")) %>%
inner_join(serious_severity, by = c("V1", "V2")) %>%
inner_join(major_severity, by = c("V1", "V2")) %>%
inner_join(extremism, by = c("V1", "V2")) %>%
inner_join(economical, by = c("V1", "V2")) %>%
inner_join(drugs, by = c("V1", "V2")) %>%
inner_join(arms, by = c("V1", "V2")) %>%
inner_join(terrorism, by = c("V1", "V2")) %>%
inner_join(against_foreigns, by = c("V1", "V2")) %>%
inner_join(median_income, by = c("V1", "V2")) %>%
inner_join(unsolved, by = c("V1", "V2")) %>%
inner_join(gini, by = c("V1", "V2")) %>%
inner_join(poor, by = c("V1", "V2")) %>%
inner_join(russians, by = c("V2")) %>%
mutate(total = minor_severity + moderate_severity + serious_severity + major_severity)
## AVERAGE SEVERITY
combined_regions_data = combined_regions_data %>%
mutate(severity_avg =
1*(minor_severity / total) +
2*(moderate_severity / total) +
3*(serious_severity / total) +
4*(major_severity / total))
## DATA PREPARING FOR UNFOLDING
combined_regions_data = combined_regions_data %>%
rename(year = V1, region = V2) %>%
filter(year == 2017) %>%
select(-year)
## MISSING VALUES CHECKING
sum(is.na(combined_regions_data))
# Prints 1
missings_values = which(is.na(combined_regions_data), arr.ind = T)
print(paste0(
combined_regions_data$region[missings_values[1]], ", ", colnames(combined_regions_data)[missings_values[2]]))
# Prints "Алтайский край, poor"
## TRANSLITERATION OF REGION NAMES
combined_regions_data$region = stri_trans_general(combined_regions_data$region, "russian-latin/bgn")
## DELETING UNNECESSARY VARIABLES
combined_regions_data = combined_regions_data %>%
select(-total)
What has been done:
poor variable for Altai region, it is deleted in the
relevant part of analysis.
What determines the composition of crime in a region? My sociological gut feeling provides three features.
The first feature is cultural composition of region. Culture determines cultural apparatus. Cultural apparatus (Mills, 1963), in turn, determine the notion of action, risks and benefits attached to it. So, people from regions that differ in terms of cultural composition can perceive theft and outflowing risk of going to jail in distinct ways. For example, in case of one region, it can be perceived as something vicious and unholy — people will abandon you if they find out that you stole something. In case of another, such a behavior can be perceived oppositely, as a demonstration of positive dexterity and courage — in this case, it can raise your status.
As for the second feature, I denote it using umbrella term of socio-economic conditions. Here, logic is simple as one two three. To commit a crime, the need for benefits should outweigh the risks and there should be suitable object for crime. The first component, the need can be observed in degree of region insecurity, i.e. the share of people who cannot meet their basic needs. Indeed, individual who strive from hunger have more incentive to steal a loaf of bread compared to the person who can buy that loaf without a second thought. As for the risks part, I suppose, that it (at least partly) can be treated as function of region richness: more money, more possibilities to enforce the law (of course, there is factor of corruption, but let’s skip it for now to not overcomplicate the analysis). Last component is the presence of object for crime. I suppose, when everyone is more or less poor as you, you possibilities are not really high compared to the case when there is a noticeable share of those who is way richer than you. In such a vein, this component can be represented by extent of region inequality.
The last feature is the interests of law-enforcement representatives. For example, one of such interest is career advancement. Such advancement, probably, can be facilitated by a) high clearance rate, and b) high weight (= severity) of solved crimes. Because of it, the real severity of crime, and general fact of crime presence may fade into the background.
In my personal experience, Russia exhibits significant variability in all these factors. Because of it, specific compositions of crimes in this country are expected.
But why do we need to look at such crimes compositions? Many reasons. First of all economic downfall is coming — so, it’s better to be prepared and understand which regions provides the greatest opportunities to steal a bread with the smallest risk of getting in jail. Seriously talking, understanding the crimes compositions in the context of regional specificity can be helpful for developing approaches and policies of region development: it will help to handle the root cause, not only the outcome. Also, understanding which regions are similar in terms of crimes composition provides opportunities for testing different approaches and policies simultaneously (almost an experiment, except for the fact that we still can’t control a lot of things).
It’s important to note that crime composition of each region implies 6 data variables representing different categories of crimes. How can we a) see if regions are similar / dissimilar in terms of such compositions, and b) see by what categories regions are similar / dissimilar (that’s why not regular MDS). This is the task that can be accomplished via Unfolding as it is designed exactly for visualizing the patterns and relationships within multi-dimensional data, i.e. exactly the one we have.
In previous part I denoted subject of analysis and theoretical framework (sort of). What will be done next:
To get all neccesary datasets I use unnamed aggregator of official Russian statistical reports (available via: https://russia.duck.consulting/). I use it instead of official resources due its very convenient possibility to download all the data I need. Nevertheless, official resources (like this: http://crimestat.ru/) are still used for data verification. Everything was correct at the moment of work completion.
In compilated dataset there are 11 variables. All of them represent
situation as far 2017 (except for the russians
variable).
First of all, we have variable with the name of region:
region: region name;Next, we have the number of crimes (per 10 thousand people) for 6 categories:
extremism: crimes of a extremist
nature;terrorism: crimes of a terrorist
nature;drugs: crimes related to drug
trafficking;arms: crimes related to illegal arms
trafficking;against_foreigns: crimes against
foreigners and stateless persons;economical: crimes of an economic
nature;We also have two more variables about crimes:
severity: average severity of crimes
in the region. Calculated via the next formula: 1*share of minor
severity crimes + 2*share of moderate severity crimes + 3*share of
serious severity crimes + 4*share of major severity crimes;unsolved: the share of crimes that was
not solved.Some variable to represent socio-economic situation in region:
median_income: median income per
capita;gini: Gini coefficient (used to look
at inequality situation);poor: share of citizens below the
value of the regional subsistence minimum (from the total number of
region residents);And, finally, one variable representing ethnic composition:
russians: share of Russian citizens
(from the total number of region residents; data from 2010 state
census).
Unfolding doesn’t imply serious requirements towards data analysed, only non-negativity and values comparability. Nevertheless, it will be useful to look at the data. First of all, it can help us to detect anomalies and inconsistency with data description. Secondly, it can prepare our expectations towards results of further analysis by providing the look at the values spread, distribution and presence of outliers.
# Looking at structure of our dataset
str(combined_regions_data)
## 'data.frame': 85 obs. of 17 variables:
## $ region : chr "Adygeya" "Altayskiy kray" "Amurskaya oblastʹ" "Arkhangelʹskaya oblastʹ" ...
## $ minor_severity : num 36.4 82.1 88.6 80.8 65.9 ...
## $ moderate_severity: num 34.9 59.8 93.4 58.8 43.2 ...
## $ serious_severity : num 14.4 20.8 42.3 24.5 16.2 ...
## $ major_severity : num 2.96 6.51 6.51 6.96 6.47 7.17 3.01 4.95 5.4 5.49 ...
## $ extremism : num 0.07 0.19 0.02 0.23 0.04 0.04 0.05 0.14 0.03 0.09 ...
## $ economical : num 6.51 4.11 6.22 8.84 9.06 5.44 5.04 5.01 8.52 6.63 ...
## $ drugs : num 5.98 14.3 24.78 7.88 16.02 ...
## $ arms : num 1.26 2.4 5 1.6 0.96 1.34 1.37 2.35 3.32 1.19 ...
## $ terrorism : num 0.4 0 0.02 0.03 0.27 0.11 0.01 0.01 0 0.07 ...
## $ against_foreigns : num 0.71 0.52 1.17 0.35 0.65 0.46 0.66 0.61 1.1 0.96 ...
## $ median_income : num 20280 17415 22619 24395 18355 ...
## $ unsolved : num 32.1 34.9 38.4 39.3 29.2 ...
## $ gini : num 0.389 0.376 0.397 0.383 0.373 0.417 0.396 0.384 0.396 0.358 ...
## $ poor : num 13.1 NA 14.9 14.1 17.3 12.3 7.9 13.3 18.5 13.6 ...
## $ russians : num 63.6 93.9 94.3 95.6 67.6 ...
## $ severity_avg : num 1.82 1.71 1.86 1.75 1.72 ...
# Looking at summary of our dataset
summary(combined_regions_data[, 2:length(colnames(combined_regions_data))])
## minor_severity moderate_severity serious_severity major_severity
## Min. : 9.74 Min. : 6.91 Min. : 4.93 Min. : 2.220
## 1st Qu.: 50.82 1st Qu.: 36.06 1st Qu.:17.30 1st Qu.: 5.000
## Median : 65.77 Median : 44.77 Median :21.12 Median : 6.960
## Mean : 67.43 Mean : 50.15 Mean :21.79 Mean : 7.081
## 3rd Qu.: 85.80 3rd Qu.: 60.33 3rd Qu.:24.30 3rd Qu.: 8.800
## Max. :135.87 Max. :157.28 Max. :61.84 Max. :16.200
##
## extremism economical drugs arms
## Min. :0.0000 Min. : 3.860 Min. : 4.20 Min. :0.390
## 1st Qu.:0.0500 1st Qu.: 5.670 1st Qu.: 9.16 1st Qu.:1.390
## Median :0.1000 Median : 6.570 Median :11.62 Median :1.900
## Mean :0.1232 Mean : 6.899 Mean :13.17 Mean :2.345
## 3rd Qu.:0.1700 3rd Qu.: 8.010 3rd Qu.:15.90 3rd Qu.:2.560
## Max. :0.5500 Max. :15.320 Max. :33.80 Max. :8.630
##
## terrorism against_foreigns median_income unsolved
## Min. :0.000 Min. :0.0300 Min. :12233 Min. :11.99
## 1st Qu.:0.020 1st Qu.:0.4700 1st Qu.:18107 1st Qu.:35.21
## Median :0.050 Median :0.6700 Median :20444 Median :40.32
## Mean :0.154 Mean :0.8413 Mean :22862 Mean :39.26
## 3rd Qu.:0.090 3rd Qu.:1.0900 3rd Qu.:23209 3rd Qu.:43.57
## Max. :1.810 Max. :2.4500 Max. :55579 Max. :67.87
##
## gini poor russians severity_avg
## Min. :0.3350 Min. : 7.50 Min. : 0.7849 Min. :1.591
## 1st Qu.:0.3610 1st Qu.:12.12 1st Qu.:66.0504 1st Qu.:1.721
## Median :0.3730 Median :14.25 Median :88.2531 Median :1.793
## Mean :0.3762 Mean :15.32 Mean :76.4345 Mean :1.794
## 3rd Qu.:0.3900 3rd Qu.:17.52 3rd Qu.:93.6987 3rd Qu.:1.851
## Max. :0.4230 Max. :41.50 Max. :97.2660 Max. :2.214
## NA's :1
# Choosing necessary variables
regions_data_num = combined_regions_data %>%
select(-region)
# Building histograms
ggplot(
gather(regions_data_num), aes(value)) +
geom_histogram(bins = 25) +
facet_wrap(~key, scales = 'free_x') +
ylab("Number of regions") +
xlab("Variable value") +
ggtitle("Histograms for numeric variables") +
theme(
plot.title = element_text(margin = margin(b = 20, unit = "pt"), hjust = 0.5, face = "bold"),
axis.title.y = element_text(margin = margin(r = 10, unit = "pt")),
axis.title.x = element_text(margin = margin(t = 10, unit = "pt")))
# Building density plots
ggplot(
gather(regions_data_num), aes(value)) +
geom_density() +
facet_wrap(~key, scales = 'free') +
ylab("Density of regions") +
xlab("Variable value") +
ggtitle("Density plots for numeric variables") +
theme(
plot.title = element_text(margin = margin(b = 20, unit = "pt"), hjust = 0.5, face = "bold"),
axis.title.y = element_text(margin = margin(r = 10, unit = "pt")),
axis.title.x = element_text(margin = margin(t = 10, unit = "pt")))
# Creating ranks
combined_regions_data[, 6:11] =
apply(combined_regions_data[, 6:11], 2, function(x) round(length(x) - rank(x) + 1, 0))
terrorism and
russians: in these cases the highest share of values are
around the extreme (min or max). It can mean that these variables are
not varied enough to catch patterns with their involvement.
First of all, I build very general Unfolding:
# Data preparation for Unfolding
rownames(combined_regions_data) = combined_regions_data$region
regions_data_work = combined_regions_data
crimes_unf = unfolding(regions_data_work[6:11])
conf_items = as.data.frame(crimes_unf$conf.col)
conf_persons = as.data.frame(crimes_unf$conf.row)
conf_persons$region = regions_data_work$region
ggplot(conf_persons, aes(x = D1, y = D2)) +
geom_point(size = 2.5, alpha = 0.8, aes(text = region)) +
coord_fixed(xlim = c(-1.4, 1.4), ylim = c(-1.4, 1.4)) +
scale_size(range = c(0, 1)) +
labs(
title = "Crime in Russian Regions (Original)",
x = "Dimension 1",
y = "Dimension 2") +
geom_point(shape = 15, size = 2, aes(x = D1, y = D2), color = "purple", conf_items) +
geom_label_repel(
aes(x = D1, y = D2, label = rownames(conf_items)),
conf_items, alpha = 0.8, label.size = 0.2, box.padding = 0.45) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
crimes_unf
##
## Call: unfolding(delta = regions_data_work[6:11])
##
## Model: Rectangular smacof
## Number of subjects: 85
## Number of objects: 6
## Transformation: none
## Conditionality: matrix
##
## Stress-1 value: 0.325963
## Penalized Stress: 8.110287
## Number of iterations: 112
Visualization is already revealing some groupings and associations between categories. But Unfolding fit is not the best: stress value is ~0.326 (poor). It also will be beneficial to look at permutation test (with 100 iterations):
permtest(crimes_unf, nrep = 100, method.dat = "full", verbose = F)
##
## Call: permtest.smacofR(object = crimes_unf, method.dat = "full", nrep = 100,
## verbose = F)
##
## SMACOF Permutation Test
## Number of objects: 85
## Number of replications (permutations): 100
##
## Observed stress value: 0.326
## p-value: <0.001
P < 0.001, it allow us to reject H0. It means that observed configurations is not random, but organised in accordance with certain structure. Good.
We see that MDS has sense (according to permutation test), but fit is not really good (poor stress). What we gonna do now? 2 things:
crimes_unf_ordinal = unfolding(regions_data_work[6:11], type = "ordinal")
conf_items = as.data.frame(crimes_unf_ordinal$conf.col)
conf_persons = as.data.frame(crimes_unf_ordinal$conf.row)
conf_persons$region = regions_data_work$region
ggplot(conf_persons, aes(x = D1, y = D2)) +
geom_point(size = 2.5, alpha = 0.8, aes(text = region)) +
coord_fixed(xlim = c(-1.4, 1.4), ylim = c(-1.4, 1.4)) +
scale_size(range = c(0, 1)) +
labs(
title = "Crime in Russian Regions (Ordinal)",
x = "Dimension 1",
y = "Dimension 2") +
geom_point(shape = 15, size = 2, aes(x = D1, y = D2), color = "purple", conf_items) +
geom_label_repel(
aes(x = D1, y = D2, label = rownames(conf_items)),
conf_items, alpha = 0.8, label.size = 0.2, box.padding = 0.45) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
crimes_unf_ordinal
##
## Call: unfolding(delta = regions_data_work[6:11], type = "ordinal")
##
## Model: Rectangular smacof
## Number of subjects: 85
## Number of objects: 6
## Transformation: ordinalp
## Conditionality: matrix
##
## Stress-1 value: 0.330856
## Penalized Stress: 8.040999
## Number of iterations: 111
Stress become slightly worse (0.331 now vs 0.326 initially), but penalized stress — slightly better (8.04 now vs 8.11 initially). Anyway I will not go further with this type as its grouping are a bit less distinguishable compared to the initial option.
crimes_unf_mspline = unfolding(regions_data_work[6:11], type = "mspline")
conf_items = as.data.frame(crimes_unf_mspline$conf.col)
conf_persons = as.data.frame(crimes_unf_mspline$conf.row)
conf_persons$region = regions_data_work$region
ggplot(conf_persons, aes(x = D1, y = D2)) +
geom_point(size = 2.5, alpha = 0.8, aes(text = region)) +
coord_fixed(xlim = c(-1.4, 1.4), ylim = c(-1.4, 1.4)) +
scale_size(range = c(0, 1)) +
labs(
title = "Crime in Russian Regions (Mspline)",
x = "Dimension 1",
y = "Dimension 2") +
geom_point(shape = 15, size = 2, aes(x = D1, y = D2), color = "purple", conf_items) +
geom_label_repel(
aes(x = D1, y = D2, label = rownames(conf_items)),
conf_items, alpha = 0.8, label.size = 0.2, box.padding = 0.45) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
crimes_unf_mspline
##
## Call: unfolding(delta = regions_data_work[6:11], type = "mspline")
##
## Model: Rectangular smacof
## Number of subjects: 85
## Number of objects: 6
## Transformation: mspline
## Conditionality: matrix
##
## Stress-1 value: 0.332447
## Penalized Stress: 8.034291
## Number of iterations: 111
Similar situation to the ordinal type: stress become slightly worse (0.332 now vs 0.326 initially), penalized stress — slightly better (8.03 now vs 8.11 initially). And, again, I will not go further with this options as its grouping are less distinguishable compared to the initial option.
The initial type of MDS (ratio, I guess) is not differ noticeably compared to the 2 other types (ordinal and mspline) in terms of fit, but its groupings are more easily readable — so, I will continue my journey with it.
To handle outliers, we firstly need to detect them. Let’s try with the plots:
plot(crimes_unf, plot.type = "stressplot")
Oh, that’s option not for us… Let’s build regular table instead with the top-15 regions by stress:
stress_per_point = as.data.frame(crimes_unf$spp.row)
colnames(stress_per_point) = "stress"
stress_per_point$region = rownames(stress_per_point)
stress_per_point = stress_per_point %>%
select(region, stress) %>%
arrange(desc(as.numeric(stress)))
rownames(stress_per_point) = NULL
stress_per_point[1:15, ]
## region stress
## 1 Chukot·skiy avtonomnyy okrug 4.166387
## 2 Ingushetiya 3.603652
## 3 Yakutiya 3.278064
## 4 Novgorodskaya oblastʹ 2.868489
## 5 Astrakhanskaya oblastʹ 2.645768
## 6 Sverdlovskaya oblastʹ 2.444688
## 7 Yevreyskaya avtonomnaya oblastʹ 2.361204
## 8 Omskaya oblastʹ 2.217478
## 9 Respublika Tyva 2.098576
## 10 Chechnya 2.091680
## 11 Yaroslavskaya oblastʹ 2.084594
## 12 Murmanskaya oblastʹ 1.943692
## 13 Nenetskiy avtonomnyy okrug 1.912272
## 14 Chuvashiya 1.896252
## 15 Severnaya Osetiya 1.864978
It is noticeable that changes from point to point become less after 5 observation. So, I will delete 5 observations with the highest stress value. After it I will rerun unfolding and compare its result with the initial option:
regions_data_out = regions_data_work %>%
filter(region %!in%
c("Yakutiya", "Ingushetiya", "Chukot·skiy avtonomnyy okrug",
"Novgorodskaya oblastʹ", "Astrakhanskaya oblastʹ"))
crimes_unf_out = unfolding(regions_data_out[6:11])
conf_items = as.data.frame(crimes_unf_out$conf.col)
conf_persons = as.data.frame(crimes_unf_out$conf.row)
conf_persons$region = regions_data_out$region
ggplot(conf_persons, aes(x = D1, y = D2)) +
geom_point(size = 2.5, alpha = 0.8, aes(text = region)) +
coord_fixed(xlim = c(-1.4, 1.4), ylim = c(-1.4, 1.4)) +
scale_size(range = c(0, 1)) +
labs(
title = "Crime in Russian Regions (Original)",
x = "Dimension 1",
y = "Dimension 2") +
geom_point(shape = 15, size = 2, aes(x = D1, y = D2), color = "purple", conf_items) +
geom_label_repel(
aes(x = D1, y = D2, label = rownames(conf_items)),
conf_items, alpha = 0.8, label.size = 0.2, box.padding = 0.45) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
crimes_unf_out
##
## Call: unfolding(delta = regions_data_out[6:11])
##
## Model: Rectangular smacof
## Number of subjects: 80
## Number of objects: 6
## Transformation: none
## Conditionality: matrix
##
## Stress-1 value: 0.309035
## Penalized Stress: 7.989472
## Number of iterations: 113
Both stress and penalized stress become smaller, but not really noticeably: 0.309 now vs 0.326 initially and 7.99 now vs 8.11 initially. At the same time groupings become way more blurry — so, again, initial option wins.
In this part I will try to explain groupings based on 5 features: share of poor citizens, median income, Gini coefficient, average severity of crimes, here of unsolved crimes and share of Russians. Let’s start!
conf_items = as.data.frame(crimes_unf$conf.col)
conf_persons = as.data.frame(crimes_unf$conf.row)
conf_persons$region = regions_data_work$region
conf_persons$median_income = regions_data_work$median_income
ggplot(conf_persons, aes(x = D1, y = D2)) +
geom_point(size = conf_persons$median_income/8000, alpha = 0.7, aes(color = median_income, text = region)) +
coord_fixed(xlim = c(-1.4, 1.4), ylim = c(-1.4, 1.4)) +
scale_colour_gradient(low = "red", high = "darkgreen") +
scale_size(range = c(0, 1)) +
labs(
title = "Crime in Russian Regions (Median income)",
x = "Dimension 1",
y = "Dimension 2") +
geom_point(shape = 15, size = 2, aes(x = D1, y = D2), conf_items) +
geom_label_repel(aes(x = D1, y = D2, label = rownames(conf_items)),
conf_items, alpha = 0.8, label.size = 0.2, box.padding = 0.4) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
Here 2 patterns can be indicated: crimes against foreigners is committed in regions with medium or high median income (just because more foreigners presented due to better economic opportunities, I guess), while crimes of extremist nature — in regions with low median income (no decent crime subject — so, negative emotions are transforming into radical intentions?).
conf_items = as.data.frame(crimes_unf$conf.col)
conf_persons = as.data.frame(crimes_unf$conf.row)
conf_persons$region = regions_data_work$region
conf_persons$gini = regions_data_work$gini
ggplot(conf_persons, aes(x = D1, y = D2)) +
geom_point(size = conf_persons$gini*8, alpha = 0.7, aes(color = gini, text = region)) +
coord_fixed(xlim = c(-1.4, 1.4), ylim = c(-1.4, 1.4)) +
scale_colour_gradient(low = "red", high = "darkgreen") +
scale_size(range = c(0, 1)) +
labs(
title = "Crime in Russian Regions (Gini coefficient)",
x = "Dimension 1",
y = "Dimension 2") +
geom_point(shape = 15, size = 2, aes(x = D1, y = D2), conf_items) +
geom_label_repel(aes(x = D1, y = D2, label = rownames(conf_items)),
conf_items, alpha = 0.8, label.size = 0.2, box.padding = 0.4) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
Similar picture to median income: crimes against foreigners is committed in regions with high Gini coefficient (just because more foreigners presented due to better economic opportunities, I guess), while crimes of extremist nature — in regions with low Gini coefficient (again, no decent crime subject — so, negative emotions are just transforming into radical intentions?).
conf_items = as.data.frame(crimes_unf$conf.col)
conf_persons = as.data.frame(crimes_unf$conf.row)
conf_persons$region = regions_data_work$region
conf_persons$severity_avg = regions_data_work$severity_avg
ggplot(conf_persons, aes(x = D1, y = D2)) +
geom_point(size = conf_persons$severity_avg*2, alpha = 0.7, aes(color = severity_avg, text = region)) +
coord_fixed(xlim = c(-1.4, 1.4), ylim = c(-1.4, 1.4)) +
scale_colour_gradient(low = "darkgreen", high = "red") +
scale_size(range = c(0, 1)) +
labs(
title = "Crime in Russian Regions (Average severity)",
x = "Dimension 1",
y = "Dimension 2") +
geom_point(shape = 15, size = 2, aes(x = D1, y = D2), conf_items) +
geom_label_repel(aes(x = D1, y = D2, label = rownames(conf_items)),
conf_items, alpha = 0.8, label.size = 0.2, box.padding = 0.4) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
No pattern is presented. Probably, due to low variability (range from 1.6 to 2.2, while 1 is min and 4 is max).
conf_items = as.data.frame(crimes_unf$conf.col)
conf_persons = as.data.frame(crimes_unf$conf.row)
conf_persons$region = regions_data_work$region
conf_persons$unsolved = regions_data_work$unsolved
ggplot(conf_persons, aes(x = D1, y = D2)) +
geom_point(size = conf_persons$unsolved/12, alpha = 0.7, aes(color = unsolved, text = region)) +
coord_fixed(xlim = c(-1.4, 1.4), ylim = c(-1.4, 1.4)) +
scale_colour_gradient(low = "darkgreen", high = "red") +
scale_size(range = c(0, 1)) +
labs(
title = "Crime in Russian Regions (Unsolved crimes)",
x = "Dimension 1",
y = "Dimension 2") +
geom_point(shape = 15, size = 2, aes(x = D1, y = D2), conf_items) +
geom_label_repel(aes(x = D1, y = D2, label = rownames(conf_items)),
conf_items, alpha = 0.8, label.size = 0.2, box.padding = 0.4) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
One pattern: we see that regions with low share of unsolved crimes tend to register a lot of crimes of extremist nature (remnants of the stick system? as such crimes are easy to search and fabricate via social media?).
conf_items = as.data.frame(crimes_unf$conf.col)
conf_items$type = rownames(conf_items)
conf_persons = as.data.frame(crimes_unf$conf.row)
conf_persons$region = regions_data_work$region
plot = ggplot(conf_persons, aes(x = D1, y = D2)) +
geom_point(size = 1.5, alpha = 0.8, aes(text = region)) +
coord_fixed(xlim = c(-1.4, 1.4), ylim = c(-1.4, 1.4)) +
labs(
title = "Crime in Russian Regions (General Graph)",
x = "Dimension 1",
y = "Dimension 2") +
geom_point(shape = 15, size = 2, aes(x = D1, y = D2, text = type), color = "purple", conf_items) +
geom_label_repel(
aes(x = D1, y = D2, label = rownames(conf_items)),
conf_items, alpha = 0.8, label.size = 0.2, box.padding = 0.45) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
ggp_build = plotly_build(plot)
ggp_build$layout$height = 800
ggp_build$layout$width = 800
ggp_build$tooltip = text
plot
ggp_build
General observations: