What

R data analysis for:

Crime among immigrant groups in the Netherlands: the predictive validity of country of origin (in prep)

Start

options(digits = 2)
library(pacman)
p_load(kirkegaard, readr, dplyr, rms, googlesheets4, tmap, sf, broom, readxl, writexl)

#default theme
theme_set(theme_bw())

#redefine describe
describe = function(...) psych::describe(...) %>% as.matrix()

Data

#read data from gdrive

#suspect counts
#combined data
crime = read_sheet("https://docs.google.com/spreadsheets/d/1GI_XW5Icj-5cvWA0MZNKZW3VxsYWcmUSZuhseYdsFFA/edit#gid=0", sheet = 1) %>% df_legalize_names() %>% {set_colnames(., names(.) %>% str_to_lower())}

## Using an auto-discovered, cached token.
## To suppress this message, modify your code or options to clearly consent to the use of a cached token.
## See gargle's "Non-interactive auth" vignette for more details:
## https://gargle.r-lib.org/articles/non-interactive-auth.html
## The googlesheets4 package is using a cached token for the.dfx@gmail.com.

## Reading from 'Public preferences and reality: crime rates among 70 immigrant groups in the Netherlands (data)'

## Range "'Combined_crime_rates_long'"

#by generation
crime_gen = read_sheet("https://docs.google.com/spreadsheets/d/1GI_XW5Icj-5cvWA0MZNKZW3VxsYWcmUSZuhseYdsFFA/edit#gid=0", sheet = 2) %>% df_legalize_names() %>% {set_colnames(., names(.) %>% str_to_lower())}

## Reading from 'Public preferences and reality: crime rates among 70 immigrant groups in the Netherlands (data)'

## Range "'Combined_crime_rates_by_generation_long'"

#Dutch native counts
natives = read_sheet("https://docs.google.com/spreadsheets/d/1GI_XW5Icj-5cvWA0MZNKZW3VxsYWcmUSZuhseYdsFFA/edit#gid=0", sheet = 3) %>% df_legalize_names() %>% {set_colnames(., names(.) %>% str_to_lower())}

## Reading from 'Public preferences and reality: crime rates among 70 immigrant groups in the Netherlands (data)'

## Range "'Dutch_rates'"

#national data from 2017 Kirkegaard
#https://openpsych.net/paper/53
natl = read_csv("data/Megadataset_v2.0p.csv") %>% 
  rename(ISO = X1)

## Parsed with column specification:
## cols(
##   .default = col_double(),
##   X1 = col_character(),
##   LVnotes = col_character(),
##   Names = col_character(),
##   Capital = col_character(),
##   se_Land = col_character(),
##   se_abbrev = col_character(),
##   de_Country = col_character(),
##   de_abbrev = col_character(),
##   de_Country_EN = col_character(),
##   de_Western_neighbor = col_logical(),
##   de_NW_European = col_logical(),
##   dk_Origin = col_character(),
##   Country_EN = col_character()
## )

## See spec(...) for full column specifications.

#Rindermann IQs
rindermann = read_excel("data/cognitive_capitalism_appendix.xlsx") %>% 
  df_legalize_names() %>% 
  mutate(IQ_Rindermann = CA_totc,
         ISO = pu_translate(Country))

## No exact match: Antigua-Barbuda

## No exact match: Benin (Dahomey)

## No exact match: Central Afric R

## No exact match: Congo (Brazz)

## No exact match: Dominican Repub

## No exact match: Equat. Guinea

## No exact match: Korea-North

## No exact match: Korea-South

## No exact match: Nether Antilles

## No exact match: Papua N-Guinea

## No exact match: Sao Tome/Princi

## No exact match: St. Kitts & Nevis

## No exact match: St. Vincent/Gre

## No exact match: Trinidad Tobago

## No exact match: United Arab Emi

## Best fuzzy match found: Antigua-Barbuda -> Antigua & Barbuda with distance 3.00

## Best fuzzy match found: Benin (Dahomey) -> Benin (ex Dahomey) with distance 3.00

## Best fuzzy match found: Central Afric R -> Central Africa with distance 2.00

## Best fuzzy match found: Congo (Brazz) -> Congo (Brazz rep) with distance 4.00

## Best fuzzy match found: Dominican Repub -> Dominican Rep. with distance 2.00

## Best fuzzy match found: Equat. Guinea -> Equ. Guinea with distance 2.00

## Best fuzzy match found: Korea-North -> Korea North with distance 1.00

## Best fuzzy match found: Korea-South -> Korea South with distance 1.00

## Best fuzzy match found: Nether Antilles -> Neth. Antilles with distance 2.00

## Best fuzzy match found: Papua N-Guinea -> Papua New Guinea with distance 3.00

## Best fuzzy match found: Sao Tome/Princi -> Sao Tome & Principe with distance 5.00

## Best fuzzy match found: St. Kitts & Nevis -> St. Kitts en Nevis with distance 2.00

## Best fuzzy match found: St. Vincent/Gre -> St. Vincent with distance 4.00

## Best fuzzy match found: Trinidad Tobago -> Trinidad & Tobago with distance 2.00

## Best fuzzy match found: United Arab Emi -> United Arab Emirates with distance 5.00

#data from German study, 2017, Kirkegaard Becker
#https://openpsych.net/paper/50
natl2 = read_rds("data/main.rds") %>% 
  mutate(ISO = de_abbrev)

#old dutch data
olddutch = read_rds("data/Dutch crime study 2015.rds") %>% 
  rownames_to_column("ISO")

#UN regions
chess2019 = read_rds("data/chess_kirkegaard2019.rds")

#preference data
prefs = read_sheet("https://docs.google.com/spreadsheets/d/1BE1GoWvcEojhUNs5r8Dy05cvEk1aFYSfE0dJqybdGhM/edit#gid=508472659") %>% df_legalize_names()

## Reading from 'Enquête immigratiebeleid  (Responses)'

##

#exclusions
prefs_excl = read_sheet("https://docs.google.com/spreadsheets/d/1BE1GoWvcEojhUNs5r8Dy05cvEk1aFYSfE0dJqybdGhM/edit#gid=508472659", sheet = 2) %>% df_legalize_names()

## Reading from 'Enquête immigratiebeleid  (Responses)'

## Range "'Sheet2'"

#remove rejected rows
prefs_orig = prefs
prefs = prefs %>% filter(Wat_is_uw_Prolific_ID %in% (prefs_excl %>% filter(is.na(rejected)) %>% pull(Wat_is_uw_Prolific_ID)))
prefs_excl$rejected %>% sum(na.rm = T)

## [1] 20

prefs %<>% miss_filter(missing = 1)

Recode

#mutate variables for each dataset
#calculate rates, and relative rates to natives
natives %<>% mutate(
    suspect_rate = total_registered_suspects / population_size,
    arrest_rate = total_arrested_suspects / population_size,
    ISO = origin %>% pu_translate(),
    year = as.numeric(year)
)

crime %<>% mutate(
  suspect_rate = total_registered_suspects / population_size,
  arrest_rate = total_arrested_suspects / population_size,
  ISO = origin %>% pu_translate(),
  year = as.numeric(year)
) %>% 
  #add natives
  bind_rows(natives) %>% 
  #relative rates
  plyr::ddply("year", function(dd) {

    dd$suspect_rate_RR = dd$suspect_rate / natives$suspect_rate[natives$year == (dd$year[1])]
    dd$arrest_rate_RR = dd$arrest_rate / natives$arrest_rate[natives$year == (dd$year[1])]
  
  dd
})

## No exact match: Former Czechoslovakia

## No exact match: Congo (Democratic Republic)

## No exact match: Tunesia

## Best fuzzy match found: Former Czechoslovakia -> Czechoslovakia with distance 7.00

## Best fuzzy match found: Congo (Democratic Republic) -> Congo (Zaire Democratic Republic) with distance 6.00

## Best fuzzy match found: Tunesia -> Tunisia with distance 1.00

crime_gen %<>% mutate(
  suspect_rate = total_registered_suspects / population_size,
  arrest_rate = total_arrested_suspects / population_size,
  ISO = origin %>% pu_translate(),
  year = as.numeric(year)
) %>% 
  plyr::ddply("year", function(dd) {

    dd$suspect_rate_RR = dd$suspect_rate / natives$suspect_rate[natives$year == (dd$year[1])]
    dd$arrest_rate_RR = dd$arrest_rate / natives$arrest_rate[natives$year == (dd$year[1])]
  
  dd
})

## No exact match: Congo (Democratic Republic)

## No exact match: Former Czechoslovakia

## No exact match: Tunesia

## Best fuzzy match found: Congo (Democratic Republic) -> Congo (Zaire Democratic Republic) with distance 6.00

## Best fuzzy match found: Former Czechoslovakia -> Czechoslovakia with distance 7.00

## Best fuzzy match found: Tunesia -> Tunisia with distance 1.00

#average across years
#then join with other natl data
crime_avg = crime %>% 
  group_by(origin) %>% 
  dplyr::summarise(
    pop = wtd_mean(population_size),
    suspect_rate = wtd_mean(suspect_rate, population_size),
    arrest_rate = wtd_mean(arrest_rate, population_size),
    ISO = ISO[1]
  )

crime_gen_avg = crime_gen %>% 
  plyr::ddply(c("origin", "generation"), function(dd) {
    tibble(
      pop = wtd_mean(dd$population_size),
      suspect_rate = wtd_mean(dd$suspect_rate, dd$population_size),
      arrest_rate = wtd_mean(dd$arrest_rate, dd$population_size),
    )
  }) %>% 
  gather(key = stat, value = value, pop, suspect_rate, arrest_rate) %>% 
  unite(col = stat, stat, generation) %>% 
  mutate(stat = stat %>% str_replace("first generation", "1st") %>% str_replace("second generation", "2nd")) %>% 
  spread(key = stat, value = value) %>% 
  mutate(
    prop_2nd = pop_2nd / (pop_1st + pop_2nd),
    ISO = pu_translate(origin)
  )

## No exact match: Congo (Democratic Republic)

## No exact match: Former Czechoslovakia

## No exact match: Tunesia

## Best fuzzy match found: Congo (Democratic Republic) -> Congo (Zaire Democratic Republic) with distance 6.00

## Best fuzzy match found: Former Czechoslovakia -> Czechoslovakia with distance 7.00

## Best fuzzy match found: Tunesia -> Tunisia with distance 1.00

#RRs
crime_avg %<>% mutate(
  arrest_rate_RR = arrest_rate / crime_avg$arrest_rate[crime_avg$ISO=="NLD"],
  suspect_rate_RR = suspect_rate / crime_avg$suspect_rate[crime_avg$ISO=="NLD"]
)

crime_gen_avg %<>% mutate(
  arrest_rate_1st_RR = arrest_rate_1st / crime_avg$arrest_rate[crime_avg$ISO=="NLD"],
  arrest_rate_2nd_RR = arrest_rate_2nd / crime_avg$arrest_rate[crime_avg$ISO=="NLD"],
  suspect_rate_1st_RR = suspect_rate_1st / crime_avg$suspect_rate[crime_avg$ISO=="NLD"],
  suspect_rate_2nd_RR = suspect_rate_2nd / crime_avg$suspect_rate[crime_avg$ISO=="NLD"]
)


#join to one main dataset
d = full_join(crime_avg, crime_gen_avg %>% select(-origin), by = "ISO") %>% 
  #join with national datasets
  left_join(natl %>% select(ISO, IslamPewResearch2010, LV2012estimatedIQ), by = "ISO") %>% 
  left_join(natl2 %>% select(ISO, Muslim, IQ), by = "ISO") %>% 
  left_join(rindermann %>% select(ISO, IQ_Rindermann), by = "ISO") %>% 
  #join with old dutch data
  left_join(olddutch %>% dplyr::select(`12_17men`:YoungPersonsCrimeFactor, ISO), by = "ISO") %>% 
  #fill in missing data from the other cols
  mutate(IQ = miss_fill(IQ, LV2012estimatedIQ),
         Muslim = miss_fill(Muslim, IslamPewResearch2010)) %>% 
  left_join(chess2019 %>% select(ISO, UN_continent, UN_macroregion, UN_region), by = "ISO")

#overlap?
#data with generations does not have Netherlands
crime_avg$ISO[!crime_avg$ISO %in% crime_gen_avg$ISO]

## [1] "NLD"

crime_gen_avg$ISO[!crime_gen_avg$ISO %in% crime_avg$ISO]

## character(0)

#add some missing values
imputed_muslim = c("YUG", "SUN", "CSK", "HKG")
#Yugoslavia
d$UN_continent[d$ISO == "YUG"] = "Europe"
d$UN_region[d$ISO == "YUG"] = "Southern Europe"
d$UN_macroregion[d$ISO == "YUG"] = "Southern Europe"
#Soviet Union
d$UN_continent[d$ISO == "SUN"] = "Europe"
d$UN_region[d$ISO == "SUN"] = "Eastern Europe"
d$UN_macroregion[d$ISO == "SUN"] = "Eastern Europe"
#czechoslovakia
d$UN_continent[d$ISO == "CSK"] = "Europe"
d$UN_region[d$ISO == "CSK"] = "Eastern Europe"
d$UN_macroregion[d$ISO == "CSK"] = "Eastern Europe"
#Hong Kong 
d$UN_continent[d$ISO == "HKG"] = "Asia"
d$UN_region[d$ISO == "HKG"] = "Eastern Asia"
d$UN_macroregion[d$ISO == "HKG"] = "Eastern Asia"

#colonial
d$colonial = F
d$colonial[d$ISO %in% c("ANT", "SUR", "IDN")] = T

#prefs
country_vars = names(prefs) %>% str_subset("specifieke_landen") %>% {.[!str_detect(., "Selecteer")]}
country_prefs = prefs[country_vars]
#fix names, translate
colnames(country_prefs) = str_match(colnames(country_prefs), "landen_(.+)")[, 2] %>% pu_translate()

## No exact match: Argentini

## No exact match: Australi

## No exact match: Belgi

## No exact match: Brazili

## No exact match: Democratische_Republiek_van_Congo

## No exact match: Dominicaanse_Republiek

## No exact match: Ethiopi

## No exact match: Filippijnen

## No exact match: Indi

## No exact match: Indonesi

## No exact match: Isra_l

## No exact match: Itali

## No exact match: Voormalig_Joegoslavi

## No exact match: Kaapverdi

## No exact match: Malasia

## No exact match: Nederlandse_Antillen

## No exact match: Nieuw_Zeeland

## No exact match: Roemeni

## No exact match: Sierra_Leone

## No exact match: Somali

## No exact match: Sri_Lanka

## No exact match: Syri

## No exact match: Tunesi

## No exact match: Verenigde_Staten_van_Amerika

## No exact match: Verenigd_Koningkrijk

## No exact match: Zuid_Afrika

## No exact match: Zuid_Korea

## Best fuzzy match found: Argentini -> Argentina with distance 1.00

## Best fuzzy match found: Australi -> Australia with distance 1.00

## Best fuzzy match found: Belgi -> Belgio with distance 1.00

## Best fuzzy match found: Brazili -> Brazil with distance 1.00

## Best fuzzy match found: Democratische_Republiek_van_Congo -> Democratic Republic of Congo with distance 11.00

## Best fuzzy match found: Dominicaanse_Republiek -> Dominicaanse Republiek with distance 1.00

## Best fuzzy match found: Ethiopi -> Ethiopia with distance 1.00

## Best fuzzy match found: Filippijnen -> Filipijnen with distance 1.00

## Best fuzzy match found: Indi -> India with distance 1.00

## Best fuzzy match found: Indonesi -> Indonesia with distance 1.00

## Best fuzzy match found: Isra_l -> Israel with distance 1.00

## Best fuzzy match found: Itali -> Italy with distance 1.00

## Best fuzzy match found: Voormalig_Joegoslavi -> Joegoslavië with distance 11.00

## Best fuzzy match found: Kaapverdi -> Kapp Verde with distance 4.00

## Best fuzzy match found: Malasia -> Malaysia with distance 1.00

## Best fuzzy match found: Nederlandse_Antillen -> Netherlands Antilles with distance 5.00

## Best fuzzy match found: Nieuw_Zeeland -> Nieuw-Zeeland with distance 1.00

## Best fuzzy match found: Roemeni -> Roemenië with distance 1.00

## Best fuzzy match found: Sierra_Leone -> Sierra Leone with distance 1.00

## Best fuzzy match found: Somali -> Somalia with distance 1.00

## Best fuzzy match found: Sri_Lanka -> Sri Lanka with distance 1.00

## Best fuzzy match found: Syri -> Syria with distance 1.00

## Best fuzzy match found: Tunesi -> Tunesië with distance 1.00

## Best fuzzy match found: Verenigde_Staten_van_Amerika -> Verenigde Staten van Amerika with distance 3.00

## Best fuzzy match found: Verenigd_Koningkrijk -> Verenigd Koninkrijk with distance 2.00

## Best fuzzy match found: Zuid_Afrika -> Zuid-Afrika with distance 1.00

## Best fuzzy match found: Zuid_Korea -> Zuid-Korea with distance 1.00

#recode to -1 and +1's
country_prefs_num = country_prefs
for (v in colnames(country_prefs_num)) {
  country_prefs_num[[v]] %<>% mapvalues(from = c("We moeten helemaal geen mensen meer uit dit land toelaten.", "We moeten minder mensen uit dit land toelaten.", "We moeten meer mensen uit dit land toelaten.", "We moeten net zoveel mensen uit dit land toelaten als nu."), to = c(1, 1, -1, -1)) %>% as.numeric()
}

#replace values
prefs$last_vote = prefs$Wat_heeft_u_bij_de_afgelopen_Tweede_Kamerverkiezingen_gestemd
prefs$last_vote %<>% mapvalues(from = c("Forum voor Democratie"), to = c("FvD"))

#weights for representative sample wrt. parties
#https://en.wikipedia.org/wiki/2017_Dutch_general_election#Results
party_votes = c(
  "VVD" = 21.3,
  "PVV" = 13.1,
  "CDA" = 12.4,
  "D66" = 12.2,
  "GroenLinks" = 9.1,
  "SP" = 9.1,
  "PvdA" = 5.7,
  "ChristenUnie" = 3.4,
  "Partij voor de Dieren" = 3.2,
  "50Plus" = 3.1,
  "SGP" = 2.1,
  "DENK" = 2.1,
  "Forum voor Democratie" = 1.8
)

#calculate weights
#start with 0, for non-parties
weights = rep(0, nrow(prefs))
for (p in names(party_votes)) {
  which_in_p = prefs$last_vote == p
  weights[which_in_p] = party_votes[p] / sum(which_in_p)
}

#sums match?
#some parties aren't represented in our data, so we cannot adjust for them
sum(party_votes)

## [1] 99

sum(weights)

## [1] 92

#means by country
country_prefs_avg = tibble(
  ISO = colnames(country_prefs),
  net_opposition_raw = colMeans(country_prefs_num, na.rm = T),
  net_opposition = map_dbl(country_prefs_num, ~wtd_mean(., w = weights)),
  frac_none = map_dbl(country_prefs, ~wtd_mean(.=="We moeten helemaal geen mensen meer uit dit land toelaten.", w = weights)),
  frac_fewer = map_dbl(country_prefs, ~wtd_mean(.=="We moeten minder mensen uit dit land toelaten.", w = weights)),
  frac_same = map_dbl(country_prefs, ~wtd_mean(.=="We moeten net zoveel mensen uit dit land toelaten als nu.", w = weights)),
  frac_more = map_dbl(country_prefs, ~wtd_mean(.=="We moeten meer mensen uit dit land toelaten.", w = weights))
)

#correlations?
country_prefs_avg[-1] %>% wtd.cors()

##                    net_opposition_raw net_opposition frac_none frac_fewer
## net_opposition_raw               1.00           0.96      0.85       0.93
## net_opposition                   0.96           1.00      0.90       0.96
## frac_none                        0.85           0.90      1.00       0.76
## frac_fewer                       0.93           0.96      0.76       1.00
## frac_same                       -0.87          -0.94     -0.89      -0.88
## frac_more                       -0.46          -0.40     -0.26      -0.45
##                    frac_same frac_more
## net_opposition_raw    -0.866    -0.462
## net_opposition        -0.938    -0.399
## frac_none             -0.887    -0.259
## frac_fewer            -0.880    -0.448
## frac_same              1.000     0.055
## frac_more              0.055     1.000

#join
d = left_join(d, country_prefs_avg, by = "ISO")

Results

Descriptives

#count
nrow(d)

## [1] 71

#years covered
crime$year %>% range()

## [1] 2005 2018

#data total
d$pop %>% sum()

## [1] 6547215

#highest factor diff
d$suspect_rate_RR %>% range() %>% 
  {
    print(.)
    .[2] / .[1]}

## [1] 0.22 3.81

## [1] 17

#prefs
d %>% select(net_opposition:frac_more) %>% wtd.cors()

##                net_opposition frac_none frac_fewer frac_same frac_more
## net_opposition           1.00      0.90       0.96    -0.938    -0.399
## frac_none                0.90      1.00       0.76    -0.887    -0.259
## frac_fewer               0.96      0.76       1.00    -0.880    -0.448
## frac_same               -0.94     -0.89      -0.88     1.000     0.055
## frac_more               -0.40     -0.26      -0.45     0.055     1.000

d %>% select(origin, net_opposition:frac_more)

#prefs demographics
prefs %>% filter(last_vote %in% names(party_votes)) %>% pull(last_vote) %>% table2()

#zero cells
(crime$suspect_rate==0) %>% wtd_mean()

## [1] 0.001

(crime$arrest_rate==0) %>% wtd_mean()

## [1] 0.0028

(crime_gen$suspect_rate==0) %>% wtd_mean()

## [1] 0.031

(crime_gen$arrest_rate==0) %>% wtd_mean()

## [1] 0.052

#Muslim and IQ in this sample and elsewhere
d %>% select(Muslim, IQ) %>% wtd.cor()

## Warning in summary.lm(lm(stdz(y, weight = weight) ~ stdz(x, weight = weight), :
## essentially perfect fit: summary may be unreliable

## Warning in summary.lm(lm(stdz(y, weight = weight) ~ stdz(x, weight = weight), :
## essentially perfect fit: summary may be unreliable

## $correlation
##        Muslim    IQ
## Muslim   1.00 -0.42
## IQ      -0.42  1.00
## 
## $std.err
##         Muslim      IQ
## Muslim 2.6e-17 1.1e-01
## IQ     1.1e-01 7.9e-17
## 
## $t.value
##          Muslim       IQ
## Muslim  3.9e+16 -3.9e+00
## IQ     -3.9e+00  1.3e+16
## 
## $p.value
##         Muslim      IQ
## Muslim 0.00000 0.00026
## IQ     0.00026 0.00000

d %>% select(Muslim, IQ) %>% pairwiseCount()

##        Muslim IQ
## Muslim     71 71
## IQ         71 71

natl %>% select(IslamPewResearch2010, LV2012estimatedIQ) %>% wtd.cor()

## Warning in summary.lm(lm(stdz(y, weight = weight) ~ stdz(x, weight = weight), :
## essentially perfect fit: summary may be unreliable

## Warning in summary.lm(lm(stdz(y, weight = weight) ~ stdz(x, weight = weight), :
## essentially perfect fit: summary may be unreliable

## $correlation
##                      IslamPewResearch2010 LV2012estimatedIQ
## IslamPewResearch2010                 1.00             -0.27
## LV2012estimatedIQ                   -0.27              1.00
## 
## $std.err
##                      IslamPewResearch2010 LV2012estimatedIQ
## IslamPewResearch2010              3.5e-17           6.9e-02
## LV2012estimatedIQ                 6.9e-02           2.1e-17
## 
## $t.value
##                      IslamPewResearch2010 LV2012estimatedIQ
## IslamPewResearch2010              2.8e+16          -3.9e+00
## LV2012estimatedIQ                -3.9e+00           4.8e+16
## 
## $p.value
##                      IslamPewResearch2010 LV2012estimatedIQ
## IslamPewResearch2010              0.00000           0.00015
## LV2012estimatedIQ                 0.00015           0.00000

natl %>% select(IslamPewResearch2010, LV2012estimatedIQ) %>% pairwiseCount()

##                      IslamPewResearch2010 LV2012estimatedIQ
## IslamPewResearch2010                  232               198
## LV2012estimatedIQ                     198               203

#regional codings
d$UN_continent %>% table2()

d$UN_macroregion %>% table2()

Temporal development of crime

#temporal development
suspects_largest_10 = crime %>% 
  filter(year == 2018) %>% 
  arrange(-population_size) %>% 
  pull(origin) %>% head(10)

crime %>% 
  #largest 10 in 2018
  filter(origin %in% suspects_largest_10) %>% 
  ggplot(aes(year, suspect_rate, color = origin)) +
  geom_line() +
  scale_x_continuous("Year", breaks = seq(2005, 2018, by = 3)) +
  scale_y_continuous("Crime rate")

GG_save("figs/suspect_rate_timeline.png")

#relative rates
crime %>% 
  #largest 10 in 2018
  filter(origin %in% suspects_largest_10) %>% 
  ggplot(aes(year, suspect_rate_RR, color = origin)) +
  geom_line() +
  scale_x_continuous("Year", breaks = seq(2005, 2018, by = 3)) +
  scale_y_continuous("Crime rate")

GG_save("figs/suspect_rate_RR_timeline.png")

#By generation
suspects_gen_largest_10 = crime_gen %>% 
  plyr::ddply(c("origin"), function(dd) {
  tibble(
    origin = dd$origin[1],
    pop = dd$population_size %>% sum()
  )
}) %>% arrange(-pop) %>% pull(origin) %>% head(10)

crime_gen %>% 
  #largest 10 in 2018
  filter(origin %in% suspects_gen_largest_10) %>% 
  ggplot(aes(year, suspect_rate, color = origin, linetype = generation)) +
  geom_line() +
  scale_x_continuous("Year", breaks = seq(2005, 2018, by = 3)) +
  scale_y_continuous("Crime rate")

GG_save("figs/suspect_rate_timeline_gen.png")

#population level
d_years = crime %>% 
  group_by(year) %>% 
  summarise(suspects = sum(total_registered_suspects),
            arrests = sum(total_arrested_suspects),
            population = sum(population_size),
            suspect_rate = suspects/population,
            arrest_rate = arrests/population
            ) %>% 
  mutate(
    suspect_rate_RR = suspect_rate/first(suspect_rate),
    arrest_rate_RR = arrest_rate/first(arrest_rate)
    )

d_years

#within each
d_origins = crime %>% 
  arrange(year) %>% 
  group_by(origin) %>% 
  summarise(
    first = first(suspect_rate),
    last = last(suspect_rate),
    RR = last / first)

d_origins

#descriptives
d_origins$RR[d_origins$origin != "The Netherlands"] %>% describe()

##    vars  n mean   sd median trimmed  mad  min max range skew kurtosis    se
## X1    1 70  0.5 0.23   0.48    0.48 0.14 0.14   2   1.9  3.8       22 0.028

d_origins$RR[d_origins$origin == "The Netherlands"]

## [1] 0.45

d_origins[-1] %>% wtd.cors()

##       first last    RR
## first 1.000 0.93 0.023
## last  0.932 1.00 0.291
## RR    0.023 0.29 1.000

Prediction analysis

#scattersplots
GG_scatter(d, "IQ", "suspect_rate_RR", case_names = "origin") +
  xlab("Origin country national IQ")

GG_save("figs/IQ_crime.png")

GG_scatter(d, "Muslim", "suspect_rate_RR", case_names = "origin", text_pos = "tr") +
  xlab("Origin country Muslim% of population")

GG_save("figs/Muslim_crime.png")


#with weights 
GG_scatter(d, "IQ", "suspect_rate_RR", case_names = "origin", weights = d$pop %>% sqrt()) +
  xlab("Origin country national IQ")

GG_save("figs/IQ_crime_wt.png")

GG_scatter(d, "Muslim", "suspect_rate_RR", case_names = "origin", text_pos = "tr", weights = d$pop %>% sqrt()) +
  xlab("Origin country Muslim% of population")

GG_save("figs/Muslim_crime_wt.png")


#correlations
combine_upperlower(
  wtd.cors(d %>% select(suspect_rate, suspect_rate_1st, suspect_rate_2nd, Muslim, IQ, net_opposition, prop_2nd)),
  wtd.cors(d %>% select(suspect_rate, suspect_rate_1st, suspect_rate_2nd, Muslim, IQ, net_opposition, prop_2nd), weight = d$pop %>% sqrt())
)

##                  suspect_rate suspect_rate_1st suspect_rate_2nd Muslim    IQ
## suspect_rate               NA             0.96            0.897   0.44 -0.66
## suspect_rate_1st        0.946               NA            0.797   0.39 -0.66
## suspect_rate_2nd        0.906             0.76               NA   0.43 -0.59
## Muslim                  0.452             0.27            0.432     NA -0.42
## IQ                     -0.635            -0.56           -0.552  -0.52    NA
## net_opposition          0.458             0.41            0.527   0.68 -0.63
## prop_2nd               -0.021            -0.19           -0.024   0.10  0.18
##                  net_opposition prop_2nd
## suspect_rate               0.57   -0.084
## suspect_rate_1st           0.58   -0.239
## suspect_rate_2nd           0.58   -0.089
## Muslim                     0.66   -0.070
## IQ                        -0.71    0.299
## net_opposition               NA   -0.477
## prop_2nd                  -0.43       NA

#regression
ols(suspect_rate_RR ~ IQ + Muslim, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt())

## Linear Regression Model
##  
##  ols(formula = suspect_rate_RR ~ IQ + Muslim, data = d %>% df_standardize(messages = F), 
##      weights = d$pop %>% sqrt())
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs      71    LR chi2     39.22    R2       0.424    
##  sigma9.5523    d.f.            2    R2 adj   0.407    
##  d.f.     68    Pr(> chi2) 0.0000    g        0.789    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.7953 -0.6496 -0.1990  0.1089  2.2776 
##  
##  
##            Coef    S.E.   t     Pr(>|t|)
##  Intercept  0.1764 0.0965  1.83 0.0719  
##  IQ        -0.6065 0.1190 -5.10 <0.0001 
##  Muslim     0.1607 0.1020  1.58 0.1198  
##

ols(suspect_rate_RR ~ IQ + Muslim, data = d %>% df_standardize(messages = F))

## Linear Regression Model
##  
##  ols(formula = suspect_rate_RR ~ IQ + Muslim, data = d %>% df_standardize(messages = F))
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs      71    LR chi2     44.65    R2       0.467    
##  sigma0.7409    d.f.            2    R2 adj   0.451    
##  d.f.     68    Pr(> chi2) 0.0000    g        0.781    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -1.53838 -0.46878 -0.03552  0.28373  2.48775 
##  
##  
##            Coef    S.E.   t     Pr(>|t|)
##  Intercept  0.0000 0.0879  0.00 1.0000  
##  IQ        -0.5738 0.0976 -5.88 <0.0001 
##  Muslim     0.2011 0.0976  2.06 0.0432  
##

#compare models
lrtest(
  ols(suspect_rate_RR ~ IQ, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt()),
  ols(suspect_rate_RR ~ IQ + Muslim, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt())
)

## 
## Model 1: suspect_rate_RR ~ IQ
## Model 2: suspect_rate_RR ~ IQ + Muslim
## 
## L.R. Chisq       d.f.          P 
##       2.55       1.00       0.11

#by generation
ols(suspect_rate_1st_RR ~ IQ + Muslim, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt())

## Frequencies of Missing Values Due to Each Variable
## suspect_rate_1st_RR                  IQ              Muslim           (weights) 
##                   1                   0                   0                   0 
## 
## Linear Regression Model
##  
##  ols(formula = suspect_rate_1st_RR ~ IQ + Muslim, data = d %>% 
##      df_standardize(messages = F), weights = d$pop %>% sqrt())
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs      70    LR chi2     26.41    R2       0.314    
##  sigma9.6951    d.f.            2    R2 adj   0.294    
##  d.f.     67    Pr(> chi2) 0.0000    g        0.733    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.8586 -0.5475 -0.2512  0.1496  2.9419 
##  
##  
##            Coef    S.E.   t     Pr(>|t|)
##  Intercept  0.1621 0.1063  1.52 0.1320  
##  IQ        -0.6356 0.1314 -4.84 <0.0001 
##  Muslim     0.0204 0.1037  0.20 0.8445  
##

ols(suspect_rate_2nd_RR ~ IQ + Muslim, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt())

## Frequencies of Missing Values Due to Each Variable
## suspect_rate_2nd_RR                  IQ              Muslim           (weights) 
##                   1                   0                   0                   0 
## 
## Linear Regression Model
##  
##  ols(formula = suspect_rate_2nd_RR ~ IQ + Muslim, data = d %>% 
##      df_standardize(messages = F), weights = d$pop %>% sqrt())
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs       70    LR chi2     29.69    R2       0.346    
##  sigma10.2171    d.f.            2    R2 adj   0.326    
##  d.f.      67    Pr(> chi2) 0.0000    g        0.782    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.8801 -0.5380 -0.1417  0.1111  2.2376 
##  
##  
##            Coef    S.E.   t     Pr(>|t|)
##  Intercept  0.1338 0.1120  1.19 0.2367  
##  IQ        -0.5586 0.1384 -4.04 0.0001  
##  Muslim     0.2240 0.1092  2.05 0.0442  
##

#combined model
#long form the generation data
crime_gen2= crime_gen %>% 
  left_join(d %>% select(ISO, IQ, Muslim), by = "ISO")

ols(suspect_rate ~ IQ + Muslim + generation, data = crime_gen2 %>% df_standardize(messages = F), weights = crime_gen2$population_size %>% sqrt())

## Linear Regression Model
##  
##  ols(formula = suspect_rate ~ IQ + Muslim + generation, data = crime_gen2 %>% 
##      df_standardize(messages = F), weights = crime_gen2$population_size %>% 
##      sqrt())
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    1960    LR chi2    742.19    R2       0.315    
##  sigma7.8267    d.f.            3    R2 adj   0.314    
##  d.f.   1956    Pr(> chi2) 0.0000    g        0.713    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -3.1639 -0.5645 -0.1978  0.2104  3.7282 
##  
##  
##                               Coef    S.E.   t      Pr(>|t|)
##  Intercept                    -0.1405 0.0260  -5.41 <0.0001 
##  IQ                           -0.5053 0.0242 -20.87 <0.0001 
##  Muslim                        0.1161 0.0192   6.05 <0.0001 
##  generation=second generation  0.5387 0.0388  13.87 <0.0001 
##

ols(suspect_rate ~ IQ + Muslim + generation, data = crime_gen2 %>% df_standardize(messages = F))

## Linear Regression Model
##  
##  ols(formula = suspect_rate ~ IQ + Muslim + generation, data = crime_gen2 %>% 
##      df_standardize(messages = F))
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs    1960    LR chi2    878.61    R2       0.361    
##  sigma0.7998    d.f.            3    R2 adj   0.360    
##  d.f.   1956    Pr(> chi2) 0.0000    g        0.686    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -2.95544 -0.43459 -0.08021  0.32838  3.82923 
##  
##  
##                               Coef    S.E.   t      Pr(>|t|)
##  Intercept                    -0.2851 0.0255 -11.16 <0.0001 
##  IQ                           -0.4464 0.0199 -22.45 <0.0001 
##  Muslim                        0.1535 0.0199   7.72 <0.0001 
##  generation=second generation  0.5701 0.0361  15.78 <0.0001 
##

#correlations with old data
wtd.cors(d %>% select(suspect_rate, suspect_rate_1st, suspect_rate_2nd,
                      arrest_rate, arrest_rate_1st, arrest_rate_2nd,
                      `12_17men`:YoungPersonsCrimeFactor)) %>% 
  .[-c(1:6), c(1:6)]

##                         suspect_rate suspect_rate_1st suspect_rate_2nd
## 12_17men                        0.82             0.76             0.87
## 12_17women                      0.75             0.70             0.71
## 18_24men                        0.89             0.80             0.91
## 18_24women                      0.74             0.75             0.68
## 25_44men                        0.90             0.89             0.81
## 25_44women                      0.78             0.86             0.59
## 45_79men                        0.78             0.82             0.66
## 45_79women                      0.50             0.57             0.41
## All                             0.92             0.93             0.82
## 12_17_1gen                      0.89             0.92             0.79
## 18_24_1gen                      0.93             0.93             0.85
## 25_44_1gen                      0.87             0.96             0.69
## 45_79_1gen                      0.72             0.87             0.51
## All_1gen                        0.89             0.97             0.72
## 12_17_2gen                      0.85             0.74             0.90
## 18_24_2gen                      0.83             0.63             0.94
## 25_44_2gen                      0.79             0.59             0.91
## 45_79_2gen                      0.27             0.30             0.15
## All_2gen                        0.81             0.60             0.92
## YoungPersonsCrimeFactor         0.96             0.89             0.95
##                         arrest_rate arrest_rate_1st arrest_rate_2nd
## 12_17men                       0.83            0.77            0.82
## 12_17women                     0.74            0.69            0.65
## 18_24men                       0.90            0.81            0.88
## 18_24women                     0.74            0.75            0.55
## 25_44men                       0.92            0.92            0.76
## 25_44women                     0.79            0.87            0.47
## 45_79men                       0.78            0.82            0.62
## 45_79women                     0.51            0.58            0.34
## All                            0.93            0.94            0.71
## 12_17_1gen                     0.90            0.91            0.77
## 18_24_1gen                     0.94            0.93            0.83
## 25_44_1gen                     0.89            0.97            0.67
## 45_79_1gen                     0.75            0.89            0.49
## All_1gen                       0.91            0.98            0.70
## 12_17_2gen                     0.83            0.72            0.89
## 18_24_2gen                     0.80            0.61            0.94
## 25_44_2gen                     0.78            0.56            0.93
## 45_79_2gen                     0.28            0.32            0.13
## All_2gen                       0.79            0.57            0.94
## YoungPersonsCrimeFactor        0.96            0.88            0.94

Preferences

#scatterplots
GG_scatter(d, "suspect_rate_RR", "net_opposition", case_names = "origin", text_pos = "tl")

GG_save("figs/suspect_opposition.png")

GG_scatter(d, "suspect_rate_RR", "net_opposition", case_names = "origin", text_pos = "tl", weights = d$pop %>% sqrt())

GG_save("figs/suspect_opposition_wt.png")

#without colonials
d %>% filter(!colonial) %>% select(suspect_rate, net_opposition) %>% wtd.cors()

##                suspect_rate net_opposition
## suspect_rate           1.00           0.66
## net_opposition         0.66           1.00

d %>% filter(!colonial) %>% select(suspect_rate, net_opposition, pop) %>% {wtd.cors(., weight = .$pop %>% sqrt())}

##                suspect_rate net_opposition   pop
## suspect_rate           1.00           0.68 -0.29
## net_opposition         0.68           1.00  0.33
## pop                   -0.29           0.33  1.00

#regression models
regs = list(
  ols(net_opposition ~ suspect_rate, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt()),
  ols(net_opposition ~ suspect_rate + colonial, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt()),
  ols(net_opposition ~ suspect_rate + colonial + Muslim, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt()),
  ols(net_opposition ~ suspect_rate + colonial + Muslim + UN_continent, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt()),
  ols(net_opposition ~ suspect_rate + colonial + Muslim + UN_macroregion, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt())
)
names(regs) = 1:length(regs)

#residuals for colonials
regs$`1` %>% {
  y = resid(.) %>% standardize()
  names(y) = d$origin
  y[d$colonial]
}

##            Indonesia Netherlands Antilles             Suriname 
##               -0.084               -2.172               -1.956

#full output
regs

## $`1`
## Frequencies of Missing Values Due to Each Variable
## net_opposition   suspect_rate      (weights) 
##              3              0              0 
## 
## Linear Regression Model
##  
##  ols(formula = net_opposition ~ suspect_rate, data = d %>% df_standardize(messages = F), 
##      weights = d$pop %>% sqrt())
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs       68    LR chi2     15.99    R2       0.210    
##  sigma10.3005    d.f.            1    R2 adj   0.198    
##  d.f.      66    Pr(> chi2) 0.0001    g        0.494    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.7445 -0.7475  0.2077  0.6795  1.6717 
##  
##  
##               Coef    S.E.   t     Pr(>|t|)
##  Intercept    -0.0695 0.1144 -0.61 0.5458  
##  suspect_rate  0.4383 0.1048  4.18 <0.0001 
##  
## 
## $`2`
## Frequencies of Missing Values Due to Each Variable
## net_opposition   suspect_rate       colonial      (weights) 
##              3              0              0              0 
## 
## Linear Regression Model
##  
##  ols(formula = net_opposition ~ suspect_rate + colonial, data = d %>% 
##      df_standardize(messages = F), weights = d$pop %>% sqrt())
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs      68    LR chi2     32.95    R2       0.384    
##  sigma9.1623    d.f.            2    R2 adj   0.365    
##  d.f.     65    Pr(> chi2) 0.0000    g        0.618    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.2878 -0.8297  0.0640  0.5951  1.5217 
##  
##  
##               Coef    S.E.   t     Pr(>|t|)
##  Intercept     0.0780 0.1074  0.73 0.4705  
##  suspect_rate  0.5489 0.0967  5.68 <0.0001 
##  colonial     -1.3200 0.3076 -4.29 <0.0001 
##  
## 
## $`3`
## Frequencies of Missing Values Due to Each Variable
## net_opposition   suspect_rate       colonial         Muslim      (weights) 
##              3              0              0              0              0 
## 
## Linear Regression Model
##  
##  ols(formula = net_opposition ~ suspect_rate + colonial + Muslim, 
##      data = d %>% df_standardize(messages = F), weights = d$pop %>% 
##          sqrt())
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs      68    LR chi2     70.34    R2       0.645    
##  sigma7.0146    d.f.            3    R2 adj   0.628    
##  d.f.     64    Pr(> chi2) 0.0000    g        0.764    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -0.97919 -0.43661  0.01758  0.61289  1.29841 
##  
##  
##               Coef    S.E.   t     Pr(>|t|)
##  Intercept    -0.0095 0.0832 -0.11 0.9097  
##  suspect_rate  0.3268 0.0808  4.04 0.0001  
##  colonial     -1.1960 0.2362 -5.06 <0.0001 
##  Muslim        0.5034 0.0735  6.85 <0.0001 
##  
## 
## $`4`
## Frequencies of Missing Values Due to Each Variable
## net_opposition   suspect_rate       colonial         Muslim   UN_continent 
##              3              0              0              0              0 
##      (weights) 
##              0 
## 
## Linear Regression Model
##  
##  ols(formula = net_opposition ~ suspect_rate + colonial + Muslim + 
##      UN_continent, data = d %>% df_standardize(messages = F), 
##      weights = d$pop %>% sqrt())
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs      68    LR chi2     88.75    R2       0.729    
##  sigma6.3274    d.f.            7    R2 adj   0.697    
##  d.f.     60    Pr(> chi2) 0.0000    g        0.917    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -0.88265 -0.40730 -0.06779  0.29813  1.50423 
##  
##  
##                        Coef    S.E.   t     Pr(>|t|)
##  Intercept             -0.3688 0.1360 -2.71 0.0087  
##  suspect_rate           0.3663 0.0990  3.70 0.0005  
##  colonial              -1.1974 0.2484 -4.82 <0.0001 
##  Muslim                 0.2701 0.0939  2.88 0.0055  
##  UN_continent=Africa    0.6648 0.2690  2.47 0.0163  
##  UN_continent=Americas  0.1759 0.2489  0.71 0.4826  
##  UN_continent=Asia      0.8292 0.2221  3.73 0.0004  
##  UN_continent=Oceania  -0.7103 0.5141 -1.38 0.1722  
##  
## 
## $`5`
## Frequencies of Missing Values Due to Each Variable
## net_opposition   suspect_rate       colonial         Muslim UN_macroregion 
##              3              0              0              0              0 
##      (weights) 
##              0 
## 
## Linear Regression Model
##  
##  ols(formula = net_opposition ~ suspect_rate + colonial + Muslim + 
##      UN_macroregion, data = d %>% df_standardize(messages = F), 
##      weights = d$pop %>% sqrt())
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs      68    LR chi2    138.53    R2       0.870    
##  sigma4.5828    d.f.           12    R2 adj   0.841    
##  d.f.     55    Pr(> chi2) 0.0000    g        1.058    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -0.92875 -0.23372 -0.05334  0.23475  1.17074 
##  
##  
##                                    Coef    S.E.   t     Pr(>|t|)
##  Intercept                         -0.8761 0.1289 -6.80 <0.0001 
##  suspect_rate                       0.2132 0.0895  2.38 0.0206  
##  colonial                          -1.1158 0.2329 -4.79 <0.0001 
##  Muslim                             0.3454 0.0958  3.61 0.0007  
##  UN_macroregion=Caribbean           0.8884 0.3939  2.26 0.0281  
##  UN_macroregion=Latin America       0.8521 0.2231  3.82 0.0003  
##  UN_macroregion=Africa              1.5499 0.2454  6.32 <0.0001 
##  UN_macroregion=Eastern Asia        0.9529 0.2564  3.72 0.0005  
##  UN_macroregion=Eastern Europe      1.7101 0.2254  7.59 <0.0001 
##  UN_macroregion=MENA                1.1775 0.2907  4.05 0.0002  
##  UN_macroregion=South-Eastern Asia  1.1056 0.2402  4.60 <0.0001 
##  UN_macroregion=Southern Asia       1.4684 0.2756  5.33 <0.0001 
##  UN_macroregion=Southern Europe     0.5369 0.2213  2.43 0.0185  
##

#summarize
regs %>% summarize_models(asterisks_only = F)

Robustness tests

Rindermann IQs

Alternative IQ estimates.

#plots
GG_scatter(d, "IQ_Rindermann", "suspect_rate_RR", case_names = "origin") +
  xlab("Origin country national IQ")

GG_scatter(d, "IQ_Rindermann", "suspect_rate_RR", case_names = "origin", weights = d$pop %>% sqrt()) +
  xlab("Origin country national IQ")

#correlations
wtd.cors(d %>% select(suspect_rate, Muslim, IQ, IQ_Rindermann))

##               suspect_rate Muslim    IQ IQ_Rindermann
## suspect_rate          1.00   0.44 -0.66         -0.65
## Muslim                0.44   1.00 -0.42         -0.38
## IQ                   -0.66  -0.42  1.00          0.98
## IQ_Rindermann        -0.65  -0.38  0.98          1.00

wtd.cors(d %>% select(suspect_rate, Muslim, IQ, IQ_Rindermann), weight = d$pop %>% sqrt())

##               suspect_rate Muslim    IQ IQ_Rindermann
## suspect_rate          1.00   0.45 -0.64         -0.65
## Muslim                0.45   1.00 -0.52         -0.53
## IQ                   -0.64  -0.52  1.00          0.98
## IQ_Rindermann        -0.65  -0.53  0.98          1.00

#regression
ols(suspect_rate ~ IQ_Rindermann + Muslim, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt())

## Frequencies of Missing Values Due to Each Variable
##  suspect_rate IQ_Rindermann        Muslim     (weights) 
##             0             4             0             0 
## 
## Linear Regression Model
##  
##  ols(formula = suspect_rate ~ IQ_Rindermann + Muslim, data = d %>% 
##      df_standardize(messages = F), weights = d$pop %>% sqrt())
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs      67    LR chi2     38.98    R2       0.441    
##  sigma9.6493    d.f.            2    R2 adj   0.424    
##  d.f.     64    Pr(> chi2) 0.0000    g        0.786    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.7435 -0.6348 -0.2164  0.1140  2.2975 
##  
##  
##                Coef    S.E.   t     Pr(>|t|)
##  Intercept      0.1635 0.1001  1.63 0.1072  
##  IQ_Rindermann -0.6165 0.1196 -5.15 <0.0001 
##  Muslim         0.1478 0.1050  1.41 0.1642  
##

ols(suspect_rate ~ IQ_Rindermann + Muslim, data = d %>% df_standardize(messages = F))

## Frequencies of Missing Values Due to Each Variable
##  suspect_rate IQ_Rindermann        Muslim 
##             0             4             0 
## 
## Linear Regression Model
##  
##  ols(formula = suspect_rate ~ IQ_Rindermann + Muslim, data = d %>% 
##      df_standardize(messages = F))
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs      67    LR chi2     41.75    R2       0.464    
##  sigma0.7603    d.f.            2    R2 adj   0.447    
##  d.f.     64    Pr(> chi2) 0.0000    g        0.794    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.4500 -0.4328 -0.0410  0.3099  2.5330 
##  
##  
##                Coef    S.E.   t     Pr(>|t|)
##  Intercept     -0.0112 0.0929 -0.12 0.9046  
##  IQ_Rindermann -0.5730 0.1014 -5.65 <0.0001 
##  Muslim         0.2278 0.0994  2.29 0.0252  
##

#compare models
lrtest(
  ols(suspect_rate ~ IQ_Rindermann, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt()),
  ols(suspect_rate ~ IQ_Rindermann + Muslim, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt())
)

## 
## Model 1: suspect_rate ~ IQ_Rindermann
## Model 2: suspect_rate ~ IQ_Rindermann + Muslim
## 
## L.R. Chisq       d.f.          P 
##       2.04       1.00       0.15

Unweighted Prolific survey

Don’t weigh the Prolific results by demopgrahics (voting choice), but accept the sample skew.

#scatterplots
GG_scatter(d, "suspect_rate_RR", "net_opposition_raw", case_names = "origin", text_pos = "tl")

GG_scatter(d, "suspect_rate_RR", "net_opposition_raw", case_names = "origin", text_pos = "tl", weights = d$pop %>% sqrt())

#correlations
wtd.cors(d %>% select(suspect_rate, Muslim, IQ, net_opposition, net_opposition_raw))

##                    suspect_rate Muslim    IQ net_opposition net_opposition_raw
## suspect_rate               1.00   0.44 -0.66           0.57               0.54
## Muslim                     0.44   1.00 -0.42           0.66               0.63
## IQ                        -0.66  -0.42  1.00          -0.71              -0.68
## net_opposition             0.57   0.66 -0.71           1.00               0.96
## net_opposition_raw         0.54   0.63 -0.68           0.96               1.00

wtd.cors(d %>% select(suspect_rate, Muslim, IQ, net_opposition, net_opposition_raw), weight = d$pop %>% sqrt())

##                    suspect_rate Muslim    IQ net_opposition net_opposition_raw
## suspect_rate               1.00   0.45 -0.64           0.46               0.46
## Muslim                     0.45   1.00 -0.52           0.68               0.69
## IQ                        -0.64  -0.52  1.00          -0.63              -0.60
## net_opposition             0.46   0.68 -0.63           1.00               0.96
## net_opposition_raw         0.46   0.69 -0.60           0.96               1.00

#without colonials
d %>% filter(!colonial) %>% select(suspect_rate, net_opposition, net_opposition_raw) %>% wtd.cors()

##                    suspect_rate net_opposition net_opposition_raw
## suspect_rate               1.00           0.66               0.63
## net_opposition             0.66           1.00               0.96
## net_opposition_raw         0.63           0.96               1.00

d %>% filter(!colonial) %>% select(suspect_rate, net_opposition, net_opposition_raw, pop) %>% {wtd.cors(., weight = .$pop %>% sqrt())}

##                    suspect_rate net_opposition net_opposition_raw   pop
## suspect_rate               1.00           0.68               0.67 -0.29
## net_opposition             0.68           1.00               0.96  0.33
## net_opposition_raw         0.67           0.96               1.00  0.43
## pop                       -0.29           0.33               0.43  1.00

#regression models
regs = list(
  ols(net_opposition_raw ~ suspect_rate, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt()),
  ols(net_opposition_raw ~ suspect_rate + colonial, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt()),
  ols(net_opposition_raw ~ suspect_rate + colonial + Muslim, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt()),
  ols(net_opposition_raw ~ suspect_rate + colonial + Muslim + UN_continent, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt()),
  ols(net_opposition_raw ~ suspect_rate + colonial + Muslim + UN_macroregion, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt())
)
names(regs) = "model" + 1:length(regs)

#residuals for colonials
regs$model1 %>% {
  y = resid(.) %>% standardize()
  names(y) = d$origin
  y[d$colonial]
}

##            Indonesia Netherlands Antilles             Suriname 
##                 0.12                -2.49                -1.52

#full output
regs

## $model1
## Frequencies of Missing Values Due to Each Variable
## net_opposition_raw       suspect_rate          (weights) 
##                  3                  0                  0 
## 
## Linear Regression Model
##  
##  ols(formula = net_opposition_raw ~ suspect_rate, data = d %>% 
##      df_standardize(messages = F), weights = d$pop %>% sqrt())
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs       68    LR chi2     15.96    R2       0.209    
##  sigma10.7751    d.f.            1    R2 adj   0.197    
##  d.f.      66    Pr(> chi2) 0.0001    g        0.516    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -2.0798 -0.8557  0.2572  0.7076  1.4717 
##  
##  
##               Coef    S.E.   t     Pr(>|t|)
##  Intercept    -0.0344 0.1197 -0.29 0.7747  
##  suspect_rate  0.4579 0.1096  4.18 <0.0001 
##  
## 
## $model2
## Frequencies of Missing Values Due to Each Variable
## net_opposition_raw       suspect_rate           colonial          (weights) 
##                  3                  0                  0                  0 
## 
## Linear Regression Model
##  
##  ols(formula = net_opposition_raw ~ suspect_rate + colonial, data = d %>% 
##      df_standardize(messages = F), weights = d$pop %>% sqrt())
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs      68    LR chi2     28.85    R2       0.346    
##  sigma9.8753    d.f.            2    R2 adj   0.326    
##  d.f.     65    Pr(> chi2) 0.0000    g        0.627    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -1.37032 -0.85693  0.04163  0.56805  1.28321 
##  
##  
##               Coef    S.E.   t     Pr(>|t|)
##  Intercept     0.1021 0.1158  0.88 0.3813  
##  suspect_rate  0.5602 0.1042  5.38 <0.0001 
##  colonial     -1.2215 0.3315 -3.68 0.0005  
##  
## 
## $model3
## Frequencies of Missing Values Due to Each Variable
## net_opposition_raw       suspect_rate           colonial             Muslim 
##                  3                  0                  0                  0 
##          (weights) 
##                  0 
## 
## Linear Regression Model
##  
##  ols(formula = net_opposition_raw ~ suspect_rate + colonial + 
##      Muslim, data = d %>% df_standardize(messages = F), weights = d$pop %>% 
##      sqrt())
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs      68    LR chi2     65.96    R2       0.621    
##  sigma7.5758    d.f.            3    R2 adj   0.603    
##  d.f.     64    Pr(> chi2) 0.0000    g        0.786    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -1.0517 -0.5740  0.1009  0.5149  1.3823 
##  
##  
##               Coef    S.E.   t     Pr(>|t|)
##  Intercept     0.0081 0.0899  0.09 0.9288  
##  suspect_rate  0.3216 0.0873  3.68 0.0005  
##  colonial     -1.0882 0.2551 -4.27 <0.0001 
##  Muslim        0.5411 0.0794  6.82 <0.0001 
##  
## 
## $model4
## Frequencies of Missing Values Due to Each Variable
## net_opposition_raw       suspect_rate           colonial             Muslim 
##                  3                  0                  0                  0 
##       UN_continent          (weights) 
##                  0                  0 
## 
## Linear Regression Model
##  
##  ols(formula = net_opposition_raw ~ suspect_rate + colonial + 
##      Muslim + UN_continent, data = d %>% df_standardize(messages = F), 
##      weights = d$pop %>% sqrt())
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs      68    LR chi2     84.52    R2       0.711    
##  sigma6.8262    d.f.            7    R2 adj   0.678    
##  d.f.     60    Pr(> chi2) 0.0000    g        0.956    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -0.8989 -0.4305 -0.1110  0.2779  1.6667 
##  
##  
##                        Coef    S.E.   t     Pr(>|t|)
##  Intercept             -0.4217 0.1467 -2.88 0.0056  
##  suspect_rate           0.3376 0.1068  3.16 0.0025  
##  colonial              -1.1345 0.2680 -4.23 <0.0001 
##  Muslim                 0.3065 0.1012  3.03 0.0036  
##  UN_continent=Africa    0.7877 0.2902  2.71 0.0087  
##  UN_continent=Americas  0.3434 0.2686  1.28 0.2060  
##  UN_continent=Asia      0.9132 0.2396  3.81 0.0003  
##  UN_continent=Oceania  -0.6258 0.5546 -1.13 0.2636  
##  
## 
## $model5
## Frequencies of Missing Values Due to Each Variable
## net_opposition_raw       suspect_rate           colonial             Muslim 
##                  3                  0                  0                  0 
##     UN_macroregion          (weights) 
##                  0                  0 
## 
## Linear Regression Model
##  
##  ols(formula = net_opposition_raw ~ suspect_rate + colonial + 
##      Muslim + UN_macroregion, data = d %>% df_standardize(messages = F), 
##      weights = d$pop %>% sqrt())
##  
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs      68    LR chi2    172.55    R2       0.921    
##  sigma3.7321    d.f.           12    R2 adj   0.904    
##  d.f.     55    Pr(> chi2) 0.0000    g        1.133    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -0.63423 -0.22078 -0.04751  0.11800  0.94697 
##  
##  
##                                    Coef    S.E.   t      Pr(>|t|)
##  Intercept                         -1.0705 0.1050 -10.20 <0.0001 
##  suspect_rate                       0.2092 0.0729   2.87 0.0058  
##  colonial                          -1.0685 0.1897  -5.63 <0.0001 
##  Muslim                             0.2757 0.0780   3.54 0.0008  
##  UN_macroregion=Caribbean           0.8185 0.3207   2.55 0.0135  
##  UN_macroregion=Latin America       1.3221 0.1817   7.28 <0.0001 
##  UN_macroregion=Africa              1.6713 0.1999   8.36 <0.0001 
##  UN_macroregion=Eastern Asia        1.1008 0.2088   5.27 <0.0001 
##  UN_macroregion=Eastern Europe      2.0958 0.1836  11.42 <0.0001 
##  UN_macroregion=MENA                1.7515 0.2368   7.40 <0.0001 
##  UN_macroregion=South-Eastern Asia  1.5170 0.1956   7.76 <0.0001 
##  UN_macroregion=Southern Asia       1.4856 0.2244   6.62 <0.0001 
##  UN_macroregion=Southern Europe     0.5444 0.1802   3.02 0.0038  
##

#summary
regs %>% summarize_models()

Nonlinearities

Add a smoothing line (LOESS, default ggplot2 settings).

#scattersplots
GG_scatter(d, "IQ", "suspect_rate_RR", case_names = "origin") +
  xlab("Origin country national IQ") +
  geom_smooth()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

GG_scatter(d, "Muslim", "suspect_rate_RR", case_names = "origin", text_pos = "tr") +
  xlab("Origin country Muslim% of population") +
  geom_smooth()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#with weights 
GG_scatter(d, "IQ", "suspect_rate_RR", case_names = "origin", weights = d$pop %>% sqrt()) +
  xlab("Origin country national IQ") +
  geom_smooth()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

GG_scatter(d, "Muslim", "suspect_rate_RR", case_names = "origin", text_pos = "tr", weights = d$pop %>% sqrt()) +
  xlab("Origin country Muslim% of population") +
  geom_smooth()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Colonial as interaction

Colonial status as an interaction.

#regression models
regs = list(
  ols(net_opposition ~ suspect_rate, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt()),
  ols(net_opposition ~ suspect_rate * colonial, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt()),
  ols(net_opposition ~ suspect_rate * colonial + Muslim, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt()),
  ols(net_opposition ~ suspect_rate * colonial + Muslim + UN_continent, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt()),
  ols(net_opposition ~ suspect_rate * colonial + Muslim + UN_macroregion, data = d %>% df_standardize(messages = F), weights = d$pop %>% sqrt())
)

#summarize
regs %>% summarize_models(asterisks_only = F)

Without imputed Muslim%

#subset data
d_no_imp = d %>% filter(!ISO %in% imputed_muslim)

#models
regs = list(
  ols(net_opposition ~ suspect_rate, data = d_no_imp %>% df_standardize(messages = F), weights = d_no_imp$pop %>% sqrt()),
  ols(net_opposition ~ suspect_rate + colonial, data = d_no_imp %>% df_standardize(messages = F), weights = d_no_imp$pop %>% sqrt()),
  ols(net_opposition ~ suspect_rate + colonial + Muslim, data = d_no_imp  %>% df_standardize(messages = F), weights = d_no_imp$pop %>% sqrt()),
  ols(net_opposition ~ suspect_rate + colonial + Muslim + UN_continent, data = d_no_imp %>% df_standardize(messages = F), weights = d_no_imp$pop %>% sqrt()),
  ols(net_opposition ~ suspect_rate + colonial + Muslim + UN_macroregion, data = d_no_imp  %>% df_standardize(messages = F), weights = d_no_imp$pop %>% sqrt())
)

#summarize
regs %>% summarize_models()

Alternative crime outcome

#correlations
d %>% select(suspect_rate, suspect_rate_1st, suspect_rate_2nd,
             arrest_rate, arrest_rate_1st, arrest_rate_2nd,
             IQ, Muslim, net_opposition) %>% 
  wtd.cors()

##                  suspect_rate suspect_rate_1st suspect_rate_2nd arrest_rate
## suspect_rate             1.00             0.96             0.90        1.00
## suspect_rate_1st         0.96             1.00             0.80        0.96
## suspect_rate_2nd         0.90             0.80             1.00        0.89
## arrest_rate              1.00             0.96             0.89        1.00
## arrest_rate_1st          0.96             1.00             0.80        0.97
## arrest_rate_2nd          0.79             0.67             0.94        0.79
## IQ                      -0.66            -0.66            -0.59       -0.64
## Muslim                   0.44             0.39             0.43        0.44
## net_opposition           0.57             0.58             0.58        0.55
##                  arrest_rate_1st arrest_rate_2nd    IQ Muslim net_opposition
## suspect_rate                0.96            0.79 -0.66   0.44           0.57
## suspect_rate_1st            1.00            0.67 -0.66   0.39           0.58
## suspect_rate_2nd            0.80            0.94 -0.59   0.43           0.58
## arrest_rate                 0.97            0.79 -0.64   0.44           0.55
## arrest_rate_1st             1.00            0.68 -0.65   0.38           0.56
## arrest_rate_2nd             0.68            1.00 -0.43   0.34           0.47
## IQ                         -0.65           -0.43  1.00  -0.42          -0.71
## Muslim                      0.38            0.34 -0.42   1.00           0.66
## net_opposition              0.56            0.47 -0.71   0.66           1.00

#more decimals
d %>% select(suspect_rate, arrest_rate) %>% wtd.cor() %>% print(digits = 4)

## Warning in summary.lm(lm(stdz(y, weight = weight) ~ stdz(x, weight = weight), :
## essentially perfect fit: summary may be unreliable

## Warning in summary.lm(lm(stdz(y, weight = weight) ~ stdz(x, weight = weight), :
## essentially perfect fit: summary may be unreliable

## $correlation
##              suspect_rate arrest_rate
## suspect_rate       1.0000      0.9952
## arrest_rate        0.9952      1.0000
## 
## $std.err
##              suspect_rate arrest_rate
## suspect_rate    9.827e-17   1.172e-02
## arrest_rate     1.172e-02   2.804e-17
## 
## $t.value
##              suspect_rate arrest_rate
## suspect_rate    1.018e+16   8.490e+01
## arrest_rate     8.490e+01   3.566e+16
## 
## $p.value
##              suspect_rate arrest_rate
## suspect_rate    0.000e+00   1.532e-71
## arrest_rate     1.532e-71   0.000e+00

#models
regs = list(
  ols(net_opposition ~ arrest_rate, data = d_no_imp %>% df_standardize(messages = F), weights = d_no_imp$pop %>% sqrt()),
  ols(net_opposition ~ arrest_rate + colonial, data = d_no_imp %>% df_standardize(messages = F), weights = d_no_imp$pop %>% sqrt()),
  ols(net_opposition ~ arrest_rate + colonial + Muslim, data = d_no_imp  %>% df_standardize(messages = F), weights = d_no_imp$pop %>% sqrt()),
  ols(net_opposition ~ arrest_rate + colonial + Muslim + UN_continent, data = d_no_imp %>% df_standardize(messages = F), weights = d_no_imp$pop %>% sqrt()),
  ols(net_opposition ~ arrest_rate + colonial + Muslim + UN_macroregion, data = d_no_imp  %>% df_standardize(messages = F), weights = d_no_imp$pop %>% sqrt())
)

#summarize
regs %>% summarize_models()

Maps

#this one is missing too many units
# world = spData::world %>% mutate(ISO = pu_translate(name_long))
#http://thematicmapping.org/downloads/world_borders.php
world = sf::read_sf("data/TM_WORLD_BORDERS-0.3/TM_WORLD_BORDERS-0.3.shp")

#regions coding
world2 = left_join(world, chess2019, by = c("ISO3" = "ISO"))

#continents
map_continents = tm_shape(world2) +
  tm_fill("UN_continent", title = "Continents") +
  tm_borders()
map_continents

## Warning: The shape world2 is invalid. See sf::st_is_valid

tmap_save(map_continents, "figs/continents map.png")

## Warning: The shape world2 is invalid. See sf::st_is_valid

## Warning: The shape world2 is invalid. See sf::st_is_valid

## Map saved to /media/truecrypt1/Projects/Dutch immigrant crime 2019/figs/continents map.png

## Resolution: 2992 by 1474 pixels

## Size: 10 by 4.9 inches (300 dpi)

map_macroregions = tm_shape(world2) +
  tm_fill("UN_macroregion", title = "Macroregions") +
  tm_borders()
map_macroregions

## Warning: The shape world2 is invalid. See sf::st_is_valid

tmap_save(map_macroregions, "figs/macroregions map.png")

## Warning: The shape world2 is invalid. See sf::st_is_valid

## Warning: The shape world2 is invalid. See sf::st_is_valid

## Map saved to /media/truecrypt1/Projects/Dutch immigrant crime 2019/figs/macroregions map.png

## Resolution: 2992 by 1474 pixels

## Size: 10 by 4.9 inches (300 dpi)

map_regions = tm_shape(world2) +
  tm_fill("UN_region", title = "Regions") +
  tm_borders()
map_regions

## Warning: The shape world2 is invalid. See sf::st_is_valid

tmap_save(map_regions, "figs/regions map.png")

## Warning: The shape world2 is invalid. See sf::st_is_valid

## Warning: The shape world2 is invalid. See sf::st_is_valid

## Map saved to /media/truecrypt1/Projects/Dutch immigrant crime 2019/figs/regions map.png

## Resolution: 2992 by 1474 pixels

## Size: 10 by 4.9 inches (300 dpi)

#merge some ex-countries for plotting
world %<>% mutate(
  ISO = case_when((ISO3 %in% c("RUS") ~ "SUN"), #soviet is Russia
                  (ISO3 %in% c("CZE", "SVK")) ~ "CSK", #merge czech slovakia
                  (ISO3 %in% c("HRV", "SRB", "SVN", "MKD", "BIH", "MNE")) ~ "YUG", #merge yugoslavia
                  TRUE ~ ISO3
                  )
)

#merge table
d_spatial = full_join(world, d, by = "ISO")

#crime map
crime_map = tm_shape(d_spatial) +
  tm_fill("suspect_rate_RR", title = "Relative crime rate") +
  tm_borders()
crime_map

## Warning: The shape d_spatial is invalid. See sf::st_is_valid

tmap_save(crime_map, "figs/suspect_rate_map.png")

## Warning: The shape d_spatial is invalid. See sf::st_is_valid

## Warning: The shape d_spatial is invalid. See sf::st_is_valid

## Map saved to /media/truecrypt1/Projects/Dutch immigrant crime 2019/figs/suspect_rate_map.png

## Resolution: 2992 by 1474 pixels

## Size: 10 by 4.9 inches (300 dpi)

Print data

#print main variables
d %>% 
  select(origin, UN_continent, UN_macroregion, pop, suspect_rate, suspect_rate_RR, suspect_rate_1st_RR, suspect_rate_2nd_RR, Muslim, IQ, net_opposition)

Dutch immigrant crime 2019