About

The R code written for this project was done back in 2015 when I was quite new at R. As a result, it is quite terrible. Lots of DRY violations. I have changed it a little since then to fix bugs introduced by me renaming some functions. There are no serious changes with reflect to the overall code structure.

Startup

Load packages and set options.

options(digits = 2)
library(pacman)
p_load(kirkegaard, jsonlite, stringi, plyr, devtools,
       VIM, scales, readODS)

Data

Load data.

#load name data
names.df = read.csv("names.csv", stringsAsFactors = F, encoding="UTF-8")
rownames(names.df) = names.df$name

EDA

Plot some things of interest.

#Initial explorative plots
#Mere histograms
ggplot(data=names.df, aes(x=number)) +
  geom_histogram() +
  scale_x_continuous(trans=log10_trans(),
                     breaks = trans_breaks("log10", function(x) round(10^x))) +
  xlab("Number of persons") +
  ylab("Number of names")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figures/number.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
describe(names.df$number)
##    vars    n mean   sd median trimmed mad min   max range skew kurtosis
## X1    1 2358 2185 5714    309     741 277 100 50094 49994  4.6       24
##     se
## X1 118
ggplot(data=names.df, aes(x=age)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figures/age.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=names.df, aes(x=own.place)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 455 rows containing non-finite values (stat_bin).

GG_save("figures/own_place.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 455 rows containing non-finite values (stat_bin).
ggplot(data=names.df, aes(x=conviction)) +
  geom_histogram() +
  xlab("conviction [%]")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figures/conviction.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(data=names.df, aes(x=income)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 526 rows containing non-finite values (stat_bin).

GG_save("figures/income.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 526 rows containing non-finite values (stat_bin).
ggplot(data = names.df, aes(x=age, y=income, color=gender)) +
  geom_point() +
  geom_smooth(aes(color=gender)) +
  ylab("Monthly income in DKK")
## `geom_smooth()` using method = 'gam'
## Warning: Removed 526 rows containing non-finite values (stat_smooth).
## Warning: Removed 526 rows containing missing values (geom_point).

GG_save("figures/age_gender_income.png")
## `geom_smooth()` using method = 'gam'
## Warning: Removed 526 rows containing non-finite values (stat_smooth).

## Warning: Removed 526 rows containing missing values (geom_point).
ggplot(data = names.df, aes(x=age, y=conviction, color=gender)) +
  geom_point() +
  geom_smooth(aes(color=gender))
## `geom_smooth()` using method = 'gam'

GG_save("figures/age_gender_conviction.png")
## `geom_smooth()` using method = 'gam'
ggplot(data = names.df, aes(x=age, y=married, color=gender)) +
  geom_point() +
  geom_smooth(aes(color=gender))
## `geom_smooth()` using method = 'gam'
## Warning: Removed 455 rows containing non-finite values (stat_smooth).
## Warning: Removed 455 rows containing missing values (geom_point).

GG_save("figures/age_gender_married.png")
## `geom_smooth()` using method = 'gam'
## Warning: Removed 455 rows containing non-finite values (stat_smooth).

## Warning: Removed 455 rows containing missing values (geom_point).
ggplot(data = names.df, aes(x=age, y=own.place, color=gender)) +
  geom_point() +
  geom_smooth(aes(color=gender))
## `geom_smooth()` using method = 'gam'
## Warning: Removed 455 rows containing non-finite values (stat_smooth).

## Warning: Removed 455 rows containing missing values (geom_point).

GG_save("figures/age_gender_ownPlace.png")
## `geom_smooth()` using method = 'gam'
## Warning: Removed 455 rows containing non-finite values (stat_smooth).

## Warning: Removed 455 rows containing missing values (geom_point).
ggplot(data = names.df, aes(x=age, y=no.job, color=gender)) +
  geom_point() +
  geom_smooth(aes(color=gender))
## `geom_smooth()` using method = 'gam'
## Warning: Removed 526 rows containing non-finite values (stat_smooth).
## Warning: Removed 526 rows containing missing values (geom_point).