Required packages

The below packages are used to preprocess the data and generate the report.

library(readr)
library(magrittr)
library(tidyr)
library(dplyr)
library(lubridate)
library(stringr)

Executive Summary

The steps undertaken to find the data, pre-process it and generate the report were:

Data

The two datasets used in this report were taken from the website Kaggle.

1.metal_bands_2017.csv Dataset

URL https://www.kaggle.com/mrpantherson/metal-by-nation?select=metal_bands_2017.csv

Data Description :
The dataset with 5000 Observations and 6 attributes, is a list of metal music bands from around the world, with their origin and count of fans as of 2017.

2.world_population_1960_2015.csv Dataset

URL :https://www.kaggle.com/mrpantherson/metal-by-nation?select=world_population_1960_2015.csv

Data Description:
The dataset contains population of countries from 1960 to 2015 and has 57 attributes and 264 observations.

# Loading data

world <- read_csv("world_population_1960_2015.csv")

dim(world) 
## [1] 264  57
str(world) 
## tibble [264 x 57] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Country Name: chr [1:264] "Aruba" "Andorra" "Afghanistan" "Angola" ...
##  $ 1960        : num [1:264] 54208 13414 8994793 5270844 1608800 ...
##  $ 1961        : num [1:264] 55435 14376 9164945 5367287 1659800 ...
##  $ 1962        : num [1:264] 56226 15376 9343772 5465905 1711319 ...
##  $ 1963        : num [1:264] 56697 16410 9531555 5565808 1762621 ...
##  $ 1964        : num [1:264] 57029 17470 9728645 5665701 1814135 ...
##  $ 1965        : num [1:264] 57360 18551 9935358 5765025 1864791 ...
##  $ 1966        : num [1:264] 57712 19646 10148841 5863568 1914573 ...
##  $ 1967        : num [1:264] 58049 20755 10368600 5962831 1965598 ...
##  $ 1968        : num [1:264] 58385 21888 10599790 6066094 2022272 ...
##  $ 1969        : num [1:264] 58724 23061 10849510 6177703 2081695 ...
##  $ 1970        : num [1:264] 59065 24279 11121097 6300969 2135479 ...
##  $ 1971        : num [1:264] 59438 25560 11412821 6437645 2187853 ...
##  $ 1972        : num [1:264] 59849 26892 11716896 6587647 2243126 ...
##  $ 1973        : num [1:264] 60239 28231 12022514 6750215 2296752 ...
##  $ 1974        : num [1:264] 60525 29514 12315553 6923749 2350124 ...
##  $ 1975        : num [1:264] 60655 30706 12582954 7107334 2404831 ...
##  $ 1976        : num [1:264] 60589 31781 12831361 7299508 2458526 ...
##  $ 1977        : num [1:264] 60366 32769 13056499 7501320 2513546 ...
##  $ 1978        : num [1:264] 60106 33746 13222547 7717139 2566266 ...
##  $ 1979        : num [1:264] 59978 34819 13283279 7952882 2617832 ...
##  $ 1980        : num [1:264] 60096 36063 13211412 8211950 2671997 ...
##  $ 1981        : num [1:264] 60567 37502 12996923 8497950 2726056 ...
##  $ 1982        : num [1:264] 61344 39112 12667001 8807511 2784278 ...
##  $ 1983        : num [1:264] 62204 40862 12279095 9128655 2843960 ...
##  $ 1984        : num [1:264] 62831 42704 11912510 9444918 2904429 ...
##  $ 1985        : num [1:264] 63028 44597 11630498 9745209 2964762 ...
##  $ 1986        : num [1:264] 62644 46515 11438949 10023700 3022635 ...
##  $ 1987        : num [1:264] 61835 48458 11337932 10285712 3083605 ...
##  $ 1988        : num [1:264] 61077 50431 11375768 10544904 3142336 ...
##  $ 1989        : num [1:264] 61032 52449 11608351 10820992 3227943 ...
##  $ 1990        : num [1:264] 62148 54511 12067570 11127870 3286542 ...
##  $ 1991        : num [1:264] 64623 56674 12789374 11472173 3266790 ...
##  $ 1992        : num [1:264] 68235 58904 13745630 11848971 3247039 ...
##  $ 1993        : num [1:264] 72498 61003 14824371 12246786 3227287 ...
##  $ 1994        : num [1:264] 76700 62707 15869967 12648483 3207536 ...
##  $ 1995        : num [1:264] 80326 63854 16772522 13042666 3187784 ...
##  $ 1996        : num [1:264] 83195 64291 17481800 13424813 3168033 ...
##  $ 1997        : num [1:264] 85447 64147 18034130 13801868 3148281 ...
##  $ 1998        : num [1:264] 87276 63888 18511480 14187710 3128530 ...
##  $ 1999        : num [1:264] 89004 64161 19038420 14601983 3108778 ...
##  $ 2000        : num [1:264] 90858 65399 19701940 15058638 3089027 ...
##  $ 2001        : num [1:264] 92894 67770 20531160 15562791 3060173 ...
##  $ 2002        : num [1:264] 94995 71046 21487079 16109696 3051010 ...
##  $ 2003        : num [1:264] 97015 74783 22507368 16691395 3039616 ...
##  $ 2004        : num [1:264] 98742 78337 23499850 17295500 3026939 ...
##  $ 2005        : num [1:264] 100031 81223 24399948 17912942 3011487 ...
##  $ 2006        : num [1:264] 100830 83373 25183615 18541467 2992547 ...
##  $ 2007        : num [1:264] 101218 84878 25877544 19183907 2970017 ...
##  $ 2008        : num [1:264] 101342 85616 26528741 19842251 2947314 ...
##  $ 2009        : num [1:264] 101416 85474 27207291 20520103 2927519 ...
##  $ 2010        : num [1:264] 101597 84419 27962207 21219954 2913021 ...
##  $ 2011        : num [1:264] 101936 82326 28809167 21942296 2904780 ...
##  $ 2012        : num [1:264] 102393 79316 29726803 22685632 2900247 ...
##  $ 2013        : num [1:264] 102921 75902 30682500 23448202 2896652 ...
##  $ 2014        : num [1:264] 103441 72786 31627506 24227524 2893654 ...
##  $ 2015        : num [1:264] 103889 70473 32526562 25021974 2889167 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Country Name` = col_character(),
##   ..   `1960` = col_double(),
##   ..   `1961` = col_double(),
##   ..   `1962` = col_double(),
##   ..   `1963` = col_double(),
##   ..   `1964` = col_double(),
##   ..   `1965` = col_double(),
##   ..   `1966` = col_double(),
##   ..   `1967` = col_double(),
##   ..   `1968` = col_double(),
##   ..   `1969` = col_double(),
##   ..   `1970` = col_double(),
##   ..   `1971` = col_double(),
##   ..   `1972` = col_double(),
##   ..   `1973` = col_double(),
##   ..   `1974` = col_double(),
##   ..   `1975` = col_double(),
##   ..   `1976` = col_double(),
##   ..   `1977` = col_double(),
##   ..   `1978` = col_double(),
##   ..   `1979` = col_double(),
##   ..   `1980` = col_double(),
##   ..   `1981` = col_double(),
##   ..   `1982` = col_double(),
##   ..   `1983` = col_double(),
##   ..   `1984` = col_double(),
##   ..   `1985` = col_double(),
##   ..   `1986` = col_double(),
##   ..   `1987` = col_double(),
##   ..   `1988` = col_double(),
##   ..   `1989` = col_double(),
##   ..   `1990` = col_double(),
##   ..   `1991` = col_double(),
##   ..   `1992` = col_double(),
##   ..   `1993` = col_double(),
##   ..   `1994` = col_double(),
##   ..   `1995` = col_double(),
##   ..   `1996` = col_double(),
##   ..   `1997` = col_double(),
##   ..   `1998` = col_double(),
##   ..   `1999` = col_double(),
##   ..   `2000` = col_double(),
##   ..   `2001` = col_double(),
##   ..   `2002` = col_double(),
##   ..   `2003` = col_double(),
##   ..   `2004` = col_double(),
##   ..   `2005` = col_double(),
##   ..   `2006` = col_double(),
##   ..   `2007` = col_double(),
##   ..   `2008` = col_double(),
##   ..   `2009` = col_double(),
##   ..   `2010` = col_double(),
##   ..   `2011` = col_double(),
##   ..   `2012` = col_double(),
##   ..   `2013` = col_double(),
##   ..   `2014` = col_double(),
##   ..   `2015` = col_double()
##   .. )
head(world)
metal <- read_csv("metal_bands_2017.csv")

dim(metal) 
## [1] 5000    7
str(metal)
## tibble [5,000 x 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ X1       : num [1:5000] 0 1 2 3 4 5 6 7 8 9 ...
##  $ band_name: chr [1:5000] "Iron Maiden" "Opeth" "Metallica" "Megadeth" ...
##  $ fans     : num [1:5000] 4195 4147 3712 3105 3054 ...
##  $ formed   : num [1:5000] 1975 1990 1981 1983 1988 ...
##  $ origin   : chr [1:5000] "United Kingdom" "Sweden" "USA" "USA" ...
##  $ split    : chr [1:5000] "-" "1990" "-" "1983" ...
##  $ style    : chr [1:5000] "New wave of british heavy,Heavy" "Extreme progressive,Progressive rock,Progressive" "Heavy,Bay area thrash" "Thrash,Heavy,Hard rock" ...
##  - attr(*, "problems")= tibble [4 x 5] (S3: tbl_df/tbl/data.frame)
##   ..$ row     : int [1:4] 4443 4600 4822 4949
##   ..$ col     : chr [1:4] "formed" "formed" "formed" "formed"
##   ..$ expected: chr [1:4] "a double" "a double" "a double" "a double"
##   ..$ actual  : chr [1:4] "-" "-" "-" "-"
##   ..$ file    : chr [1:4] "'metal_bands_2017.csv'" "'metal_bands_2017.csv'" "'metal_bands_2017.csv'" "'metal_bands_2017.csv'"
##  - attr(*, "spec")=
##   .. cols(
##   ..   X1 = col_double(),
##   ..   band_name = col_character(),
##   ..   fans = col_double(),
##   ..   formed = col_double(),
##   ..   origin = col_character(),
##   ..   split = col_character(),
##   ..   style = col_character()
##   .. )
head(metal)

Tidy & Manipulate Data I

The world population dataset violates the “Tidy Data Principles”, by having variables as column headers. This is resolved by using the gather function from the tidyr package.

# Tidy Data Principles check

world1 <- world %>% gather(key = "year", value = "count", -"Country Name") 
world2 <- aggregate(count ~ `Country Name`, world1, sum)

world <- data.frame(world2,stringsAsFactors = FALSE)

head(world)

Understand

# Dataset description

str(world)
## 'data.frame':    263 obs. of  2 variables:
##  $ Country.Name: chr  "Afghanistan" "Albania" "Algeria" "American Samoa" ...
##  $ count       : num  9.20e+08 1.51e+08 1.36e+09 2.35e+06 2.78e+06 ...
#
metal<-data.frame(metal,stringsAsFactors = FALSE)

metal <- metal %>% select(`band_name`, `fans`, `formed`, `origin`, `split`)

metal$formed <- year(as.Date(as.character(metal$formed), format="%Y"))

metal$split[metal$split != "-"] <- "Not Split"
metal$split[metal$split == "-"] <- "Split"

metal$split <- factor(c("Split", "Not Split"), levels = c("Not Split", "Split"));
str(metal)
## 'data.frame':    5000 obs. of  5 variables:
##  $ band_name: chr  "Iron Maiden" "Opeth" "Metallica" "Megadeth" ...
##  $ fans     : num  4195 4147 3712 3105 3054 ...
##  $ formed   : num  1975 1990 1981 1983 1988 ...
##  $ origin   : chr  "United Kingdom" "Sweden" "USA" "USA" ...
##  $ split    : Factor w/ 2 levels "Not Split","Split": 2 1 2 1 2 1 2 1 2 1 ...

Scan I

# Scan the data for missing values, special values and obvious errors
sapply(metal, function(x) sum(is.na(x)))
## band_name      fans    formed    origin     split 
##         0         0         4         8         0
metal <- metal %>% na.omit()
metal <- metal %>% filter(metal$origin != "Taiwan")
metal <- metal %>% filter(metal$fans != 0)

sapply(metal, function(x) sum( is.infinite(x)))
## band_name      fans    formed    origin     split 
##         0         0         0         0         0
sapply(metal, function(x) sum( is.nan(x)))
## band_name      fans    formed    origin     split 
##         0         0         0         0         0
head(metal)

Data

Before merging the data is processed by matching the country names in both datasets and removing bands with no fans.

# Merging the 2 datasets

metal$origin <- metal$origin %>% str_replace_all(c("Korea, South" = "South Korea", "Jersey" = "USA",  "The Netherlands" = "Netherlands", "Kyrgyzstan" = "Kazakhstan"))


metal <- metal %>% mutate(origin = strsplit(as.character(origin), ",")) %>% unnest(origin)
metal$origin  <- trimws(metal$origin)

metal <- metal[!duplicated(metal), ]

# finding mismatched country names
diff <- setdiff(metal$origin, world$`Country.Name`)
diff
## [1] "USA"         "Russia"      "Iran"        "UAE"         "Egypt"      
## [6] "Syria"       "South Korea" "Macedonia"   "Venezuela"
world$`Country.Name` <- world$`Country.Name` %>% str_replace_all(c("United States" = "USA", "Russian Federation" = "Russia", "Iran, Islamic Rep." = "Iran", "United Arab Emirates" = "UAE", 
                                                                   "Egypt, Arab Rep." = "Egypt", "Syrian Arab Republic" = "Syria", "Korea, Rep." = "South Korea", "Macedonia, FYR" = "Macedonia", 
                                                                   "Venezuela, RB" = "Venezuela"))

diff <- setdiff(metal$origin, world$`Country.Name`)
diff
## character(0)
#merge
data <- metal %>% left_join(world, c('origin' = 'Country.Name'))
dim(data)
## [1] 4943    6

Tidy & Manipulate Data II

#  relative count of fans wrt population
data <- mutate(data, 'fans_prop' = fans/count*100000)

head(data)

Scan II

# Identifying Outliers

par(mfrow=c(2,2))
boxplot(data$fans,main="Fans")
boxplot(data$count,main="Population")
boxplot(data$fans_prop,main="Proportion")

#Dealing with outliers
cap <- function(x){
  quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ))
  x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
  x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
  x
}

data$fans <- data$fans %>%cap
data$count <- data$count %>%  cap
data$fans_prop <- data$fans_prop %>%  cap


par(mfrow=c(2,2))

boxplot(data$fans,main="Fans")
boxplot(data$count,main="Population")
boxplot(data$fans_prop,main="Proportion")

Transform

# Checking Normality of the numeric attribute

data$fans %>% hist(freq=FALSE, main="Frequency Distribution",xlab = "Fans prop")
data$fans  %>% mean() %>% abline(v=., col='red', lw=2)
data$fans %>% median() %>% abline(v=., col='green', lw=2)
lines(seq(-15, 0), dnorm(seq(-15,0), mean(data$fans), sd(data$fans )), col="blue")

# Applying appropriate data transformation

log(data$fans) %>% hist(freq=FALSE, xlim=c(-1,6),  main="Frequency Distribution",xlab = "Fans prop")
log(data$fans) %>% mean() %>% abline(v=., col='red', lw=2)
log(data$fans) %>% median() %>% abline(v=., col='green', lw=2)
lines(seq(-1, 6), dnorm(seq(-1,6), mean(log(data$fans)), sd(log(data$fans) )), col="blue")

Reference