The below packages are used to preprocess the data and generate the report.
library(readr)
library(magrittr)
library(tidyr)
library(dplyr)
library(lubridate)
library(stringr)
The steps undertaken to find the data, pre-process it and generate the report were:
The two datasets used in this report were taken from the website Kaggle.
1.metal_bands_2017.csv Dataset
URL https://www.kaggle.com/mrpantherson/metal-by-nation?select=metal_bands_2017.csv
Data Description :
The dataset with 5000 Observations and 6 attributes, is a list of metal music bands from around the world, with their origin and count of fans as of 2017.
2.world_population_1960_2015.csv Dataset
URL :https://www.kaggle.com/mrpantherson/metal-by-nation?select=world_population_1960_2015.csv
Data Description:
The dataset contains population of countries from 1960 to 2015 and has 57 attributes and 264 observations.
Country Name: Name of the country
19960-2015: population in each country for the respective year
The datasets are imported from the csv file into R using readr functions which has default stringAsFactors as False to avoid unwanted conversion of strings to factors
Dataset structures, dimension and first 6 observations are displayed
# Loading data
world <- read_csv("world_population_1960_2015.csv")
dim(world)
## [1] 264 57
str(world)
## tibble [264 x 57] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Country Name: chr [1:264] "Aruba" "Andorra" "Afghanistan" "Angola" ...
## $ 1960 : num [1:264] 54208 13414 8994793 5270844 1608800 ...
## $ 1961 : num [1:264] 55435 14376 9164945 5367287 1659800 ...
## $ 1962 : num [1:264] 56226 15376 9343772 5465905 1711319 ...
## $ 1963 : num [1:264] 56697 16410 9531555 5565808 1762621 ...
## $ 1964 : num [1:264] 57029 17470 9728645 5665701 1814135 ...
## $ 1965 : num [1:264] 57360 18551 9935358 5765025 1864791 ...
## $ 1966 : num [1:264] 57712 19646 10148841 5863568 1914573 ...
## $ 1967 : num [1:264] 58049 20755 10368600 5962831 1965598 ...
## $ 1968 : num [1:264] 58385 21888 10599790 6066094 2022272 ...
## $ 1969 : num [1:264] 58724 23061 10849510 6177703 2081695 ...
## $ 1970 : num [1:264] 59065 24279 11121097 6300969 2135479 ...
## $ 1971 : num [1:264] 59438 25560 11412821 6437645 2187853 ...
## $ 1972 : num [1:264] 59849 26892 11716896 6587647 2243126 ...
## $ 1973 : num [1:264] 60239 28231 12022514 6750215 2296752 ...
## $ 1974 : num [1:264] 60525 29514 12315553 6923749 2350124 ...
## $ 1975 : num [1:264] 60655 30706 12582954 7107334 2404831 ...
## $ 1976 : num [1:264] 60589 31781 12831361 7299508 2458526 ...
## $ 1977 : num [1:264] 60366 32769 13056499 7501320 2513546 ...
## $ 1978 : num [1:264] 60106 33746 13222547 7717139 2566266 ...
## $ 1979 : num [1:264] 59978 34819 13283279 7952882 2617832 ...
## $ 1980 : num [1:264] 60096 36063 13211412 8211950 2671997 ...
## $ 1981 : num [1:264] 60567 37502 12996923 8497950 2726056 ...
## $ 1982 : num [1:264] 61344 39112 12667001 8807511 2784278 ...
## $ 1983 : num [1:264] 62204 40862 12279095 9128655 2843960 ...
## $ 1984 : num [1:264] 62831 42704 11912510 9444918 2904429 ...
## $ 1985 : num [1:264] 63028 44597 11630498 9745209 2964762 ...
## $ 1986 : num [1:264] 62644 46515 11438949 10023700 3022635 ...
## $ 1987 : num [1:264] 61835 48458 11337932 10285712 3083605 ...
## $ 1988 : num [1:264] 61077 50431 11375768 10544904 3142336 ...
## $ 1989 : num [1:264] 61032 52449 11608351 10820992 3227943 ...
## $ 1990 : num [1:264] 62148 54511 12067570 11127870 3286542 ...
## $ 1991 : num [1:264] 64623 56674 12789374 11472173 3266790 ...
## $ 1992 : num [1:264] 68235 58904 13745630 11848971 3247039 ...
## $ 1993 : num [1:264] 72498 61003 14824371 12246786 3227287 ...
## $ 1994 : num [1:264] 76700 62707 15869967 12648483 3207536 ...
## $ 1995 : num [1:264] 80326 63854 16772522 13042666 3187784 ...
## $ 1996 : num [1:264] 83195 64291 17481800 13424813 3168033 ...
## $ 1997 : num [1:264] 85447 64147 18034130 13801868 3148281 ...
## $ 1998 : num [1:264] 87276 63888 18511480 14187710 3128530 ...
## $ 1999 : num [1:264] 89004 64161 19038420 14601983 3108778 ...
## $ 2000 : num [1:264] 90858 65399 19701940 15058638 3089027 ...
## $ 2001 : num [1:264] 92894 67770 20531160 15562791 3060173 ...
## $ 2002 : num [1:264] 94995 71046 21487079 16109696 3051010 ...
## $ 2003 : num [1:264] 97015 74783 22507368 16691395 3039616 ...
## $ 2004 : num [1:264] 98742 78337 23499850 17295500 3026939 ...
## $ 2005 : num [1:264] 100031 81223 24399948 17912942 3011487 ...
## $ 2006 : num [1:264] 100830 83373 25183615 18541467 2992547 ...
## $ 2007 : num [1:264] 101218 84878 25877544 19183907 2970017 ...
## $ 2008 : num [1:264] 101342 85616 26528741 19842251 2947314 ...
## $ 2009 : num [1:264] 101416 85474 27207291 20520103 2927519 ...
## $ 2010 : num [1:264] 101597 84419 27962207 21219954 2913021 ...
## $ 2011 : num [1:264] 101936 82326 28809167 21942296 2904780 ...
## $ 2012 : num [1:264] 102393 79316 29726803 22685632 2900247 ...
## $ 2013 : num [1:264] 102921 75902 30682500 23448202 2896652 ...
## $ 2014 : num [1:264] 103441 72786 31627506 24227524 2893654 ...
## $ 2015 : num [1:264] 103889 70473 32526562 25021974 2889167 ...
## - attr(*, "spec")=
## .. cols(
## .. `Country Name` = col_character(),
## .. `1960` = col_double(),
## .. `1961` = col_double(),
## .. `1962` = col_double(),
## .. `1963` = col_double(),
## .. `1964` = col_double(),
## .. `1965` = col_double(),
## .. `1966` = col_double(),
## .. `1967` = col_double(),
## .. `1968` = col_double(),
## .. `1969` = col_double(),
## .. `1970` = col_double(),
## .. `1971` = col_double(),
## .. `1972` = col_double(),
## .. `1973` = col_double(),
## .. `1974` = col_double(),
## .. `1975` = col_double(),
## .. `1976` = col_double(),
## .. `1977` = col_double(),
## .. `1978` = col_double(),
## .. `1979` = col_double(),
## .. `1980` = col_double(),
## .. `1981` = col_double(),
## .. `1982` = col_double(),
## .. `1983` = col_double(),
## .. `1984` = col_double(),
## .. `1985` = col_double(),
## .. `1986` = col_double(),
## .. `1987` = col_double(),
## .. `1988` = col_double(),
## .. `1989` = col_double(),
## .. `1990` = col_double(),
## .. `1991` = col_double(),
## .. `1992` = col_double(),
## .. `1993` = col_double(),
## .. `1994` = col_double(),
## .. `1995` = col_double(),
## .. `1996` = col_double(),
## .. `1997` = col_double(),
## .. `1998` = col_double(),
## .. `1999` = col_double(),
## .. `2000` = col_double(),
## .. `2001` = col_double(),
## .. `2002` = col_double(),
## .. `2003` = col_double(),
## .. `2004` = col_double(),
## .. `2005` = col_double(),
## .. `2006` = col_double(),
## .. `2007` = col_double(),
## .. `2008` = col_double(),
## .. `2009` = col_double(),
## .. `2010` = col_double(),
## .. `2011` = col_double(),
## .. `2012` = col_double(),
## .. `2013` = col_double(),
## .. `2014` = col_double(),
## .. `2015` = col_double()
## .. )
head(world)
metal <- read_csv("metal_bands_2017.csv")
dim(metal)
## [1] 5000 7
str(metal)
## tibble [5,000 x 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ X1 : num [1:5000] 0 1 2 3 4 5 6 7 8 9 ...
## $ band_name: chr [1:5000] "Iron Maiden" "Opeth" "Metallica" "Megadeth" ...
## $ fans : num [1:5000] 4195 4147 3712 3105 3054 ...
## $ formed : num [1:5000] 1975 1990 1981 1983 1988 ...
## $ origin : chr [1:5000] "United Kingdom" "Sweden" "USA" "USA" ...
## $ split : chr [1:5000] "-" "1990" "-" "1983" ...
## $ style : chr [1:5000] "New wave of british heavy,Heavy" "Extreme progressive,Progressive rock,Progressive" "Heavy,Bay area thrash" "Thrash,Heavy,Hard rock" ...
## - attr(*, "problems")= tibble [4 x 5] (S3: tbl_df/tbl/data.frame)
## ..$ row : int [1:4] 4443 4600 4822 4949
## ..$ col : chr [1:4] "formed" "formed" "formed" "formed"
## ..$ expected: chr [1:4] "a double" "a double" "a double" "a double"
## ..$ actual : chr [1:4] "-" "-" "-" "-"
## ..$ file : chr [1:4] "'metal_bands_2017.csv'" "'metal_bands_2017.csv'" "'metal_bands_2017.csv'" "'metal_bands_2017.csv'"
## - attr(*, "spec")=
## .. cols(
## .. X1 = col_double(),
## .. band_name = col_character(),
## .. fans = col_double(),
## .. formed = col_double(),
## .. origin = col_character(),
## .. split = col_character(),
## .. style = col_character()
## .. )
head(metal)
The world population dataset violates the “Tidy Data Principles”, by having variables as column headers. This is resolved by using the gather function from the tidyr package.
# Tidy Data Principles check
world1 <- world %>% gather(key = "year", value = "count", -"Country Name")
world2 <- aggregate(count ~ `Country Name`, world1, sum)
world <- data.frame(world2,stringsAsFactors = FALSE)
head(world)
# Dataset description
str(world)
## 'data.frame': 263 obs. of 2 variables:
## $ Country.Name: chr "Afghanistan" "Albania" "Algeria" "American Samoa" ...
## $ count : num 9.20e+08 1.51e+08 1.36e+09 2.35e+06 2.78e+06 ...
#
metal<-data.frame(metal,stringsAsFactors = FALSE)
metal <- metal %>% select(`band_name`, `fans`, `formed`, `origin`, `split`)
metal$formed <- year(as.Date(as.character(metal$formed), format="%Y"))
metal$split[metal$split != "-"] <- "Not Split"
metal$split[metal$split == "-"] <- "Split"
metal$split <- factor(c("Split", "Not Split"), levels = c("Not Split", "Split"));
str(metal)
## 'data.frame': 5000 obs. of 5 variables:
## $ band_name: chr "Iron Maiden" "Opeth" "Metallica" "Megadeth" ...
## $ fans : num 4195 4147 3712 3105 3054 ...
## $ formed : num 1975 1990 1981 1983 1988 ...
## $ origin : chr "United Kingdom" "Sweden" "USA" "USA" ...
## $ split : Factor w/ 2 levels "Not Split","Split": 2 1 2 1 2 1 2 1 2 1 ...
# Scan the data for missing values, special values and obvious errors
sapply(metal, function(x) sum(is.na(x)))
## band_name fans formed origin split
## 0 0 4 8 0
metal <- metal %>% na.omit()
metal <- metal %>% filter(metal$origin != "Taiwan")
metal <- metal %>% filter(metal$fans != 0)
sapply(metal, function(x) sum( is.infinite(x)))
## band_name fans formed origin split
## 0 0 0 0 0
sapply(metal, function(x) sum( is.nan(x)))
## band_name fans formed origin split
## 0 0 0 0 0
head(metal)
Before merging the data is processed by matching the country names in both datasets and removing bands with no fans.
# Merging the 2 datasets
metal$origin <- metal$origin %>% str_replace_all(c("Korea, South" = "South Korea", "Jersey" = "USA", "The Netherlands" = "Netherlands", "Kyrgyzstan" = "Kazakhstan"))
metal <- metal %>% mutate(origin = strsplit(as.character(origin), ",")) %>% unnest(origin)
metal$origin <- trimws(metal$origin)
metal <- metal[!duplicated(metal), ]
# finding mismatched country names
diff <- setdiff(metal$origin, world$`Country.Name`)
diff
## [1] "USA" "Russia" "Iran" "UAE" "Egypt"
## [6] "Syria" "South Korea" "Macedonia" "Venezuela"
world$`Country.Name` <- world$`Country.Name` %>% str_replace_all(c("United States" = "USA", "Russian Federation" = "Russia", "Iran, Islamic Rep." = "Iran", "United Arab Emirates" = "UAE",
"Egypt, Arab Rep." = "Egypt", "Syrian Arab Republic" = "Syria", "Korea, Rep." = "South Korea", "Macedonia, FYR" = "Macedonia",
"Venezuela, RB" = "Venezuela"))
diff <- setdiff(metal$origin, world$`Country.Name`)
diff
## character(0)
#merge
data <- metal %>% left_join(world, c('origin' = 'Country.Name'))
dim(data)
## [1] 4943 6
# relative count of fans wrt population
data <- mutate(data, 'fans_prop' = fans/count*100000)
head(data)
# Identifying Outliers
par(mfrow=c(2,2))
boxplot(data$fans,main="Fans")
boxplot(data$count,main="Population")
boxplot(data$fans_prop,main="Proportion")
#Dealing with outliers
cap <- function(x){
quantiles <- quantile( x, c(.05, 0.25, 0.75, .95 ))
x[ x < quantiles[2] - 1.5*IQR(x) ] <- quantiles[1]
x[ x > quantiles[3] + 1.5*IQR(x) ] <- quantiles[4]
x
}
data$fans <- data$fans %>%cap
data$count <- data$count %>% cap
data$fans_prop <- data$fans_prop %>% cap
par(mfrow=c(2,2))
boxplot(data$fans,main="Fans")
boxplot(data$count,main="Population")
boxplot(data$fans_prop,main="Proportion")
# Checking Normality of the numeric attribute
data$fans %>% hist(freq=FALSE, main="Frequency Distribution",xlab = "Fans prop")
data$fans %>% mean() %>% abline(v=., col='red', lw=2)
data$fans %>% median() %>% abline(v=., col='green', lw=2)
lines(seq(-15, 0), dnorm(seq(-15,0), mean(data$fans), sd(data$fans )), col="blue")
# Applying appropriate data transformation
log(data$fans) %>% hist(freq=FALSE, xlim=c(-1,6), main="Frequency Distribution",xlab = "Fans prop")
log(data$fans) %>% mean() %>% abline(v=., col='red', lw=2)
log(data$fans) %>% median() %>% abline(v=., col='green', lw=2)
lines(seq(-1, 6), dnorm(seq(-1,6), mean(log(data$fans)), sd(log(data$fans) )), col="blue")