Dataset source: https://github.com/RMHogervorst/unicorns_on_unicycles

Import Data:

library(readxl)
observations <- read_excel("~/public datset/githubs/unicorns_on_unicycles-master/observations.xlsx")
View(observations)

sales <- read_excel("~/public datset/githubs/unicorns_on_unicycles-master/sales.xlsx")
View(sales)

Examining & Cleaning Data:

We noticed above that both datasets observations & sales have 42 obs. & a haeder. Let’s explore further.

head(observations)

## # A tibble: 6 x 3
##   countryname  year   pop
##   <chr>       <dbl> <dbl>
## 1 Austria      1670  85.0
## 2 Austria      1671  83.0
## 3 Austria      1674  75.0
## 4 Austria      1675  82.0
## 5 Austria      1676  79.0
## 6 Austria      1677  70.0

str(observations)                   #so it has 3 variables: 1 of class chr & 2 num; their names listed.

## Classes 'tbl_df', 'tbl' and 'data.frame':    42 obs. of  3 variables:
##  $ countryname: chr  "Austria" "Austria" "Austria" "Austria" ...
##  $ year       : num  1670 1671 1674 1675 1676 ...
##  $ pop        : num  85 83 75 82 79 70 81 80 70 79 ...

str(sales)                          #for similar information of sales dataset.

## Classes 'tbl_df', 'tbl' and 'data.frame':    42 obs. of  8 variables:
##  $ name_of_country   : chr  "AUSTRIA" "AUSTRIA" "AUSTRIA" "AUSTRIA" ...
##  $ year              : num  1670 1671 1674 1675 1676 ...
##  $ bikes             : num  60 59 53 58 56 50 57 57 49 56 ...
##  $ X__1              : logi  NA NA NA NA NA NA ...
##  $ X__2              : logi  NA NA NA NA NA NA ...
##  $ name_of_country__1: chr  "AUSTRIA" "AUSTRIA" "AUSTRIA" "AUSTRIA" ...
##  $ year__1           : num  1670 1671 1674 1675 1676 ...
##  $ total_turnover    : num  5274 5186 4659 5098 4922 ...

#confirm wheather columns 4:5 of sales have all NAs & no relevant data. 
sum(is.na(sales$X__1))

## [1] 42

sum(is.na(sales$X__2))

## [1] 42

# so all relevant data of sales is in columns 1:3 & 6:8.
# countryname in observationns & name_of_country(also  name_of_country_1) in sales look similar

identical(observations$countryname,sales$name_of_country)  #difference in lower & upper case letters

## [1] FALSE

identical(sales$name_of_country,sales$name_of_country__1)  #same obs.

## [1] TRUE

table(observations$countryname)

## 
##     Austria      France     Germany Netherlands Switzerland 
##           8           7           9           9           9

table(sales$name_of_country)

## 
##     AUSTRIA      FRANCE     GERMANY NETHERLANDS SWITZERLAND 
##           8           7           9           9           9

table(sales$name_of_country__1)

## 
##     AUSTRIA      FRANCE     GERMANY NETHERLANDS SWITZERLAND 
##           8           7           9           9           9

Combining the two datasets:

Here it’s clear that 1 set of obs. in observations & two set of obs. in sales are all corresponding to the same 42 obs.(years) in 5 countries. (And also that Austria & France subsets have respectively 1 & 2 obs. less than the other 3 counties.) Therefore we combine the two in one dataframe obs:

obs<-data.frame(country<-observations$countryname,year<-observations$year,pop<-observations$pop,bikes<-sales$bikes,turnover<-sales$total_turnover)
View(obs)
names(obs)

## [1] "country....observations.countryname"
## [2] "year....observations.year"          
## [3] "pop....observations.pop"            
## [4] "bikes....sales.bikes"               
## [5] "turnover....sales.total_turnover"

names(obs)<-substr(names(obs),1,c(7,4,3,5,8))
names(obs)

## [1] "country"  "year"     "pop"      "bikes"    "turnover"

View(obs)

Now that we have our smart dataset, lets play with it. First of all,we’ll plot our variables of interst: ## bikes & pop (unicorn population), country wise.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

opts = levels((obs$country))
par(mfrow=c(3,2)) # all plots on one page 
  for(i in 1:length(opts)){ 
   par(pch=16,col=rainbow(length(opts))[i]) # plotting symbol and color
  heading = paste("country =",opts[i]) 
  
  obs%>%filter(country==opts[i])%>%select(pop,bikes)%>%plot(main=heading) 
}

Surprisingly there is a linear relationship between the two. Next I’ll plot:

bikes vs turnover for all countries.

opts = levels((obs$country))
par(mfrow=c(3,2),col="blue",pch=17) # all plots on one page 
  for(i in 1:length(opts)){ 
   #par(pch=16,col=rainbow(3)[i]) # plotting symbol and color
  heading = paste("country =",opts[i]) 
  
  obs%>%filter(country==opts[i])%>%select(bikes,turnover)%>%plot(main=heading) 
}

As expected, the two are related. Now we’ll plot:

year & bikes.

par(mfrow=c(3,2),col="purple",pch=22) # all plots on one page 
  for(i in 1:length(opts)){ 
   #par(pch=16,col=rainbow(3)[i]) # plotting symbol and color
  heading = paste("country =",opts[i]) 
  
  obs%>%filter(country==opts[i])%>%select(year,bikes)%>%plot(main=heading) 
}

This is interesting, there is seems no relation between year & bikes. So this makes relation between bikes & unicorn population, observed above even important. We can also study some other parameters.

country wise disribution of unicorn in 1670s:

 boxplot(obs$pop~country,col=rainbow(length(levels(obs$country))),main="country wise disribution of unicorn in 1670s*", sub="*observations for some years are missing", ylab="unicorn population")

country wise disribution of bikes sold in 1670s:

boxplot(obs$bikes~country,col=rainbow(length(levels(obs$country))),main="country wise disribution of bikes sold in 1670s*", sub="*observations for some years are missing", ylab="number of bikes sold")

note:

I have not excluded obs for year 1670, which is available only for Austria, because data is already very scanty and don’t want to loose any information. More over emphasis is on finding relationship between the two variables and not on comparisions betweeb the countries.

table(obs$year)

## 
## 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 
##    1    3    3    4    4    5    5    5    5    3    4

Conclusion:

The data suggests reation between unicorn populations and unicycles in 1670s in the countries.

Data Cleaning & Data Wrangling exercise

Dr. Arvind Singla

12 March 2018