First we load the csv data into R from Github. We also load the required packages.
library(RCurl)
## Warning: package 'RCurl' was built under R version 3.2.4
## Loading required package: bitops
library(tidyr)
## Warning: package 'tidyr' was built under R version 3.2.3
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:RCurl':
##
## complete
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
url = 'https://raw.githubusercontent.com/cyadusha/oil/master/oil.csv'
x = getURL(url)
oil = read.csv(file = textConnection(x), header = TRUE)
We write the following if loop nested within the for loop to replace the empty spaces in the first column with the month names.
for (i in 1:14){
if (i %% 2 == 0){
oil[i,1] = oil[i-1,1]
}
}
Now we gather all of the Brand names into one column and separate the numeric column into two columns - “Purchased” and “Consumed”.
oil = oil %>% gather(Brand, N, Caltex:Mobil, na.rm = T) %>% separate(N, c("Purchased", "Consumed"))
## Warning: attributes are not identical across measure variables; they will
## be dropped
Even though we are not interested in the opening balances, we replace all of the “000’s” in the “Consumed” column with just “0”. For this we have to load the stringr package.
library(stringr)
oil[,5] = str_replace_all(oil[,5], "000", "0")
Now, we convert both the columns into vectors that contain numeric values.
oil$Purchased = as.numeric(oil$Purchased)
oil$Consumed = as.numeric(oil$Consumed)
Because we are not interested in the opening balance, we filter out all of the rows which have “Open” as the month name.
oil = filter(oil, Month != "Open")
Now to compute the closing balance by category and brand we use group_by, summarise, and sum commands.
closingbalance = oil %>% group_by(Category, Brand) %>% summarise(Purchased = sum(Purchased), Consumed = sum(Consumed), Balance = Purchased - Consumed)
Because we want these values to be read as numeric values, we use the data.frame command.
closingbalance = data.frame(closingbalance)
Now to find the most consumed brand in each category, we first take the closing balance data frame and use the group_by, summarise, and max commands. The which.max command not only renders the maximum value but the entire row that contains the maximum value we would be looking for.
mostconsumed = closingbalance %>% group_by(Category) %>% summarise(Brand = Brand[which.max(Consumed)], Consumed = max(Consumed))
data.frame(mostconsumed)
## Category Brand Consumed
## 1 Engine Oil Gulf 761
## 2 GearBox Oil Caltex 746
Now, we use the following command from the string package to convert the closing balance vector into the original format given.
closingbalance[,5] = str_c(closingbalance[,5], sep = ":", "000")
closingbalance[,5]
## [1] "190:000" "103:000" "204:000" "177:000" "287:000" "178:000"