Goal: Learn R language just like any foreign languages
About the data: FuelCheck provides real-time information about fuel prices at service stations across NSW. The dataset contains fuel price and other attributes for October 2017 only.
FuelCheck enables NSW motorists to:
Find the cheapest fuel being sold anywhere in NSW
Get directions to any service station in NSW
Search for fuel by type (E10, Regular, LPG, etc) or brand
Submit a complaint to NSW Fair Trading if the price at the pump doesn’t match what is shown on FuelCheck.
The data we will be working on is for October 2017 NSW only.
Load Packages
library(tidyverse)
library(lubridate)
library(DT)
library(broom)
library(knitr)
library(viridis) #scale color
library(forcats) #factors
library(glue)
library(ggmap)
library(leaflet)
library(rgdal)
library(tmap)
library(broom)
library(leaflet.extras)
library(emo)Load Data
data1 <- read_csv("D:/r datawarehouse/petrol/October-2017.csv")Check for any missing values
colSums(is.na(data1))## ServiceStationName Address Suburb
## 0 0 0
## Postcode Brand FuelCode
## 0 0 0
## PriceUpdatedDate Price
## 0 0
There are no missing values in the dataset.
There are 60045 rows and 8 columns in the dataset.
datatable(head(data1, 2), options = list(pagelength = 5, scrollx = T), caption = "Displaying only first 2 rows of the dataset")Below is the summary of each variable and its class assigned to it. We can clearly see that PriceUpdatedDate is in character class. We will assign correct class to each variables.
summary(data1)## ServiceStationName Address Suburb Postcode
## Length:60045 Length:60045 Length:60045 Min. :1579
## Class :character Class :character Class :character 1st Qu.:2142
## Mode :character Mode :character Mode :character Median :2204
## Mean :2319
## 3rd Qu.:2557
## Max. :4383
## Brand FuelCode PriceUpdatedDate Price
## Length:60045 Length:60045 Length:60045 Min. : 53.9
## Class :character Class :character Class :character 1st Qu.:127.8
## Mode :character Mode :character Mode :character Median :136.9
## Mean :136.4
## 3rd Qu.:145.9
## Max. :177.7
counts <- as.data.frame(matrix(NA, nrow = 1, ncol = 8))
names(counts) <- names(data1)
for(i in seq_along(data1)){
counts[[i]] <- length(unique(data1[[i]]))
}
kable(counts, caption = "Count of unique values")| ServiceStationName | Address | Suburb | Postcode | Brand | FuelCode | PriceUpdatedDate | Price |
|---|---|---|---|---|---|---|---|
| 1794 | 1926 | 1058 | 473 | 20 | 9 | 11965 | 613 |
This should help us in assigning correct class to each variable. Brand,FuelCode and PostCode should be changed to factors. PriceUpdatedDate as Date and Time.
We’ll create a new variable Day_time which will convert the PriceUpdatedDate variable to date and time. This will also preserve the original data.
data1 <- data1 %>%
mutate_at(vars(Brand,FuelCode,Postcode), funs(as.factor(.)))
data1$Day_time <- dmy_hm(data1$PriceUpdatedDate)data1$Date <- lubridate::date(data1$Day_time)data1$Day <- wday(data1$Day_time, label = T, abbr = T)data1 <- data1 %>%
mutate(Price_std = Price/100)We will extract geospatial location of each petrol station from google. We will then plot those location on a map provided by leaflet.
data1$id <- row.names(data1)
Fuel_stations <- data1 %>%
group_by(Address) %>%
summarise(n = n()) %>%
arrange(-n) %>%
mutate(id = row.names(.))
# geocodings <- geocode(Fuel_stations$Address)
# write_csv(geocodings, "D:/r datawarehouse/petrol/geocodings.csv")
# geocode_na <- is.na(geocodings$lon)
geocodings <- read_csv("D:/r datawarehouse/petrol/geocodings.csv")
geocodings$id <- row.names(geocodings)
Fuel_stations <- Fuel_stations %>%
mutate_at(vars(Address), funs(str_replace_all(., "\\s\\(.*\\)", ""))) %>%
mutate_at(vars(Address), funs(str_replace_all(., "Cnr Warnoon & Palm Avenue, Leeton NSW 2705", "Warnoon & Palm Avenue, Leeton NSW 2705"))) %>%
mutate_at(vars(Address), funs(str_replace_all(., "Cnr Zouch and Edward Streets, Young NSW 2594", "Zouch and Edward Streets, Young NSW 2594"))) %>%
mutate_at(vars(Address), funs(str_replace_all(., "Cnr Fitzroy and Cobra Streets, South Dubbo NSW 2830", "Fitzroy and Cobra Streets, South Dubbo NSW 2830")))
# na_fuel_station <- Fuel_stations$Address[is.na(geocodings$lon)]
# na_geocoding <- geocode(na_fuel_station)
# write_csv(na_geocoding, "D:/r datawarehouse/petrol/na_geocoding.csv")
na_geocoding <- read_csv("D:/r datawarehouse/petrol/na_geocoding.csv")
# names(na_geocoding) <- c("lon.x", "lat.x","id")
geocodings$id <- as.numeric(geocodings$id)
# geocoding_na_row <- geocodings %>%
# filter(is.na(lon))
# na_geocoding$id <- geocoding_na_row$id
geocodings <- geocodings %>%
left_join(na_geocoding, by=c("id" = "id"))
geocodings$lon.x[is.na(geocodings$lon.x)] <- geocodings$lon.y[is.na(geocodings$lon.x)]
geocodings$lat.x[is.na(geocodings$lat.x)] <- geocodings$lat.y[is.na(geocodings$lat.x)]
geocodings <- geocodings %>%
select(1:3)
Fuel_stations$id <- as.numeric(Fuel_stations$id)
Fuel_stations<- Fuel_stations %>%
left_join(geocodings, by=c("id"="id"))
Fuel_stations <- Fuel_stations %>%
select(-c(2:3))
data1 <- data1 %>%
mutate_at(vars(Address), funs(str_replace_all(., "\\s\\(.*\\)", ""))) %>%
mutate_at(vars(Address), funs(str_replace_all(., "Cnr Warnoon & Palm Avenue, Leeton NSW 2705", "Warnoon & Palm Avenue, Leeton NSW 2705"))) %>%
mutate_at(vars(Address), funs(str_replace_all(., "Cnr Zouch and Edward Streets, Young NSW 2594", "Zouch and Edward Streets, Young NSW 2594"))) %>%
mutate_at(vars(Address), funs(str_replace_all(., "Cnr Fitzroy and Cobra Streets, South Dubbo NSW 2830", "Fitzroy and Cobra Streets, South Dubbo NSW 2830")))
data1 <- data1 %>%
left_join(Fuel_stations, by = c("Address", "Address"))
# Fuel_stations <- Fuel_stations %>%
# mutate_at(vars(Address), funs(str_replace_all(., "\\s\\(.*\\)", ""))) %>%
# mutate_at(vars(Address), funs(str_replace_all(., "^Cnr |^Cnr. ", "")))
# Fuel_stations$Address[str_detect(Fuel_stations$Address, ("^Cnr |^Cnr. "))]
# x <- as.data.frame(Fuel_stations$Address[geocode_na])
#
# names(x) <- "var"
#
# x_2<- x %>%
# mutate_at(vars(var), funs(str_replace_all(., "\\s*\\(.*\\)", "")))
# geocodings$lon[Fuel_stations$Address %>%
# str_detect("574 Great Western Hwy")]
#
#
# geocode("574 Great Western Hwy, Werrington NSW 2747")
# sum(is.na(geocodings))
#
# x1 <- data1[1:4,]
# # x<- geocode(x1$Address)
#
# x1$id <- row.names(x1)
# x$id <- row.names(x)
#
# x1 %>%
# left_join(x)data1 %>%
ggplot(aes(lon.x, lat.x))+
geom_point()Ok! It didn’t go out smoothly. Straight away we can see that top left cluster are incorrect GPS coordinates. Australia should be at the bottom right.
After tweaking here and there I still couldn’t fix the GPS cordinates. I will not spend further time for this since it is not important.
data1$edited_address <- glue("{data1$Address}, Australia")
wrong_gps <- data1 %>%
filter(lon.x < 100) %>%
group_by(edited_address) %>%
count()
# getcode_wrong_gps <- geocode(wrong_gps$edited_address)
# write_csv(getcode_wrong_gps, "D:/r datawarehouse/petrol/getcode_wrong_gps.csv")
getcode_wrong_gps <- read_csv("D:/r datawarehouse/petrol/getcode_wrong_gps.csv")
wrong_gps$lon.z <- getcode_wrong_gps$lon
wrong_gps$lat.z <- getcode_wrong_gps$lat
wrong_gps <- wrong_gps %>%
select(1) %>%
mutate(identifier = "A")
data1 <- data1 %>%
left_join(wrong_gps, by = c("edited_address", "edited_address"))
data1$lon.x[!is.na(data1$identifier)] <- NA
data1$lat.x[!is.na(data1$identifier)] <- NA
# data1 %>%
# filter(identifier == "A")
# wrong_gps %>%
# left_join(data1, by = c("edited_address", "edited_address"))
#
#
# leaflet() %>%
# addTiles() %>%
# addMarkers(data = wrong_gps,
# lng = ~lon.z,
# lat = ~lat.z,
# clusterOptions = markerClusterOptions())
# Ok, even after adding Australia at the end of the Service station address google is still giving me GPS coordinates of Non AU. I am not #going to spend any further time correcting it.All NSW Fuel service station location:
# NSW_geo <- geocode("New South Wales, Australia")
# rm(NSW_geo)
NSW_gps <- data.frame(lon = 146.9211, lat = -31.25322)
# map_nsw <- get_map(NSW_gps)
# map_data <- readOGR(dsn = "Aus_map/AUS_adm1.shp")
service_stat <- data1 %>%
group_by(Address, lon.x, lat.x, ServiceStationName) %>%
summarise(n = n())
leaflet() %>%
addTiles() %>%
addMarkers(data = service_stat,lat= ~lat.x,lng = ~lon.x,clusterOptions = markerClusterOptions(),popup =as.character(service_stat$ServiceStationName)) %>%
addProviderTiles(providers$OpenStreetMap) %>%
addScaleBar()# ==================== Google map
# g <- get_map(NSW_gps, zoom = 6, source = "google")
#
# ggmap(g)+
# geom_point(data = data1, aes(lon.x, lat.x))+
# facet_wrap(~FuelCode)