R Cheatsheets: https://rstudio.com/resources/cheatsheets/

2. Manipulating data

2.1 factor vs string

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
Bicycle <- read.csv("data/Bicycle.csv")


Bicycle$DS_LOCATION %>% levels()
## NULL
Bicycle$NB_TRAFFIC_SURVEY %>% class()
## [1] "integer"
Bicycle$NB_TRAFFIC_SURVEY <- Bicycle$NB_TRAFFIC_SURVEY %>% as.factor()

levels(Bicycle$NB_TRAFFIC_SURVEY)
##  [1] "9288" "9289" "9290" "9291" "9292" "9293" "9294" "9295" "9296" "9297"
## [11] "9298" "9299" "9300" "9301" "9302" "9303" "9304" "9305" "9306" "9307"
## [21] "9308" "9309" "9310" "9311" "9312" "9313" "9314" "9315" "9316"
levels(Bicycle$day)
## NULL
Bicycle$day <- Bicycle$day %>%  factor(  levels=c('Sun','Mon','Tue','Wed','Thu','Fri','Sat'),
ordered=TRUE)


#install.packages("forcats")
library(forcats)
Bicycle$day <- Bicycle$day %>% fct_relevel('Mon','Tue','Wed','Thu','Fri','Sat','Sun')

Bicycle$weekend <- Bicycle$weekend %>% as.factor
Bicycle$weekend <- Bicycle$weekend %>% factor(levels = c("FALSE","TRUE"),                                               labels=c("Weekday","Weekend"))

Bicycle$weekend %>% levels
## [1] "Weekday" "Weekend"
Bicycle$weekend <- Bicycle$weekend %>% fct_recode("Weekend" = "TRUE",
"Weekday"  = "FALSE")
## Warning: Unknown levels in `f`: TRUE, FALSE
Bicycle$Quarter <- Bicycle$Quarter %>% factor(levels=c(1,2,3,4),
labels=c("1st Quarter","2nd Quarter","3rd Quarter", "4th Quarter"),
ordered = TRUE)

2.2 Filtering & Selecting

# 2.2.1 Filtering 
Bicycle_Summer <- Bicycle %>% filter(Season == "Summer")
head(Bicycle_Summer)
##   ï..Unique_ID NB_TRAFFIC_SURVEY NB_LOCATION_TRAFFIC_SURVEY
## 1           15              9306                          1
## 2           16              9306                          1
## 3           17              9306                          1
## 4           18              9306                          1
## 5           19              9306                          1
## 6           20              9306                          1
##                      Sort.Des
## 1 Flemington Rd NW Bound Lane
## 2 Flemington Rd NW Bound Lane
## 3 Flemington Rd NW Bound Lane
## 4 Flemington Rd NW Bound Lane
## 5 Flemington Rd NW Bound Lane
## 6 Flemington Rd NW Bound Lane
##                                            DS_LOCATION DT_ANALYSIS_SUMMARY
## 1 (BIKE LANE)FLEMINGTON RD NW BD 10M SE OF DRYBURGH ST          11/12/2010
## 2 (BIKE LANE)FLEMINGTON RD NW BD 10M SE OF DRYBURGH ST          12/12/2010
## 3 (BIKE LANE)FLEMINGTON RD NW BD 10M SE OF DRYBURGH ST          13/12/2010
## 4 (BIKE LANE)FLEMINGTON RD NW BD 10M SE OF DRYBURGH ST          14/12/2010
## 5 (BIKE LANE)FLEMINGTON RD NW BD 10M SE OF DRYBURGH ST          15/12/2010
## 6 (BIKE LANE)FLEMINGTON RD NW BD 10M SE OF DRYBURGH ST          16/12/2010
##   NB_YEAR NB_MONTH NB_WEEKDAY_NONHOL_QTR CT_VOLUME_AMPEAK CT_VOLUME_PMPEAK
## 1    2010       12                     0                9               24
## 2    2010       12                     0               10               17
## 3    2010       12                     0               15              101
## 4    2010       12                     0               10              135
## 5    2010       12                     0               12               99
## 6    2010       12                     0               13               95
##   CT_VOLUME_4HOUR_OFFPEAK CT_VOLUME_12HOUR CT_VOLUME_24HOUR DS_HOLIDAY
## 1                      34              116              179           
## 2                      26               97              133           
## 3                      42              325              386           
## 4                      46              350              435           
## 5                      41              307              385           
## 6                      28              303              396           
##   NB_SEASONALITY_PERIOD NB_TYPE_PERIOD Primary weekend     Quarter Season
## 1                    19              1   FALSE Weekend 4th Quarter Summer
## 2                    19              1   FALSE Weekend 4th Quarter Summer
## 3                    20              3   FALSE Weekday 4th Quarter Summer
## 4                    20              3   FALSE Weekday 4th Quarter Summer
## 5                    20              3   FALSE Weekday 4th Quarter Summer
## 6                    20              3   FALSE Weekday 4th Quarter Summer
##   Cyclying.Season day
## 1         Cycling Sat
## 2         Cycling Sun
## 3         Cycling Mon
## 4         Cycling Tue
## 5         Cycling Wed
## 6         Cycling Thu
Bicycle_Summer$Season %>% summary
##    Length     Class      Mode 
##     13757 character character
Bicycle_Summer_Spring <- Bicycle %>% filter(Season=="Summer" | Season =="Spring")
Bicycle_Summer_Spring$Season %>% summary
##    Length     Class      Mode 
##     29011 character character
Bicycle_Summer_Spring_2009 <- Bicycle %>% filter((Season=="Summer" | Season =="Spring") & NB_YEAR >= 2009)

table(Bicycle$Season, Bicycle$NB_YEAR)
##         
##          2005 2006 2007 2008 2009 2010 2011 2012 2013
##   Autumn   14 1028 1480 1566 1781 1557 2457 3317    0
##   Spring   40 1335 1508 1815 1597 1671 3697 3591    0
##   Summer  238 1244 1472 1578 1669 1671 2574 3059  252
##   Winter   20 1020 1518 1868 1643 1765 3592 3366    0
table(Bicycle_Summer_Spring_2009$Season,
      Bicycle_Summer_Spring_2009$NB_YEAR)
##         
##          2009 2010 2011 2012 2013
##   Spring 1597 1671 3697 3591    0
##   Summer 1669 1671 2574 3059  252
# 2.2.2 Selecting

Bicycle_volume <- Bicycle %>% dplyr::select(Sort.Des, DT_ANALYSIS_SUMMARY, CT_VOLUME_24HOUR)

# we can look at the head of the data frame using
Bicycle_volume %>% head()
##                      Sort.Des DT_ANALYSIS_SUMMARY CT_VOLUME_24HOUR
## 1      St Georges St Hawthorn          09/06/2007              480
## 2 Flemington Rd NW Bound Lane          17/03/2008              654
## 3 Flemington Rd NW Bound Lane          19/03/2008              794
## 4 Flemington Rd NW Bound Lane          20/03/2008              732
## 5 Flemington Rd NW Bound Lane          21/03/2008              221
## 6 Flemington Rd NW Bound Lane          22/03/2008              271
# 2.2.3 Adding a new variable 
Bicycle$New_weekday_variable <- Bicycle$day %>% fct_recode("weekday" = "Mon", "weekday" = "Tue", "weekday" = "Wed", "weekday" = "Thu", "weekday" = "Fri", "weekend" = "Sat", "weekend" = "Sun")
# Check
table(Bicycle$New_weekday_variable, Bicycle$day)
##          
##            Mon  Tue  Wed  Thu  Fri  Sat  Sun
##   weekday 8109 8125 8131 8087 8098    0    0
##   weekend    0    0    0    0    0 8267 8186
Bicycle <- Bicycle %>% mutate(peak = (CT_VOLUME_AMPEAK + CT_VOLUME_PMPEAK)/CT_VOLUME_24HOUR)

Bicycle$peak <- (Bicycle$CT_VOLUME_AMPEAK + Bicycle$CT_VOLUME_PMPEAK)/Bicycle$CT_VOLUME_24HOUR

# 2.2.4 Ordering datasets

Bicycle_sorted <- Bicycle %>% arrange(Sort.Des,NB_YEAR,NB_MONTH)
Bicycle_sorted %>% head()
##   ï..Unique_ID NB_TRAFFIC_SURVEY NB_LOCATION_TRAFFIC_SURVEY
## 1        17386              9316                          1
## 2        17387              9316                          1
## 3        17388              9316                          1
## 4        17389              9316                          1
## 5        17390              9316                          1
## 6        17391              9316                          1
##                                 Sort.Des
## 1 Albert St E Bound Lane, Melbourne City
## 2 Albert St E Bound Lane, Melbourne City
## 3 Albert St E Bound Lane, Melbourne City
## 4 Albert St E Bound Lane, Melbourne City
## 5 Albert St E Bound Lane, Melbourne City
## 6 Albert St E Bound Lane, Melbourne City
##                                    DS_LOCATION DT_ANALYSIS_SUMMARY NB_YEAR
## 1 (BIKELANE) ALBERT ST EB 50M E OF MORRISON PL          23/06/2011    2011
## 2 (BIKELANE) ALBERT ST EB 50M E OF MORRISON PL          24/06/2011    2011
## 3 (BIKELANE) ALBERT ST EB 50M E OF MORRISON PL          25/06/2011    2011
## 4 (BIKELANE) ALBERT ST EB 50M E OF MORRISON PL          26/06/2011    2011
## 5 (BIKELANE) ALBERT ST EB 50M E OF MORRISON PL          27/06/2011    2011
## 6 (BIKELANE) ALBERT ST EB 50M E OF MORRISON PL          28/06/2011    2011
##   NB_MONTH NB_WEEKDAY_NONHOL_QTR CT_VOLUME_AMPEAK CT_VOLUME_PMPEAK
## 1        6                    42               31              117
## 2        6                    42               34               87
## 3        6                     0               16               27
## 4        6                     0               10               29
## 5        6                    42               42              115
## 6        6                    42               36              118
##   CT_VOLUME_4HOUR_OFFPEAK CT_VOLUME_12HOUR CT_VOLUME_24HOUR DS_HOLIDAY
## 1                      32              345              418           
## 2                      39              352              403           
## 3                      67              148              183           
## 4                      36              122              156           
## 5                      30              414              489           
## 6                      33              392              474           
##   NB_SEASONALITY_PERIOD NB_TYPE_PERIOD Primary weekend     Quarter Season
## 1                    12              1    TRUE Weekday 2nd Quarter Winter
## 2                    12              1    TRUE Weekday 2nd Quarter Winter
## 3                    12              1    TRUE Weekend 2nd Quarter Winter
## 4                    12              1    TRUE Weekend 2nd Quarter Winter
## 5                    12              1    TRUE Weekday 2nd Quarter Winter
## 6                    12              1    TRUE Weekday 2nd Quarter Winter
##   Cyclying.Season day New_weekday_variable      peak
## 1     Non-Cycling Thu              weekday 0.3540670
## 2     Non-Cycling Fri              weekday 0.3002481
## 3     Non-Cycling Sat              weekend 0.2349727
## 4     Non-Cycling Sun              weekend 0.2500000
## 5     Non-Cycling Mon              weekday 0.3210634
## 6     Non-Cycling Tue              weekday 0.3248945

2.3 Dates and times

Bicycle$DT_ANALYSIS_SUMMARY %>% class
## [1] "character"
Bicycle$DT_ANALYSIS_SUMMARY %>% head
## [1] "09/06/2007" "17/03/2008" "19/03/2008" "20/03/2008" "21/03/2008"
## [6] "22/03/2008"
# install.packages("lubridate")
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:dplyr':
## 
##     intersect, setdiff, union
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
Bicycle$DT_ANALYSIS_SUMMARY <- Bicycle$DT_ANALYSIS_SUMMARY %>% dmy
Bicycle$DT_ANALYSIS_SUMMARY %>%  class
## [1] "Date"
Bicycle$DT_ANALYSIS_SUMMARY %>% wday(label = TRUE) %>%  head
## [1] Sat Mon Wed Thu Fri Sat
## Levels: Sun < Mon < Tue < Wed < Thu < Fri < Sat
Bicycle$DT_ANALYSIS_SUMMARY %>% month(label = TRUE) %>%  head
## [1] Jun Mar Mar Mar Mar Mar
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
Bicycle$DT_ANALYSIS_SUMMARY %>% year %>%  head
## [1] 2007 2008 2008 2008 2008 2008
times <- c("02:02:57","02:03:03","02:03:05","02:03:13","02:03:13")
times %>% class
## [1] "character"
times <- times %>% hms

times %>% class
## [1] "Period"
## attr(,"package")
## [1] "lubridate"
times %>% second
## [1] 57  3  5 13 13
times %>% minute
## [1] 2 3 3 3 3
times[2] %>% seconds - times[1] %>% seconds
## Note: method with signature 'Period#ANY' chosen for function '-',
##  target signature 'Period#Period'.
##  "ANY#Period" would also be valid
## [1] "6S"
library(dplyr)
library(forcats)
library(ggplot2)
AusPopDemographics <- read.csv("data/AusPopDemographics.csv")

# 
mid_49 <- AusPopDemographics %>% 
  filter(Age =="25 - 29" | Age =="30 - 34" | Age =="35 - 39" | Age =="40 - 44" | Age =="45 - 49")

library(forcats)
AusPopDemographics$Age <-
AusPopDemographics$Age %>% 
  fct_recode(
    "25 - 49" = "25 - 29",
    "25 - 49" = "30 - 34",
    "25 - 49" = "35 - 39",
    "25 - 49" = "40 - 44",
    "25 - 49" = "45 - 49",
  )



ggplot(data = AusPopDemographics, mapping = aes(x = Year, y = Population, colour = Age)) +  geom_point()