Recitation 04

Load Packages

library(dplyr)
library(ggplot2)
library(statsr)
library(gridExtra)
library(lubridate)

Let’s clean the global environment before moving further

rm(list=ls())
dev.off
## function (which = dev.cur()) 
## {
##     if (which == 1) 
##         stop("cannot shut down device 1 (the null device)")
##     .External(C_devoff, as.integer(which))
##     dev.cur()
## }
## <bytecode: 0x7fc9583775f0>
## <environment: namespace:grDevices>
cat("\014")

Loading the data

citi1 <- read.csv("~/Downloads/Datasets/citi1.csv")
citi2 <- read.csv("~/Downloads/Datasets/citi2.csv")
biking_weather <- read.csv("~/Downloads/Datasets/biking.weather.csv")
auto <- read.csv("~/Downloads/Datasets/automobiles.csv")
aq<-airquality

Understanding the data

dim(aq)
## [1] 153   6

We can see that the dataset provided consists of 153 rows and 6 column variables.

str(aq)
## 'data.frame':    153 obs. of  6 variables:
##  $ Ozone  : int  41 36 12 18 NA 28 23 19 8 NA ...
##  $ Solar.R: int  190 118 149 313 NA NA 299 99 19 194 ...
##  $ Wind   : num  7.4 8 12.6 11.5 14.3 14.9 8.6 13.8 20.1 8.6 ...
##  $ Temp   : int  67 72 74 62 56 66 65 59 61 69 ...
##  $ Month  : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ Day    : int  1 2 3 4 5 6 7 8 9 10 ...

We can see that all the columns are numerical variables with few missing values

summary(aq)
##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0  
## 

Adding Basic Functionalities

A basic function that returns the mean, meadian and standard deviation of a variable

explore<-function(x){
  data<-c("Mean"=mean(x, na.rm=T),
          "Median"=median(x, na.rm =T), 
          "Standard Deviation" = sd(x, na.rm =T))
  return(data)
}
apply(aq,2, explore)
##                       Ozone   Solar.R     Wind     Temp    Month      Day
## Mean               42.12931 185.93151 9.957516 77.88235 6.993464 15.80392
## Median             31.50000 205.00000 9.700000 79.00000 7.000000 16.00000
## Standard Deviation 32.98788  90.05842 3.523001  9.46527 1.416522  8.86452

Including Plots

You can also embed plots, for example:

Seeing how aggregate funciton should work in case of no missing values

max(aq$Temp)
## [1] 97

In case of missing value we get NA, this can be avoided by using the argiment na.rm = T

max(aq$Solar.R)
## [1] NA
max(aq$Solar.R, na.rm = T) 
## [1] 334

Only selecting the rows that have non NA values of Solar.R attribute

max(aq$Solar.R[!is.na(aq$Solar.R)]) 
## [1] 334

Using subset to establish the same task

aq<-subset(aq,!is.na(Solar.R)) 

Using the package tidyr to make our job easier

aq<- tidyr::drop_na(airquality, Ozone) 

Let’s look at the number of rows that were dropped

dim(airquality)
## [1] 153   6
dim(aq)
## [1] 116   6
sum(is.na(airquality$Ozone))
## [1] 37

Imputing missing values with mean

aq$Ozone[is.na(aq$Ozone)]<-mean(aq$Ozone, na.rm=T)

Imputing missing values with median

aq$Ozone[is.na(aq$Ozone)]<-median(aq$Ozone, na.rm=T)

Imputing missing values with mode

aq$Ozone[is.na(aq$Ozone)]<-mode(aq$Ozone)

Let’s look at the transform function

?transform
mean(auto$height)
## [1] 53.72488
# tf$height <- auto$height - 10
tf <- transform(auto, height = height - 10)
mean(tf$height)
## [1] 43.72488

MERGE AND JOIN

Union - same format (number and types of columns and data, removes duplicates)

citi_union <- union(citi1, citi2, all = TRUE)
nrow(citi_union) 
## [1] 29

We can see that the union operation removes eleven duplicates

Intersect - keep only those rows from both sets where all values are same

citi_intersect <- merge(citi1, citi2)
nrow(citi_intersect)
## [1] 11

setdiff - rows that are in var1 but not in var 2, i.e. var1 - var2

?dplyr::setdiff
citi_setdiff <- setdiff(citi2, citi1)
nrow(citi_setdiff)
## [1] 9

JOINS

inner join or natural join - selects rows from both datasets where the specified column value is same

citi_subset <- citi1[1:20,]
citi_subset$start_date <- date(citi_subset$starttime)
citi_subset$start_date
##  [1] "2020-01-01" "2020-01-02" "2020-01-02" "2020-01-02" "2020-01-02"
##  [6] "2020-01-02" "2020-01-02" "2020-01-01" "2020-01-02" "2020-01-02"
## [11] "2020-01-02" "2020-01-02" "2020-01-02" "2020-01-02" "2020-01-02"
## [16] "2020-01-02" "2020-01-02" "2020-01-02" "2020-01-01" "2020-01-01"

merging citi_subset with weather data on dates

?merge
inner <- merge(citi_subset, biking_weather, by.x = "start_date", by.y = "Day")
head(inner)
##  [1] start_date              X.x                     tripduration           
##  [4] starttime               stoptime                start.station.id       
##  [7] start.station.name      start.station.latitude  start.station.longitude
## [10] end.station.id          end.station.name        end.station.latitude   
## [13] end.station.longitude   bikeid                  usertype               
## [16] birth.year              gender                  age                    
## [19] X.y                     Time                    Weather                
## [22] Temp                   
## <0 rows> (or 0-length row.names)
timeOfDay <- function(t){ifelse(t<7,"night",ifelse(t<12, "morning",ifelse(t<18,"afternoon","evening")))}
citi_prepped <- transform(citi1,Day=substr(starttime,1,10),Time=timeOfDay(as.numeric(substr(starttime,12,13))))
head(citi_prepped)
##       X tripduration                starttime                 stoptime
## 1 11657         2411 2020-01-01 16:09:49.4430 2020-01-01 16:50:01.1920
## 2 37306         1456 2020-01-02 14:34:13.8300 2020-01-02 14:58:30.3640
## 3 47314         1151 2020-01-02 17:35:29.8280 2020-01-02 17:54:41.6850
## 4 30290           85 2020-01-02 11:18:16.5240 2020-01-02 11:19:41.8230
## 5 23729          623 2020-01-02 08:34:04.6280 2020-01-02 08:44:28.1090
## 6 46530          457 2020-01-02 17:24:36.7720 2020-01-02 17:32:14.5040
##   start.station.id           start.station.name start.station.latitude
## 1              528              2 Ave & E 31 St               40.74291
## 2             3323 W 106 St & Central Park West               40.79819
## 3              305              E 58 St & 3 Ave               40.76096
## 4             3289      W 90 St & Amsterdam Ave               40.79018
## 5             2008        Little West St & 1 Pl               40.70569
## 6             3467       W Broadway & Spring St               40.72495
##   start.station.longitude end.station.id
## 1               -73.97706            340
## 2               -73.96059           3374
## 3               -73.96724            379
## 4               -73.97289           3293
## 5               -74.01678            316
## 6               -74.00166            297
##                                end.station.name end.station.latitude
## 1                       Madison St & Clinton St             40.71269
## 2 Central Park North & Adam Clayton Powell Blvd             40.79948
## 3                               W 31 St & 7 Ave             40.74916
## 4                            W 92 St & Broadway             40.79210
## 5                        Fulton St & William St             40.70956
## 6                               E 15 St & 3 Ave             40.73423
##   end.station.longitude bikeid   usertype birth.year gender age        Day
## 1             -73.98776  19416 Subscriber       1979      2  41 2020-01-01
## 2             -73.95561  39503 Subscriber       1994      2  26 2020-01-02
## 3             -73.99160  16974 Subscriber       1969      1  51 2020-01-02
## 4             -73.97390  38973 Subscriber       1980      1  40 2020-01-02
## 5             -74.00654  17759 Subscriber       1972      1  48 2020-01-02
## 6             -73.98692  28733 Subscriber       1969      0  51 2020-01-02
##        Time
## 1 afternoon
## 2 afternoon
## 3 afternoon
## 4   morning
## 5   morning
## 6 afternoon

Natural Join (equality on common fields/columns)

biketrip_weather <- merge(citi_prepped,biking_weather) 
head(biketrip_weather)
##  [1] X                       Day                     Time                   
##  [4] tripduration            starttime               stoptime               
##  [7] start.station.id        start.station.name      start.station.latitude 
## [10] start.station.longitude end.station.id          end.station.name       
## [13] end.station.latitude    end.station.longitude   bikeid                 
## [16] usertype                birth.year              gender                 
## [19] age                     Weather                 Temp                   
## <0 rows> (or 0-length row.names)

By default, merge will do a join (combine rows with matching values) for all matching values in columns with the same name.

If the columns do not have the same name in the two data frames, use by.x and by.y

For example, let’s say we want to use the end time of the trip for the join, not the starttime. Let’s create a new column.

citi_prepped$endTime <- timeOfDay(as.numeric(citi_prepped$stoptime[!is.na(citi_prepped$stoptime)]))

Equi Join (equality on selected fields/columns)

biketrip_weather <- merge(citi_prepped,biking_weather, by.x=c("Day","endTime"),by.y=c("Day","Time")) 

only joins when the days are the same, ignore the time. Look at the result. Why is it this way. Does it make sense?

biketrip_weather <- merge(citi_prepped,biking_weather,by.x="Day",by.y="Day") 
tail(biketrip_weather)
##           Day   X.x tripduration                starttime
## 75 2020-01-02 25158          312 2020-01-02 08:54:33.1590
## 76 2020-01-02 25158          312 2020-01-02 08:54:33.1590
## 77 2020-01-02 18998          174 2020-01-02 06:19:06.7760
## 78 2020-01-02 18998          174 2020-01-02 06:19:06.7760
## 79 2020-01-02 18998          174 2020-01-02 06:19:06.7760
## 80 2020-01-02 18998          174 2020-01-02 06:19:06.7760
##                    stoptime start.station.id    start.station.name
## 75 2020-01-02 08:59:45.5780              237       E 11 St & 2 Ave
## 76 2020-01-02 08:59:45.5780              237       E 11 St & 2 Ave
## 77 2020-01-02 06:22:01.3830              195 Liberty St & Broadway
## 78 2020-01-02 06:22:01.3830              195 Liberty St & Broadway
## 79 2020-01-02 06:22:01.3830              195 Liberty St & Broadway
## 80 2020-01-02 06:22:01.3830              195 Liberty St & Broadway
##    start.station.latitude start.station.longitude end.station.id
## 75               40.73047               -73.98672           3260
## 76               40.73047               -73.98672           3260
## 77               40.70906               -74.01043           3690
## 78               40.70906               -74.01043           3690
## 79               40.70906               -74.01043           3690
## 80               40.70906               -74.01043           3690
##           end.station.name end.station.latitude end.station.longitude bikeid
## 75 Mercer St & Bleecker St             40.72706             -73.99662  17790
## 76 Mercer St & Bleecker St             40.72706             -73.99662  17790
## 77     Park Pl & Church St             40.71334             -74.00936  41620
## 78     Park Pl & Church St             40.71334             -74.00936  41620
## 79     Park Pl & Church St             40.71334             -74.00936  41620
## 80     Park Pl & Church St             40.71334             -74.00936  41620
##      usertype birth.year gender age  Time.x endTime X.y    Time.y Weather Temp
## 75 Subscriber       1990      1  30 morning      NA   7 afternoon    rain   45
## 76 Subscriber       1990      1  30 morning      NA   8   evening    rain   40
## 77 Subscriber       1967      1  53   night      NA   5     night  cloudy   45
## 78 Subscriber       1967      1  53   night      NA   6   morning     sun   40
## 79 Subscriber       1967      1  53   night      NA   7 afternoon    rain   45
## 80 Subscriber       1967      1  53   night      NA   8   evening    rain   40

outer joins - these can be left outer or right outer and represent in set

theory notation: left outer is all rows where the specified column values is same in both datasets plus all rows from the variable on the left

left <- merge(citi_prepped, biking_weather, by.x = "Day", by.y = "Day", all.x = TRUE)
tail(left)
##           Day   X.x tripduration                starttime
## 75 2020-01-02 25158          312 2020-01-02 08:54:33.1590
## 76 2020-01-02 25158          312 2020-01-02 08:54:33.1590
## 77 2020-01-02 18998          174 2020-01-02 06:19:06.7760
## 78 2020-01-02 18998          174 2020-01-02 06:19:06.7760
## 79 2020-01-02 18998          174 2020-01-02 06:19:06.7760
## 80 2020-01-02 18998          174 2020-01-02 06:19:06.7760
##                    stoptime start.station.id    start.station.name
## 75 2020-01-02 08:59:45.5780              237       E 11 St & 2 Ave
## 76 2020-01-02 08:59:45.5780              237       E 11 St & 2 Ave
## 77 2020-01-02 06:22:01.3830              195 Liberty St & Broadway
## 78 2020-01-02 06:22:01.3830              195 Liberty St & Broadway
## 79 2020-01-02 06:22:01.3830              195 Liberty St & Broadway
## 80 2020-01-02 06:22:01.3830              195 Liberty St & Broadway
##    start.station.latitude start.station.longitude end.station.id
## 75               40.73047               -73.98672           3260
## 76               40.73047               -73.98672           3260
## 77               40.70906               -74.01043           3690
## 78               40.70906               -74.01043           3690
## 79               40.70906               -74.01043           3690
## 80               40.70906               -74.01043           3690
##           end.station.name end.station.latitude end.station.longitude bikeid
## 75 Mercer St & Bleecker St             40.72706             -73.99662  17790
## 76 Mercer St & Bleecker St             40.72706             -73.99662  17790
## 77     Park Pl & Church St             40.71334             -74.00936  41620
## 78     Park Pl & Church St             40.71334             -74.00936  41620
## 79     Park Pl & Church St             40.71334             -74.00936  41620
## 80     Park Pl & Church St             40.71334             -74.00936  41620
##      usertype birth.year gender age  Time.x endTime X.y    Time.y Weather Temp
## 75 Subscriber       1990      1  30 morning      NA   7 afternoon    rain   45
## 76 Subscriber       1990      1  30 morning      NA   8   evening    rain   40
## 77 Subscriber       1967      1  53   night      NA   5     night  cloudy   45
## 78 Subscriber       1967      1  53   night      NA   6   morning     sun   40
## 79 Subscriber       1967      1  53   night      NA   7 afternoon    rain   45
## 80 Subscriber       1967      1  53   night      NA   8   evening    rain   40

right outer is all rows where the specified column values is same in both datasets plus all rows from the variable on the right

right <- merge(citi_prepped, biking_weather, by.x = "Day", by.y = "Day", all.y = TRUE)
tail(right)
##           Day   X.x tripduration                starttime
## 79 2020-01-02 18998          174 2020-01-02 06:19:06.7760
## 80 2020-01-02 18998          174 2020-01-02 06:19:06.7760
## 81 2020-01-03    NA           NA                     <NA>
## 82 2020-01-03    NA           NA                     <NA>
## 83 2020-01-03    NA           NA                     <NA>
## 84 2020-01-03    NA           NA                     <NA>
##                    stoptime start.station.id    start.station.name
## 79 2020-01-02 06:22:01.3830              195 Liberty St & Broadway
## 80 2020-01-02 06:22:01.3830              195 Liberty St & Broadway
## 81                     <NA>               NA                  <NA>
## 82                     <NA>               NA                  <NA>
## 83                     <NA>               NA                  <NA>
## 84                     <NA>               NA                  <NA>
##    start.station.latitude start.station.longitude end.station.id
## 79               40.70906               -74.01043           3690
## 80               40.70906               -74.01043           3690
## 81                     NA                      NA             NA
## 82                     NA                      NA             NA
## 83                     NA                      NA             NA
## 84                     NA                      NA             NA
##       end.station.name end.station.latitude end.station.longitude bikeid
## 79 Park Pl & Church St             40.71334             -74.00936  41620
## 80 Park Pl & Church St             40.71334             -74.00936  41620
## 81                <NA>                   NA                    NA     NA
## 82                <NA>                   NA                    NA     NA
## 83                <NA>                   NA                    NA     NA
## 84                <NA>                   NA                    NA     NA
##      usertype birth.year gender age Time.x endTime X.y    Time.y Weather Temp
## 79 Subscriber       1967      1  53  night      NA   7 afternoon    rain   45
## 80 Subscriber       1967      1  53  night      NA   8   evening    rain   40
## 81       <NA>         NA     NA  NA   <NA>      NA   9     night    rain   36
## 82       <NA>         NA     NA  NA   <NA>      NA  10   morning  cloudy   45
## 83       <NA>         NA     NA  NA   <NA>      NA  11 afternoon     sun   55
## 84       <NA>         NA     NA  NA   <NA>      NA  12   evening     sun   50