homework 8

Who rides longer? Men or women?

I am trying to see the relationship between gender and the length of duration(in seconds) for a trip. The data can be found here: https://www.citibikenyc.com/system-data

library(readr)
library(nlme)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:nlme':
## 
##     collapse

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(magrittr)
library(tidyr)

## 
## Attaching package: 'tidyr'

## The following object is masked from 'package:magrittr':
## 
##     extract

library(haven)
library(lmerTest)

## Loading required package: lme4

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following object is masked from 'package:tidyr':
## 
##     expand

## 
## Attaching package: 'lme4'

## The following object is masked from 'package:nlme':
## 
##     lmList

## 
## Attaching package: 'lmerTest'

## The following object is masked from 'package:lme4':
## 
##     lmer

## The following object is masked from 'package:stats':
## 
##     step

library(ggplot2)
library(texreg)

## Version:  1.36.23
## Date:     2017-03-03
## Author:   Philip Leifeld (University of Glasgow)
## 
## Please cite the JSS article in your publications -- see citation("texreg").

## 
## Attaching package: 'texreg'

## The following object is masked from 'package:tidyr':
## 
##     extract

## The following object is masked from 'package:magrittr':
## 
##     extract

bike=read_csv("/Users/Jessica/Desktop/bike.csv")

## Parsed with column specification:
## cols(
##   tripduration = col_double(),
##   starttime = col_datetime(format = ""),
##   stoptime = col_datetime(format = ""),
##   `start station id` = col_double(),
##   `start station name` = col_character(),
##   `start station latitude` = col_double(),
##   `start station longitude` = col_double(),
##   `end station id` = col_double(),
##   `end station name` = col_character(),
##   `end station latitude` = col_double(),
##   `end station longitude` = col_double(),
##   bikeid = col_double(),
##   usertype = col_character(),
##   `birth year` = col_double(),
##   gender = col_double()
## )

## Warning: 8 parsing failures.
##   row              col expected actual                              file
## 40002 start station id a double   NULL '/Users/Jessica/Desktop/bike.csv'
## 40002 end station id   a double   NULL '/Users/Jessica/Desktop/bike.csv'
## 41131 start station id a double   NULL '/Users/Jessica/Desktop/bike.csv'
## 41131 end station id   a double   NULL '/Users/Jessica/Desktop/bike.csv'
## 41510 start station id a double   NULL '/Users/Jessica/Desktop/bike.csv'
## ..... ................ ........ ...... .................................
## See problems(...) for more details.

head(bike)

## # A tibble: 6 x 15
##   tripduration starttime           stoptime            `start station …
##          <dbl> <dttm>              <dttm>                         <dbl>
## 1          219 2019-02-01 00:00:06 2019-02-01 00:03:46             3494
## 2          143 2019-02-01 00:00:28 2019-02-01 00:02:51              438
## 3          296 2019-02-01 00:01:13 2019-02-01 00:06:10             3571
## 4          478 2019-02-01 00:01:14 2019-02-01 00:09:12              167
## 5          225 2019-02-01 00:01:49 2019-02-01 00:05:34             3458
## 6          457 2019-02-01 00:02:04 2019-02-01 00:09:41             3078
## # … with 11 more variables: `start station name` <chr>, `start station
## #   latitude` <dbl>, `start station longitude` <dbl>, `end station
## #   id` <dbl>, `end station name` <chr>, `end station latitude` <dbl>,
## #   `end station longitude` <dbl>, bikeid <dbl>, usertype <chr>, `birth
## #   year` <dbl>, gender <dbl>

bike$`start station id`=as.character(bike$`start station id`)
bike$gender=as.character(bike$gender)
bike$`birth year`=as.character(bike$`birth year`)
#Removing unknown gender
bike=filter(bike,gender!=0)
#Converting to minutes
bike$tripduration=bike$tripduration/60
bike=filter(bike, "tripduration">200)
bike=rename(bike,"stationName"=`start station name`)
bike=filter(bike,`start station id`!=3426)

head(bike)

## # A tibble: 6 x 15
##   tripduration starttime           stoptime            `start station …
##          <dbl> <dttm>              <dttm>              <chr>           
## 1         3.65 2019-02-01 00:00:06 2019-02-01 00:03:46 3494            
## 2         2.38 2019-02-01 00:00:28 2019-02-01 00:02:51 438             
## 3         4.93 2019-02-01 00:01:13 2019-02-01 00:06:10 3571            
## 4         7.97 2019-02-01 00:01:14 2019-02-01 00:09:12 167             
## 5         3.75 2019-02-01 00:01:49 2019-02-01 00:05:34 3458            
## 6         7.62 2019-02-01 00:02:04 2019-02-01 00:09:41 3078            
## # … with 11 more variables: stationName <chr>, `start station
## #   latitude` <dbl>, `start station longitude` <dbl>, `end station
## #   id` <dbl>, `end station name` <chr>, `end station latitude` <dbl>,
## #   `end station longitude` <dbl>, bikeid <dbl>, usertype <chr>, `birth
## #   year` <chr>, gender <chr>

A look at the groups which are CitiBike Stations where males/females come from and the number of stations

head(unique(bike$stationName))

## [1] "E 115 St & Lexington Ave" "St Marks Pl & 1 Ave"     
## [3] "Bedford Ave & Bergen St"  "E 39 St & 3 Ave"         
## [5] "W 55 St & 6 Ave"          "Broadway & Roebling St"

tail(unique(bike$stationName))

## [1] "E 2 St & Avenue C"         "NYCBS Depot - GOW"        
## [3] "34 St & 35 Ave"            "Carroll St & Franklin Ave"
## [5] "23 Ave & 27 St"            "27 Ave & 9 St"

cat("The number of stations/groupings is",length(unique(bike$stationName)))

## The number of stations/groupings is 756

Linear regression of sex on trip duration: Complete Pooling

Gender=1 is Male; Gender=2 is Female

pooling=lm(tripduration~gender,data=bike)
summary(pooling)

## 
## Call:
## lm(formula = tripduration ~ gender, data = bike)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##    -15     -8     -5      1  32446 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  12.0552     0.7239  16.654  < 2e-16 ***
## gender2       4.1966     1.5132   2.773  0.00555 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 159.4 on 62902 degrees of freedom
## Multiple R-squared:  0.0001223,  Adjusted R-squared:  0.0001064 
## F-statistic: 7.692 on 1 and 62902 DF,  p-value: 0.00555

Interpreting the complete pooling linear model:

Based on complete pooling where we run a linear model that treats gender at an individual level, with no grouping, we can see that our coefficient on gender is statistically significant from 0 at the 1% level. Compared to males, females on average ride for about 4.2 minutes longer. Males ride for about 12.1 minutes and females for about 16.3 minutes.

Based on this, I would conclude that females seem to be more active around NYC than males.

No-Pooling model

bike2<-bike%>%
  mutate(gender=factor(ifelse(gender==1,"Male",
                       ifelse(gender==2,"Female",NA))),
         tripduration = as.numeric(tripduration))%>%
  rename("ID"=`start station id`)
 bike2= filter(bike2,"tripduration"> 200)

IDWith1Gender<-bike2%>%
  group_by(ID,gender)%>%
  summarize(n=n())%>%
  tidyr::spread(gender, n)%>%
  filter(is.na(Female) | is.na(Male))

dcoef <- bike2 %>% 
  filter (!ID %in% IDWith1Gender$ID)%>%
    group_by(`ID`) %>% 
    do(mod = lm(tripduration~gender, data = .))

coef <- dcoef %>% do(data.frame(intc = coef(.$mod)[1]))
ggplot(coef, aes(x = intc)) + geom_histogram(binwidth = 0.5)

Here are plots of the intercept for our regression for each station. The intercept represents the average trip duration for males at each station. Here we can see that the mode average duration for men is about 10 minutes. But, we see wide variation at stations, ranging up to almost 30 minutes,despite few outliers.

dcoef <- bike2 %>% 
  filter (!ID %in% IDWith1Gender$ID)%>%
    group_by(`ID`) %>% 
    do(mod = lm(tripduration~gender, data = .))
coef <- dcoef %>% do(data.frame(genderc = coef(.$mod)[2]))
ggplot(coef, aes(x = genderc)) + geom_histogram(binwidth = 0.5)

Here we can see that there is also variation between female ride duration across stations. Some stations have female riders that ride on average up to about 10 more minutes than the men at that station. Some stations even have men which ride more on average than women–something that is lost when we do complete pooling! However, most stations have female riders that ride about 1 minute or so longer than the men at those stations. Based on these visualizations and the before-mentioned analysis, we can see how there is variation in duration of riding by gender across different stations.

Linear Mixed-Effects Model: Random Intercept

randomInt=lme(tripduration ~ gender, data = bike, random = ~1|stationName, method = "ML")
summary(randomInt)

## Linear mixed-effects model fit by maximum likelihood
##  Data: bike 
##        AIC      BIC    logLik
##   816566.9 816603.1 -408279.5
## 
## Random effects:
##  Formula: ~1 | stationName
##         (Intercept) Residual
## StdDev:    1.422374 159.4194
## 
## Fixed effects: tripduration ~ gender 
##                 Value Std.Error    DF   t-value p-value
## (Intercept) 12.056891  0.727185 62147 16.580225  0.0000
## gender2      4.197874  1.513292 62147  2.774001  0.0055
##  Correlation: 
##         (Intr)
## gender2 -0.476
## 
## Standardized Within-Group Residuals:
##           Min            Q1           Med            Q3           Max 
##  -0.108985362  -0.049952023  -0.030264641   0.003433652 203.510253236 
## 
## Number of Observations: 62904
## Number of Groups: 756

Here we can see that we have 756 different stations (groups) which do have variation between one another when it comes to ride duration. We can see this from the standard deviation between the stations for men of about 1.4.

Interpreting the results:

Ceteris paribus, females on average ride a citibike for about 1.42 minutes longer than males do. Males on average ride their bikes for about 12.1 minutes while males ride on average for about 16.3 minutes. This interpretation takes into account variation between the different stations, and we can see that the coefficients, compare to total pooling, increase slightly. This means that there was some variation in stations that was not captured when doing the total pooling method.

Linear Mixed-Effects Model: Random Slope

randomSlope=lme(tripduration ~ gender, data = bike, random = ~gender|stationName, method = "ML")
summary(randomSlope)

## Linear mixed-effects model fit by maximum likelihood
##  Data: bike 
##        AIC      BIC    logLik
##   815961.8 816016.1 -407974.9
## 
## Random effects:
##  Formula: ~gender | stationName
##  Structure: General positive-definite, Log-Cholesky parametrization
##             StdDev      Corr  
## (Intercept)   0.1980755 (Intr)
## gender2      46.1245928 0.963 
## Residual    157.8506120       
## 
## Fixed effects: tripduration ~ gender 
##                 Value Std.Error    DF   t-value p-value
## (Intercept) 12.054059 0.7167572 62147 16.817494  0.0000
## gender2      4.472063 2.4360886 62147  1.835756  0.0664
##  Correlation: 
##         (Intr)
## gender2 -0.286
## 
## Standardized Within-Group Residuals:
##           Min            Q1           Med            Q3           Max 
##  -5.532166198  -0.046550978  -0.027141617   0.006807261 200.110791147 
## 
## Number of Observations: 62904
## Number of Groups: 756

Interpretation:

Ceteris paribus, females on average ride for 4.5 minutes longer than males do. Males on average ride for about 12.1 minutes while females ride for about 16.6 minutes. By allowing for gender variation across stations we have slightly increased the duration difference between genders when compared to the random slope model.