read in the file

bike_sharing_data <- read.delim("bike_sharing_data.txt")
str(bike_sharing_data)
## 'data.frame':    17379 obs. of  13 variables:
##  $ datetime  : chr  "1/1/2011 0:00" "1/1/2011 1:00" "1/1/2011 2:00" "1/1/2011 3:00" ...
##  $ season    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ workingday: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ weather   : int  1 1 1 1 1 2 1 1 1 1 ...
##  $ temp      : num  9.84 9.02 9.02 9.84 9.84 ...
##  $ atemp     : num  14.4 13.6 13.6 14.4 14.4 ...
##  $ humidity  : chr  "81" "80" "80" "75" ...
##  $ windspeed : num  0 0 0 0 0 ...
##  $ casual    : int  3 8 5 3 0 0 2 1 1 8 ...
##  $ registered: int  13 32 27 10 1 1 0 2 7 6 ...
##  $ count     : int  16 40 32 13 1 1 2 3 8 14 ...
##  $ sources   : chr  "ad campaign" "www.yahoo.com" "www.google.fi" "AD campaign" ...

summarize the data

summary(bike_sharing_data)
##    datetime             season         holiday          workingday    
##  Length:17379       Min.   :1.000   Min.   :0.00000   Min.   :0.0000  
##  Class :character   1st Qu.:2.000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Mode  :character   Median :3.000   Median :0.00000   Median :1.0000  
##                     Mean   :2.502   Mean   :0.02877   Mean   :0.6827  
##                     3rd Qu.:3.000   3rd Qu.:0.00000   3rd Qu.:1.0000  
##                     Max.   :4.000   Max.   :1.00000   Max.   :1.0000  
##     weather           temp           atemp         humidity        
##  Min.   :1.000   Min.   : 0.82   Min.   : 0.00   Length:17379      
##  1st Qu.:1.000   1st Qu.:13.94   1st Qu.:16.66   Class :character  
##  Median :1.000   Median :20.50   Median :24.24   Mode  :character  
##  Mean   :1.425   Mean   :20.38   Mean   :23.79                     
##  3rd Qu.:2.000   3rd Qu.:27.06   3rd Qu.:31.06                     
##  Max.   :4.000   Max.   :41.00   Max.   :50.00                     
##    windspeed          casual         registered        count    
##  Min.   : 0.000   Min.   :  0.00   Min.   :  0.0   Min.   :  1  
##  1st Qu.: 7.002   1st Qu.:  4.00   1st Qu.: 36.0   1st Qu.: 42  
##  Median :12.998   Median : 16.00   Median :116.0   Median :141  
##  Mean   :12.737   Mean   : 34.48   Mean   :152.5   Mean   :187  
##  3rd Qu.:16.998   3rd Qu.: 46.00   3rd Qu.:217.0   3rd Qu.:277  
##  Max.   :56.997   Max.   :367.00   Max.   :886.0   Max.   :977  
##    sources         
##  Length:17379      
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

preview the data

head(bike_sharing_data)
##        datetime season holiday workingday weather temp  atemp humidity
## 1 1/1/2011 0:00      1       0          0       1 9.84 14.395       81
## 2 1/1/2011 1:00      1       0          0       1 9.02 13.635       80
## 3 1/1/2011 2:00      1       0          0       1 9.02 13.635       80
## 4 1/1/2011 3:00      1       0          0       1 9.84 14.395       75
## 5 1/1/2011 4:00      1       0          0       1 9.84 14.395       75
## 6 1/1/2011 5:00      1       0          0       2 9.84 12.880       75
##   windspeed casual registered count       sources
## 1    0.0000      3         13    16   ad campaign
## 2    0.0000      8         32    40 www.yahoo.com
## 3    0.0000      5         27    32 www.google.fi
## 4    0.0000      3         10    13   AD campaign
## 5    0.0000      0          1     1       Twitter
## 6    6.0032      0          1     1  www.bing.com
tail(bike_sharing_data)
##               datetime season holiday workingday weather  temp  atemp humidity
## 17374 12/31/2012 18:00      1       0          1       2 10.66 13.635       48
## 17375 12/31/2012 19:00      1       0          1       2 10.66 12.880       60
## 17376 12/31/2012 20:00      1       0          1       2 10.66 12.880       60
## 17377 12/31/2012 21:00      1       0          1       1 10.66 12.880       60
## 17378 12/31/2012 22:00      1       0          1       1 10.66 13.635       56
## 17379 12/31/2012 23:00      1       0          1       1 10.66 13.635       65
##       windspeed casual registered count       sources
## 17374    8.9981     10        326   336 facebook page
## 17375   11.0014      5        206   211          <NA>
## 17376   11.0014      4        140   144   AD campaign
## 17377   11.0014      3         96    99   AD campaign
## 17378    8.9981      4         90    94   ad campaign
## 17379    8.9981      3         50    53        direct

create a contingency table

sort(table(bike_sharing_data$sources), decreasing = TRUE)
## 
##      ad campaign    www.yahoo.com           direct     www.bing.com 
##             3472             1705             1610             1595 
## www.google.co.uk    facebook page      AD campaign          Twitter 
##             1553             1551              894              890 
##      Twitter          Ad Campaign    www.google.fi   www.google.com 
##              855              851              828              527 
##             blog 
##              494

indexing

bike_sharing_data[8,]
##        datetime season holiday workingday weather temp atemp humidity windspeed
## 8 1/1/2011 7:00      1       0          0       1  8.2 12.88       86         0
##   casual registered count       sources
## 8      1          2     3 www.yahoo.com
bike_sharing_data[8,9]
## [1] 0

subsetting

cold_season_data <- subset(bike_sharing_data, season == 1)
head(cold_season_data)
##        datetime season holiday workingday weather temp  atemp humidity
## 1 1/1/2011 0:00      1       0          0       1 9.84 14.395       81
## 2 1/1/2011 1:00      1       0          0       1 9.02 13.635       80
## 3 1/1/2011 2:00      1       0          0       1 9.02 13.635       80
## 4 1/1/2011 3:00      1       0          0       1 9.84 14.395       75
## 5 1/1/2011 4:00      1       0          0       1 9.84 14.395       75
## 6 1/1/2011 5:00      1       0          0       2 9.84 12.880       75
##   windspeed casual registered count       sources
## 1    0.0000      3         13    16   ad campaign
## 2    0.0000      8         32    40 www.yahoo.com
## 3    0.0000      5         27    32 www.google.fi
## 4    0.0000      3         10    13   AD campaign
## 5    0.0000      0          1     1       Twitter
## 6    6.0032      0          1     1  www.bing.com
good_weather_high_count <- subset(bike_sharing_data, weather == 1 & count > 500)
head(good_weather_high_count)
##             datetime season holiday workingday weather  temp  atemp humidity
## 2660 4/25/2011 17:00      2       0          1       1 28.70 32.575       54
## 2684 4/26/2011 17:00      2       0          1       1 27.88 31.820       61
## 2732 4/28/2011 17:00      2       0          1       1 26.24 31.060       47
## 2828  5/2/2011 17:00      2       0          1       1 27.06 31.060       65
## 2852  5/3/2011 17:00      2       0          1       1 28.70 32.575       54
## 2853  5/3/2011 18:00      2       0          1       1 28.70 32.575       48
##      windspeed casual registered count          sources
## 2660   19.9995     66        464   530 www.google.co.uk
## 2684   23.9994     69        477   546    www.yahoo.com
## 2732   23.9994     57        444   501    www.google.fi
## 2828   12.9980     65        472   537    www.yahoo.com
## 2852   31.0009     53        464   517      ad campaign
## 2853   27.9993     59        485   544      Twitter
high_humidity_data <- bike_sharing_data[bike_sharing_data$humidity > 90, ]
head(high_humidity_data)
##           datetime season holiday workingday weather  temp  atemp humidity
## 23  1/1/2011 22:00      1       0          0       2 16.40 20.455       94
## 26   1/2/2011 1:00      1       0          0       2 18.04 21.970       94
## 28   1/2/2011 3:00      1       0          0       2 18.86 22.725       94
## 29   1/2/2011 4:00      1       0          0       2 18.86 22.725       94
## 170  1/8/2011 8:00      1       0          0       3  6.56  9.090       93
## 171  1/8/2011 9:00      1       0          0       3  6.56  9.090       93
##     windspeed casual registered count          sources
## 23    15.0013     11         17    28      Ad Campaign
## 26    16.9979      1         16    17      Twitter    
## 28    12.9980      2          4     6 www.google.co.uk
## 29    12.9980      2          1     3      ad campaign
## 170    7.0015      0         15    15   www.google.com
## 171    7.0015      0         20    20          Twitter